Commit f7dfc15b authored by Cody Zacharias's avatar Cody Zacharias Committed by GitHub

Merge pull request #21 from hpiedcoq/patch-1

add hashtags extraction and cleanse text (urls)
parents fa2bd816 51ecfe59
...@@ -8,6 +8,7 @@ import async_timeout ...@@ -8,6 +8,7 @@ import async_timeout
import csv import csv
import datetime import datetime
import json import json
import re
import sys import sys
async def getUrl(init): async def getUrl(init):
...@@ -77,7 +78,8 @@ async def getTweets(init): ...@@ -77,7 +78,8 @@ async def getTweets(init):
time = t.strftime("%H:%M:%S") time = t.strftime("%H:%M:%S")
username = tweet.find("span", "username").text.replace("@", "") username = tweet.find("span", "username").text.replace("@", "")
timezone = strftime("%Z", gmtime()) timezone = strftime("%Z", gmtime())
text = tweet.find("p", "tweet-text").text.replace("\n", " ") text = tweet.find("p", "tweet-text").text.replace("\n", " ").replace("http"," http").replace("pic.twitter"," pic.twitter")
hashtags = ",".join(re.findall(r'(?i)\#\w+', text, flags=re.UNICODE))
try: try:
mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ") mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ")
for i in range(len(mentions)): for i in range(len(mentions)):
...@@ -93,10 +95,12 @@ async def getTweets(init): ...@@ -93,10 +95,12 @@ async def getTweets(init):
output = tweets output = tweets
else: else:
output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text) output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)
if arg.hashtags:
output+= " {}".format(hashtags)
if arg.o != None: if arg.o != None:
if arg.csv: if arg.csv:
dat = [tweetid, date, time, timezone, username, text] dat = [tweetid, date, time, timezone, username, text, hashtags]
with open(arg.o, "a", newline='') as csv_file: with open(arg.o, "a", newline='') as csv_file:
writer = csv.writer(csv_file, delimiter="|") writer = csv.writer(csv_file, delimiter="|")
writer.writerow(dat) writer.writerow(dat)
...@@ -128,6 +132,7 @@ if __name__ == "__main__": ...@@ -128,6 +132,7 @@ if __name__ == "__main__":
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", action="store_true") ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", action="store_true")
ap.add_argument("--users", help="Display users only (Use with -s).", action="store_true") ap.add_argument("--users", help="Display users only (Use with -s).", action="store_true")
ap.add_argument("--csv", help="Write as .csv file.", action="store_true") ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
arg = ap.parse_args() arg = ap.parse_args()
if arg.u is not None: if arg.u is not None:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment