add hashtags extraction and cleanse text (urls)

I added an automatic extraction for hashtags, and a space character before http, https and pic urls in the text column, in order to parse it more conveniently.

add hashtags extraction and cleanse text (urls)
I added an automatic extraction for hashtags, and a space character before http, https and pic urls in the text column, in order to parse it more conveniently.
1c2819c5 · @hpiedcoq · GitHub · fa2bd816 · 1c2819c5
Commit 1c2819c5 authored Feb 05, 2018 by @hpiedcoq Committed by GitHub Feb 05, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

tweep.py tweep.py +6 -3

No files found.
--- a/tweep.py
+++ b/tweep.py
@@ -9,6 +9,8 @@ import csv
 import datetime
 import json
 import sys
+import re
+

 async def getUrl(init):
 	if init == -1:
@@ -77,7 +79,8 @@ async def getTweets(init):
 		time = t.strftime("%H:%M:%S")
 		username = tweet.find("span", "username").text.replace("@", "")
 		timezone = strftime("%Z", gmtime())
-		text = tweet.find("p", "tweet-text").text.replace("\n", " ")
+		text = tweet.find("p", "tweet-text").text.replace("\n", " ").replace("http"," http").replace("pic.twitter"," pic.twitter")
+		hashtags = ",".join(re.findall(r'(?i)\#\w+', text, flags=re.UNICODE))
 		try:
 			mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ")
 			for i in range(len(mentions)):
@@ -92,11 +95,11 @@ async def getTweets(init):
 		elif arg.tweets:
 			output = tweets
 		else:
-			output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)
+			output = "{} {} {} {} <{}> {} {}".format(tweetid, date, time, timezone, username, text, hashtags)

 		if arg.o != None:
 			if arg.csv:
-				dat = [tweetid, date, time, timezone, username, text]
+				dat = [tweetid, date, time, timezone, username, text, hashtags]
 				with open(arg.o, "a", newline='') as csv_file:
 					writer = csv.writer(csv_file, delimiter="|")
 					writer.writerow(dat)