Commit 1c2819c5 authored by @hpiedcoq's avatar @hpiedcoq Committed by GitHub

add hashtags extraction and cleanse text (urls)

I added an automatic extraction for hashtags, and a space character before http, https and pic urls in the text column, in order to parse it more conveniently.
parent fa2bd816
...@@ -9,6 +9,8 @@ import csv ...@@ -9,6 +9,8 @@ import csv
import datetime import datetime
import json import json
import sys import sys
import re
async def getUrl(init): async def getUrl(init):
if init == -1: if init == -1:
...@@ -77,7 +79,8 @@ async def getTweets(init): ...@@ -77,7 +79,8 @@ async def getTweets(init):
time = t.strftime("%H:%M:%S") time = t.strftime("%H:%M:%S")
username = tweet.find("span", "username").text.replace("@", "") username = tweet.find("span", "username").text.replace("@", "")
timezone = strftime("%Z", gmtime()) timezone = strftime("%Z", gmtime())
text = tweet.find("p", "tweet-text").text.replace("\n", " ") text = tweet.find("p", "tweet-text").text.replace("\n", " ").replace("http"," http").replace("pic.twitter"," pic.twitter")
hashtags = ",".join(re.findall(r'(?i)\#\w+', text, flags=re.UNICODE))
try: try:
mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ") mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ")
for i in range(len(mentions)): for i in range(len(mentions)):
...@@ -92,11 +95,11 @@ async def getTweets(init): ...@@ -92,11 +95,11 @@ async def getTweets(init):
elif arg.tweets: elif arg.tweets:
output = tweets output = tweets
else: else:
output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text) output = "{} {} {} {} <{}> {} {}".format(tweetid, date, time, timezone, username, text, hashtags)
if arg.o != None: if arg.o != None:
if arg.csv: if arg.csv:
dat = [tweetid, date, time, timezone, username, text] dat = [tweetid, date, time, timezone, username, text, hashtags]
with open(arg.o, "a", newline='') as csv_file: with open(arg.o, "a", newline='') as csv_file:
writer = csv.writer(csv_file, delimiter="|") writer = csv.writer(csv_file, delimiter="|")
writer.writerow(dat) writer.writerow(dat)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment