Commit 5cd65d7c authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Patch csv (#276)

* Fix for csv output

Now it writes to different files with custom format

* Added dict to avoid duplicates users

* Fixed duplicate

* Fixed json error, fixex favorites scraping

While scraping favorites, some tweets are missing

* Update (#275)

* Fix json

* Added new data to meta

* Fix Target profile infos are scraped at each round

* Remove UserId()

* single user info

* clean comments

* clean

* add userlist lookup

* Fix search query

* add .csv, .json to .gitignore

* modify tweet timestamp

* removed --query

* fix date

* Update README

* Readme update

* Dynamic user agent + fixing retry bug (#271)

* Add dynamic user agent and bug fix

* Changed error message

Changed error message to clarify that this kind of errors is not expected and we are trying to solve them, it seems that Twitter tries to block our requests and lies about tweets existence

* Updated issue_template

* Changed name to a correct one

* Fixed csv/json

Now "output" is the name of the directory to save files in
parent 55b73b08
...@@ -16,7 +16,7 @@ class Config: ...@@ -16,7 +16,7 @@ class Config:
Verified = False Verified = False
Store_csv = False Store_csv = False
Store_json = False Store_json = False
Custom = False Custom = {"tweet": None, "user": None, "username": None}
Show_hashtags = False Show_hashtags = False
Limit = None Limit = None
Count = None Count = None
......
...@@ -12,7 +12,7 @@ def Follow(response): ...@@ -12,7 +12,7 @@ def Follow(response):
try: try:
cursor = findall(r'cursor=(.*?)">', str(cursor))[0] cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
except Exception as e: except Exception as e:
print(e) print(str(e) + " [x] feed.Follow")
return follow, cursor return follow, cursor
...@@ -24,7 +24,7 @@ def Mobile(response): ...@@ -24,7 +24,7 @@ def Mobile(response):
try: try:
max_id = findall(r'max_id=(.*?)">', str(max_id))[0] max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
except Exception as e: except Exception as e:
print(e) print(str(e) + " [x] feed.Mobile")
return tweets, max_id return tweets, max_id
......
...@@ -6,6 +6,7 @@ from .storage import db, elasticsearch, write, panda ...@@ -6,6 +6,7 @@ from .storage import db, elasticsearch, write, panda
#import logging #import logging
_duplicate_dict = {}
follow_object = {} follow_object = {}
tweets_object = [] tweets_object = []
...@@ -99,6 +100,7 @@ async def tweetUserData(tweet,config, conn): ...@@ -99,6 +100,7 @@ async def tweetUserData(tweet,config, conn):
async def Tweets(tw, location, config, conn): async def Tweets(tw, location, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Tweets') #logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
global _duplicate_dict
copyright = tw.find("div", "StreamItemContent--withheld") copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw): if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config) tweet = Tweet(tw, location, config)
...@@ -106,6 +108,32 @@ async def Tweets(tw, location, config, conn): ...@@ -106,6 +108,32 @@ async def Tweets(tw, location, config, conn):
if config.Database is not None and config.User_info: if config.Database is not None and config.User_info:
await tweetUserData(tweet, config, conn) await tweetUserData(tweet, config, conn)
if config.User_info:
for user in tweet.mentions:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.tags:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.replies:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
if datecheck(tweet.datestamp, config): if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet) output = format.Tweet(config, tweet)
......
...@@ -11,7 +11,7 @@ class Twint: ...@@ -11,7 +11,7 @@ class Twint:
if config.Resume is not None and config.TwitterSearch: if config.Resume is not None and config.TwitterSearch:
self.init = f"TWEET-{config.Resume}-0" self.init = f"TWEET-{config.Resume}-0"
else: else:
if config.Profile_full | config.Retweets: if config.Profile_full | config.Retweets | config.Favorites:
self.init = -1 self.init = -1
else: else:
self.init = "-1" self.init = "-1"
......
...@@ -28,21 +28,41 @@ def struct(obj, custom, _type): ...@@ -28,21 +28,41 @@ def struct(obj, custom, _type):
return fieldnames, row return fieldnames, row
def createDirIfMissing(dirname):
if not os.path.exists(dirname):
os.makedirs(dirname)
def Csv(obj, config): def Csv(obj, config):
fieldnames, row = struct(obj, config.Custom, Type(config)) _obj_type = obj.__class__.__name__
if _obj_type == "str": _obj_type = "username"
Output_csv = {"tweet": config.Output.split(".")[0] + "/tweets.csv",
"user": config.Output.split(".")[0] + "/users.csv",
"username": config.Output.split(".")[0] + "/usernames.csv"}
fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
createDirIfMissing(config.Output.split(".")[0])
if not (os.path.exists(config.Output)): if not (os.path.exists(Output_csv[_obj_type])):
with open(config.Output, "w", newline='', encoding="utf-8") as csv_file: with open(Output_csv[_obj_type], "w", newline='', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader() writer.writeheader()
with open(config.Output, "a", newline='', encoding="utf-8") as csv_file: with open(Output_csv[_obj_type], "a", newline='', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writerow(row) writer.writerow(row)
def Json(obj, config): def Json(obj, config):
null, data = struct(obj, config.Custom, Type(config)) _obj_type = obj.__class__.__name__
if _obj_type == "str": _obj_type = "username"
Output_json = {"tweet": config.Output.split(".")[0] + "/tweets.json",
"user": config.Output.split(".")[0] + "/users.json",
"username": config.Output.split(".")[0] + "/usernames.json"}
null, data = struct(obj, config.Custom[_obj_type], _obj_type)
createDirIfMissing(config.Output.split(".")[0])
with open(config.Output, "a", newline='', encoding="utf-8") as json_file: with open(Output_json[_obj_type], "a", newline='', encoding="utf-8") as json_file:
json.dump(data, json_file, ensure_ascii=False) json.dump(data, json_file, ensure_ascii=False)
json_file.write("\n") json_file.write("\n")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment