Commit 78d76ba4 authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Patch csv (#276)

* Fix for csv output

Now it writes to different files with custom format

* Added dict to avoid duplicates users

* Fixed duplicate

* Fixed json error, fixex favorites scraping

While scraping favorites, some tweets are missing

* Update (#275)

* Fix json

* Added new data to meta

* Fix Target profile infos are scraped at each round

* Remove UserId()

* single user info

* clean comments

* clean

* add userlist lookup

* Fix search query

* add .csv, .json to .gitignore

* modify tweet timestamp

* removed --query

* fix date

* Update README

* Readme update

* Dynamic user agent + fixing retry bug (#271)

* Add dynamic user agent and bug fix

* Changed error message

Changed error message to clarify that this kind of errors is not expected and we are trying to solve them, it seems that Twitter tries to block our requests and lies about tweets existence

* Updated issue_template

* Changed name to a correct one

* Fixed csv/json

Now "output" is the name of the directory to save files in
parent 86bfd28f
......@@ -16,7 +16,7 @@ class Config:
Verified = False
Store_csv = False
Store_json = False
Custom = False
Custom = {"tweet": None, "user": None, "username": None}
Show_hashtags = False
Limit = None
Count = None
......
......@@ -12,7 +12,7 @@ def Follow(response):
try:
cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
except Exception as e:
print(e)
print(str(e) + " [x] feed.Follow")
return follow, cursor
......@@ -24,7 +24,7 @@ def Mobile(response):
try:
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
except Exception as e:
print(e)
print(str(e) + " [x] feed.Mobile")
return tweets, max_id
......
......@@ -6,6 +6,7 @@ from .storage import db, elasticsearch, write, panda
#import logging
_duplicate_dict = {}
follow_object = {}
tweets_object = []
......@@ -99,6 +100,7 @@ async def tweetUserData(tweet,config, conn):
async def Tweets(tw, location, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
global _duplicate_dict
copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config)
......@@ -106,6 +108,32 @@ async def Tweets(tw, location, config, conn):
if config.Database is not None and config.User_info:
await tweetUserData(tweet, config, conn)
if config.User_info:
for user in tweet.mentions:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.tags:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.replies:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet)
......
......@@ -11,7 +11,7 @@ class Twint:
if config.Resume is not None and config.TwitterSearch:
self.init = f"TWEET-{config.Resume}-0"
else:
if config.Profile_full | config.Retweets:
if config.Profile_full | config.Retweets | config.Favorites:
self.init = -1
else:
self.init = "-1"
......
......@@ -28,21 +28,41 @@ def struct(obj, custom, _type):
return fieldnames, row
def createDirIfMissing(dirname):
if not os.path.exists(dirname):
os.makedirs(dirname)
def Csv(obj, config):
fieldnames, row = struct(obj, config.Custom, Type(config))
_obj_type = obj.__class__.__name__
if _obj_type == "str": _obj_type = "username"
Output_csv = {"tweet": config.Output.split(".")[0] + "/tweets.csv",
"user": config.Output.split(".")[0] + "/users.csv",
"username": config.Output.split(".")[0] + "/usernames.csv"}
fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
createDirIfMissing(config.Output.split(".")[0])
if not (os.path.exists(config.Output)):
with open(config.Output, "w", newline='', encoding="utf-8") as csv_file:
if not (os.path.exists(Output_csv[_obj_type])):
with open(Output_csv[_obj_type], "w", newline='', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
with open(config.Output, "a", newline='', encoding="utf-8") as csv_file:
with open(Output_csv[_obj_type], "a", newline='', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writerow(row)
def Json(obj, config):
null, data = struct(obj, config.Custom, Type(config))
_obj_type = obj.__class__.__name__
if _obj_type == "str": _obj_type = "username"
Output_json = {"tweet": config.Output.split(".")[0] + "/tweets.json",
"user": config.Output.split(".")[0] + "/users.json",
"username": config.Output.split(".")[0] + "/usernames.json"}
null, data = struct(obj, config.Custom[_obj_type], _obj_type)
createDirIfMissing(config.Output.split(".")[0])
with open(config.Output, "a", newline='', encoding="utf-8") as json_file:
with open(Output_json[_obj_type], "a", newline='', encoding="utf-8") as json_file:
json.dump(data, json_file, ensure_ascii=False)
json_file.write("\n")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment