Patch csv (#276)

* Fix for csv output Now it writes to different files with custom format * Added dict to avoid duplicates users * Fixed duplicate * Fixed json error, fixex favorites scraping While scraping favorites, some tweets are missing * Update (#275) * Fix json * Added new data to meta * Fix Target profile infos are scraped at each round * Remove UserId() * single user info * clean comments * clean * add userlist lookup * Fix search query * add .csv, .json to .gitignore * modify tweet timestamp * removed --query * fix date * Update README * Readme update * Dynamic user agent + fixing retry bug (#271) * Add dynamic user agent and bug fix * Changed error message Changed error message to clarify that this kind of errors is not expected and we are trying to solve them, it seems that Twitter tries to block our requests and lies about tweets existence * Updated issue_template * Changed name to a correct one * Fixed csv/json Now "output" is the name of the directory to save files in

Patch csv (#276)
* Fix for csv output Now it writes to different files with custom format * Added dict to avoid duplicates users * Fixed duplicate * Fixed json error, fixex favorites scraping While scraping favorites, some tweets are missing * Update (#275) * Fix json * Added new data to meta * Fix Target profile infos are scraped at each round * Remove UserId() * single user info * clean comments * clean * add userlist lookup * Fix search query * add .csv, .json to .gitignore * modify tweet timestamp * removed --query * fix date * Update README * Readme update * Dynamic user agent + fixing retry bug (#271) * Add dynamic user agent and bug fix * Changed error message Changed error message to clarify that this kind of errors is not expected and we are trying to solve them, it seems that Twitter tries to block our requests and lies about tweets existence * Updated issue_template * Changed name to a correct one * Fixed csv/json Now "output" is the name of the directory to save files in
78d76ba4 · Francesco Poldi · GitHub · 86bfd28f · 78d76ba4 · 78d76ba4
Commit 78d76ba4 authored Oct 31, 2018 by Francesco Poldi Committed by GitHub Oct 31, 2018
Showing with 58 additions and 10 deletions

twint/config.py twint/config.py +1 -1

twint/feed.py twint/feed.py +2 -2

twint/output.py twint/output.py +28 -0

twint/run.py twint/run.py +1 -1

twint/storage/write.py twint/storage/write.py +26 -6

No files found.
--- a/twint/config.py
+++ b/twint/config.py
@@ -16,7 +16,7 @@ class Config:
    Verified = False
    Store_csv = False
    Store_json = False
-    Custom = False
+    Custom = {"tweet": None, "user": None, "username": None}
    Show_hashtags = False
    Limit = None
    Count = None

--- a/twint/feed.py
+++ b/twint/feed.py
@@ -12,7 +12,7 @@ def Follow(response):
    try:
        cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
    except Exception as e:
-        print(e)
+        print(str(e) + " [x] feed.Follow")

    return follow, cursor

@@ -24,7 +24,7 @@ def Mobile(response):
    try:
        max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
    except Exception as e:
-        print(e)
+        print(str(e) + " [x] feed.Mobile")

    return tweets, max_id


--- a/twint/output.py
+++ b/twint/output.py
@@ -6,6 +6,7 @@ from .storage import db, elasticsearch, write, panda

 #import logging

+_duplicate_dict = {}
 follow_object = {}

 tweets_object = []
@@ -99,6 +100,7 @@ async def tweetUserData(tweet,config, conn):

 async def Tweets(tw, location, config, conn):
    #logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
+    global _duplicate_dict
    copyright = tw.find("div", "StreamItemContent--withheld")
    if copyright is None and is_tweet(tw):
        tweet = Tweet(tw, location, config)
@@ -106,6 +108,32 @@ async def Tweets(tw, location, config, conn):
        if config.Database is not None and config.User_info:
            await tweetUserData(tweet, config, conn)

+        if config.User_info:
+            for user in tweet.mentions:
+                try:
+                    _duplicate_dict[user["screen_name"]]
+                except KeyError:
+                    _duplicate_dict[user["screen_name"]] = True
+                    _user = user["screen_name"]
+                    url = f"http://twitter.com/{_user}?lang=en"
+                    await get.User(url, config, conn)
+            for user in tweet.tags:
+                try:
+                    _duplicate_dict[user["screen_name"]]
+                except KeyError:
+                    _duplicate_dict[user["screen_name"]] = True
+                    _user = user["screen_name"]
+                    url = f"http://twitter.com/{_user}?lang=en"
+                    await get.User(url, config, conn)
+            for user in tweet.replies:
+                try:
+                    _duplicate_dict[user["screen_name"]]
+                except KeyError:
+                    _duplicate_dict[user["screen_name"]] = True
+                    _user = user["screen_name"]
+                    url = f"http://twitter.com/{_user}?lang=en"
+                    await get.User(url, config, conn)
+
        if datecheck(tweet.datestamp, config):
            output = format.Tweet(config, tweet)


--- a/twint/run.py
+++ b/twint/run.py
@@ -11,7 +11,7 @@ class Twint:
        if config.Resume is not None and config.TwitterSearch:
            self.init = f"TWEET-{config.Resume}-0"
        else:
-            if config.Profile_full | config.Retweets:
+            if config.Profile_full | config.Retweets | config.Favorites:
                self.init = -1
            else:
                self.init = "-1"

--- a/twint/storage/write.py
+++ b/twint/storage/write.py
@@ -28,21 +28,41 @@ def struct(obj, custom, _type):

    return fieldnames, row

+def createDirIfMissing(dirname):
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
 def Csv(obj, config):
-    fieldnames, row = struct(obj, config.Custom, Type(config))
+    _obj_type = obj.__class__.__name__
+    if _obj_type == "str": _obj_type = "username"
+    Output_csv = {"tweet": config.Output.split(".")[0] + "/tweets.csv",
+                  "user": config.Output.split(".")[0] + "/users.csv",
+                  "username": config.Output.split(".")[0] + "/usernames.csv"}
+
+    fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
+
+    createDirIfMissing(config.Output.split(".")[0])

-    if not (os.path.exists(config.Output)):
-        with open(config.Output, "w", newline='', encoding="utf-8") as csv_file:
+    if not (os.path.exists(Output_csv[_obj_type])):
+        with open(Output_csv[_obj_type], "w", newline='', encoding="utf-8") as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()

-    with open(config.Output, "a", newline='', encoding="utf-8") as csv_file:
+    with open(Output_csv[_obj_type], "a", newline='', encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writerow(row)

 def Json(obj, config):
-    null, data = struct(obj, config.Custom, Type(config))
+    _obj_type = obj.__class__.__name__
+    if _obj_type == "str": _obj_type = "username"
+    Output_json = {"tweet": config.Output.split(".")[0] + "/tweets.json",
+                  "user": config.Output.split(".")[0] + "/users.json",
+                  "username": config.Output.split(".")[0] + "/usernames.json"}
+
+    null, data = struct(obj, config.Custom[_obj_type], _obj_type)
+
+    createDirIfMissing(config.Output.split(".")[0])

-    with open(config.Output, "a", newline='', encoding="utf-8") as json_file:
+    with open(Output_json[_obj_type], "a", newline='', encoding="utf-8") as json_file:
        json.dump(data, json_file, ensure_ascii=False)
        json_file.write("\n")