Finished dataframes storing option (#224)

* Update (#174) * add function to clean accumulated pandas storage data * Fixed typo, dataname, removed attributes * Added config options and config var * Added autoclean Works for search now * Added Elasticsearch count options * Added silent output and objects for users and followers * Update * Clean following/followers attr * Final construct of object * Redesign * Little fix * Debug * Debug * Globals * Removed debug * Globals pt 2 * Mix * Added _old_obj to store previous scrape * Prefix * Pre fix pt 2 * commented * Fix for object follow * Update * Update * Completed follow_object * Pandas object for followers and following * Finished pandas object for followers and following * Added docstrings in Twint.py * Added lowercase #170 * Finished lower case Close #170 * Fix defaults * Added some edits In `panda.py` changing the structure of the dataframe for users that one is following/followed, in `config.py` added autoupdate so that one does not have to call `storage.panda.get()` at every run, in `output.py` edited follow_object, in `run.py` added autoupdate function for panda, in`tweet.py` just some docstrings * Update for panda and objects * Finished storing data into dataframes #173 Now followers, following, tweets, and user details are saved in dataframes

Finished dataframes storing option (#224)
* Update (#174) * add function to clean accumulated pandas storage data * Fixed typo, dataname, removed attributes * Added config options and config var * Added autoclean Works for search now * Added Elasticsearch count options * Added silent output and objects for users and followers * Update * Clean following/followers attr * Final construct of object * Redesign * Little fix * Debug * Debug * Globals * Removed debug * Globals pt 2 * Mix * Added _old_obj to store previous scrape * Prefix * Pre fix pt 2 * commented * Fix for object follow * Update * Update * Completed follow_object * Pandas object for followers and following * Finished pandas object for followers and following * Added docstrings in Twint.py * Added lowercase #170 * Finished lower case Close #170 * Fix defaults * Added some edits In `panda.py` changing the structure of the dataframe for users that one is following/followed, in `config.py` added autoupdate so that one does not have to call `storage.panda.get()` at every run, in `output.py` edited follow_object, in `run.py` added autoupdate function for panda, in`tweet.py` just some docstrings * Update for panda and objects * Finished storing data into dataframes #173 Now followers, following, tweets, and user details are saved in dataframes
84962bb6 · Francesco Poldi · GitHub · 421205c2 · 84962bb6 · 84962bb6
Commit 84962bb6 authored Sep 23, 2018 by Francesco Poldi Committed by GitHub Sep 23, 2018
8 changed files
--- a/Twint.py
+++ b/Twint.py
@@ -8,17 +8,20 @@ https://github.com/haccer/twint/wiki
 Licensed under MIT License
 Copyright (c) 2018 Cody Zacharias
 '''
-import argparse
-import twint
 import sys
 import os
+import argparse
+import twint

-def error(error, message):
-    print("[-] {}: {}".format(error, message))
+def error(_error, message):
+    """ Print errors to stdout
+    """
+    print("[-] {}: {}".format(_error, message))
    sys.exit(0)

 def check(args):
-    # Error checking
+    """ Error checking
+    """
    if args.username is not None:
        if args.verified:
            error("Contradicting Args",
@@ -42,8 +45,9 @@ def check(args):

    # Proxy stuff
    if args.proxy_host is not None:
+        import socks
+        import socket
        if args.proxy_host.lower() == "tor":
-            import socks, socket
            socks.set_default_proxy(socks.SOCKS5, "localhost", 9050)
            socket.socket = socks.socksocket
        elif args.proxy_port and args.proxy_type:
@@ -55,7 +59,6 @@ def check(args):
                _type = socks.HTTP
            else:
                error("Error", "Proxy types allowed are: socks5, socks4, and http.")
-            import socks, socket
            socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port))
            socket.socket = socks.socksocket
        else:
@@ -64,20 +67,23 @@ def check(args):
        if args.proxy_port or args.proxy_type:
            error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type")

-def loadUserList(ul, type):
+def loadUserList(ul, _type):
+    """ Concatenate users
+    """
    if os.path.exists(os.path.abspath(ul)):
        userlist = open(os.path.abspath(ul), "r").read().splitlines()
    else:
        userlist = ul.split(",")
-    if type == "search":
+    if _type == "search":
        un = ""
        for user in userlist:
            un += "%20OR%20from%3A" + user
        return un[15:]
-    else:
    return userlist

 def initialize(args):
+    """ Set default values for config from args
+    """
    c = twint.Config()
    c.Username = args.username
    c.User_id = args.userid
@@ -123,9 +129,12 @@ def initialize(args):
    c.Media = args.media
    c.Replies = args.replies
    c.Pandas_clean = args.pandas_clean
+    c.ES_count = {"likes":True, "replies":True, "retweets":True}
    return c

 def options():
+    """ Parse arguments
+    """
    ap = argparse.ArgumentParser(prog="Twint.py",
                                 usage="python3 %(prog)s [options]",
                                 description="TWINT - An Advanced Twitter Scraping Tool.")
@@ -151,7 +160,8 @@ def options():
    ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
    ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
                    action="store_true")
-    ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", action="store_true")
+    ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
+                    action="store_true")
    ap.add_argument("--hostname", help="Store the mysql database host")
    ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3  or mysql database.")
    ap.add_argument("--DB_user", help="Store the mysql database user")
@@ -164,53 +174,71 @@ def options():
    ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
    ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
    ap.add_argument("--proxy-port", help="The port of the proxy server.")
-    ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.")
+    ap.add_argument("--essid",
+                    help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
+                    nargs="?", default="")
    ap.add_argument("--userlist", help="Userlist from list or file.")
-    ap.add_argument("--retweets", help="Include user's Retweets (Warning: limited).", action="store_true")
+    ap.add_argument("--retweets",
+                    help="Include user's Retweets (Warning: limited).",
+                    action="store_true")
    ap.add_argument("--format", help="Custom output format (See wiki for details).")
-    ap.add_argument("--user-full", help="Collect all user information (Use with followers or following only).",
+    ap.add_argument("--user-full",
+                    help="Collect all user information (Use with followers or following only).",
                    action="store_true")
    ap.add_argument("--profile-full",
-            help="Slow, but effective method of collecting a user's Tweets (Including Retweets).",
+                    help="Slow, but effective method of collecting a user's Tweets and RT.",
                    action="store_true")
    ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
-    ap.add_argument("--pandas-type", help="Specify HDF5 or Pickle (HDF5 as default)")
-    ap.add_argument("--search_name", help="Name for identify the search like -3dprinter stuff- only for mysql")
-    ap.add_argument("-it", "--index-tweets", help="Custom Elasticsearch Index name for Tweets.")
-    ap.add_argument("-if", "--index-follow", help="Custom Elasticsearch Index name for Follows.")
-    ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.")
-    ap.add_argument("--debug", help="Store information in debug logs", action="store_true")
+    ap.add_argument("--pandas-type",
+                    help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
+    ap.add_argument("--search_name",
+                    help="Name for identify the search like -3dprinter stuff- only for mysql")
+    ap.add_argument("-it", "--index-tweets",
+                    help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twint")
+    ap.add_argument("-if", "--index-follow",
+                    help="Custom Elasticsearch Index name for Follows.",
+                    nargs="?", default="twintGraph")
+    ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
+                    nargs="?", default="twintUser")
+    ap.add_argument("--debug",
+                    help="Store information in debug logs", action="store_true")
    ap.add_argument("--resume", help="Resume from Tweet ID.")
    ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
    ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
-    ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true")
+    ap.add_argument("--media",
+                    help="Display Tweets with only images or videos.", action="store_true")
    ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
+    ap.add_argument("-pc", "--pandas-clean",
+                    help="Automatically clean Pandas dataframe at every scrape.")
+    ap.add_argument("-ec", "--es-count", nargs="?", default="",
+                    help="What NOT to count: likes, replies, retweets; only for Elasticsearch.")
    ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
    args = ap.parse_args()

    return args

 def main():
+    """ Main
+    """
    args = options()
    check(args)

    if args.userlist:
        args.username = loadUserList(args.userlist, "search")

-    if not args.pandas_type:
-        args.pandas_type = "HDF5"
+    if args.pandas_clean:
+        twint.storage.panda.clean()

-    if not args.index_tweets:
-        args.index_tweets = "twint"
+    c = initialize(args)

-    if not args.index_follow:
-        args.index_follow = "twintGraph"
+    if "likes" in str(args.es_count):
+        c.ES_count["likes"] = False

-    if not args.index_users:
-        args.index_users = "twintUser"
+    if "replies" in str(args.es_count):
+        c.ES_count["replies"] = False

-    if not args.essid:
-        args.essid = ""
+    if "retweets" in str(args.es_count):
+        c.ES_count["retweets"] = False

    if args.pandas_clean:
        twint.storage.panda.clean()

--- a/twint/config.py
+++ b/twint/config.py
@@ -49,3 +49,6 @@ class Config:
    Media = False
    Replies = False
    Pandas_clean = True
+    ES_count = {"likes":True,"replies":True,"retweets":True}
+    Lowercase = False
+    Pandas_au = True
--- a/twint/output.py
+++ b/twint/output.py
@@ -4,7 +4,16 @@ from .user import User
 from datetime import datetime
 from .storage import db, elasticsearch, write, panda

+follow_object = {}
+
 tweets_object = []
+user_object = []
+
+_follow_list = []
+
+def clean_follow_list():
+    global _follow_list
+    _follow_list = []

 def datecheck(datestamp, config):
    if config.Since and config.Until:
@@ -21,7 +30,13 @@ def is_tweet(tw):
    except:
        return False

-def _output(obj, output, config):
+def _output(obj, output, config, **extra):
+    if config.Lowercase:
+        obj.username = obj.username.lower()
+        for i in range(len(obj.mentions)):
+            obj.mentions[i] = obj.mentions[i].lower()
+        for i in range(len(obj.hashtags)):
+            obj.hashtags[i] = obj.hashtags[i].lower()
    if config.Output != None:
        if config.Store_csv:
            try :
@@ -33,13 +48,15 @@ def _output(obj, output, config):
        else:
            write.Text(output, config.Output)

-    if config.Pandas:
-        panda.update(obj, config.Essid)
+    if config.Pandas and config.User_full:
+        panda.update(obj, config)
+    if extra.get("follow_list"):
+        follow_object.username = config.Username
+        follow_object.action = config.Following*"following" + config.Followers*"followers"
+        follow_object.users = _follow_list
+        panda.update(follow_object, config.Essid)
    if config.Elasticsearch:
-        if config.Store_object:
-            tweets_object.append(obj)
-        else:
-            print(output, end=".", flush=True)
+        print("", end=".", flush=True)
    else:
        if config.Store_object:
            tweets_object.append(obj)
@@ -63,9 +80,14 @@ async def Tweets(tw, location, config, conn):
            if config.Elasticsearch:
                elasticsearch.Tweet(tweet, config)

+            if config.Store_object:
+                tweets_object.append(tweet) #twint.tweet.tweet
+
            _output(tweet, output, config)

 async def Users(u, config, conn):
+    global user_object
+
    user = User(u)
    output = format.User(config.Format, user)

@@ -81,13 +103,27 @@ async def Users(u, config, conn):
        user.join_date = _save_date
        user.join_time = _save_time

+    if config.Store_object:
+        user_object.append(user) # twint.user.user
+
    _output(user, output, config)

 async def Username(username, config, conn):
+    global follow_object
+    follow_var = config.Following*"following" + config.Followers*"followers"
+
    if config.Database:
        db.follow(conn, config.Username, config.Followers, username)

    if config.Elasticsearch:
        elasticsearch.Follow(username, config)

-    _output(username, username, config)
+    if config.Store_object or config.Pandas:
+        try:
+            _ = follow_object[config.Username][follow_var]
+        except KeyError:
+            follow_object.update({config.Username: {follow_var: []}})
+        follow_object[config.Username][follow_var].append(username)
+        if config.Pandas_au:
+            panda.update(follow_object[config.Username], config)
+    _output(username, username, config, follow_list=_follow_list)
--- a/twint/run.py
+++ b/twint/run.py
@@ -16,6 +16,9 @@ class Twint:
        self.d = datelock.Set(self.config.Until, self.config.Since)
        verbose.Elastic(config.Elasticsearch)

+        if self.config.Store_object:
+            output.clean_follow_list()
+
        if self.config.Pandas_clean:
            storage.panda.clean()

@@ -124,17 +127,35 @@ def Favorites(config):
    run(config)

 def Followers(config):
+    output.clean_follow_list()
    config.Followers = True
+    config.Following = False
    run(config)
+    if config.Pandas_au:
+        storage.panda._autoget("followers")
+        if config.User_full:
+            storage.panda._autoget("user")
+    storage.panda.clean()

 def Following(config):
+    output.clean_follow_list()
    config.Following = True
+    config.Followers = False
    run(config)
+    if config.Pandas_au:
+        storage.panda._autoget("following")
+        if config.User_full:
+            storage.panda._autoget("user")
+    storage.panda.clean()

 def Profile(config):
-    config.Profile = True
    run(config)

+
 def Search(config):
    config.TwitterSearch = True
+    config.Following = False
+    config.Followers = False
    run(config)
+    if config.Pandas_au:
+        storage.panda._autoget("tweet")
--- a/twint/storage/elasticsearch.py
+++ b/twint/storage/elasticsearch.py
@@ -73,6 +73,7 @@ def Tweet(Tweet, config):
            }
    actions.append(j_data)

+    if config.ES_count["likes"] is not False:
        for l in range(int(Tweet.likes)):
            j_data = {
                "_index": config.Index_tweets,
@@ -99,6 +100,7 @@ def Tweet(Tweet, config):
            actions.append(j_data)
            nLikes += 1

+    if config.ES_count["replies"] is not False:
        for rep in range(int(Tweet.replies)):
            j_data = {
                "_index": config.Index_tweets,
@@ -125,6 +127,7 @@ def Tweet(Tweet, config):
            actions.append(j_data)
            nReplies += 1

+    if config.ES_count["retweets"] is not False:
        for ret in range(int(Tweet.retweets)):
            j_data = {
                "_index": config.Index_tweets,

--- a/twint/storage/panda.py
+++ b/twint/storage/panda.py
-from .elasticsearch import *
 from time import strftime, localtime
 import pandas as pd
 import warnings
+from .elasticsearch import *
+
+Tweets_df = None
+Follow_df = None
+User_df = None
+
+_object_blocks = {
+    "tweet": [],
+    "user": [],
+    "following": [],
+    "followers": []
+}
+_type = ""
+
+def _concat(df, type):
+    if df is None:
+        df = pd.DataFrame(_object_blocks[type])
+    else:
+        _df = pd.DataFrame(_object_blocks[type])
+        df = pd.concat([df, _df], sort=True)
+    return df
+
+def _autoget(type):
+    global Tweets_df
+    global Follow_df
+    global User_df
+
+    if type == "tweet":
+        Tweets_df = _concat(Tweets_df, type)
+    if type == "followers" or type == "following":
+        Follow_df = _concat(Follow_df, type)
+    if type == "user":
+        User_df = _concat(User_df, type)

-_blocks = []

-def update(Tweet, session):
-    dt = f"{Tweet.datestamp} {Tweet.timestamp}"
+def update(object, config):
+    global _type

+    try:
+        _type = ((object.type == "tweet")*"tweet" +
+                 (object.type == "user")*"user")
+    except AttributeError:
+        _type = config.Following*"following" + config.Followers*"followers"
+
+    if _type == "tweet":
+        dt = f"{object.datestamp} {object.timestamp}"
        _data = {
-                "id": Tweet.id,
+            "id": object.id,
            "date": dt,
-                "timezone": Tweet.timezone,
-                "location": Tweet.location,
-                "tweet": Tweet.tweet,
-                "hashtags": Tweet.hashtags,
-                "user_id": Tweet.user_id,
-                "username": Tweet.username,
-                "link": Tweet.link,
-                "retweet": Tweet.retweet,
-                "user_rt": Tweet.user_rt,
-                "essid": str(session),
-                'mentions': Tweet.mentions
+            "timezone": object.timezone,
+            "location": object.location,
+            "tweet": object.tweet,
+            "hashtags": object.hashtags,
+            "user_id": object.user_id,
+            "username": object.username,
+            "link": object.link,
+            "retweet": object.retweet,
+            "user_rt": object.user_rt,
+            "essid": config.Essid,
+            'mentions': object.mentions
+            }
+        _object_blocks[_type].append(_data)
+    elif _type == "user":
+        _data = {
+            "id": object.id,
+            "name": object.name,
+            "username": object.username,
+            "bio": object.bio,
+            "location": object.location,
+            "url": object.url,
+            "join_datetime": object.join_date + " " + object.join_time,
+            "join_date": object.join_date,
+            "join_time": object.join_time,
+            "tweets": object.tweets,
+            "following": object.following,
+            "followers": object.followers,
+            "likes": object.likes,
+            "media": object.media_count,
+            "private": object.is_private,
+            "verified": object.is_verified,
+            "avatar": object.avatar,
+            "session": str(config.Essid)
+            }
+        _object_blocks[_type].append(_data)
+    elif _type == "followers" or _type == "following":
+        _data = {
+            config.Following*"following" + config.Followers*"followers" :
+                             {config.Username: object[_type]}
        }
-    _blocks.append(_data)
+        _object_blocks[_type] = _data
+    else:
+        print("Wrong type of object passed!")

-def get():
-    df = pd.DataFrame(_blocks)
-    return df

 def clean():
-    _blocks.clear()
+    _object_blocks["tweet"].clear()
+    _object_blocks["following"].clear()
+    _object_blocks["followers"].clear()
+    _object_blocks["user"].clear()

 def save(_filename, _dataframe, **options):
    if options.get("dataname"):
@@ -41,15 +110,16 @@ def save(_filename, _dataframe, **options):
    if not options.get("type"):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
-            _store = pd.HDFStore(_filename)
+            _store = pd.HDFStore(_filename + ".h5")
            _store[_dataname] = _dataframe
            _store.close()
    elif options.get("type") == "Pickle":
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
-            _dataframe.to_pickle(_filename)
+            _dataframe.to_pickle(_filename + ".pkl")
    else:
-        print("Please specify: filename, DataFrame, DataFrame name and type (HDF5, default, or Pickle")
+        print("""Please specify: filename, DataFrame, DataFrame name and type
+              (HDF5, default, or Pickle)""")

 def read(_filename, **options):
    if not options.get("dataname"):
@@ -58,11 +128,12 @@ def read(_filename, **options):
        _dataname = options.get("dataname")

    if not options.get("type"):
-        _store = pd.HDFStore(_filename)
-        df = _store[_dataname]
-        return df
+        _store = pd.HDFStore(_filename + ".h5")
+        _df = _store[_dataname]
+        return _df
    elif options.get("type") == "Pickle":
-        df = pd.read_pickle(_filename)
-        return df
+        _df = pd.read_pickle(_filename + ".pkl")
+        return _df
    else:
-        print("Please specify: DataFrame, DataFrame name (twint as default), filename and type (HDF5, default, or Pickle")
+        print("""Please specify: DataFrame, DataFrame name (twint as default),
+              filename and type (HDF5, default, or Pickle""")
--- a/twint/tweet.py
+++ b/twint/tweet.py
@@ -2,9 +2,16 @@ from time import strftime, localtime
 import re

 class tweet:
+    """Define Tweet class
+    """
+    type = "tweet"
+
+    def __init__(self):
        pass

 def getMentions(tw):
+    """Extract ment from tweet
+    """
    try:
        mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ")
    except:
@@ -13,6 +20,8 @@ def getMentions(tw):
    return mentions

 def getText(tw):
+    """Replace some text
+    """
    text = tw.find("p", "tweet-text").text
    text = text.replace("\n", " ")
    text = text.replace("http", " http")
@@ -33,9 +42,13 @@ def getTweet(tw, mentions):
    return text

 def getHashtags(text):
+    """Get hashtags of tweet
+    """
    return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)

 def getStat(tw, _type):
+    """Get stats about Tweet
+    """
    st = f"ProfileTweet-action--{_type} u-hiddenVisually"
    return tw.find("span", st).find("span")["data-tweet-stat-count"]

@@ -44,6 +57,8 @@ def getRetweet(profile, username, user):
        return True

 def getUser_rt(profile, username, user):
+    """Get username that retweeted
+    """
    if getRetweet(profile, username, user):
        user_rt = user
    else:
@@ -52,6 +67,8 @@ def getUser_rt(profile, username, user):
    return user_rt

 def Tweet(tw, location, config):
+    """Create Tweet object
+    """
    t = tweet()
    t.id = tw.find("div")["data-item-id"]
    t.datetime = int(tw.find("span", "_timestamp")["data-time"])

--- a/twint/user.py
+++ b/twint/user.py
 class user:
+    type = "user"
+
+    def __init__(self):
        pass

 def inf(ur, _type):