Merge remote-tracking branch 'origin/master'

bf671f19 · Francesco Poldi · cd5c92b6 · 59b3bdd0 · bf671f19 · bf671f19
Commit bf671f19 authored Jul 28, 2018 by Francesco Poldi
7 changed files
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ A few simple examples to help you understand the basics:
 - `python3 twint.py -u username --following` - Scrape who a Twitter user follows.
 - `python3 twint.py -u username --favorites` - Collect all the Tweets a user has favorited.
 - `python3 twint.py -u username --following --user-full` - Collect full user information a person follows
- `python3 twint.py -u username --profile-full` - Use a slow, but effective method to gather all the Tweets from a user's profile (Including Retweets).
+- `python3 twint.py -u username --profile-full` - Use a slow, but effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, Including Retweets).
 - `python3 twint.py -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
 - `python3 twint.py -u username --resume 10940389583058` - Resume a search starting from the specified Tweet ID.

--- a/Twint.py
+++ b/Twint.py
@@ -122,6 +122,7 @@ def initialize(args):
    c.Videos = args.videos
    c.Media = args.media
    c.Replies = args.replies
+    c.Pandas_clean = args.pandas_clean
    return c
 def options():
@@ -184,6 +185,7 @@ def options():
    ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
    ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true")
    ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
+    ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
    args = ap.parse_args()
    return args
@@ -210,6 +212,9 @@ def main():
    if not args.essid:
        args.essid = ""
+    if args.pandas_clean:
+        twint.storage.panda.clean()
    c = initialize(args)
    if args.favorites:

--- a/twint/config.py
+++ b/twint/config.py
@@ -47,3 +47,4 @@ class Config:
    Videos = False
    Media = False
    Replies = False
+    Pandas_clean = True
--- a/twint/get.py
+++ b/twint/get.py
@@ -49,7 +49,7 @@ async def Response(session, url):
 async def Username(_id):
    url = f"https://twitter.com/intent/user?user_id={_id}&lang=en"
-    r = Request(url)
+    r = await Request(url)
    soup = BeautifulSoup(r, "html.parser")
    return soup.find("a", "fn url alternate-context")["href"].replace("/", "")

--- a/twint/run.py
+++ b/twint/run.py
-from . import datelock, feed, get, output, verbose
+from . import datelock, feed, get, output, verbose, storage
 from asyncio import get_event_loop
 from datetime import timedelta
 from .storage import db
@@ -16,6 +16,9 @@ class Twint:
        self.d = datelock.Set(self.config.Until, self.config.Since)
        verbose.Elastic(config.Elasticsearch)
+        if self.config.Pandas_clean:
+            storage.panda.clean()
        if not self.config.Timedelta:
            if (self.d._until - self.d._since).days > 30:
                self.config.Timedelta = 30
@@ -78,7 +81,7 @@ class Twint:
    async def main(self):
        if self.config.User_id is not None:
            self.config.Username = await get.Username(self.config.User_id)
        if self.config.TwitterSearch and self.config.Since and self.config.Until:
            _days = timedelta(days=int(self.config.Timedelta))
            while self.d._since < self.d._until:
@@ -106,10 +109,10 @@ class Twint:
                        await self.tweets()
                else:
                    break
                if get.Limit(self.config.Limit, self.count):
                    break
        if self.config.Count:
            verbose.Count(self.count, self.config)

--- a/twint/storage/panda.py
+++ b/twint/storage/panda.py
@@ -6,7 +6,6 @@ import warnings
 _blocks = []
 def update(Tweet, session):
-    day = weekday(strftime("%A", localtime(Tweet.datetime)))
    dt = f"{Tweet.datestamp} {Tweet.timestamp}"
    _data = {
@@ -18,8 +17,6 @@ def update(Tweet, session):
                "hashtags": Tweet.hashtags,
                "user_id": Tweet.user_id,
                "username": Tweet.username,
-                "day": day,
-                "hour": hour(Tweet.datetime),
                "link": Tweet.link,
                "retweet": Tweet.retweet,
                "user_rt": Tweet.user_rt,
@@ -32,6 +29,9 @@ def get():
    df = pd.DataFrame(_blocks)
    return df
+def clean():
+    _blocks.clear()
 def save(_filename, _dataframe, **options):
    if options.get("dataname"):
        _dataname = options.get("dataname")
@@ -53,7 +53,9 @@ def save(_filename, _dataframe, **options):
 def read(_filename, **options):
    if not options.get("dataname"):
-        _dataname = "Twint"
+        _dataname = "twint"
+    else:
+        _dataname = options.get("dataname")
    if not options.get("type"):
        _store = pd.HDFStore(_filename)

--- a/twint/url.py
+++ b/twint/url.py
@@ -66,10 +66,10 @@ async def Search(config, init):
    if config.Until:
        url += f"%20until%3A{config.Until}"
    if config.Fruit:
-        url += "%20myspace.com%20OR%20last.fm%20OR"
+        url += "%20%22myspace.com%22%20OR%20%22last.fm%22%20OR"
-        url += "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
+        url += "%20%22mail%22%20OR%20%22email%22%20OR%20%22gmail%22%20OR%20%22e-mail%22"
-        url += "%20OR%20phone%20OR%20call%20me%20OR%20text%20me"
+        url += "%20OR%20%22phone%22%20OR%20%22call%20me%22%20OR%20%22text%20me%22"
-        url += "%20OR%20keybase"
+        url += "%20OR%20%22keybase%22"
    if config.Verified:
        url += "%20filter%3Averified"
    if config.To: