Fix missing commits (#258)

2cc218cd · andytnt · Francesco Poldi · 685078ff · 2cc218cd · 2cc218cd
Commit 2cc218cd authored Oct 25, 2018 by andytnt Committed by Francesco Poldi Oct 25, 2018
8 changed files
--- a/elasticsearch/index-user.json
+++ b/elasticsearch/index-user.json
@@ -17,9 +17,10 @@ PUT twintuser
        "followers": {"type": "integer"},
        "likes": {"type": "integer"},
        "media": {"type": "integer"},
-        "private": {"type": "boolean"},
+        "private": {"type": "integer"},
-        "verified": {"type": "boolean"},
+        "verified": {"type": "integer"},
        "avatar": {"type": "text"},
+        "background_image": {"type": "text"},
        "session": {"type": "keyword"}
      }
    }
@@ -28,4 +29,4 @@ PUT twintuser
  "settings": {
    "number_of_shards": 1
  }
 }
\ No newline at end of file
--- a/twint/get.py
+++ b/twint/get.py
@@ -10,6 +10,7 @@ from aiohttp_socks import SocksConnector, SocksVer
 from . import url
 from .output import Tweets, Users
+from .user import inf
 #import logging
@@ -102,6 +103,14 @@ async def Username(_id):
    return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
+async def UserId(username):
+    #loggin.info("[<] " + str(datetime.now()) + ':: get+UserId')
+    url = f"http://twitter.com/{username}?lang=en"
+    r = await Request(url)
+    soup = BeautifulSoup(r, "html.parser")
+    return int(inf(soup, "id"))
 async def Tweet(url, config, conn):
    #loggin.info("[<] " + str(datetime.now()) + ':: Tweet')
    try:

--- a/twint/output.py
+++ b/twint/output.py
-from . import format
+from . import format, get
 from .tweet import Tweet
 from .user import User
 from datetime import datetime
@@ -78,13 +78,34 @@ def _output(obj, output, config, **extra):
            except UnicodeEncodeError:
                print("unicode error [x] output._output")
+async def tweetUserData(tweet,config, conn):
+    user_ids = set()
+    usernames = []
+    for user in tweet.mentions:
+        if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
+                user_ids.add(user["id"])
+                usernames.append(user["screen_name"])
+    for user in tweet.tags:
+        if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
+                user_ids.add(user["id"])
+                usernames.append(user["screen_name"])
+    for user in tweet.replies:
+        if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
+                user_ids.add(user["id"])
+                usernames.append(user["screen_name"])
+    for user in usernames:
+        url = f"http://twitter.com/{user}?lang=en"
+        await get.User(url, config, conn)
 async def Tweets(tw, location, config, conn):
    #logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
    copyright = tw.find("div", "StreamItemContent--withheld")
    if copyright is None and is_tweet(tw):
        tweet = Tweet(tw, location, config)
        if config.Database is not None and config.User_info:
            await tweetUserData(tweet, config, conn)
        if datecheck(tweet.datestamp, config):
            output = format.Tweet(config, tweet)
@@ -110,7 +131,7 @@ async def Users(u, config, conn):
    output = format.User(config.Format, user)
    if config.Database:
-        db.user(conn, config.Username, config.Followers, user)
+        db.user(conn, config, user)
    if config.Elasticsearch:
        _save_date = user.join_date

--- a/twint/run.py
+++ b/twint/run.py
@@ -101,6 +101,9 @@ class Twint:
        if self.config.User_id is not None:
            self.config.Username = await get.Username(self.config.User_id)
+        if self.config.Username is not None:
+            self.config.User_id = await get.UserId(self.config.Username)
        if self.config.TwitterSearch and self.config.Since and self.config.Until:
            _days = timedelta(days=int(self.config.Timedelta))
            while self.d._since < self.d._until:

--- a/twint/storage/db.py
+++ b/twint/storage/db.py
--- a/twint/storage/elasticsearch.py
+++ b/twint/storage/elasticsearch.py
--- a/twint/tweet.py
+++ b/twint/tweet.py
 from time import strftime, localtime
-import re
+import json
 #from datetime import datetime
 #import logging
@@ -11,16 +11,84 @@ class tweet:
    def __init__(self):
        pass
+def getRawURLS(tw, link, config):
+    player = tw.find_all("div","PlayableMedia-player")
+    gif_url, gif_thumb, video_url, video_thumb = "", "", "", ""
+    for node in player:
+        styles = node.attrs['style'].split()
+        for style in styles:
+            if style.startswith('background'):
+                tmp = "background-image:url('"
+                style = style.replace(tmp, "")
+                if "tweet_video_thumb" in style:
+                    gif_url = style.replace("')",'')
+                    gif_url = gif_url.replace('.jpg','.mp4')
+                    gif_url = gif_url.replace('https://pbs','https://video')
+                    gif_url = gif_url.replace("_thumb", "")
+                    gif_thumb = style.replace("')", "")
+                else:
+                    video_url, video_thumb = "video","video_thumb"
+    return gif_url, gif_thumb, video_url, video_thumb
 def getMentions(tw):
    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getMentions')
    """Extract ment from tweet
    """
+    mentions = [{"id":int(mention["data-mentioned-user-id"]),"id_str": mention["data-mentioned-user-id"],"screen_name":mention.get('href').split("/")[-1]} for mention in tw.find_all('a',{'class':'twitter-atreply'})]
+    return mentions
+def getReplies(tw):
+    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getReplies')
+    """Extract replies from tweet
+    """
+    replyToUsersJSON = json.loads(tw.find("div")["data-reply-to-users-json"])
+    replies = [{"id":int(reply["id_str"]),"id_str": reply["id_str"],"screen_name":reply["screen_name"]} for reply in replyToUsersJSON]
+    return replies
+def getTags(tw):
+    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getTags')
+    """Extract tags from tweet
+    """
+    tags = []
    try:
-        mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ")
+        tag_links = tw.find("div","media-tagging-block").find_all("a","js-user-profile-link")
+        for tag in tag_links:
+            if tag.has_attr("data-user-id"):
+                tmpData = {
+                    "id":int(tag["data-user-id"]),
+                    "id_str": tag["data-user-id"],
+                    "screen_name":tag.get('href').split("/")[-1]
+                }
+                tags.append(tmpData)
    except:
-        mentions = ""
+        tags = []
-    return mentions
+    return tags
+def getQuoteInfo(tw):
+    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getQuoteInfo')
+    """Extract quote from tweet
+    """
+    base_twitter = "https://twitter.com"
+    quote_status = 0
+    quote_id = 0
+    quote_id_str = ""
+    quote_url = ""
+    try:
+        quote = tw.find("div","QuoteTweet-innerContainer")
+        quote_status = 1
+        quote_id = int(quote["data-item-id"])
+        quote_id_str = quote["data-item-id"]
+        quote_url = base_twitter + quote.get("href")
+    except:
+        quote_status = 0
+    return quote_status, quote_id, quote_id_str, quote_url
 def getText(tw):
    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getText')
@@ -33,25 +101,6 @@ def getText(tw):
    return text
-def getTweet(tw, mentions):
-    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getTweet')
-    try:
-        text = getText(tw)
-        for i in range(len(mentions)):
-            mention = f"@{mentions[i]}"
-            if mention not in text:
-                text = f"{mention} {text}"
-    except:
-        text = getText(tw)
-    return text
-def getHashtags(text):
-    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getHashtags')
-    """Get hashtags of tweet
-    """
-    return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
 def getStat(tw, _type):
    """Get stats about Tweet
    """
@@ -61,42 +110,49 @@ def getStat(tw, _type):
 def getRetweet(profile, username, user):
    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getRetweet')
-    if profile and username.lower() != user:
+    if profile and username.lower() != user.lower():
-        return True
+        return 1
-def getUser_rt(profile, username, user):
-    #logging.info("[<] " + str(datetime.now()) + ':: tweet+getUser_rt')
-    """Get username that retweeted
-    """
-    if getRetweet(profile, username, user):
-        user_rt = user
-    else:
-        user_rt = "None"
-    return user_rt
 def Tweet(tw, location, config):
    """Create Tweet object
    """
    ##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet')
    t = tweet()
-    t.id = tw.find("div")["data-item-id"]
+    t.id = int(tw.find("div")["data-item-id"])
+    t.id_str = tw.find("div")["data-item-id"]
+    t.conversation_id = tw.find("div")["data-conversation-id"]
    t.datetime = int(tw.find("span", "_timestamp")["data-time"])
    t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime))
    t.timestamp = strftime("%H:%M:%S", localtime(t.datetime))
-    t.user_id = tw.find("a", "account-group js-account-group js-action-profile js-user-profile-link js-nav")["data-user-id"]
+    t.user_id = int(tw.find("div")["data-user-id"])
-    t.username = tw.find("span", "username").text.replace("@", "")
+    t.user_id_str = tw.find("div")["data-user-id"]
+    t.username = tw.find("div")["data-screen-name"]
+    t.name = tw.find("div")["data-name"]
+    t.profile_image_url = tw.find("img", "js-action-profile-avatar").get('src').replace("_bigger","")
+    t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else None
    t.timezone = strftime("%Z", localtime())
    for img in tw.findAll("img", "Emoji Emoji--forText"):
        img.replaceWith(img["alt"])
    t.mentions = getMentions(tw)
-    t.tweet = getTweet(tw, t.mentions)
+    t.tags = getTags(tw)
+    t.replies = getReplies(tw)
+    t.urls = [link.attrs["data-expanded-url"] for link in tw.find_all('a',{'class':'twitter-timeline-link'}) if link.has_attr("data-expanded-url")]
+    t.photos = [photo_node.attrs['data-image-url'] for photo_node in tw.find_all("div", "AdaptiveMedia-photoContainer")]
+    t.tweet = getText(tw)
    t.location = location
-    t.hashtags = getHashtags(t.tweet)
+    t.hashtags = [hashtag.text for hashtag in tw.find_all("a","twitter-hashtag")]
-    t.replies = getStat(tw, "reply")
+    t.replies_count = getStat(tw, "reply")
-    t.retweets = getStat(tw, "retweet")
+    t.retweets_count = getStat(tw, "retweet")
-    t.likes = getStat(tw, "favorite")
+    t.likes_count = getStat(tw, "favorite")
    t.link = f"https://twitter.com/{t.username}/status/{t.id}"
    t.retweet = getRetweet(config.Profile, t.username, config.Username)
-    t.user_rt = getUser_rt(config.Profile, t.username, config.Username)
+    t.gif_url, t.gif_thumb, t.video_url, t.video_thumb = getRawURLS(tw, t.link, config)
-    return t
+    t.is_quote_status, t.quote_id, t.quote_id_str, t.quote_url = getQuoteInfo(tw)
+    t.is_reply_to = int(bool(tw.find("div")["data-is-reply-to"])) if tw.find("div").has_attr("data-is-reply-to") else 0
+    t.has_parent_tweet = int(bool(tw.find("div")["data-has-parent-tweet"])) if tw.find("div").has_attr("data-has-parent-tweet") else 0
+    t.in_reply_to_screen_name = ""
+    t.in_reply_to_status_id = 0
+    t.in_reply_to_status_id_str = ""
+    t.in_reply_to_user_id = 0
+    t.in_reply_to_user_id_str = ""
+    return t
\ No newline at end of file
--- a/twint/user.py
+++ b/twint/user.py
@@ -20,6 +20,10 @@ def inf(ur, _type):
        ret = group["data-screen-name"]
    elif _type == "private":
        ret = group["data-protected"]
+        if ret == 'true':
+            ret = 1
+        else:
+            ret = 0
    return ret
@@ -28,18 +32,18 @@ def card(ur, _type):
        try:
            ret = ur.find("p", "ProfileHeaderCard-bio u-dir").text.replace("\n", " ")
        except:
-            ret = "None"
+            ret = None
    elif _type == "location":
        try:
            ret = ur.find("span", "ProfileHeaderCard-locationText u-dir").text
            ret = ret[15:].replace("\n", " ")[:-10]
        except:
-            ret = "None"
+            ret = None
    elif _type == "url":
        try:
            ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"]
        except:
-            ret = "None"
+            ret = None
    return ret
@@ -54,11 +58,13 @@ def convertToInt(x):
        "b" : 1000000000,
    }
    try :
+        if ',' in x:
+            x = x.replace(',', '')
        y = int(x)
        return y
    except :
        pass
    try :
        y = float(str(x)[:-1])
        y = y * multDict[str(x)[-1:].lower()]
@@ -79,11 +85,10 @@ def stat(ur, _type):
 def media(ur):
    try:
-        media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text
+      media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text.strip().split(" ")[0]
-        media_count = media_count.replace("\n", "")[32:].split(" ")[0]
+      media_count = convertToInt(media_count)
-        media_count = convertToInt(media_count)
    except:
-        media_count = "0"
+      media_count = 0
    return media_count
@@ -91,11 +96,11 @@ def verified(ur):
    try:
        is_verified = ur.find("span", "ProfileHeaderCard-badges").text
        if "Verified account" in is_verified:
-            is_verified = "true"
+            is_verified = 1
        else:
-            is_verified = "false"
+            is_verified = 0
    except:
-        is_verified = "false"
+        is_verified = 0
    return is_verified
@@ -119,4 +124,5 @@ def User(ur):
    u.is_private = inf(ur, "private")
    u.is_verified = verified(ur)
    u.avatar = ur.find("img", "ProfileAvatar-image")["src"]
-    return u
+    u.background_image = ur.find('div',{'class':'ProfileCanopy-headerBg'}).find('img').get('src')
+    return u
\ No newline at end of file