Commit 2cc218cd authored by andytnt's avatar andytnt Committed by Francesco Poldi

Fix missing commits (#258)

parent 685078ff
...@@ -17,9 +17,10 @@ PUT twintuser ...@@ -17,9 +17,10 @@ PUT twintuser
"followers": {"type": "integer"}, "followers": {"type": "integer"},
"likes": {"type": "integer"}, "likes": {"type": "integer"},
"media": {"type": "integer"}, "media": {"type": "integer"},
"private": {"type": "boolean"}, "private": {"type": "integer"},
"verified": {"type": "boolean"}, "verified": {"type": "integer"},
"avatar": {"type": "text"}, "avatar": {"type": "text"},
"background_image": {"type": "text"},
"session": {"type": "keyword"} "session": {"type": "keyword"}
} }
} }
...@@ -28,4 +29,4 @@ PUT twintuser ...@@ -28,4 +29,4 @@ PUT twintuser
"settings": { "settings": {
"number_of_shards": 1 "number_of_shards": 1
} }
} }
\ No newline at end of file
...@@ -10,6 +10,7 @@ from aiohttp_socks import SocksConnector, SocksVer ...@@ -10,6 +10,7 @@ from aiohttp_socks import SocksConnector, SocksVer
from . import url from . import url
from .output import Tweets, Users from .output import Tweets, Users
from .user import inf
#import logging #import logging
...@@ -102,6 +103,14 @@ async def Username(_id): ...@@ -102,6 +103,14 @@ async def Username(_id):
return soup.find("a", "fn url alternate-context")["href"].replace("/", "") return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
async def UserId(username):
#loggin.info("[<] " + str(datetime.now()) + ':: get+UserId')
url = f"http://twitter.com/{username}?lang=en"
r = await Request(url)
soup = BeautifulSoup(r, "html.parser")
return int(inf(soup, "id"))
async def Tweet(url, config, conn): async def Tweet(url, config, conn):
#loggin.info("[<] " + str(datetime.now()) + ':: Tweet') #loggin.info("[<] " + str(datetime.now()) + ':: Tweet')
try: try:
......
from . import format from . import format, get
from .tweet import Tweet from .tweet import Tweet
from .user import User from .user import User
from datetime import datetime from datetime import datetime
...@@ -78,13 +78,34 @@ def _output(obj, output, config, **extra): ...@@ -78,13 +78,34 @@ def _output(obj, output, config, **extra):
except UnicodeEncodeError: except UnicodeEncodeError:
print("unicode error [x] output._output") print("unicode error [x] output._output")
async def tweetUserData(tweet,config, conn):
user_ids = set()
usernames = []
for user in tweet.mentions:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.tags:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.replies:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in usernames:
url = f"http://twitter.com/{user}?lang=en"
await get.User(url, config, conn)
async def Tweets(tw, location, config, conn): async def Tweets(tw, location, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Tweets') #logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
copyright = tw.find("div", "StreamItemContent--withheld") copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw): if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config) tweet = Tweet(tw, location, config)
if config.Database is not None and config.User_info: if config.Database is not None and config.User_info:
await tweetUserData(tweet, config, conn) await tweetUserData(tweet, config, conn)
if datecheck(tweet.datestamp, config): if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet) output = format.Tweet(config, tweet)
...@@ -110,7 +131,7 @@ async def Users(u, config, conn): ...@@ -110,7 +131,7 @@ async def Users(u, config, conn):
output = format.User(config.Format, user) output = format.User(config.Format, user)
if config.Database: if config.Database:
db.user(conn, config.Username, config.Followers, user) db.user(conn, config, user)
if config.Elasticsearch: if config.Elasticsearch:
_save_date = user.join_date _save_date = user.join_date
......
...@@ -101,6 +101,9 @@ class Twint: ...@@ -101,6 +101,9 @@ class Twint:
if self.config.User_id is not None: if self.config.User_id is not None:
self.config.Username = await get.Username(self.config.User_id) self.config.Username = await get.Username(self.config.User_id)
if self.config.Username is not None:
self.config.User_id = await get.UserId(self.config.Username)
if self.config.TwitterSearch and self.config.Since and self.config.Until: if self.config.TwitterSearch and self.config.Since and self.config.Until:
_days = timedelta(days=int(self.config.Timedelta)) _days = timedelta(days=int(self.config.Timedelta))
while self.d._since < self.d._until: while self.d._since < self.d._until:
......
This diff is collapsed.
This diff is collapsed.
from time import strftime, localtime from time import strftime, localtime
import re import json
#from datetime import datetime #from datetime import datetime
#import logging #import logging
...@@ -11,16 +11,84 @@ class tweet: ...@@ -11,16 +11,84 @@ class tweet:
def __init__(self): def __init__(self):
pass pass
def getRawURLS(tw, link, config):
player = tw.find_all("div","PlayableMedia-player")
gif_url, gif_thumb, video_url, video_thumb = "", "", "", ""
for node in player:
styles = node.attrs['style'].split()
for style in styles:
if style.startswith('background'):
tmp = "background-image:url('"
style = style.replace(tmp, "")
if "tweet_video_thumb" in style:
gif_url = style.replace("')",'')
gif_url = gif_url.replace('.jpg','.mp4')
gif_url = gif_url.replace('https://pbs','https://video')
gif_url = gif_url.replace("_thumb", "")
gif_thumb = style.replace("')", "")
else:
video_url, video_thumb = "video","video_thumb"
return gif_url, gif_thumb, video_url, video_thumb
def getMentions(tw): def getMentions(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getMentions') #logging.info("[<] " + str(datetime.now()) + ':: tweet+getMentions')
"""Extract ment from tweet """Extract ment from tweet
""" """
mentions = [{"id":int(mention["data-mentioned-user-id"]),"id_str": mention["data-mentioned-user-id"],"screen_name":mention.get('href').split("/")[-1]} for mention in tw.find_all('a',{'class':'twitter-atreply'})]
return mentions
def getReplies(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getReplies')
"""Extract replies from tweet
"""
replyToUsersJSON = json.loads(tw.find("div")["data-reply-to-users-json"])
replies = [{"id":int(reply["id_str"]),"id_str": reply["id_str"],"screen_name":reply["screen_name"]} for reply in replyToUsersJSON]
return replies
def getTags(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getTags')
"""Extract tags from tweet
"""
tags = []
try: try:
mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ") tag_links = tw.find("div","media-tagging-block").find_all("a","js-user-profile-link")
for tag in tag_links:
if tag.has_attr("data-user-id"):
tmpData = {
"id":int(tag["data-user-id"]),
"id_str": tag["data-user-id"],
"screen_name":tag.get('href').split("/")[-1]
}
tags.append(tmpData)
except: except:
mentions = "" tags = []
return mentions return tags
def getQuoteInfo(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getQuoteInfo')
"""Extract quote from tweet
"""
base_twitter = "https://twitter.com"
quote_status = 0
quote_id = 0
quote_id_str = ""
quote_url = ""
try:
quote = tw.find("div","QuoteTweet-innerContainer")
quote_status = 1
quote_id = int(quote["data-item-id"])
quote_id_str = quote["data-item-id"]
quote_url = base_twitter + quote.get("href")
except:
quote_status = 0
return quote_status, quote_id, quote_id_str, quote_url
def getText(tw): def getText(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getText') #logging.info("[<] " + str(datetime.now()) + ':: tweet+getText')
...@@ -33,25 +101,6 @@ def getText(tw): ...@@ -33,25 +101,6 @@ def getText(tw):
return text return text
def getTweet(tw, mentions):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getTweet')
try:
text = getText(tw)
for i in range(len(mentions)):
mention = f"@{mentions[i]}"
if mention not in text:
text = f"{mention} {text}"
except:
text = getText(tw)
return text
def getHashtags(text):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getHashtags')
"""Get hashtags of tweet
"""
return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
def getStat(tw, _type): def getStat(tw, _type):
"""Get stats about Tweet """Get stats about Tweet
""" """
...@@ -61,42 +110,49 @@ def getStat(tw, _type): ...@@ -61,42 +110,49 @@ def getStat(tw, _type):
def getRetweet(profile, username, user): def getRetweet(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getRetweet') #logging.info("[<] " + str(datetime.now()) + ':: tweet+getRetweet')
if profile and username.lower() != user: if profile and username.lower() != user.lower():
return True return 1
def getUser_rt(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getUser_rt')
"""Get username that retweeted
"""
if getRetweet(profile, username, user):
user_rt = user
else:
user_rt = "None"
return user_rt
def Tweet(tw, location, config): def Tweet(tw, location, config):
"""Create Tweet object """Create Tweet object
""" """
##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet') ##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet')
t = tweet() t = tweet()
t.id = tw.find("div")["data-item-id"] t.id = int(tw.find("div")["data-item-id"])
t.id_str = tw.find("div")["data-item-id"]
t.conversation_id = tw.find("div")["data-conversation-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time"]) t.datetime = int(tw.find("span", "_timestamp")["data-time"])
t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime)) t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime))
t.timestamp = strftime("%H:%M:%S", localtime(t.datetime)) t.timestamp = strftime("%H:%M:%S", localtime(t.datetime))
t.user_id = tw.find("a", "account-group js-account-group js-action-profile js-user-profile-link js-nav")["data-user-id"] t.user_id = int(tw.find("div")["data-user-id"])
t.username = tw.find("span", "username").text.replace("@", "") t.user_id_str = tw.find("div")["data-user-id"]
t.username = tw.find("div")["data-screen-name"]
t.name = tw.find("div")["data-name"]
t.profile_image_url = tw.find("img", "js-action-profile-avatar").get('src').replace("_bigger","")
t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else None
t.timezone = strftime("%Z", localtime()) t.timezone = strftime("%Z", localtime())
for img in tw.findAll("img", "Emoji Emoji--forText"): for img in tw.findAll("img", "Emoji Emoji--forText"):
img.replaceWith(img["alt"]) img.replaceWith(img["alt"])
t.mentions = getMentions(tw) t.mentions = getMentions(tw)
t.tweet = getTweet(tw, t.mentions) t.tags = getTags(tw)
t.replies = getReplies(tw)
t.urls = [link.attrs["data-expanded-url"] for link in tw.find_all('a',{'class':'twitter-timeline-link'}) if link.has_attr("data-expanded-url")]
t.photos = [photo_node.attrs['data-image-url'] for photo_node in tw.find_all("div", "AdaptiveMedia-photoContainer")]
t.tweet = getText(tw)
t.location = location t.location = location
t.hashtags = getHashtags(t.tweet) t.hashtags = [hashtag.text for hashtag in tw.find_all("a","twitter-hashtag")]
t.replies = getStat(tw, "reply") t.replies_count = getStat(tw, "reply")
t.retweets = getStat(tw, "retweet") t.retweets_count = getStat(tw, "retweet")
t.likes = getStat(tw, "favorite") t.likes_count = getStat(tw, "favorite")
t.link = f"https://twitter.com/{t.username}/status/{t.id}" t.link = f"https://twitter.com/{t.username}/status/{t.id}"
t.retweet = getRetweet(config.Profile, t.username, config.Username) t.retweet = getRetweet(config.Profile, t.username, config.Username)
t.user_rt = getUser_rt(config.Profile, t.username, config.Username) t.gif_url, t.gif_thumb, t.video_url, t.video_thumb = getRawURLS(tw, t.link, config)
return t t.is_quote_status, t.quote_id, t.quote_id_str, t.quote_url = getQuoteInfo(tw)
t.is_reply_to = int(bool(tw.find("div")["data-is-reply-to"])) if tw.find("div").has_attr("data-is-reply-to") else 0
t.has_parent_tweet = int(bool(tw.find("div")["data-has-parent-tweet"])) if tw.find("div").has_attr("data-has-parent-tweet") else 0
t.in_reply_to_screen_name = ""
t.in_reply_to_status_id = 0
t.in_reply_to_status_id_str = ""
t.in_reply_to_user_id = 0
t.in_reply_to_user_id_str = ""
return t
\ No newline at end of file
...@@ -20,6 +20,10 @@ def inf(ur, _type): ...@@ -20,6 +20,10 @@ def inf(ur, _type):
ret = group["data-screen-name"] ret = group["data-screen-name"]
elif _type == "private": elif _type == "private":
ret = group["data-protected"] ret = group["data-protected"]
if ret == 'true':
ret = 1
else:
ret = 0
return ret return ret
...@@ -28,18 +32,18 @@ def card(ur, _type): ...@@ -28,18 +32,18 @@ def card(ur, _type):
try: try:
ret = ur.find("p", "ProfileHeaderCard-bio u-dir").text.replace("\n", " ") ret = ur.find("p", "ProfileHeaderCard-bio u-dir").text.replace("\n", " ")
except: except:
ret = "None" ret = None
elif _type == "location": elif _type == "location":
try: try:
ret = ur.find("span", "ProfileHeaderCard-locationText u-dir").text ret = ur.find("span", "ProfileHeaderCard-locationText u-dir").text
ret = ret[15:].replace("\n", " ")[:-10] ret = ret[15:].replace("\n", " ")[:-10]
except: except:
ret = "None" ret = None
elif _type == "url": elif _type == "url":
try: try:
ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"] ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"]
except: except:
ret = "None" ret = None
return ret return ret
...@@ -54,11 +58,13 @@ def convertToInt(x): ...@@ -54,11 +58,13 @@ def convertToInt(x):
"b" : 1000000000, "b" : 1000000000,
} }
try : try :
if ',' in x:
x = x.replace(',', '')
y = int(x) y = int(x)
return y return y
except : except :
pass pass
try : try :
y = float(str(x)[:-1]) y = float(str(x)[:-1])
y = y * multDict[str(x)[-1:].lower()] y = y * multDict[str(x)[-1:].lower()]
...@@ -79,11 +85,10 @@ def stat(ur, _type): ...@@ -79,11 +85,10 @@ def stat(ur, _type):
def media(ur): def media(ur):
try: try:
media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text.strip().split(" ")[0]
media_count = media_count.replace("\n", "")[32:].split(" ")[0] media_count = convertToInt(media_count)
media_count = convertToInt(media_count)
except: except:
media_count = "0" media_count = 0
return media_count return media_count
...@@ -91,11 +96,11 @@ def verified(ur): ...@@ -91,11 +96,11 @@ def verified(ur):
try: try:
is_verified = ur.find("span", "ProfileHeaderCard-badges").text is_verified = ur.find("span", "ProfileHeaderCard-badges").text
if "Verified account" in is_verified: if "Verified account" in is_verified:
is_verified = "true" is_verified = 1
else: else:
is_verified = "false" is_verified = 0
except: except:
is_verified = "false" is_verified = 0
return is_verified return is_verified
...@@ -119,4 +124,5 @@ def User(ur): ...@@ -119,4 +124,5 @@ def User(ur):
u.is_private = inf(ur, "private") u.is_private = inf(ur, "private")
u.is_verified = verified(ur) u.is_verified = verified(ur)
u.avatar = ur.find("img", "ProfileAvatar-image")["src"] u.avatar = ur.find("img", "ProfileAvatar-image")["src"]
return u u.background_image = ur.find('div',{'class':'ProfileCanopy-headerBg'}).find('img').get('src')
return u
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment