Commit 8b7e5a1a authored by andytnt's avatar andytnt Committed by Francesco Poldi

new info (#235)

* Update index-tweets.json

* Update index-user.json

* Update get.py

* Update output.py

* Update run.py

* Update tweet.py

* Update user.py

* Update db.py

* Update elasticsearch.py

* Update _count variable
parent e80ecb83
...@@ -4,8 +4,11 @@ PUT twinttweets ...@@ -4,8 +4,11 @@ PUT twinttweets
"items": { "items": {
"properties": { "properties": {
"id": {"type": "long"}, "id": {"type": "long"},
"conversation_id": {"type": "text"},
"created_at": {"type":"text"},
"date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"timezone": {"type": "text"}, "timezone": {"type": "text"},
"place": {"type": "text"},
"location": {"type": "text"}, "location": {"type": "text"},
"hashtags": {"type": "text"}, "hashtags": {"type": "text"},
"tweet": {"type": "text"}, "tweet": {"type": "text"},
...@@ -13,16 +16,28 @@ PUT twinttweets ...@@ -13,16 +16,28 @@ PUT twinttweets
"retweets": {"type": "boolean"}, "retweets": {"type": "boolean"},
"likes": {"type": "boolean"}, "likes": {"type": "boolean"},
"user_id": {"type": "keyword"}, "user_id": {"type": "keyword"},
"user_id_str": {"type":"text"},
"username": {"type": "keyword"}, "username": {"type": "keyword"},
"name": {"type":"text"},
"profile_image_url": {"type":"text"},
"day": {"type": "integer"}, "day": {"type": "integer"},
"hour": {"type": "integer"}, "hour": {"type": "integer"},
"link": {"type": "text"}, "link": {"type": "text"},
"gif_url": {"type": "text"},
"gif_thumb": {"type": "text"},
"video_url": {"type": "text"},
"video_thumb": {"type": "text"},
"is_reply_to": {"type": "integer"},
"has_parent_tweet": {"type": "integer"},
"retweet": {"type": "text"}, "retweet": {"type": "text"},
"user_rt": {"type": "text"},
"essid": {"type": "keyword"}, "essid": {"type": "keyword"},
"nlikes": {"type": "integer"}, "nlikes": {"type": "integer"},
"nreplies": {"type": "integer"}, "nreplies": {"type": "integer"},
"nretweets": {"type": "integer"}, "nretweets": {"type": "integer"},
"is_quote_status": {"type": "integer"},
"quote_id": {"type": "long"},
"quote_id_str": {"type":"text"},
"quote_url": {"type":"text"},
"search": {"type": "text"} "search": {"type": "text"}
} }
} }
......
...@@ -17,9 +17,10 @@ PUT twintuser ...@@ -17,9 +17,10 @@ PUT twintuser
"followers": {"type": "integer"}, "followers": {"type": "integer"},
"likes": {"type": "integer"}, "likes": {"type": "integer"},
"media": {"type": "integer"}, "media": {"type": "integer"},
"private": {"type": "boolean"}, "private": {"type": "integer"},
"verified": {"type": "boolean"}, "verified": {"type": "integer"},
"avatar": {"type": "text"}, "avatar": {"type": "text"},
"background_image": {"type": "text"},
"session": {"type": "keyword"} "session": {"type": "keyword"}
} }
} }
......
...@@ -10,6 +10,7 @@ from aiohttp_socks import SocksConnector, SocksVer ...@@ -10,6 +10,7 @@ from aiohttp_socks import SocksConnector, SocksVer
from . import url from . import url
from .output import Tweets, Users from .output import Tweets, Users
from .user import inf
#import logging #import logging
...@@ -102,6 +103,14 @@ async def Username(_id): ...@@ -102,6 +103,14 @@ async def Username(_id):
return soup.find("a", "fn url alternate-context")["href"].replace("/", "") return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
async def UserId(username):
#loggin.info("[<] " + str(datetime.now()) + ':: get+UserId')
url = f"http://twitter.com/{username}?lang=en"
r = await Request(url)
soup = BeautifulSoup(r, "html.parser")
return int(inf(soup, "id"))
async def Tweet(url, config, conn): async def Tweet(url, config, conn):
#loggin.info("[<] " + str(datetime.now()) + ':: Tweet') #loggin.info("[<] " + str(datetime.now()) + ':: Tweet')
try: try:
...@@ -146,7 +155,7 @@ async def Multi(feed, config, conn): ...@@ -146,7 +155,7 @@ async def Multi(feed, config, conn):
else: else:
link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"] link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
url = f"https://twitter.com{link}?lang=en" url = f"https://twitter.com{link}?lang=en"
if config.User_full: if config.User_full:
futures.append(loop.run_in_executor(executor, await User(url, futures.append(loop.run_in_executor(executor, await User(url,
config, conn))) config, conn)))
......
from . import format from . import format, get
from .tweet import Tweet from .tweet import Tweet
from .user import User from .user import User
from datetime import datetime from datetime import datetime
...@@ -78,17 +78,37 @@ def _output(obj, output, config, **extra): ...@@ -78,17 +78,37 @@ def _output(obj, output, config, **extra):
except UnicodeEncodeError: except UnicodeEncodeError:
print("unicode error [x] output._output") print("unicode error [x] output._output")
async def tweetUserData(tweet,config, conn):
user_ids = set()
usernames = []
for user in tweet.mentions:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.tags:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.replies:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in usernames:
url = f"http://twitter.com/{user}?lang=en"
await get.User(url, config, conn)
async def Tweets(tw, location, config, conn): async def Tweets(tw, location, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Tweets') #logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
copyright = tw.find("div", "StreamItemContent--withheld") copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw): if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config) tweet = Tweet(tw, location, config)
await tweetUserData(tweet, config, conn)
if datecheck(tweet.datestamp, config): if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet) output = format.Tweet(config, tweet)
if config.Database: if config.Database:
db.tweets(conn, tweet, config) db.tweets(conn, tweet, config)
if config.Pandas: if config.Pandas:
panda.update(tweet, config) panda.update(tweet, config)
...@@ -108,7 +128,8 @@ async def Users(u, config, conn): ...@@ -108,7 +128,8 @@ async def Users(u, config, conn):
output = format.User(config.Format, user) output = format.User(config.Format, user)
if config.Database: if config.Database:
db.user(conn, config.Username, config.Followers, user) #db.user(conn, config.Username, config.Followers, user)
db.user(conn, config, user)
if config.Elasticsearch: if config.Elasticsearch:
_save_date = user.join_date _save_date = user.join_date
......
...@@ -94,6 +94,9 @@ class Twint: ...@@ -94,6 +94,9 @@ class Twint:
if self.config.User_id is not None: if self.config.User_id is not None:
self.config.Username = await get.Username(self.config.User_id) self.config.Username = await get.Username(self.config.User_id)
if self.config.Username is not None:
self.config.User_id = await get.UserId(self.config.Username)
if self.config.TwitterSearch and self.config.Since and self.config.Until: if self.config.TwitterSearch and self.config.Since and self.config.Until:
_days = timedelta(days=int(self.config.Timedelta)) _days = timedelta(days=int(self.config.Timedelta))
while self.d._since < self.d._until: while self.d._since < self.d._until:
......
This diff is collapsed.
...@@ -56,22 +56,37 @@ def Tweet(Tweet, config): ...@@ -56,22 +56,37 @@ def Tweet(Tweet, config):
"_id": Tweet.id + "_raw_" + config.Essid, "_id": Tweet.id + "_raw_" + config.Essid,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
"conversation_id": Tweet.conversation_id,
"created_at": Tweet.created_at,
"date": dt, "date": dt,
"timezone": Tweet.timezone, "timezone": Tweet.timezone,
"place": Tweet.place,
"location": Tweet.location, "location": Tweet.location,
"tweet": Tweet.tweet, "tweet": Tweet.tweet,
"hashtags": Tweet.hashtags, "hashtags": Tweet.hashtags,
"user_id": Tweet.user_id, "user_id": Tweet.user_id,
"user_id_str": Tweet.user_id_str,
"username": Tweet.username, "username": Tweet.username,
"name": Tweet.name,
"profile_image_url": Tweet.profile_image_url,
"day": day, "day": day,
"hour": hour(Tweet.datetime), "hour": hour(Tweet.datetime),
"link": Tweet.link, "link": Tweet.link,
"gif_url": Tweet.gif_url,
"gif_thumb": Tweet.gif_thumb,
"video_url": Tweet.video_url,
"video_thumb": Tweet.video_thumb,
"is_reply_to": Tweet.is_reply_to,
"has_parent_tweet": Tweet.has_parent_tweet,
"retweet": Tweet.retweet, "retweet": Tweet.retweet,
"user_rt": Tweet.user_rt,
"essid": config.Essid, "essid": config.Essid,
"nlikes": int(Tweet.likes), "nlikes": int(Tweet.likes_count),
"nreplies": int(Tweet.replies), "nreplies": int(Tweet.replies_count),
"nretweets": int(Tweet.retweets), "nretweets": int(Tweet.retweets_count),
"is_quote_status": Tweet.is_quote_status,
"quote_id": Tweet.quote_id,
"quote_id_str": Tweet.quote_id_str,
"quote_url": Tweet.quote_url,
"search": str(config.Search) "search": str(config.Search)
} }
} }
...@@ -208,6 +223,7 @@ def UserProfile(user, config): ...@@ -208,6 +223,7 @@ def UserProfile(user, config):
"private": user.is_private, "private": user.is_private,
"verified": user.is_verified, "verified": user.is_verified,
"avatar": user.avatar, "avatar": user.avatar,
"background_image": user.background_image,
"session": config.Essid "session": config.Essid
} }
} }
......
from time import strftime, localtime from time import strftime, localtime
import re import json
#from datetime import datetime #from datetime import datetime
#import logging #import logging
...@@ -11,16 +11,84 @@ class tweet: ...@@ -11,16 +11,84 @@ class tweet:
def __init__(self): def __init__(self):
pass pass
def getRawURLS(tw, link, config):
player = tw.find_all("div","PlayableMedia-player")
gif_url, gif_thumb, video_url, video_thumb = "", "", "", ""
for node in player:
styles = node.attrs['style'].split()
for style in styles:
if style.startswith('background'):
tmp = "background-image:url('"
style = style.replace(tmp, "")
if "tweet_video_thumb" in style:
gif_url = style.replace("')",'')
gif_url = gif_url.replace('.jpg','.mp4')
gif_url = gif_url.replace('https://pbs','https://video')
gif_url = gif_url.replace("_thumb", "")
gif_thumb = style.replace("')", "")
else:
video_url, video_thumb = "video","video_thumb"
return gif_url, gif_thumb, video_url, video_thumb
def getMentions(tw): def getMentions(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getMentions') #logging.info("[<] " + str(datetime.now()) + ':: tweet+getMentions')
"""Extract ment from tweet """Extract ment from tweet
""" """
mentions = [{"id":int(mention["data-mentioned-user-id"]),"id_str": mention["data-mentioned-user-id"],"screen_name":mention.get('href').split("/")[-1]} for mention in tw.find_all('a',{'class':'twitter-atreply'})]
return mentions
def getReplies(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getReplies')
"""Extract replies from tweet
"""
replyToUsersJSON = json.loads(tw.find("div")["data-reply-to-users-json"])
replies = [{"id":int(reply["id_str"]),"id_str": reply["id_str"],"screen_name":reply["screen_name"]} for reply in replyToUsersJSON]
return replies
def getTags(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getTags')
"""Extract tags from tweet
"""
tags = []
try: try:
mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ") tag_links = tw.find("div","media-tagging-block").find_all("a","js-user-profile-link")
for tag in tag_links:
if tag.has_attr("data-user-id"):
tmpData = {
"id":int(tag["data-user-id"]),
"id_str": tag["data-user-id"],
"screen_name":tag.get('href').split("/")[-1]
}
tags.append(tmpData)
except: except:
mentions = "" tags = []
return mentions return tags
def getQuoteInfo(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getQuoteInfo')
"""Extract quote from tweet
"""
base_twitter = "https://twitter.com"
quote_status = 0
quote_id = 0
quote_id_str = ""
quote_url = ""
try:
quote = tw.find("div","QuoteTweet-innerContainer")
quote_status = 1
quote_id = int(quote["data-item-id"])
quote_id_str = quote["data-item-id"]
quote_url = base_twitter + quote.get("href")
except:
quote_status = 0
return quote_status, quote_id, quote_id_str, quote_url
def getText(tw): def getText(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getText') #logging.info("[<] " + str(datetime.now()) + ':: tweet+getText')
...@@ -33,25 +101,6 @@ def getText(tw): ...@@ -33,25 +101,6 @@ def getText(tw):
return text return text
def getTweet(tw, mentions):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getTweet')
try:
text = getText(tw)
for i in range(len(mentions)):
mention = f"@{mentions[i]}"
if mention not in text:
text = f"{mention} {text}"
except:
text = getText(tw)
return text
def getHashtags(text):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getHashtags')
"""Get hashtags of tweet
"""
return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
def getStat(tw, _type): def getStat(tw, _type):
"""Get stats about Tweet """Get stats about Tweet
""" """
...@@ -61,42 +110,50 @@ def getStat(tw, _type): ...@@ -61,42 +110,50 @@ def getStat(tw, _type):
def getRetweet(profile, username, user): def getRetweet(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getRetweet') #logging.info("[<] " + str(datetime.now()) + ':: tweet+getRetweet')
if profile and username.lower() != user: if profile and username.lower() != user.lower():
return True return 1
def getUser_rt(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getUser_rt')
"""Get username that retweeted
"""
if getRetweet(profile, username, user):
user_rt = user
else:
user_rt = "None"
return user_rt
def Tweet(tw, location, config): def Tweet(tw, location, config):
"""Create Tweet object """Create Tweet object
""" """
##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet') ##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet')
t = tweet() t = tweet()
t.id = tw.find("div")["data-item-id"] t.id = int(tw.find("div")["data-item-id"])
t.id_str = tw.find("div")["data-item-id"]
t.conversation_id = tw.find("div")["data-conversation-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time"]) t.datetime = int(tw.find("span", "_timestamp")["data-time"])
t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime)) t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime))
t.timestamp = strftime("%H:%M:%S", localtime(t.datetime)) t.timestamp = strftime("%H:%M:%S", localtime(t.datetime))
t.user_id = tw.find("a", "account-group js-account-group js-action-profile js-user-profile-link js-nav")["data-user-id"] t.user_id = int(tw.find("div")["data-user-id"])
t.username = tw.find("span", "username").text.replace("@", "") t.user_id_str = tw.find("div")["data-user-id"]
t.username = tw.find("div")["data-screen-name"]
t.name = tw.find("div")["data-name"]
t.profile_image_url = tw.find("img", "js-action-profile-avatar").get('src').replace("_bigger","")
t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else None
t.timezone = strftime("%Z", localtime()) t.timezone = strftime("%Z", localtime())
for img in tw.findAll("img", "Emoji Emoji--forText"): for img in tw.findAll("img", "Emoji Emoji--forText"):
img.replaceWith(img["alt"]) img.replaceWith(img["alt"])
t.mentions = getMentions(tw) t.mentions = getMentions(tw)
t.tweet = getTweet(tw, t.mentions) t.tags = getTags(tw)
t.replies = getReplies(tw)
t.urls = [link.attrs["data-expanded-url"] for link in tw.find_all('a',{'class':'twitter-timeline-link'}) if link.has_attr("data-expanded-url")]
t.photos = [photo_node.attrs['data-image-url'] for photo_node in tw.find_all("div", "AdaptiveMedia-photoContainer")]
t.tweet = getText(tw)
t.location = location t.location = location
t.hashtags = getHashtags(t.tweet) t.hashtags = [hashtag.text for hashtag in tw.find_all("a","twitter-hashtag")]
t.replies = getStat(tw, "reply") t.replies_count = getStat(tw, "reply")
t.retweets = getStat(tw, "retweet") t.retweets_count = getStat(tw, "retweet")
t.likes = getStat(tw, "favorite") t.likes_count = getStat(tw, "favorite")
t.link = f"https://twitter.com/{t.username}/status/{t.id}" t.link = f"https://twitter.com/{t.username}/status/{t.id}"
t.retweet = getRetweet(config.Profile, t.username, config.Username) t.retweet = getRetweet(config.Profile, t.username, config.Username)
t.user_rt = getUser_rt(config.Profile, t.username, config.Username) t.gif_url, t.gif_thumb, t.video_url, t.video_thumb = getRawURLS(tw, t.link, config)
t.is_quote_status, t.quote_id, t.quote_id_str, t.quote_url = getQuoteInfo(tw)
t.is_reply_to = int(bool(tw.find("div")["data-is-reply-to"])) if tw.find("div").has_attr("data-is-reply-to") else 0
t.has_parent_tweet = int(bool(tw.find("div")["data-has-parent-tweet"])) if tw.find("div").has_attr("data-has-parent-tweet") else 0
t.in_reply_to_screen_name = ""
t.in_reply_to_status_id = 0
t.in_reply_to_status_id_str = ""
t.in_reply_to_user_id = 0
t.in_reply_to_user_id_str = ""
return t return t
...@@ -20,6 +20,10 @@ def inf(ur, _type): ...@@ -20,6 +20,10 @@ def inf(ur, _type):
ret = group["data-screen-name"] ret = group["data-screen-name"]
elif _type == "private": elif _type == "private":
ret = group["data-protected"] ret = group["data-protected"]
if ret == 'true':
ret = 1
else:
ret = 0
return ret return ret
...@@ -28,18 +32,18 @@ def card(ur, _type): ...@@ -28,18 +32,18 @@ def card(ur, _type):
try: try:
ret = ur.find("p", "ProfileHeaderCard-bio u-dir").text.replace("\n", " ") ret = ur.find("p", "ProfileHeaderCard-bio u-dir").text.replace("\n", " ")
except: except:
ret = "None" ret = None
elif _type == "location": elif _type == "location":
try: try:
ret = ur.find("span", "ProfileHeaderCard-locationText u-dir").text ret = ur.find("span", "ProfileHeaderCard-locationText u-dir").text
ret = ret[15:].replace("\n", " ")[:-10] ret = ret[15:].replace("\n", " ")[:-10]
except: except:
ret = "None" ret = None
elif _type == "url": elif _type == "url":
try: try:
ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"] ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"]
except: except:
ret = "None" ret = None
return ret return ret
...@@ -54,11 +58,13 @@ def convertToInt(x): ...@@ -54,11 +58,13 @@ def convertToInt(x):
"b" : 1000000000, "b" : 1000000000,
} }
try : try :
if ',' in x:
x = x.replace(',', '')
y = int(x) y = int(x)
return y return y
except : except :
pass pass
try : try :
y = float(str(x)[:-1]) y = float(str(x)[:-1])
y = y * multDict[str(x)[-1:].lower()] y = y * multDict[str(x)[-1:].lower()]
...@@ -79,11 +85,10 @@ def stat(ur, _type): ...@@ -79,11 +85,10 @@ def stat(ur, _type):
def media(ur): def media(ur):
try: try:
media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text.strip().split(" ")[0]
media_count = media_count.replace("\n", "")[32:].split(" ")[0] media_count = convertToInt(media_count)
media_count = convertToInt(media_count)
except: except:
media_count = "0" media_count = 0
return media_count return media_count
...@@ -91,11 +96,11 @@ def verified(ur): ...@@ -91,11 +96,11 @@ def verified(ur):
try: try:
is_verified = ur.find("span", "ProfileHeaderCard-badges").text is_verified = ur.find("span", "ProfileHeaderCard-badges").text
if "Verified account" in is_verified: if "Verified account" in is_verified:
is_verified = "true" is_verified = 1
else: else:
is_verified = "false" is_verified = 0
except: except:
is_verified = "false" is_verified = 0
return is_verified return is_verified
...@@ -119,4 +124,5 @@ def User(ur): ...@@ -119,4 +124,5 @@ def User(ur):
u.is_private = inf(ur, "private") u.is_private = inf(ur, "private")
u.is_verified = verified(ur) u.is_verified = verified(ur)
u.avatar = ur.find("img", "ProfileAvatar-image")["src"] u.avatar = ur.find("img", "ProfileAvatar-image")["src"]
u.background_image = ur.find('div',{'class':'ProfileCanopy-headerBg'}).find('img').get('src')
return u return u
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment