Commit 528a2506 authored by Francesco Poldi's avatar Francesco Poldi

Revert "Merge remote-tracking branch 'origin/master'"

This reverts commit 4c27b2b82cb84e7b8d5fac206b0b8502a5faba4e.
parent 182d37f2
......@@ -4,11 +4,8 @@ PUT twinttweets
"items": {
"properties": {
"id": {"type": "long"},
"conversation_id": {"type": "text"},
"created_at": {"type":"text"},
"date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"timezone": {"type": "text"},
"place": {"type": "text"},
"location": {"type": "text"},
"hashtags": {"type": "text"},
"tweet": {"type": "text"},
......@@ -16,28 +13,16 @@ PUT twinttweets
"retweets": {"type": "boolean"},
"likes": {"type": "boolean"},
"user_id": {"type": "keyword"},
"user_id_str": {"type":"text"},
"username": {"type": "keyword"},
"name": {"type":"text"},
"profile_image_url": {"type":"text"},
"day": {"type": "integer"},
"hour": {"type": "integer"},
"link": {"type": "text"},
"gif_url": {"type": "text"},
"gif_thumb": {"type": "text"},
"video_url": {"type": "text"},
"video_thumb": {"type": "text"},
"is_reply_to": {"type": "integer"},
"has_parent_tweet": {"type": "integer"},
"retweet": {"type": "text"},
"user_rt": {"type": "text"},
"essid": {"type": "keyword"},
"nlikes": {"type": "integer"},
"nreplies": {"type": "integer"},
"nretweets": {"type": "integer"},
"is_quote_status": {"type": "integer"},
"quote_id": {"type": "long"},
"quote_id_str": {"type":"text"},
"quote_url": {"type":"text"},
"search": {"type": "text"}
}
}
......
......@@ -17,10 +17,9 @@ PUT twintuser
"followers": {"type": "integer"},
"likes": {"type": "integer"},
"media": {"type": "integer"},
"private": {"type": "integer"},
"verified": {"type": "integer"},
"private": {"type": "boolean"},
"verified": {"type": "boolean"},
"avatar": {"type": "text"},
"background_image": {"type": "text"},
"session": {"type": "keyword"}
}
}
......
......@@ -10,7 +10,6 @@ from aiohttp_socks import SocksConnector, SocksVer
from . import url
from .output import Tweets, Users
from .user import inf
#import logging
......@@ -103,14 +102,6 @@ async def Username(_id):
return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
async def UserId(username):
#loggin.info("[<] " + str(datetime.now()) + ':: get+UserId')
url = f"http://twitter.com/{username}?lang=en"
r = await Request(url)
soup = BeautifulSoup(r, "html.parser")
return int(inf(soup, "id"))
async def Tweet(url, config, conn):
#loggin.info("[<] " + str(datetime.now()) + ':: Tweet')
try:
......
from . import format, get
from . import format
from .tweet import Tweet
from .user import User
from datetime import datetime
......@@ -78,31 +78,11 @@ def _output(obj, output, config, **extra):
except UnicodeEncodeError:
print("unicode error [x] output._output")
async def tweetUserData(tweet,config, conn):
user_ids = set()
usernames = []
for user in tweet.mentions:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.tags:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.replies:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in usernames:
url = f"http://twitter.com/{user}?lang=en"
await get.User(url, config, conn)
async def Tweets(tw, location, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config)
await tweetUserData(tweet, config, conn)
if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet)
......@@ -128,8 +108,7 @@ async def Users(u, config, conn):
output = format.User(config.Format, user)
if config.Database:
#db.user(conn, config.Username, config.Followers, user)
db.user(conn, config, user)
db.user(conn, config.Username, config.Followers, user)
if config.Elasticsearch:
_save_date = user.join_date
......
......@@ -94,9 +94,6 @@ class Twint:
if self.config.User_id is not None:
self.config.Username = await get.Username(self.config.User_id)
if self.config.Username is not None:
self.config.User_id = await get.UserId(self.config.Username)
if self.config.TwitterSearch and self.config.Since and self.config.Until:
_days = timedelta(days=int(self.config.Timedelta))
while self.d._since < self.d._until:
......
This diff is collapsed.
......@@ -171,37 +171,22 @@ def Tweet(Tweet, config):
"_id": Tweet.id + "_raw_" + config.Essid,
"_source": {
"id": Tweet.id,
"conversation_id": Tweet.conversation_id,
"created_at": Tweet.created_at,
"date": dt,
"timezone": Tweet.timezone,
"place": Tweet.place,
"location": Tweet.location,
"tweet": Tweet.tweet,
"hashtags": Tweet.hashtags,
"user_id": Tweet.user_id,
"user_id_str": Tweet.user_id_str,
"username": Tweet.username,
"name": Tweet.name,
"profile_image_url": Tweet.profile_image_url,
"day": day,
"hour": hour(Tweet.datetime),
"link": Tweet.link,
"gif_url": Tweet.gif_url,
"gif_thumb": Tweet.gif_thumb,
"video_url": Tweet.video_url,
"video_thumb": Tweet.video_thumb,
"is_reply_to": Tweet.is_reply_to,
"has_parent_tweet": Tweet.has_parent_tweet,
"retweet": Tweet.retweet,
"user_rt": Tweet.user_rt,
"essid": config.Essid,
"nlikes": int(Tweet.likes_count),
"nreplies": int(Tweet.replies_count),
"nretweets": int(Tweet.retweets_count),
"is_quote_status": Tweet.is_quote_status,
"quote_id": Tweet.quote_id,
"quote_id_str": Tweet.quote_id_str,
"quote_url": Tweet.quote_url,
"nlikes": int(Tweet.likes),
"nreplies": int(Tweet.replies),
"nretweets": int(Tweet.retweets),
"search": str(config.Search)
}
}
......@@ -345,7 +330,6 @@ def UserProfile(user, config):
"private": user.is_private,
"verified": user.is_verified,
"avatar": user.avatar,
"background_image": user.background_image,
"session": config.Essid
}
}
......
from time import strftime, localtime
import json
import re
#from datetime import datetime
#import logging
......@@ -11,84 +11,16 @@ class tweet:
def __init__(self):
pass
def getRawURLS(tw, link, config):
player = tw.find_all("div","PlayableMedia-player")
gif_url, gif_thumb, video_url, video_thumb = "", "", "", ""
for node in player:
styles = node.attrs['style'].split()
for style in styles:
if style.startswith('background'):
tmp = "background-image:url('"
style = style.replace(tmp, "")
if "tweet_video_thumb" in style:
gif_url = style.replace("')",'')
gif_url = gif_url.replace('.jpg','.mp4')
gif_url = gif_url.replace('https://pbs','https://video')
gif_url = gif_url.replace("_thumb", "")
gif_thumb = style.replace("')", "")
else:
video_url, video_thumb = "video","video_thumb"
return gif_url, gif_thumb, video_url, video_thumb
def getMentions(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getMentions')
"""Extract ment from tweet
"""
mentions = [{"id":int(mention["data-mentioned-user-id"]),"id_str": mention["data-mentioned-user-id"],"screen_name":mention.get('href').split("/")[-1]} for mention in tw.find_all('a',{'class':'twitter-atreply'})]
return mentions
def getReplies(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getReplies')
"""Extract replies from tweet
"""
replyToUsersJSON = json.loads(tw.find("div")["data-reply-to-users-json"])
replies = [{"id":int(reply["id_str"]),"id_str": reply["id_str"],"screen_name":reply["screen_name"]} for reply in replyToUsersJSON]
return replies
def getTags(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getTags')
"""Extract tags from tweet
"""
tags = []
try:
tag_links = tw.find("div","media-tagging-block").find_all("a","js-user-profile-link")
for tag in tag_links:
if tag.has_attr("data-user-id"):
tmpData = {
"id":int(tag["data-user-id"]),
"id_str": tag["data-user-id"],
"screen_name":tag.get('href').split("/")[-1]
}
tags.append(tmpData)
except:
tags = []
return tags
def getQuoteInfo(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getQuoteInfo')
"""Extract quote from tweet
"""
base_twitter = "https://twitter.com"
quote_status = 0
quote_id = 0
quote_id_str = ""
quote_url = ""
try:
quote = tw.find("div","QuoteTweet-innerContainer")
quote_status = 1
quote_id = int(quote["data-item-id"])
quote_id_str = quote["data-item-id"]
quote_url = base_twitter + quote.get("href")
mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ")
except:
quote_status = 0
mentions = ""
return quote_status, quote_id, quote_id_str, quote_url
return mentions
def getText(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getText')
......@@ -101,6 +33,25 @@ def getText(tw):
return text
def getTweet(tw, mentions):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getTweet')
try:
text = getText(tw)
for i in range(len(mentions)):
mention = f"@{mentions[i]}"
if mention not in text:
text = f"{mention} {text}"
except:
text = getText(tw)
return text
def getHashtags(text):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getHashtags')
"""Get hashtags of tweet
"""
return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
def getStat(tw, _type):
"""Get stats about Tweet
"""
......@@ -110,50 +61,42 @@ def getStat(tw, _type):
def getRetweet(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getRetweet')
if profile and username.lower() != user.lower():
return 1
if profile and username.lower() != user:
return True
def getUser_rt(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getUser_rt')
"""Get username that retweeted
"""
if getRetweet(profile, username, user):
user_rt = user
else:
user_rt = "None"
return user_rt
def Tweet(tw, location, config):
"""Create Tweet object
"""
##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet')
t = tweet()
t.id = int(tw.find("div")["data-item-id"])
t.id_str = tw.find("div")["data-item-id"]
t.conversation_id = tw.find("div")["data-conversation-id"]
t.id = tw.find("div")["data-item-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time"])
t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime))
t.timestamp = strftime("%H:%M:%S", localtime(t.datetime))
t.user_id = int(tw.find("div")["data-user-id"])
t.user_id_str = tw.find("div")["data-user-id"]
t.username = tw.find("div")["data-screen-name"]
t.name = tw.find("div")["data-name"]
t.profile_image_url = tw.find("img", "js-action-profile-avatar").get('src').replace("_bigger","")
t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else None
t.user_id = tw.find("a", "account-group js-account-group js-action-profile js-user-profile-link js-nav")["data-user-id"]
t.username = tw.find("span", "username").text.replace("@", "")
t.timezone = strftime("%Z", localtime())
for img in tw.findAll("img", "Emoji Emoji--forText"):
img.replaceWith(img["alt"])
t.mentions = getMentions(tw)
t.tags = getTags(tw)
t.replies = getReplies(tw)
t.urls = [link.attrs["data-expanded-url"] for link in tw.find_all('a',{'class':'twitter-timeline-link'}) if link.has_attr("data-expanded-url")]
t.photos = [photo_node.attrs['data-image-url'] for photo_node in tw.find_all("div", "AdaptiveMedia-photoContainer")]
t.tweet = getText(tw)
t.tweet = getTweet(tw, t.mentions)
t.location = location
t.hashtags = [hashtag.text for hashtag in tw.find_all("a","twitter-hashtag")]
t.replies_count = getStat(tw, "reply")
t.retweets_count = getStat(tw, "retweet")
t.likes_count = getStat(tw, "favorite")
t.hashtags = getHashtags(t.tweet)
t.replies = getStat(tw, "reply")
t.retweets = getStat(tw, "retweet")
t.likes = getStat(tw, "favorite")
t.link = f"https://twitter.com/{t.username}/status/{t.id}"
t.retweet = getRetweet(config.Profile, t.username, config.Username)
t.gif_url, t.gif_thumb, t.video_url, t.video_thumb = getRawURLS(tw, t.link, config)
t.is_quote_status, t.quote_id, t.quote_id_str, t.quote_url = getQuoteInfo(tw)
t.is_reply_to = int(bool(tw.find("div")["data-is-reply-to"])) if tw.find("div").has_attr("data-is-reply-to") else 0
t.has_parent_tweet = int(bool(tw.find("div")["data-has-parent-tweet"])) if tw.find("div").has_attr("data-has-parent-tweet") else 0
t.in_reply_to_screen_name = ""
t.in_reply_to_status_id = 0
t.in_reply_to_status_id_str = ""
t.in_reply_to_user_id = 0
t.in_reply_to_user_id_str = ""
t.user_rt = getUser_rt(config.Profile, t.username, config.Username)
return t
......@@ -20,10 +20,6 @@ def inf(ur, _type):
ret = group["data-screen-name"]
elif _type == "private":
ret = group["data-protected"]
if ret == 'true':
ret = 1
else:
ret = 0
return ret
......@@ -32,18 +28,18 @@ def card(ur, _type):
try:
ret = ur.find("p", "ProfileHeaderCard-bio u-dir").text.replace("\n", " ")
except:
ret = None
ret = "None"
elif _type == "location":
try:
ret = ur.find("span", "ProfileHeaderCard-locationText u-dir").text
ret = ret[15:].replace("\n", " ")[:-10]
except:
ret = None
ret = "None"
elif _type == "url":
try:
ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"]
except:
ret = None
ret = "None"
return ret
......@@ -58,13 +54,11 @@ def convertToInt(x):
"b" : 1000000000,
}
try :
if ',' in x:
x = x.replace(',', '')
y = int(x)
return y
except :
pass
try :
y = float(str(x)[:-1])
y = y * multDict[str(x)[-1:].lower()]
......@@ -85,10 +79,11 @@ def stat(ur, _type):
def media(ur):
try:
media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text.strip().split(" ")[0]
media_count = convertToInt(media_count)
media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text
media_count = media_count.replace("\n", "")[32:].split(" ")[0]
media_count = convertToInt(media_count)
except:
media_count = 0
media_count = "0"
return media_count
......@@ -96,11 +91,11 @@ def verified(ur):
try:
is_verified = ur.find("span", "ProfileHeaderCard-badges").text
if "Verified account" in is_verified:
is_verified = 1
is_verified = "true"
else:
is_verified = 0
is_verified = "false"
except:
is_verified = 0
is_verified = "false"
return is_verified
......@@ -124,5 +119,4 @@ def User(ur):
u.is_private = inf(ur, "private")
u.is_verified = verified(ur)
u.avatar = ur.find("img", "ProfileAvatar-image")["src"]
u.background_image = ur.find('div',{'class':'ProfileCanopy-headerBg'}).find('img').get('src')
return u
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment