Commit a340a1dc authored by andytnt's avatar andytnt Committed by GitHub

Merge pull request #280 from andytnt/master

#273
parents bc94da50 c3b76170
......@@ -134,11 +134,11 @@ Twitter can shadow-ban accounts, which means that their tweets will not be avail
#### userlist
> get only user info of user
> To get only user info of user
`python Twint.py -u username --user-full`
> get user info of users from a userlist
> To get user info of users from a userlist
`python Twint.py --userlist inputlist --user-full`
......@@ -146,17 +146,17 @@ Twitter can shadow-ban accounts, which means that their tweets will not be avail
> To get only tweets without user info
`python Twint.py -u username --profile-full --user-info` or `set c.User_info = False`
`python Twint.py -u username --profile-full` or `set c.User_info = False`
`python Twint.py -u username --user-info` or `set c.User_info = False`
`python Twint.py -u username` or `set c.User_info = False`
#### Tweets with user info works ONLY with a Database (currently)
> To get tweets along with user info of users mentioned in tweet/replied to
`python Twint.py -u username -db database.db`
`python Twint.py -u username --user-info -db database.db`
`python Twint.py -u username --profile-full -db database.db`
`python Twint.py -u username --profile-full --user-info -db database.db`
## Contact
......
......@@ -105,6 +105,7 @@ def initialize(args):
c.Proxy_port = args.proxy_port
c.Proxy_type = args.proxy_type
c.Retweets = args.retweets
c.Get_replies = args.get_replies
return c
def options():
......@@ -186,6 +187,7 @@ def options():
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc", "--pandas-clean",
help="Automatically clean Pandas dataframe at every scrape.")
ap.add_argument("--get-replies", help="All replies to the tweet.", action="store_true")
args = ap.parse_args()
return args
......
......@@ -59,3 +59,4 @@ class Config:
Retweets = False
Query = None
Hide_output = False
Get_replies = False
......@@ -33,7 +33,7 @@ def profile(response):
json_response = loads(response)
html = json_response["items_html"]
soup = BeautifulSoup(html, "html.parser")
feed = soup.find_all("li", "js-stream-item")
feed = soup.find_all("div", "tweet")
return feed, feed[-1]["data-item-id"]
......@@ -42,5 +42,5 @@ def Json(response):
json_response = loads(response)
html = json_response["items_html"]
soup = BeautifulSoup(html, "html.parser")
feed = soup.find_all("li", "js-stream-item")
feed = soup.find_all("div", "tweet")
return feed, json_response["min_position"]
......@@ -113,10 +113,10 @@ async def Tweet(url, config, conn):
try:
response = await Request(url)
soup = BeautifulSoup(response, "html.parser")
tweet = soup.find("div", "permalink-inner permalink-tweet-container")
location = soup.find("span", "ProfileHeaderCard-locationText u-dir").text
location = location[15:].replace("\n", " ")[:-10]
await Tweets(tweet, location, config, conn)
tweets = soup.find_all("div", "tweet")
await Tweets(tweets, location, config, conn, url)
except Exception as e:
print(str(e) + " [x] get.Tweet")
......
......@@ -31,7 +31,7 @@ def datecheck(datestamp, config):
def is_tweet(tw):
#logging.info("[<] " + str(datetime.now()) + ':: output+is_tweet')
try:
tw.find("div")["data-item-id"]
tw["data-item-id"]
return True
except:
return False
......@@ -79,77 +79,89 @@ def _output(obj, output, config, **extra):
except UnicodeEncodeError:
print("unicode error [x] output._output")
async def tweetUserData(tweet,config, conn):
user_ids = set()
async def checkData(tweet, location, config, conn):
usernames = []
for user in tweet.mentions:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids = set()
global _duplicate_dict
copyright = tweet.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tweet):
tweet = Tweet(tweet, location, config)
if config.Database is not None and config.User_info:
for user in tweet.mentions:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.tags:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
for user in tweet.tags:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in tweet.replies:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
for user in tweet.replies:
if db.get_user_id(conn, user["id"]) == -1 and user["id"] not in user_ids:
user_ids.add(user["id"])
usernames.append(user["screen_name"])
for user in usernames:
url = f"http://twitter.com/{user}?lang=en"
await get.User(url, config, conn)
async def Tweets(tw, location, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
global _duplicate_dict
copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config)
if config.Database is not None and config.User_info:
for user in usernames:
url = f"http://twitter.com/{user}?lang=en"
await get.User(url, config, conn)
if config.User_info:
for user in tweet.mentions:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.tags:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.replies:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet)
if config.Database:
db.tweets(conn, tweet, config)
if config.Pandas:
panda.update(tweet, config)
if config.Elasticsearch:
elasticsearch.Tweet(tweet, config)
if config.Database is not None and config.User_info:
await tweetUserData(tweet, config, conn)
if config.User_info:
for user in tweet.mentions:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.tags:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
for user in tweet.replies:
try:
_duplicate_dict[user["screen_name"]]
except KeyError:
_duplicate_dict[user["screen_name"]] = True
_user = user["screen_name"]
url = f"http://twitter.com/{_user}?lang=en"
await get.User(url, config, conn)
if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet)
if config.Database:
db.tweets(conn, tweet, config)
if config.Pandas:
panda.update(tweet, config)
if config.Elasticsearch:
elasticsearch.Tweet(tweet, config)
if config.Store_object:
tweets_object.append(tweet) #twint.tweet.tweet
_output(tweet, output, config)
if config.Store_object:
tweets_object.append(tweet) #twint.tweet.tweet
_output(tweet, output, config)
async def Tweets(tweets, location, config, conn, url=''):
if (config.Profile_full or config.Location) and config.Get_replies:
for tw in tweets:
await checkData(tw, location, config, conn)
elif config.Favorites or config.Profile_full or config.Location:
for tw in tweets:
if tw['data-item-id'] == url.split('?')[0].split('/')[-1]:
await checkData(tw, location, config, conn)
elif config.TwitterSearch:
await checkData(tweets, location, config, conn)
else:
if int(tweets["data-user-id"]) == config.User_id:
await checkData(tweets, location, config, conn)
async def Users(u, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Users')
......
......@@ -44,7 +44,7 @@ def getReplies(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getReplies')
"""Extract replies from tweet
"""
replyToUsersJSON = json.loads(tw.find("div")["data-reply-to-users-json"])
replyToUsersJSON = json.loads(tw["data-reply-to-users-json"])
replies = [{"id":int(reply["id_str"]),"id_str": reply["id_str"],"screen_name":reply["screen_name"]} for reply in replyToUsersJSON]
......@@ -118,16 +118,16 @@ def Tweet(tw, location, config):
"""
##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet')
t = tweet()
t.id = int(tw.find("div")["data-item-id"])
t.id_str = tw.find("div")["data-item-id"]
t.conversation_id = tw.find("div")["data-conversation-id"]
t.id = int(tw["data-item-id"])
t.id_str = tw["data-item-id"]
t.conversation_id = tw["data-conversation-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time-ms"])
t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime/1000.0))
t.timestamp = strftime("%H:%M:%S", localtime(t.datetime/1000.0))
t.user_id = int(tw.find("div")["data-user-id"])
t.user_id_str = tw.find("div")["data-user-id"]
t.username = tw.find("div")["data-screen-name"]
t.name = tw.find("div")["data-name"]
t.user_id = int(tw["data-user-id"])
t.user_id_str = tw["data-user-id"]
t.username = tw["data-screen-name"]
t.name = tw["data-name"]
t.profile_image_url = tw.find("img", "js-action-profile-avatar").get('src').replace("_bigger","")
t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else None
t.timezone = strftime("%Z", localtime())
......@@ -148,8 +148,8 @@ def Tweet(tw, location, config):
t.retweet = getRetweet(config.Profile, t.username, config.Username)
t.gif_url, t.gif_thumb, t.video_url, t.video_thumb = getRawURLS(tw, t.link, config)
t.is_quote_status, t.quote_id, t.quote_id_str, t.quote_url = getQuoteInfo(tw)
t.is_reply_to = int(bool(tw.find("div")["data-is-reply-to"])) if tw.find("div").has_attr("data-is-reply-to") else 0
t.has_parent_tweet = int(bool(tw.find("div")["data-has-parent-tweet"])) if tw.find("div").has_attr("data-has-parent-tweet") else 0
t.is_reply_to = int(bool(tw["data-is-reply-to"])) if tw.has_attr("data-is-reply-to") else 0
t.has_parent_tweet = int(bool(tw["data-has-parent-tweet"])) if tw.has_attr("data-has-parent-tweet") else 0
t.in_reply_to_screen_name = ""
t.in_reply_to_status_id = 0
t.in_reply_to_status_id_str = ""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment