Rewritten for Python3

d998edb1 · Cody Zacharias · GitHub · db08c905 · d998edb1
Commit d998edb1 authored Jan 22, 2018 by Cody Zacharias Committed by GitHub Jan 22, 2018
Show whitespace changes
Inline Side-by-side

Showing with 116 additions and 156 deletions

tweep.py tweep.py +116 -156

No files found.
--- a/tweep.py
+++ b/tweep.py
-#!/usr/bin/env python
+#!/usr/bin/python3
 from bs4 import BeautifulSoup
 from time import gmtime, strftime
-from PIL import Image
-from io import BytesIO
 import argparse
+import aiohttp
+import asyncio
+import async_timeout
 import datetime
 import json
-import os
-import Queue
-import re
-import requests
 import sys
-import threading
-q = Queue.Queue()
+async def getUrl(Min):
+	if Min == -1:
-class tweep:
+		url = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
-    def __init__(self):
+	else:
-        self.min = -1
+		url = "https://twitter.com/i/search/timeline?f=tweets&vertical=default"
-        self.author = arg.u
+		url+= "&lang=en&include_available_features=1&include_entities=1&reset_"
-        self.search = arg.s
+		url+= "error_state=false&src=typd&max_position={}&q=".format(Min)
-        self.year = arg.year
-        self.feed = [-1]
-        self.tweets = 0
-        self.tweet_urls = []
-        self.pic_count = 0
-    def get_url(self):
+	if arg.u != None:
-        url_1 = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
+		url+= "from%3A{0.u}".format(arg)
-        url_2 = "https://twitter.com/i/search/timeline?f=tweets&vertical=default"
+	if arg.s != None:
-        url_2 +="&lang=en&include_available_features=1&include_entities=1&reset_error_state=false&src=typd"
+		arg.s = arg.s.replace(" ", "%20").replace("#", "%23")
-        url = url_1 if self.min == -1 else "{0}&max_position={1.min}&q=".format(url_2, self)
+		url+= "%20{0.s}".format(arg)
-        if self.author != None:
+	if arg.year != None:
-            url+= "from%3A{0.author}".format(self)
+		url+= "%20until%3A{0.year}-1-1".format(arg)
-        if self.search != None:
-            search = self.search.replace(' ','%20').replace('#','%23')
-            url+= "%20{}".format(search)
-        if self.year != None:
-            url+= "%20until%3A{0.year}-1-1".format(self)
-        if arg.pics:
-            url+= "%20filter%3Aimages"
 	if arg.fruit:
 		url+= "%20myspace.com%20OR%20last.fm%20OR"
 		url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
@@ -47,127 +31,103 @@ class tweep:
 		url+= "%20OR%20keybase"
 	if arg.verified:
 		url+= "%20filter%3Averified"
 	return url
-    def get_feed(self):
+async def fetch(session, url):
-        r = requests.get(self.get_url(),headers=agent)
+	with async_timeout.timeout(30):
-        self.feed = []
+		async with session.get(url) as response:
+			return await response.text()
+async def getFeed(Min):
+	async with aiohttp.ClientSession() as session:
+		r = await fetch(session, await getUrl(Min))
+	feed = []
 	try:
-            if self.min == -1:
+		if Min == -1:
-                html = r.text
+			html = r
 		else:
-                json_response = json.loads(r.text)
+			json_response = json.loads(r)
-                html = json_response['items_html']
+			html = json_response["items_html"]
-            soup = BeautifulSoup(html,"lxml")
+		soup = BeautifulSoup(html, "html.parser")
-            self.feed = soup.find_all('li','js-stream-item')
+		feed = soup.find_all("li", "js-stream-item")
-            lastid = self.feed[-1]['data-item-id']
+		if Min == -1:
-            firstid = self.feed[0]['data-item-id']
+			Min = "TWEET-{}-{}".format(feed[-1]["data-item-id"], feed[0]["data-item-id"])
-            if self.min == -1:
-                self.min = "TWEET-{}-{}".format(lastid,firstid)
 		else:
-                minsplit = json_response['min_position'].split('-')
+			minsplit = json_response["min_position"].split("-")
-                minsplit[1] = lastid
+			minsplit[1] = feed[-1]["data-item-id"]
-                self.min = "-".join(minsplit)
+			Min = "-".join(minsplit)
-        except: pass
+	except:
-        return self.feed
+		pass
+	return feed, Min
-    def get_tweets(self):
+async def getTweets(Min):
-        for tweet in self.get_feed():
+	feed, Min = await getFeed(Min)
-            self.tweets += 1
+	for tweet in feed:
-            tweetid = tweet['data-item-id']
+		tweetid = tweet["data-item-id"]
-            datestamp = tweet.find('a','tweet-timestamp')['title'].rpartition(' - ')[-1]
+		datestamp = tweet.find("a", "tweet-timestamp")["title"].rpartition(" - ")[-1]
-            d = datetime.datetime.strptime(datestamp, '%d %b %Y')
+		d = datetime.datetime.strptime(datestamp, "%d %b %Y")
-            date = d.strftime('%Y-%m-%d')
+		date = d.strftime("%Y-%m-%d")
-            timestamp = str(datetime.timedelta(seconds=int(tweet.find('span','_timestamp')['data-time']))).rpartition(', ')[-1]
+		timestamp = str(datetime.timedelta(seconds=int(tweet.find("span", "_timestamp")["data-time"]))).rpartition(", ")[-1]
-            t = datetime.datetime.strptime(timestamp,'%H:%M:%S')
+		t = datetime.datetime.strptime(timestamp, "%H:%M:%S")
-            time = t.strftime('%H:%M:%S')
+		time = t.strftime("%H:%M:%S")
-            username = tweet.find('span','username').text.encode('utf8').replace('@','')
+		username = tweet.find("span", "username").text.replace("@", "")
 		timezone = strftime("%Z", gmtime())
-            text = tweet.find('p','tweet-text').text.encode('utf8').replace('\n',' ')
+		text = tweet.find("p", "tweet-text").text.replace("\n", " ")
 		try:
-                mentions = tweet.find("div", "js-original-tweet")['data-mentions'].split(" ")
+			mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ")
 			for i in range(len(mentions)):
-                    text = "@{} {}".format(mentions[i], text)
+				mention = "@{}".format(mentions[i])
-            except: pass
+				if mention not in text:
-            if arg.pics:
+					text = "{} {}".format(mention, text)
-                tweet_url = "https://twitter.com/{0}/status/{1}/photo/1".format(username,tweetid)
+		except:
-                self.tweet_urls.append(tweet_url)
+			pass
-            else:
 		if arg.users:
-                    print(username)
+			output = username
 		elif arg.tweets:
-                    print(text)
+			output = tweets
 		else:
-                    print("{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text))
+			output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)
+		if arg.o != None:
+			print(output, file=open(arg.o, "a"))
-    def save_pic(self,picture):
+		print(output)
-        if not os.path.exists('tweep_img'):
-            os.makedirs('tweep_img')
-        if not os.path.exists('tweep_img/{0.author}'.format(self)):
-            os.makedirs('tweep_img/{0.author}'.format(self))
-        filename = picture[len('https://pbs.twimg.com/media/'):]
-        save_dir = 'tweep_img/{0.author}'.format(self)
-        if not os.path.isfile('{}/{}'.format(save_dir,filename)):
-            r = requests.get(picture,headers=agent)
-            i = Image.open(BytesIO(r.content))
-            i.save(os.path.join(save_dir, filename))
-            print("  Downloading: {}".format(filename))
-            self.pic_count += 1
-    def get_pics(self,tweet_url):
+	return feed, Min
-        r = requests.get(tweet_url,headers=agent)
-        soup = BeautifulSoup(r.text,"lxml")
-        picture = soup.find('div','AdaptiveMedia-photoContainer js-adaptive-photo ')
-        if picture is not None:
-            picture = picture['data-image-url'].replace(' ','')
-            self.save_pic(picture)
-    def fetch_pics(self):
+async def main():
+	feed = [-1]
+	Min = -1
 	while True:
-            tweet_url = q.get()
+		if len(feed) > 0:
-            self.get_pics(tweet_url)
+			feed, Min = await getTweets(Min)
-            q.task_done()
+		else:
+			break
-    def main(self):
+if __name__ == "__main__":
-        if arg.pics:
+	ap = argparse.ArgumentParser(prog="tweep.py", usage="python3 %(prog)s [options]", description="tweep.py - An Advanced Twitter Scraping Tool")
-            print("[+] Searching Tweets For Photos.")
+	ap.add_argument("-u", help="User's tweets you want to scrape.")
-        while True if (self.tweets < float('inf')) and len(self.feed)>0 else False:
+	ap.add_argument("-s", help="Search for tweets containing this word or phrase.")
-            self.get_tweets()
+	ap.add_argument("-o", help="Save output to a file.")
-        if arg.pics:
+	ap.add_argument("--year", help="Filter tweets before specified year.")
-            total = len(self.tweet_urls) - 1
+	ap.add_argument("--fruit", help="Display 'low-hanging-fruit' tweets.", action="store_true")
-            print("[+] {} pictures found. Collecting Pictures.".format(total))
+	ap.add_argument("--tweets", help="Display tweets only.", action="store_true")
-            for i in range(10):
+	ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", action="store_true")
-                t = threading.Thread(target=self.fetch_pics)
+	ap.add_argument("--users", help="Display users only (Use with -s).", action="store_true")
-                t.daemon = True
+	arg = ap.parse_args()
-                t.start()
-            for tweet_url in self.tweet_urls:
-                q.put(tweet_url)
-            q.join()
-            print("[+] Done. {t.pic_count} pictures saved from {t.author}.".format(t=self))
-def check():
 	if arg.u is not None:
 		if arg.users:
-            print("Please use --users in combination with -s.")
+			print("[-] Contradicting Args: Please use --users in combination with -s.")
 			sys.exit(0)
 		if arg.verified:
-            print("Please use --verified in combination with -s.")
+			print("[-] Contradicting Args: Please use --verified in combination with -s.")
-            sys.exit(0)
 	if arg.tweets and arg.users:
-        print("--users and --tweets cannot be used together.")
+		print("[-] Contradicting Args: --users and --tweets cannot be used together.")
 		sys.exit(0)
-if __name__ == '__main__':
+	loop = asyncio.get_event_loop()
-    agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
+	loop.run_until_complete(main())
-    ap = argparse.ArgumentParser(prog='tweep.py',usage='python %(prog)s [options]',description="tweep.py - An Advanced Twitter Scraping Tool")
-    ap.add_argument('-u',help="User's tweets you want to scrape.")
-    ap.add_argument('-s',help='Search for tweets containing this word or phrase.')
-    ap.add_argument('--year',help='Filter tweets before specified year.')
-    ap.add_argument('--pics',help='Save pictures.',action='store_true')
-    ap.add_argument('--fruit',help='Display "low-hanging-fruit" tweets.',action='store_true')
-    ap.add_argument('--tweets',help='Display tweets only.',action='store_true')
-    ap.add_argument('--verified',help='Display Tweets only from verified users (Use with -s).',action='store_true')
-    ap.add_argument('--users',help='Display users only (Use with -s).',action='store_true')
-    arg = ap.parse_args()
-    check()
-    tweep().main()