Commit d998edb1 authored by Cody Zacharias's avatar Cody Zacharias Committed by GitHub

Rewritten for Python3

parent db08c905
#!/usr/bin/env python #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import gmtime, strftime from time import gmtime, strftime
from PIL import Image
from io import BytesIO
import argparse import argparse
import aiohttp
import asyncio
import async_timeout
import datetime import datetime
import json import json
import os
import Queue
import re
import requests
import sys import sys
import threading
q = Queue.Queue() async def getUrl(Min):
if Min == -1:
class tweep: url = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
def __init__(self): else:
self.min = -1 url = "https://twitter.com/i/search/timeline?f=tweets&vertical=default"
self.author = arg.u url+= "&lang=en&include_available_features=1&include_entities=1&reset_"
self.search = arg.s url+= "error_state=false&src=typd&max_position={}&q=".format(Min)
self.year = arg.year
self.feed = [-1]
self.tweets = 0
self.tweet_urls = []
self.pic_count = 0
def get_url(self): if arg.u != None:
url_1 = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q=" url+= "from%3A{0.u}".format(arg)
url_2 = "https://twitter.com/i/search/timeline?f=tweets&vertical=default" if arg.s != None:
url_2 +="&lang=en&include_available_features=1&include_entities=1&reset_error_state=false&src=typd" arg.s = arg.s.replace(" ", "%20").replace("#", "%23")
url = url_1 if self.min == -1 else "{0}&max_position={1.min}&q=".format(url_2, self) url+= "%20{0.s}".format(arg)
if self.author != None: if arg.year != None:
url+= "from%3A{0.author}".format(self) url+= "%20until%3A{0.year}-1-1".format(arg)
if self.search != None:
search = self.search.replace(' ','%20').replace('#','%23')
url+= "%20{}".format(search)
if self.year != None:
url+= "%20until%3A{0.year}-1-1".format(self)
if arg.pics:
url+= "%20filter%3Aimages"
if arg.fruit: if arg.fruit:
url+= "%20myspace.com%20OR%20last.fm%20OR" url+= "%20myspace.com%20OR%20last.fm%20OR"
url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail" url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
...@@ -47,127 +31,103 @@ class tweep: ...@@ -47,127 +31,103 @@ class tweep:
url+= "%20OR%20keybase" url+= "%20OR%20keybase"
if arg.verified: if arg.verified:
url+= "%20filter%3Averified" url+= "%20filter%3Averified"
return url return url
def get_feed(self): async def fetch(session, url):
r = requests.get(self.get_url(),headers=agent) with async_timeout.timeout(30):
self.feed = [] async with session.get(url) as response:
return await response.text()
async def getFeed(Min):
async with aiohttp.ClientSession() as session:
r = await fetch(session, await getUrl(Min))
feed = []
try: try:
if self.min == -1: if Min == -1:
html = r.text html = r
else: else:
json_response = json.loads(r.text) json_response = json.loads(r)
html = json_response['items_html'] html = json_response["items_html"]
soup = BeautifulSoup(html,"lxml") soup = BeautifulSoup(html, "html.parser")
self.feed = soup.find_all('li','js-stream-item') feed = soup.find_all("li", "js-stream-item")
lastid = self.feed[-1]['data-item-id'] if Min == -1:
firstid = self.feed[0]['data-item-id'] Min = "TWEET-{}-{}".format(feed[-1]["data-item-id"], feed[0]["data-item-id"])
if self.min == -1:
self.min = "TWEET-{}-{}".format(lastid,firstid)
else: else:
minsplit = json_response['min_position'].split('-') minsplit = json_response["min_position"].split("-")
minsplit[1] = lastid minsplit[1] = feed[-1]["data-item-id"]
self.min = "-".join(minsplit) Min = "-".join(minsplit)
except: pass except:
return self.feed pass
return feed, Min
def get_tweets(self): async def getTweets(Min):
for tweet in self.get_feed(): feed, Min = await getFeed(Min)
self.tweets += 1 for tweet in feed:
tweetid = tweet['data-item-id'] tweetid = tweet["data-item-id"]
datestamp = tweet.find('a','tweet-timestamp')['title'].rpartition(' - ')[-1] datestamp = tweet.find("a", "tweet-timestamp")["title"].rpartition(" - ")[-1]
d = datetime.datetime.strptime(datestamp, '%d %b %Y') d = datetime.datetime.strptime(datestamp, "%d %b %Y")
date = d.strftime('%Y-%m-%d') date = d.strftime("%Y-%m-%d")
timestamp = str(datetime.timedelta(seconds=int(tweet.find('span','_timestamp')['data-time']))).rpartition(', ')[-1] timestamp = str(datetime.timedelta(seconds=int(tweet.find("span", "_timestamp")["data-time"]))).rpartition(", ")[-1]
t = datetime.datetime.strptime(timestamp,'%H:%M:%S') t = datetime.datetime.strptime(timestamp, "%H:%M:%S")
time = t.strftime('%H:%M:%S') time = t.strftime("%H:%M:%S")
username = tweet.find('span','username').text.encode('utf8').replace('@','') username = tweet.find("span", "username").text.replace("@", "")
timezone = strftime("%Z", gmtime()) timezone = strftime("%Z", gmtime())
text = tweet.find('p','tweet-text').text.encode('utf8').replace('\n',' ') text = tweet.find("p", "tweet-text").text.replace("\n", " ")
try: try:
mentions = tweet.find("div", "js-original-tweet")['data-mentions'].split(" ") mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ")
for i in range(len(mentions)): for i in range(len(mentions)):
text = "@{} {}".format(mentions[i], text) mention = "@{}".format(mentions[i])
except: pass if mention not in text:
if arg.pics: text = "{} {}".format(mention, text)
tweet_url = "https://twitter.com/{0}/status/{1}/photo/1".format(username,tweetid) except:
self.tweet_urls.append(tweet_url) pass
else:
if arg.users: if arg.users:
print(username) output = username
elif arg.tweets: elif arg.tweets:
print(text) output = tweets
else: else:
print("{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)) output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)
if arg.o != None:
print(output, file=open(arg.o, "a"))
def save_pic(self,picture): print(output)
if not os.path.exists('tweep_img'):
os.makedirs('tweep_img')
if not os.path.exists('tweep_img/{0.author}'.format(self)):
os.makedirs('tweep_img/{0.author}'.format(self))
filename = picture[len('https://pbs.twimg.com/media/'):]
save_dir = 'tweep_img/{0.author}'.format(self)
if not os.path.isfile('{}/{}'.format(save_dir,filename)):
r = requests.get(picture,headers=agent)
i = Image.open(BytesIO(r.content))
i.save(os.path.join(save_dir, filename))
print(" Downloading: {}".format(filename))
self.pic_count += 1
def get_pics(self,tweet_url): return feed, Min
r = requests.get(tweet_url,headers=agent)
soup = BeautifulSoup(r.text,"lxml")
picture = soup.find('div','AdaptiveMedia-photoContainer js-adaptive-photo ')
if picture is not None:
picture = picture['data-image-url'].replace(' ','')
self.save_pic(picture)
def fetch_pics(self): async def main():
feed = [-1]
Min = -1
while True: while True:
tweet_url = q.get() if len(feed) > 0:
self.get_pics(tweet_url) feed, Min = await getTweets(Min)
q.task_done() else:
break
def main(self): if __name__ == "__main__":
if arg.pics: ap = argparse.ArgumentParser(prog="tweep.py", usage="python3 %(prog)s [options]", description="tweep.py - An Advanced Twitter Scraping Tool")
print("[+] Searching Tweets For Photos.") ap.add_argument("-u", help="User's tweets you want to scrape.")
while True if (self.tweets < float('inf')) and len(self.feed)>0 else False: ap.add_argument("-s", help="Search for tweets containing this word or phrase.")
self.get_tweets() ap.add_argument("-o", help="Save output to a file.")
if arg.pics: ap.add_argument("--year", help="Filter tweets before specified year.")
total = len(self.tweet_urls) - 1 ap.add_argument("--fruit", help="Display 'low-hanging-fruit' tweets.", action="store_true")
print("[+] {} pictures found. Collecting Pictures.".format(total)) ap.add_argument("--tweets", help="Display tweets only.", action="store_true")
for i in range(10): ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", action="store_true")
t = threading.Thread(target=self.fetch_pics) ap.add_argument("--users", help="Display users only (Use with -s).", action="store_true")
t.daemon = True arg = ap.parse_args()
t.start()
for tweet_url in self.tweet_urls:
q.put(tweet_url)
q.join()
print("[+] Done. {t.pic_count} pictures saved from {t.author}.".format(t=self))
def check():
if arg.u is not None: if arg.u is not None:
if arg.users: if arg.users:
print("Please use --users in combination with -s.") print("[-] Contradicting Args: Please use --users in combination with -s.")
sys.exit(0) sys.exit(0)
if arg.verified: if arg.verified:
print("Please use --verified in combination with -s.") print("[-] Contradicting Args: Please use --verified in combination with -s.")
sys.exit(0)
if arg.tweets and arg.users: if arg.tweets and arg.users:
print("--users and --tweets cannot be used together.") print("[-] Contradicting Args: --users and --tweets cannot be used together.")
sys.exit(0) sys.exit(0)
if __name__ == '__main__': loop = asyncio.get_event_loop()
agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} loop.run_until_complete(main())
ap = argparse.ArgumentParser(prog='tweep.py',usage='python %(prog)s [options]',description="tweep.py - An Advanced Twitter Scraping Tool")
ap.add_argument('-u',help="User's tweets you want to scrape.")
ap.add_argument('-s',help='Search for tweets containing this word or phrase.')
ap.add_argument('--year',help='Filter tweets before specified year.')
ap.add_argument('--pics',help='Save pictures.',action='store_true')
ap.add_argument('--fruit',help='Display "low-hanging-fruit" tweets.',action='store_true')
ap.add_argument('--tweets',help='Display tweets only.',action='store_true')
ap.add_argument('--verified',help='Display Tweets only from verified users (Use with -s).',action='store_true')
ap.add_argument('--users',help='Display users only (Use with -s).',action='store_true')
arg = ap.parse_args()
check()
tweep().main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment