Commit d998edb1 authored by Cody Zacharias's avatar Cody Zacharias Committed by GitHub

Rewritten for Python3

parent db08c905
#!/usr/bin/env python
#!/usr/bin/python3
from bs4 import BeautifulSoup
from time import gmtime, strftime
from PIL import Image
from io import BytesIO
import argparse
import aiohttp
import asyncio
import async_timeout
import datetime
import json
import os
import Queue
import re
import requests
import sys
import threading
q = Queue.Queue()
async def getUrl(Min):
if Min == -1:
url = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
else:
url = "https://twitter.com/i/search/timeline?f=tweets&vertical=default"
url+= "&lang=en&include_available_features=1&include_entities=1&reset_"
url+= "error_state=false&src=typd&max_position={}&q=".format(Min)
class tweep:
def __init__(self):
self.min = -1
self.author = arg.u
self.search = arg.s
self.year = arg.year
self.feed = [-1]
self.tweets = 0
self.tweet_urls = []
self.pic_count = 0
if arg.u != None:
url+= "from%3A{0.u}".format(arg)
if arg.s != None:
arg.s = arg.s.replace(" ", "%20").replace("#", "%23")
url+= "%20{0.s}".format(arg)
if arg.year != None:
url+= "%20until%3A{0.year}-1-1".format(arg)
if arg.fruit:
url+= "%20myspace.com%20OR%20last.fm%20OR"
url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
url+= "%20OR%20phone%20OR%20call%20me%20OR%20text%20me"
url+= "%20OR%20keybase"
if arg.verified:
url+= "%20filter%3Averified"
def get_url(self):
url_1 = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
url_2 = "https://twitter.com/i/search/timeline?f=tweets&vertical=default"
url_2 +="&lang=en&include_available_features=1&include_entities=1&reset_error_state=false&src=typd"
url = url_1 if self.min == -1 else "{0}&max_position={1.min}&q=".format(url_2, self)
if self.author != None:
url+= "from%3A{0.author}".format(self)
if self.search != None:
search = self.search.replace(' ','%20').replace('#','%23')
url+= "%20{}".format(search)
if self.year != None:
url+= "%20until%3A{0.year}-1-1".format(self)
if arg.pics:
url+= "%20filter%3Aimages"
if arg.fruit:
url+= "%20myspace.com%20OR%20last.fm%20OR"
url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
url+= "%20OR%20phone%20OR%20call%20me%20OR%20text%20me"
url+= "%20OR%20keybase"
if arg.verified:
url+= "%20filter%3Averified"
return url
return url
def get_feed(self):
r = requests.get(self.get_url(),headers=agent)
self.feed = []
try:
if self.min == -1:
html = r.text
else:
json_response = json.loads(r.text)
html = json_response['items_html']
soup = BeautifulSoup(html,"lxml")
self.feed = soup.find_all('li','js-stream-item')
lastid = self.feed[-1]['data-item-id']
firstid = self.feed[0]['data-item-id']
if self.min == -1:
self.min = "TWEET-{}-{}".format(lastid,firstid)
else:
minsplit = json_response['min_position'].split('-')
minsplit[1] = lastid
self.min = "-".join(minsplit)
except: pass
return self.feed
async def fetch(session, url):
with async_timeout.timeout(30):
async with session.get(url) as response:
return await response.text()
def get_tweets(self):
for tweet in self.get_feed():
self.tweets += 1
tweetid = tweet['data-item-id']
datestamp = tweet.find('a','tweet-timestamp')['title'].rpartition(' - ')[-1]
d = datetime.datetime.strptime(datestamp, '%d %b %Y')
date = d.strftime('%Y-%m-%d')
timestamp = str(datetime.timedelta(seconds=int(tweet.find('span','_timestamp')['data-time']))).rpartition(', ')[-1]
t = datetime.datetime.strptime(timestamp,'%H:%M:%S')
time = t.strftime('%H:%M:%S')
username = tweet.find('span','username').text.encode('utf8').replace('@','')
timezone = strftime("%Z", gmtime())
text = tweet.find('p','tweet-text').text.encode('utf8').replace('\n',' ')
try:
mentions = tweet.find("div", "js-original-tweet")['data-mentions'].split(" ")
for i in range(len(mentions)):
text = "@{} {}".format(mentions[i], text)
except: pass
if arg.pics:
tweet_url = "https://twitter.com/{0}/status/{1}/photo/1".format(username,tweetid)
self.tweet_urls.append(tweet_url)
else:
if arg.users:
print(username)
elif arg.tweets:
print(text)
else:
print("{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text))
async def getFeed(Min):
async with aiohttp.ClientSession() as session:
r = await fetch(session, await getUrl(Min))
feed = []
try:
if Min == -1:
html = r
else:
json_response = json.loads(r)
html = json_response["items_html"]
soup = BeautifulSoup(html, "html.parser")
feed = soup.find_all("li", "js-stream-item")
if Min == -1:
Min = "TWEET-{}-{}".format(feed[-1]["data-item-id"], feed[0]["data-item-id"])
else:
minsplit = json_response["min_position"].split("-")
minsplit[1] = feed[-1]["data-item-id"]
Min = "-".join(minsplit)
except:
pass
def save_pic(self,picture):
if not os.path.exists('tweep_img'):
os.makedirs('tweep_img')
if not os.path.exists('tweep_img/{0.author}'.format(self)):
os.makedirs('tweep_img/{0.author}'.format(self))
filename = picture[len('https://pbs.twimg.com/media/'):]
save_dir = 'tweep_img/{0.author}'.format(self)
if not os.path.isfile('{}/{}'.format(save_dir,filename)):
r = requests.get(picture,headers=agent)
i = Image.open(BytesIO(r.content))
i.save(os.path.join(save_dir, filename))
print(" Downloading: {}".format(filename))
self.pic_count += 1
return feed, Min
def get_pics(self,tweet_url):
r = requests.get(tweet_url,headers=agent)
soup = BeautifulSoup(r.text,"lxml")
picture = soup.find('div','AdaptiveMedia-photoContainer js-adaptive-photo ')
if picture is not None:
picture = picture['data-image-url'].replace(' ','')
self.save_pic(picture)
async def getTweets(Min):
feed, Min = await getFeed(Min)
for tweet in feed:
tweetid = tweet["data-item-id"]
datestamp = tweet.find("a", "tweet-timestamp")["title"].rpartition(" - ")[-1]
d = datetime.datetime.strptime(datestamp, "%d %b %Y")
date = d.strftime("%Y-%m-%d")
timestamp = str(datetime.timedelta(seconds=int(tweet.find("span", "_timestamp")["data-time"]))).rpartition(", ")[-1]
t = datetime.datetime.strptime(timestamp, "%H:%M:%S")
time = t.strftime("%H:%M:%S")
username = tweet.find("span", "username").text.replace("@", "")
timezone = strftime("%Z", gmtime())
text = tweet.find("p", "tweet-text").text.replace("\n", " ")
try:
mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ")
for i in range(len(mentions)):
mention = "@{}".format(mentions[i])
if mention not in text:
text = "{} {}".format(mention, text)
except:
pass
def fetch_pics(self):
while True:
tweet_url = q.get()
self.get_pics(tweet_url)
q.task_done()
if arg.users:
output = username
elif arg.tweets:
output = tweets
else:
output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)
def main(self):
if arg.pics:
print("[+] Searching Tweets For Photos.")
while True if (self.tweets < float('inf')) and len(self.feed)>0 else False:
self.get_tweets()
if arg.pics:
total = len(self.tweet_urls) - 1
print("[+] {} pictures found. Collecting Pictures.".format(total))
for i in range(10):
t = threading.Thread(target=self.fetch_pics)
t.daemon = True
t.start()
for tweet_url in self.tweet_urls:
q.put(tweet_url)
q.join()
print("[+] Done. {t.pic_count} pictures saved from {t.author}.".format(t=self))
if arg.o != None:
print(output, file=open(arg.o, "a"))
def check():
if arg.u is not None:
if arg.users:
print("Please use --users in combination with -s.")
sys.exit(0)
if arg.verified:
print("Please use --verified in combination with -s.")
sys.exit(0)
if arg.tweets and arg.users:
print("--users and --tweets cannot be used together.")
sys.exit(0)
print(output)
if __name__ == '__main__':
agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
ap = argparse.ArgumentParser(prog='tweep.py',usage='python %(prog)s [options]',description="tweep.py - An Advanced Twitter Scraping Tool")
ap.add_argument('-u',help="User's tweets you want to scrape.")
ap.add_argument('-s',help='Search for tweets containing this word or phrase.')
ap.add_argument('--year',help='Filter tweets before specified year.')
ap.add_argument('--pics',help='Save pictures.',action='store_true')
ap.add_argument('--fruit',help='Display "low-hanging-fruit" tweets.',action='store_true')
ap.add_argument('--tweets',help='Display tweets only.',action='store_true')
ap.add_argument('--verified',help='Display Tweets only from verified users (Use with -s).',action='store_true')
ap.add_argument('--users',help='Display users only (Use with -s).',action='store_true')
arg = ap.parse_args()
check()
tweep().main()
return feed, Min
async def main():
feed = [-1]
Min = -1
while True:
if len(feed) > 0:
feed, Min = await getTweets(Min)
else:
break
if __name__ == "__main__":
ap = argparse.ArgumentParser(prog="tweep.py", usage="python3 %(prog)s [options]", description="tweep.py - An Advanced Twitter Scraping Tool")
ap.add_argument("-u", help="User's tweets you want to scrape.")
ap.add_argument("-s", help="Search for tweets containing this word or phrase.")
ap.add_argument("-o", help="Save output to a file.")
ap.add_argument("--year", help="Filter tweets before specified year.")
ap.add_argument("--fruit", help="Display 'low-hanging-fruit' tweets.", action="store_true")
ap.add_argument("--tweets", help="Display tweets only.", action="store_true")
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", action="store_true")
ap.add_argument("--users", help="Display users only (Use with -s).", action="store_true")
arg = ap.parse_args()
if arg.u is not None:
if arg.users:
print("[-] Contradicting Args: Please use --users in combination with -s.")
sys.exit(0)
if arg.verified:
print("[-] Contradicting Args: Please use --verified in combination with -s.")
if arg.tweets and arg.users:
print("[-] Contradicting Args: --users and --tweets cannot be used together.")
sys.exit(0)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment