Commit 9bbf03ea authored by haccer's avatar haccer Committed by GitHub

Update tweep.py

parent dc35be9c
#!/usr/bin/env python #!/usr/bin/env python
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import gmtime, strftime from time import gmtime, strftime
from PIL import Image
from io import BytesIO
import argparse import argparse
import datetime import datetime
import json import json
import os
import Queue
import re import re
import requests import requests
import threading
q = Queue.Queue()
class tweep: class tweep:
def __init__(self): def __init__(self):
...@@ -15,6 +22,8 @@ class tweep: ...@@ -15,6 +22,8 @@ class tweep:
self.year = arg.year self.year = arg.year
self.feed = [-1] self.feed = [-1]
self.tweets = 0 self.tweets = 0
self.tweet_urls = []
self.pic_count = 0
def get_url(self): def get_url(self):
url_1 = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q=" url_1 = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
...@@ -27,8 +36,12 @@ class tweep: ...@@ -27,8 +36,12 @@ class tweep:
url+= "%20{0.search}".format(self) url+= "%20{0.search}".format(self)
if self.year != None: if self.year != None:
url+= "%20until%3A{0.year}-1-1".format(self) url+= "%20until%3A{0.year}-1-1".format(self)
if arg.pics:
url+= "%20pic.twitter.com"
if arg.fruit: if arg.fruit:
url+="%20myspace.com%20OR%20last.fm%20OR%20mail%20OR%20email%20OR%20gmail" url+= "%20myspace.com%20OR%20last.fm%20OR"
url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
url+= "%20OR%20phone"
return url return url
def get_feed(self): def get_feed(self):
...@@ -66,17 +79,63 @@ class tweep: ...@@ -66,17 +79,63 @@ class tweep:
username = tweet.find('span','username').text.encode('utf8').replace('@','') username = tweet.find('span','username').text.encode('utf8').replace('@','')
timezone = strftime("%Z", gmtime()) timezone = strftime("%Z", gmtime())
text = tweet.find('p','tweet-text').text.encode('utf8').replace('\n',' ') text = tweet.find('p','tweet-text').text.encode('utf8').replace('\n',' ')
if arg.pics:
tweet_url = "https://twitter.com/{0}/status/{1}/photo/1".format(username,tweetid)
self.tweet_urls.append(tweet_url)
else:
print("{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)) print("{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text))
def save_pic(self,picture):
if not os.path.exists('tweep_img'):
os.makedirs('tweep_img')
if not os.path.exists('tweep_img/{0.author}'.format(self)):
os.makedirs('tweep_img/{0.author}'.format(self))
filename = picture[len('https://pbs.twimg.com/media/'):]
save_dir = 'tweep_img/{0.author}'.format(self)
if not os.path.isfile('{}/{}'.format(save_dir,filename)):
r = requests.get(picture,headers=agent)
i = Image.open(BytesIO(r.content))
i.save(os.path.join(save_dir, filename))
print(" Downloading: {}".format(filename))
self.pic_count += 1
def get_pics(self,tweet_url):
r = requests.get(tweet_url,headers=agent)
soup = BeautifulSoup(r.text,"lxml")
picture = soup.find('div','AdaptiveMedia-photoContainer js-adaptive-photo ')
if picture is not None:
picture = picture['data-image-url'].replace(' ','')
self.save_pic(picture)
def fetch_pics(self):
while True:
tweet_url = q.get()
self.get_pics(tweet_url)
q.task_done()
def main(self): def main(self):
if arg.pics:
print("[+] Searching Tweets For Photos.")
while True if (self.tweets < float('inf')) and len(self.feed)>0 else False: while True if (self.tweets < float('inf')) and len(self.feed)>0 else False:
self.get_tweets() self.get_tweets()
if arg.pics:
total = len(self.tweet_urls) - 1
print("[+] {} pictures found. Collecting Pictures.".format(total))
for i in range(20):
t = threading.Thread(target=self.fetch_pics)
t.daemon = True
t.start()
for tweet_url in self.tweet_urls:
q.put(tweet_url)
q.join()
print("[+] Done. {t.pic_count} pictures saved from {t.author}.".format(t=self))
if __name__ == '__main__': if __name__ == '__main__':
ap = argparse.ArgumentParser(prog='tweep.py',usage='python %(prog)s [options]',description="tweep.py - An Advanced Twitter Scraping Tool") ap = argparse.ArgumentParser(prog='tweep.py',usage='python %(prog)s [options]',description="tweep.py - An Advanced Twitter Scraping Tool")
ap.add_argument('-u',help="User's tweets you want to scrape.") ap.add_argument('-u',help="User's tweets you want to scrape.")
ap.add_argument('-s',help='Search for tweets containing this word or phrase.') ap.add_argument('-s',help='Search for tweets containing this word or phrase.')
ap.add_argument('--year',help='Filter tweets before specified year.') ap.add_argument('--year',help='Filter tweets before specified year.')
ap.add_argument('--pics',help='Save pictures.',action='store_true')
ap.add_argument('--fruit',help='Display "low-hanging-fruit" tweets.',action='store_true') ap.add_argument('--fruit',help='Display "low-hanging-fruit" tweets.',action='store_true')
arg = ap.parse_args() arg = ap.parse_args()
agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment