Update tweep.py

9bbf03ea · haccer · GitHub · dc35be9c · 9bbf03ea
Commit 9bbf03ea authored Jun 20, 2017 by haccer Committed by GitHub Jun 20, 2017
Show whitespace changes
Inline Side-by-side

Showing with 64 additions and 5 deletions

tweep.py tweep.py +64 -5

No files found.
--- a/tweep.py
+++ b/tweep.py
 #!/usr/bin/env python
 from bs4 import BeautifulSoup
 from time import gmtime, strftime
+from PIL import Image
+from io import BytesIO
 import argparse
 import datetime
 import json
+import os
+import Queue
 import re
 import requests
+import threading
+
+q = Queue.Queue()

 class tweep:
    def __init__(self):
@@ -15,6 +22,8 @@ class tweep:
        self.year = arg.year
        self.feed = [-1]
        self.tweets = 0
+        self.tweet_urls = []
+        self.pic_count = 0

    def get_url(self):
        url_1 = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
@@ -27,8 +36,12 @@ class tweep:
            url+= "%20{0.search}".format(self)
        if self.year != None:
            url+= "%20until%3A{0.year}-1-1".format(self)
+        if arg.pics:
+            url+= "%20pic.twitter.com"
        if arg.fruit:
-            url+="%20myspace.com%20OR%20last.fm%20OR%20mail%20OR%20email%20OR%20gmail"
+            url+= "%20myspace.com%20OR%20last.fm%20OR"
+            url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
+            url+= "%20OR%20phone"
        return url

    def get_feed(self):
@@ -66,17 +79,63 @@ class tweep:
            username = tweet.find('span','username').text.encode('utf8').replace('@','')
            timezone = strftime("%Z", gmtime())
            text = tweet.find('p','tweet-text').text.encode('utf8').replace('\n',' ')
+            if arg.pics:
+                tweet_url = "https://twitter.com/{0}/status/{1}/photo/1".format(username,tweetid)
+                self.tweet_urls.append(tweet_url)
+            else:
                print("{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text))

+    def save_pic(self,picture):
+        if not os.path.exists('tweep_img'):
+            os.makedirs('tweep_img')
+        if not os.path.exists('tweep_img/{0.author}'.format(self)):
+            os.makedirs('tweep_img/{0.author}'.format(self))
+        filename = picture[len('https://pbs.twimg.com/media/'):]
+        save_dir = 'tweep_img/{0.author}'.format(self)
+        if not os.path.isfile('{}/{}'.format(save_dir,filename)):
+            r = requests.get(picture,headers=agent)
+            i = Image.open(BytesIO(r.content))
+            i.save(os.path.join(save_dir, filename))
+            print("  Downloading: {}".format(filename))
+            self.pic_count += 1
+
+    def get_pics(self,tweet_url):
+        r = requests.get(tweet_url,headers=agent)
+        soup = BeautifulSoup(r.text,"lxml")
+        picture = soup.find('div','AdaptiveMedia-photoContainer js-adaptive-photo ')
+        if picture is not None:
+            picture = picture['data-image-url'].replace(' ','')
+            self.save_pic(picture)
+
+    def fetch_pics(self):
+        while True:
+            tweet_url = q.get()
+            self.get_pics(tweet_url)
+            q.task_done()
+
    def main(self):
+        if arg.pics:
+            print("[+] Searching Tweets For Photos.")
        while True if (self.tweets < float('inf')) and len(self.feed)>0 else False:
            self.get_tweets()
+        if arg.pics:
+            total = len(self.tweet_urls) - 1
+            print("[+] {} pictures found. Collecting Pictures.".format(total))
+            for i in range(20):
+                t = threading.Thread(target=self.fetch_pics)
+                t.daemon = True
+                t.start()
+            for tweet_url in self.tweet_urls:
+                q.put(tweet_url)
+            q.join()
+            print("[+] Done. {t.pic_count} pictures saved from {t.author}.".format(t=self))

 if __name__ == '__main__':
    ap = argparse.ArgumentParser(prog='tweep.py',usage='python %(prog)s [options]',description="tweep.py - An Advanced Twitter Scraping Tool")
    ap.add_argument('-u',help="User's tweets you want to scrape.")
    ap.add_argument('-s',help='Search for tweets containing this word or phrase.')
    ap.add_argument('--year',help='Filter tweets before specified year.')
+    ap.add_argument('--pics',help='Save pictures.',action='store_true')
    ap.add_argument('--fruit',help='Display "low-hanging-fruit" tweets.',action='store_true')
    arg = ap.parse_args()
    agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}