Merge pull request #41 from pielco11/master

Added elasticsearch fun stuff

Merge pull request #41 from pielco11/master
Added elasticsearch fun stuff
a54ec5ab · Cody Zacharias · GitHub · 155acf70 · c847f5e9 · a54ec5ab
Commit a54ec5ab authored Mar 15, 2018 by Cody Zacharias Committed by GitHub Mar 15, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 10 deletions

README.md README.md +3 -1

index.json index.json +21 -0

requirements.txt requirements.txt +1 -0

tweep.py tweep.py +35 -9

No files found.
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Some of the benefits of using Tweep vs Twitter API:
 - `-s` Search for Tweets containing this word or phrase.
 - `-g` Retrieve tweets by geolocation. Format of the argument is lat,lon,range(km or mi).
 - `-o` Save output to a file.
+- `-es` Output to Elasticsearch
 - `--year` Filter Tweets before the specified year. 
 - `--fruit` Display Tweets with "low-hanging-fruit".
 - `--tweets` Display Tweets only.
@@ -53,6 +54,7 @@ A few simple examples to help you understand the basics:
 - `python3 tweep.py -u username --fruit` - Show Tweets with low-hanging fruit.
 - `python3 tweep.py -s "Donald Trump" --verified --users` - List verified users that Tweet about Donald Trump.
 - `python3 tweep.py -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
+- `python3 tweep.py -u username -es localhost:9200` - Output Tweets to Elasticsearch

 ## Example String
 `955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
@@ -61,7 +63,7 @@ A few simple examples to help you understand the basics:
 <img src="https://i.imgur.com/RKdBrHr.png" />

 ## Thanks
-Thanks to [@hpiedcoq](https://github.com/hpiedcoq) for contributing several features!
+Thanks to [@hpiedcoq](https://github.com/hpiedcoq) & [@pielco11](https://github.com/pielco11) for contributing several features!

 ## Contact
 Shout me out on Twitter: [@now](https://twitter.com/now)
--- a/index.json
+++ b/index.json
+PUT tweep
+{
+  "mappings" : {
+    "items": {
+      "properties": { 
+        "tweetid": {"type": "long"},
+        "datestamp": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
+        "timezone": {"type": "text"},
+        "hashtags": {"type": "text"},
+        "replies": {"type": "integer"},
+        "retweets": {"type": "integer"},
+        "likes": {"type": "integer"},
+        "username": {"type": "keyword"}
+      }
+    }
+  }
+  ,
+  "settings": {
+    "number_of_shards": 1
+  }
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ aiohttp
 aiodns
 beautifulsoup4
 cchardet
+elasticsearch
--- a/tweep.py
+++ b/tweep.py
 #!/usr/bin/python3
 from bs4 import BeautifulSoup
+from elasticsearch import Elasticsearch
 from time import gmtime, strftime
 import argparse
 import aiohttp
@@ -7,6 +8,7 @@ import asyncio
 import async_timeout
 import csv
 import datetime
+import hashlib
 import json
 import re
 import sys
@@ -14,7 +16,7 @@ import sys
 async def getUrl(init):
    '''
    URL Descision:
-    Tweep utilizes positions of Tweet's from Twitter's search feature to 
+    Tweep utilizes positions of Tweet's from Twitter's search feature to
    iterate through a user's Twitter feed. This section decides whether
    this is the first URL request or not and develops the URL based on the
    args given.
@@ -99,7 +101,7 @@ async def getFeed(init):
        if init == -1:
            feed, init = await initial(response)
        else:
-            feed, init = await cont(response) 
+            feed, init = await cont(response)
    except:
        # Tweep will realize that it's done scraping.
        pass
@@ -109,7 +111,7 @@ async def getFeed(init):
 async def outTweet(tweet):
    '''
    Parsing Section:
-    This function will create the desired output string and 
+    This function will create the desired output string and
    write it to a file or csv if specified.

    Returns output.
@@ -147,7 +149,7 @@ async def outTweet(tweet):
                text = "{} {}".format(mention, text)
    except:
        pass
-
+    
    # Preparing to output

    '''
@@ -156,7 +158,23 @@ async def outTweet(tweet):
    generated list into Tweep. That's why these
    modes exist.
    '''
-    if arg.users:
+    if arg.elasticsearch:
+        jObject = {
+            "tweetid": tweetid,
+            "datestamp": date + " " + time,
+            "timezone": timezone,
+            "text": text,
+            "hashtags": re.findall(r'(?i)\#\w+', text, flags=re.UNICODE),
+            "replies": replies,
+            "retweets": retweets,
+            "likes": likes,
+            "username": username
+        }
+        
+        es = Elasticsearch(arg.elasticsearch)
+        es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject))
+        output = ""
+    elif arg.users:
        output = username
    elif arg.tweets:
        output = tweets
@@ -174,7 +192,7 @@ async def outTweet(tweet):
        if arg.stats:
            output+= " | {} replies {} retweets {} likes".format(replies, retweets, likes)

-    # Output section
+        # Output section

    if arg.o != None:
        if arg.csv:
@@ -207,7 +225,10 @@ async def getTweets(init):
        copyright = tweet.find("div","StreamItemContent--withheld")
        if copyright is None:
            count +=1
-            print(await outTweet(tweet))
+            if arg.elasticsearch:
+                print(await outTweet(tweet),end=".", flush=True)
+            else:
+                print(await outTweet(tweet))

    return tweets, init, count

@@ -225,6 +246,10 @@ async def main():
    '''
    Putting it all together.
    '''
+
+    if arg.elasticsearch:
+        print("Indexing to Elasticsearch @" + str(arg.elasticsearch))
+
    if arg.userid is not None:
        arg.u = await getUsername()

@@ -254,7 +279,7 @@ def Error(error, message):
    sys.exit(0)

 def check():
-    # Performs main argument checks so nothing unintended happens. 
+    # Performs main argument checks so nothing unintended happens.
    if arg.u is not None:
        if arg.users:
            Error("Contradicting Args", "Please use --users in combination with -s.")
@@ -271,8 +296,9 @@ if __name__ == "__main__":
    ap = argparse.ArgumentParser(prog="tweep.py", usage="python3 %(prog)s [options]", description="tweep.py - An Advanced Twitter Scraping Tool")
    ap.add_argument("-u", help="User's Tweets you want to scrape.")
    ap.add_argument("-s", help="Search for Tweets containing this word or phrase.")
-    ap.add_argument("-o", help="Save output to a file.")
    ap.add_argument("-g", help="Search for geocoded tweets.")
+    ap.add_argument("-o", help="Save output to a file.")
+    ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch")
    ap.add_argument("--year", help="Filter Tweets before specified year.")
    ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).")
    ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true")