Commit e89ae5fa authored by pielco11's avatar pielco11

Added elasticsearch fun stuff

parent 41967ed6
PUT tweep
{
"mappings" : {
"items": {
"properties": {
"tweetid": {"type": "long"},
"datestamp": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"timezone": {"type": "text"},
"hashtags": {"type": "text"},
"replies": {"type": "integer"},
"retweets": {"type": "integer"},
"likes": {"type": "integer"},
"username": {"type": "keyword"}
}
}
}
,
"settings": {
"number_of_shards": 1
}
}
...@@ -10,6 +10,9 @@ import datetime ...@@ -10,6 +10,9 @@ import datetime
import json import json
import re import re
import sys import sys
import hashlib
from elasticsearch import Elasticsearch
async def getUrl(init): async def getUrl(init):
''' '''
...@@ -148,6 +151,18 @@ async def outTweet(tweet): ...@@ -148,6 +151,18 @@ async def outTweet(tweet):
except: except:
pass pass
jObject = {
"tweetid": tweetid,
"datestamp": date + " " + time,
"timezone": timezone,
"text": text,
"hashtags": re.findall(r'(?i)\#\w+', text, flags=re.UNICODE),
"replies": replies,
"retweets": retweets,
"likes": likes,
"username": username
}
# Preparing to output # Preparing to output
''' '''
...@@ -156,6 +171,12 @@ async def outTweet(tweet): ...@@ -156,6 +171,12 @@ async def outTweet(tweet):
generated list into Tweep. That's why these generated list into Tweep. That's why these
modes exist. modes exist.
''' '''
if arg.elasticsearch:
es = Elasticsearch(arg.elasticsearch)
es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject))
return ""
else:
if arg.users: if arg.users:
output = username output = username
elif arg.tweets: elif arg.tweets:
...@@ -207,6 +228,9 @@ async def getTweets(init): ...@@ -207,6 +228,9 @@ async def getTweets(init):
copyright = tweet.find("div","StreamItemContent--withheld") copyright = tweet.find("div","StreamItemContent--withheld")
if copyright is None: if copyright is None:
count +=1 count +=1
if arg.elasticsearch:
print(await outTweet(tweet),end=".", flush=True)
else:
print(await outTweet(tweet)) print(await outTweet(tweet))
return tweets, init, count return tweets, init, count
...@@ -285,9 +309,12 @@ if __name__ == "__main__": ...@@ -285,9 +309,12 @@ if __name__ == "__main__":
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true") ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true") ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true")
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch")
arg = ap.parse_args() arg = ap.parse_args()
check() check()
if arg.elasticsearch:
print("Indexing to Elasticsearch @" + str(arg.elasticsearch))
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(main()) loop.run_until_complete(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment