Commit e89ae5fa authored by pielco11's avatar pielco11

Added elasticsearch fun stuff

parent 41967ed6
PUT tweep
{
"mappings" : {
"items": {
"properties": {
"tweetid": {"type": "long"},
"datestamp": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"timezone": {"type": "text"},
"hashtags": {"type": "text"},
"replies": {"type": "integer"},
"retweets": {"type": "integer"},
"likes": {"type": "integer"},
"username": {"type": "keyword"}
}
}
}
,
"settings": {
"number_of_shards": 1
}
}
...@@ -10,11 +10,14 @@ import datetime ...@@ -10,11 +10,14 @@ import datetime
import json import json
import re import re
import sys import sys
import hashlib
from elasticsearch import Elasticsearch
async def getUrl(init): async def getUrl(init):
''' '''
URL Descision: URL Descision:
Tweep utilizes positions of Tweet's from Twitter's search feature to Tweep utilizes positions of Tweet's from Twitter's search feature to
iterate through a user's Twitter feed. This section decides whether iterate through a user's Twitter feed. This section decides whether
this is the first URL request or not and develops the URL based on the this is the first URL request or not and develops the URL based on the
args given. args given.
...@@ -99,7 +102,7 @@ async def getFeed(init): ...@@ -99,7 +102,7 @@ async def getFeed(init):
if init == -1: if init == -1:
feed, init = await initial(response) feed, init = await initial(response)
else: else:
feed, init = await cont(response) feed, init = await cont(response)
except: except:
# Tweep will realize that it's done scraping. # Tweep will realize that it's done scraping.
pass pass
...@@ -109,7 +112,7 @@ async def getFeed(init): ...@@ -109,7 +112,7 @@ async def getFeed(init):
async def outTweet(tweet): async def outTweet(tweet):
''' '''
Parsing Section: Parsing Section:
This function will create the desired output string and This function will create the desired output string and
write it to a file or csv if specified. write it to a file or csv if specified.
Returns output. Returns output.
...@@ -147,6 +150,18 @@ async def outTweet(tweet): ...@@ -147,6 +150,18 @@ async def outTweet(tweet):
text = "{} {}".format(mention, text) text = "{} {}".format(mention, text)
except: except:
pass pass
jObject = {
"tweetid": tweetid,
"datestamp": date + " " + time,
"timezone": timezone,
"text": text,
"hashtags": re.findall(r'(?i)\#\w+', text, flags=re.UNICODE),
"replies": replies,
"retweets": retweets,
"likes": likes,
"username": username
}
# Preparing to output # Preparing to output
...@@ -156,38 +171,44 @@ async def outTweet(tweet): ...@@ -156,38 +171,44 @@ async def outTweet(tweet):
generated list into Tweep. That's why these generated list into Tweep. That's why these
modes exist. modes exist.
''' '''
if arg.users: if arg.elasticsearch:
output = username es = Elasticsearch(arg.elasticsearch)
elif arg.tweets: es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject))
output = tweets return ""
else: else:
''' if arg.users:
The standard output is how I like it, although output = username
this can be modified to your desire. Uncomment elif arg.tweets:
the bottom line and add in the variables in the output = tweets
order you want them or how you want it to look.
'''
# output = ""
output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)
if arg.hashtags:
output+= " {}".format(hashtags)
if arg.stats:
output+= " | {} replies {} retweets {} likes".format(replies, retweets, likes)
# Output section
if arg.o != None:
if arg.csv:
# Write all variables scraped to CSV
dat = [tweetid, date, time, timezone, username, text, replies, retweets, likes, hashtags]
with open(arg.o, "a", newline='') as csv_file:
writer = csv.writer(csv_file, delimiter="|")
writer.writerow(dat)
else: else:
# Writes or appends to a file. '''
print(output, file=open(arg.o, "a")) The standard output is how I like it, although
this can be modified to your desire. Uncomment
the bottom line and add in the variables in the
order you want them or how you want it to look.
'''
# output = ""
output = "{} {} {} {} <{}> {}".format(tweetid, date, time, timezone, username, text)
if arg.hashtags:
output+= " {}".format(hashtags)
if arg.stats:
output+= " | {} replies {} retweets {} likes".format(replies, retweets, likes)
# Output section
if arg.o != None:
if arg.csv:
# Write all variables scraped to CSV
dat = [tweetid, date, time, timezone, username, text, replies, retweets, likes, hashtags]
with open(arg.o, "a", newline='') as csv_file:
writer = csv.writer(csv_file, delimiter="|")
writer.writerow(dat)
else:
# Writes or appends to a file.
print(output, file=open(arg.o, "a"))
return output return output
async def getTweets(init): async def getTweets(init):
''' '''
...@@ -207,7 +228,10 @@ async def getTweets(init): ...@@ -207,7 +228,10 @@ async def getTweets(init):
copyright = tweet.find("div","StreamItemContent--withheld") copyright = tweet.find("div","StreamItemContent--withheld")
if copyright is None: if copyright is None:
count +=1 count +=1
print(await outTweet(tweet)) if arg.elasticsearch:
print(await outTweet(tweet),end=".", flush=True)
else:
print(await outTweet(tweet))
return tweets, init, count return tweets, init, count
...@@ -254,7 +278,7 @@ def Error(error, message): ...@@ -254,7 +278,7 @@ def Error(error, message):
sys.exit(0) sys.exit(0)
def check(): def check():
# Performs main argument checks so nothing unintended happens. # Performs main argument checks so nothing unintended happens.
if arg.u is not None: if arg.u is not None:
if arg.users: if arg.users:
Error("Contradicting Args", "Please use --users in combination with -s.") Error("Contradicting Args", "Please use --users in combination with -s.")
...@@ -285,9 +309,12 @@ if __name__ == "__main__": ...@@ -285,9 +309,12 @@ if __name__ == "__main__":
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true") ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true") ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true")
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch")
arg = ap.parse_args() arg = ap.parse_args()
check() check()
if arg.elasticsearch:
print("Indexing to Elasticsearch @" + str(arg.elasticsearch))
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(main()) loop.run_until_complete(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment