Commit a54ec5ab authored by Cody Zacharias's avatar Cody Zacharias Committed by GitHub

Merge pull request #41 from pielco11/master

Added elasticsearch fun stuff
parents 155acf70 c847f5e9
...@@ -21,6 +21,7 @@ Some of the benefits of using Tweep vs Twitter API: ...@@ -21,6 +21,7 @@ Some of the benefits of using Tweep vs Twitter API:
- `-s` Search for Tweets containing this word or phrase. - `-s` Search for Tweets containing this word or phrase.
- `-g` Retrieve tweets by geolocation. Format of the argument is lat,lon,range(km or mi). - `-g` Retrieve tweets by geolocation. Format of the argument is lat,lon,range(km or mi).
- `-o` Save output to a file. - `-o` Save output to a file.
- `-es` Output to Elasticsearch
- `--year` Filter Tweets before the specified year. - `--year` Filter Tweets before the specified year.
- `--fruit` Display Tweets with "low-hanging-fruit". - `--fruit` Display Tweets with "low-hanging-fruit".
- `--tweets` Display Tweets only. - `--tweets` Display Tweets only.
...@@ -53,6 +54,7 @@ A few simple examples to help you understand the basics: ...@@ -53,6 +54,7 @@ A few simple examples to help you understand the basics:
- `python3 tweep.py -u username --fruit` - Show Tweets with low-hanging fruit. - `python3 tweep.py -u username --fruit` - Show Tweets with low-hanging fruit.
- `python3 tweep.py -s "Donald Trump" --verified --users` - List verified users that Tweet about Donald Trump. - `python3 tweep.py -s "Donald Trump" --verified --users` - List verified users that Tweet about Donald Trump.
- `python3 tweep.py -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file. - `python3 tweep.py -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
- `python3 tweep.py -u username -es localhost:9200` - Output Tweets to Elasticsearch
## Example String ## Example String
`955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit` `955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
...@@ -61,7 +63,7 @@ A few simple examples to help you understand the basics: ...@@ -61,7 +63,7 @@ A few simple examples to help you understand the basics:
<img src="https://i.imgur.com/RKdBrHr.png" /> <img src="https://i.imgur.com/RKdBrHr.png" />
## Thanks ## Thanks
Thanks to [@hpiedcoq](https://github.com/hpiedcoq) for contributing several features! Thanks to [@hpiedcoq](https://github.com/hpiedcoq) & [@pielco11](https://github.com/pielco11) for contributing several features!
## Contact ## Contact
Shout me out on Twitter: [@now](https://twitter.com/now) Shout me out on Twitter: [@now](https://twitter.com/now)
PUT tweep
{
"mappings" : {
"items": {
"properties": {
"tweetid": {"type": "long"},
"datestamp": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"timezone": {"type": "text"},
"hashtags": {"type": "text"},
"replies": {"type": "integer"},
"retweets": {"type": "integer"},
"likes": {"type": "integer"},
"username": {"type": "keyword"}
}
}
}
,
"settings": {
"number_of_shards": 1
}
}
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
from time import gmtime, strftime from time import gmtime, strftime
import argparse import argparse
import aiohttp import aiohttp
...@@ -7,6 +8,7 @@ import asyncio ...@@ -7,6 +8,7 @@ import asyncio
import async_timeout import async_timeout
import csv import csv
import datetime import datetime
import hashlib
import json import json
import re import re
import sys import sys
...@@ -14,7 +16,7 @@ import sys ...@@ -14,7 +16,7 @@ import sys
async def getUrl(init): async def getUrl(init):
''' '''
URL Descision: URL Descision:
Tweep utilizes positions of Tweet's from Twitter's search feature to Tweep utilizes positions of Tweet's from Twitter's search feature to
iterate through a user's Twitter feed. This section decides whether iterate through a user's Twitter feed. This section decides whether
this is the first URL request or not and develops the URL based on the this is the first URL request or not and develops the URL based on the
args given. args given.
...@@ -99,7 +101,7 @@ async def getFeed(init): ...@@ -99,7 +101,7 @@ async def getFeed(init):
if init == -1: if init == -1:
feed, init = await initial(response) feed, init = await initial(response)
else: else:
feed, init = await cont(response) feed, init = await cont(response)
except: except:
# Tweep will realize that it's done scraping. # Tweep will realize that it's done scraping.
pass pass
...@@ -109,7 +111,7 @@ async def getFeed(init): ...@@ -109,7 +111,7 @@ async def getFeed(init):
async def outTweet(tweet): async def outTweet(tweet):
''' '''
Parsing Section: Parsing Section:
This function will create the desired output string and This function will create the desired output string and
write it to a file or csv if specified. write it to a file or csv if specified.
Returns output. Returns output.
...@@ -147,7 +149,7 @@ async def outTweet(tweet): ...@@ -147,7 +149,7 @@ async def outTweet(tweet):
text = "{} {}".format(mention, text) text = "{} {}".format(mention, text)
except: except:
pass pass
# Preparing to output # Preparing to output
''' '''
...@@ -156,7 +158,23 @@ async def outTweet(tweet): ...@@ -156,7 +158,23 @@ async def outTweet(tweet):
generated list into Tweep. That's why these generated list into Tweep. That's why these
modes exist. modes exist.
''' '''
if arg.users: if arg.elasticsearch:
jObject = {
"tweetid": tweetid,
"datestamp": date + " " + time,
"timezone": timezone,
"text": text,
"hashtags": re.findall(r'(?i)\#\w+', text, flags=re.UNICODE),
"replies": replies,
"retweets": retweets,
"likes": likes,
"username": username
}
es = Elasticsearch(arg.elasticsearch)
es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject))
output = ""
elif arg.users:
output = username output = username
elif arg.tweets: elif arg.tweets:
output = tweets output = tweets
...@@ -174,7 +192,7 @@ async def outTweet(tweet): ...@@ -174,7 +192,7 @@ async def outTweet(tweet):
if arg.stats: if arg.stats:
output+= " | {} replies {} retweets {} likes".format(replies, retweets, likes) output+= " | {} replies {} retweets {} likes".format(replies, retweets, likes)
# Output section # Output section
if arg.o != None: if arg.o != None:
if arg.csv: if arg.csv:
...@@ -207,7 +225,10 @@ async def getTweets(init): ...@@ -207,7 +225,10 @@ async def getTweets(init):
copyright = tweet.find("div","StreamItemContent--withheld") copyright = tweet.find("div","StreamItemContent--withheld")
if copyright is None: if copyright is None:
count +=1 count +=1
print(await outTweet(tweet)) if arg.elasticsearch:
print(await outTweet(tweet),end=".", flush=True)
else:
print(await outTweet(tweet))
return tweets, init, count return tweets, init, count
...@@ -225,6 +246,10 @@ async def main(): ...@@ -225,6 +246,10 @@ async def main():
''' '''
Putting it all together. Putting it all together.
''' '''
if arg.elasticsearch:
print("Indexing to Elasticsearch @" + str(arg.elasticsearch))
if arg.userid is not None: if arg.userid is not None:
arg.u = await getUsername() arg.u = await getUsername()
...@@ -254,7 +279,7 @@ def Error(error, message): ...@@ -254,7 +279,7 @@ def Error(error, message):
sys.exit(0) sys.exit(0)
def check(): def check():
# Performs main argument checks so nothing unintended happens. # Performs main argument checks so nothing unintended happens.
if arg.u is not None: if arg.u is not None:
if arg.users: if arg.users:
Error("Contradicting Args", "Please use --users in combination with -s.") Error("Contradicting Args", "Please use --users in combination with -s.")
...@@ -271,8 +296,9 @@ if __name__ == "__main__": ...@@ -271,8 +296,9 @@ if __name__ == "__main__":
ap = argparse.ArgumentParser(prog="tweep.py", usage="python3 %(prog)s [options]", description="tweep.py - An Advanced Twitter Scraping Tool") ap = argparse.ArgumentParser(prog="tweep.py", usage="python3 %(prog)s [options]", description="tweep.py - An Advanced Twitter Scraping Tool")
ap.add_argument("-u", help="User's Tweets you want to scrape.") ap.add_argument("-u", help="User's Tweets you want to scrape.")
ap.add_argument("-s", help="Search for Tweets containing this word or phrase.") ap.add_argument("-s", help="Search for Tweets containing this word or phrase.")
ap.add_argument("-o", help="Save output to a file.")
ap.add_argument("-g", help="Search for geocoded tweets.") ap.add_argument("-g", help="Search for geocoded tweets.")
ap.add_argument("-o", help="Save output to a file.")
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch")
ap.add_argument("--year", help="Filter Tweets before specified year.") ap.add_argument("--year", help="Filter Tweets before specified year.")
ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).") ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).")
ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true") ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment