Commit a54ec5ab authored by Cody Zacharias's avatar Cody Zacharias Committed by GitHub

Merge pull request #41 from pielco11/master

Added elasticsearch fun stuff
parents 155acf70 c847f5e9
......@@ -21,6 +21,7 @@ Some of the benefits of using Tweep vs Twitter API:
- `-s` Search for Tweets containing this word or phrase.
- `-g` Retrieve tweets by geolocation. Format of the argument is lat,lon,range(km or mi).
- `-o` Save output to a file.
- `-es` Output to Elasticsearch
- `--year` Filter Tweets before the specified year.
- `--fruit` Display Tweets with "low-hanging-fruit".
- `--tweets` Display Tweets only.
......@@ -53,6 +54,7 @@ A few simple examples to help you understand the basics:
- `python3 tweep.py -u username --fruit` - Show Tweets with low-hanging fruit.
- `python3 tweep.py -s "Donald Trump" --verified --users` - List verified users that Tweet about Donald Trump.
- `python3 tweep.py -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
- `python3 tweep.py -u username -es localhost:9200` - Output Tweets to Elasticsearch
## Example String
`955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
......@@ -61,7 +63,7 @@ A few simple examples to help you understand the basics:
<img src="https://i.imgur.com/RKdBrHr.png" />
## Thanks
Thanks to [@hpiedcoq](https://github.com/hpiedcoq) for contributing several features!
Thanks to [@hpiedcoq](https://github.com/hpiedcoq) & [@pielco11](https://github.com/pielco11) for contributing several features!
## Contact
Shout me out on Twitter: [@now](https://twitter.com/now)
PUT tweep
{
"mappings" : {
"items": {
"properties": {
"tweetid": {"type": "long"},
"datestamp": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"timezone": {"type": "text"},
"hashtags": {"type": "text"},
"replies": {"type": "integer"},
"retweets": {"type": "integer"},
"likes": {"type": "integer"},
"username": {"type": "keyword"}
}
}
}
,
"settings": {
"number_of_shards": 1
}
}
#!/usr/bin/python3
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
from time import gmtime, strftime
import argparse
import aiohttp
......@@ -7,6 +8,7 @@ import asyncio
import async_timeout
import csv
import datetime
import hashlib
import json
import re
import sys
......@@ -156,7 +158,23 @@ async def outTweet(tweet):
generated list into Tweep. That's why these
modes exist.
'''
if arg.users:
if arg.elasticsearch:
jObject = {
"tweetid": tweetid,
"datestamp": date + " " + time,
"timezone": timezone,
"text": text,
"hashtags": re.findall(r'(?i)\#\w+', text, flags=re.UNICODE),
"replies": replies,
"retweets": retweets,
"likes": likes,
"username": username
}
es = Elasticsearch(arg.elasticsearch)
es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject))
output = ""
elif arg.users:
output = username
elif arg.tweets:
output = tweets
......@@ -207,6 +225,9 @@ async def getTweets(init):
copyright = tweet.find("div","StreamItemContent--withheld")
if copyright is None:
count +=1
if arg.elasticsearch:
print(await outTweet(tweet),end=".", flush=True)
else:
print(await outTweet(tweet))
return tweets, init, count
......@@ -225,6 +246,10 @@ async def main():
'''
Putting it all together.
'''
if arg.elasticsearch:
print("Indexing to Elasticsearch @" + str(arg.elasticsearch))
if arg.userid is not None:
arg.u = await getUsername()
......@@ -271,8 +296,9 @@ if __name__ == "__main__":
ap = argparse.ArgumentParser(prog="tweep.py", usage="python3 %(prog)s [options]", description="tweep.py - An Advanced Twitter Scraping Tool")
ap.add_argument("-u", help="User's Tweets you want to scrape.")
ap.add_argument("-s", help="Search for Tweets containing this word or phrase.")
ap.add_argument("-o", help="Save output to a file.")
ap.add_argument("-g", help="Search for geocoded tweets.")
ap.add_argument("-o", help="Save output to a file.")
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch")
ap.add_argument("--year", help="Filter Tweets before specified year.")
ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).")
ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment