Commit 80a8f6bc authored by Cody Zacharias's avatar Cody Zacharias Committed by GitHub

Merge pull request #45 from pielco11/master

Added database storing option
parents 445ef7bb dc70c89b
...@@ -36,6 +36,18 @@ Command|Usage ...@@ -36,6 +36,18 @@ Command|Usage
`--count`|Display number Tweets scraped at the end of session. `--count`|Display number Tweets scraped at the end of session.
`--stats`|Show number of replies, retweets, and likes. `--stats`|Show number of replies, retweets, and likes.
## Elasticsearch Setup
1. Go [here](https://www.elastic.co/downloads) and download `Elasticsearch` and `Kibana`, install both; (do this once)
2. Run `Elasticsearch` and than `Kibana`, in the Kibana output you should see "[info][status][plugin:elasticsearch@6.2.2] Status changed from yellow to green - Ready";
3. Go to `http://localhost:5601`, `Dev Tools`, copy&paste from `index.json` and select the **green arrow**; (do this once)
4. Index some data: `python3.6 tweep.py --elasticsearch localhost:9200 -u whatsoever`;
5. Back to Kibana's interface, `Management`, `Index Pattern`, `Create Index Pattern`, type `tweep`, choose `datestamp` as Time filter; (do this once)
6. Go back to `Management`, `Saved Objects`, Import `dashboard.json` and than `visualization.json`; (do this once)
7. Have fun.
If you have problems don't hesitate to write to the mainteiner [@pielco11](https://github.com/pielco11) or open an issue.
Feel free to edit the dashboard and don't hesitate to share it if you want.
## Low-Hanging Fruit ## Low-Hanging Fruit
The `--fruit` feature will display Tweets that *might* contain sensitive info such as: The `--fruit` feature will display Tweets that *might* contain sensitive info such as:
- Profiles from leaked databases (Myspace or LastFM) - Profiles from leaked databases (Myspace or LastFM)
......
...@@ -7,10 +7,12 @@ PUT tweep ...@@ -7,10 +7,12 @@ PUT tweep
"datestamp": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, "datestamp": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"timezone": {"type": "text"}, "timezone": {"type": "text"},
"hashtags": {"type": "text"}, "hashtags": {"type": "text"},
"replies": {"type": "integer"}, "replies": {"type": "boolean"},
"retweets": {"type": "integer"}, "retweets": {"type": "boolean"},
"likes": {"type": "integer"}, "likes": {"type": "boolean"},
"username": {"type": "keyword"} "username": {"type": "keyword"},
"day": {"type": "keyword"},
"hour": {"type": "keyword"}
} }
} }
} }
......
#!/usr/bin/python3 #!/usr/bin/python3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch, helpers
from time import gmtime, strftime from time import gmtime, strftime
import argparse import argparse
import aiohttp import aiohttp
import asyncio import asyncio
import async_timeout import async_timeout
import contextlib
import csv import csv
import datetime import datetime
import hashlib import hashlib
import json import json
import re import re
import sys import sys
import sqlite3
## clean some output
class RecycleObject(object):
def write(self, junk): pass
@contextlib.contextmanager
def nostdout():
savestdout = sys.stdout
sys.stdout = RecycleObject()
yield
sys.stdout = savestdout
def initdb(db):
'''
Creates a new SQLite database or connects to it if exists
'''
try:
conn = sqlite3.connect(db)
cursor = conn.cursor()
table_tweets = """
CREATE TABLE IF NOT EXISTS
tweets (
id integer primary key,
date text not null,
time text not null,
timezone text not null,
user text not null,
tweet text not null,
replies integer,
likes integer,
retweets integer,
hashtags text
);
"""
cursor.execute(table_tweets)
table_users = """
CREATE TABLE IF NOT EXISTS
users (
user text primary key,
date_update text not null,
num_tweets integer
);
"""
cursor.execute(table_users)
return conn
except Exception as e:
return str(e)
async def getUrl(init): async def getUrl(init):
''' '''
...@@ -161,21 +210,108 @@ async def outTweet(tweet): ...@@ -161,21 +210,108 @@ async def outTweet(tweet):
generated list into Tweep. That's why these generated list into Tweep. That's why these
modes exist. modes exist.
''' '''
if arg.database:
try:
cursor = conn.cursor()
entry = (tweetid, date, time, timezone, username, text, replies, likes, retweets, hashtags,)
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?)', entry)
conn.commit()
except sqlite3.IntegrityError: # this happens if the tweet is already in the db
return ""
if arg.elasticsearch: if arg.elasticsearch:
day = d.strftime("%A")
if day == "Monday":
_day = 1
elif day == "Tuesday":
_day = 2
elif day == "Wednesday":
_day = 3
elif day == "Thursday":
_day = 4
elif day == "Friday":
_day = 5
elif day == "Saturday":
_day = 6
elif day == "Sunday":
_day = 7
else:
print("[x] Something is going wrong!")
sys.exit(1)
hashtags = re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
actions = []
nLikes = 0
nReplies = 0
nRetweets = 0
for l in range(int(likes)):
jObject = { jObject = {
"tweetid": tweetid, "tweetid": tweetid,
"datestamp": date + " " + time, "datestamp": date + " " + time,
"timezone": timezone, "timezone": timezone,
"text": text, "text": text,
"hashtags": re.findall(r'(?i)\#\w+', text, flags=re.UNICODE), "hashtags": hashtags,
"replies": replies, "likes": True,
"retweets": retweets, "username": username,
"likes": likes, "day": _day,
"username": username "hour": time.split(":")[0]
}
j_data = {
"_index": "tweep",
"_type": "items",
"_id": tweetid + "_likes_" + str(nLikes),
"_source": jObject
} }
actions.append(j_data)
nLikes += 1
for rep in range(int(replies)):
jObject = {
"tweetid": tweetid,
"datestamp": date + " " + time,
"timezone": timezone,
"text": text,
"hashtags": hashtags,
"replies": True,
"username": username,
"day": _day,
"hour": time.split(":")[0]
}
j_data = {
"_index": "tweep",
"_type": "items",
"_id": tweetid + "_replies_" + str(nReplies),
"_source": jObject
}
actions.append(j_data)
nReplies += 1
for rep in range(int(retweets)):
jObject = {
"tweetid": tweetid,
"datestamp": date + " " + time,
"timezone": timezone,
"text": text,
"hashtags": hashtags,
"retweets": True,
"username": username,
"day": _day,
"hour": time.split(":")[0]
}
j_data = {
"_index": "tweep",
"_type": "items",
"_id": tweetid + "_retweets_" + str(nRetweets),
"_source": jObject
}
actions.append(j_data)
nRetweets += 1
es = Elasticsearch(arg.elasticsearch) es = Elasticsearch(arg.elasticsearch)
es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject)) with nostdout():
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
actions = []
output = "" output = ""
elif arg.users: elif arg.users:
output = username output = username
...@@ -259,6 +395,14 @@ async def main(): ...@@ -259,6 +395,14 @@ async def main():
if arg.elasticsearch: if arg.elasticsearch:
print("Indexing to Elasticsearch @" + str(arg.elasticsearch)) print("Indexing to Elasticsearch @" + str(arg.elasticsearch))
if arg.database:
print("Inserting into Database: " + str(arg.database))
global conn
conn = initdb(arg.database)
if isinstance(conn, str):
print(str)
sys.exit(1)
if arg.userid is not None: if arg.userid is not None:
arg.u = await getUsername() arg.u = await getUsername()
...@@ -279,6 +423,14 @@ async def main(): ...@@ -279,6 +423,14 @@ async def main():
# Control when we want to stop scraping. # Control when we want to stop scraping.
if arg.limit is not None and num <= int(arg.limit): if arg.limit is not None and num <= int(arg.limit):
break break
if arg.database:
cursor = conn.cursor()
entry = (str(arg.u), str(datetime.datetime.now()), num,)
cursor.execute('INSERT OR REPLACE INTO users VALUES(?,?,?)', entry)
conn.commit()
conn.close()
if arg.count: if arg.count:
print("Finished: Successfully collected {} Tweets.".format(num)) print("Finished: Successfully collected {} Tweets.".format(num))
...@@ -321,6 +473,7 @@ if __name__ == "__main__": ...@@ -321,6 +473,7 @@ if __name__ == "__main__":
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true") ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true") ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true")
ap.add_argument("--database", help="Store tweets in the database")
arg = ap.parse_args() arg = ap.parse_args()
check() check()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment