Commit 71096ca6 authored by Francesco Poldi's avatar Francesco Poldi

Added database storing option

Added two tables: users and tweets.
The first stores: who, when, how much. The second stores every tweet.
If the tweet is already stored it skips to the next one so no duplication.
parent c847f5e9
...@@ -12,6 +12,43 @@ import hashlib ...@@ -12,6 +12,43 @@ import hashlib
import json import json
import re import re
import sys import sys
import sqlite3
def initdb(db):
'''
Creates a new SQLite database or connects to it if exists
'''
try:
conn = sqlite3.connect(db)
cursor = conn.cursor()
table_tweets = """
CREATE TABLE IF NOT EXISTS
tweets (
id integer primary key,
date text not null,
time text not null,
timezone text not null,
user text not null,
tweet text not null,
replies integer,
likes integer,
retweets integer,
hashtags text
);
"""
cursor.execute(table_tweets)
table_users = """
CREATE TABLE IF NOT EXISTS
users (
user text primary key,
date_update text not null,
num_tweets integer
);
"""
cursor.execute(table_users)
return conn
except Exception as e:
return str(e)
async def getUrl(init): async def getUrl(init):
''' '''
...@@ -149,7 +186,7 @@ async def outTweet(tweet): ...@@ -149,7 +186,7 @@ async def outTweet(tweet):
text = "{} {}".format(mention, text) text = "{} {}".format(mention, text)
except: except:
pass pass
# Preparing to output # Preparing to output
''' '''
...@@ -158,6 +195,15 @@ async def outTweet(tweet): ...@@ -158,6 +195,15 @@ async def outTweet(tweet):
generated list into Tweep. That's why these generated list into Tweep. That's why these
modes exist. modes exist.
''' '''
if arg.database:
try:
cursor = conn.cursor()
entry = (tweetid, date, time, timezone, username, text, replies, likes, retweets, hashtags,)
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?)', entry)
conn.commit()
except sqlite3.IntegrityError: # this happens if the tweet is already in the db
return ""
if arg.elasticsearch: if arg.elasticsearch:
jObject = { jObject = {
"tweetid": tweetid, "tweetid": tweetid,
...@@ -170,7 +216,7 @@ async def outTweet(tweet): ...@@ -170,7 +216,7 @@ async def outTweet(tweet):
"likes": likes, "likes": likes,
"username": username "username": username
} }
es = Elasticsearch(arg.elasticsearch) es = Elasticsearch(arg.elasticsearch)
es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject)) es.index(index="tweep", doc_type="items", id=tweetid, body=json.dumps(jObject))
output = "" output = ""
...@@ -250,6 +296,14 @@ async def main(): ...@@ -250,6 +296,14 @@ async def main():
if arg.elasticsearch: if arg.elasticsearch:
print("Indexing to Elasticsearch @" + str(arg.elasticsearch)) print("Indexing to Elasticsearch @" + str(arg.elasticsearch))
if arg.database:
print("Inserting into Database: " + str(arg.database))
global conn
conn = initdb(arg.database)
if isinstance(conn, str):
print(str)
sys.exit(1)
if arg.userid is not None: if arg.userid is not None:
arg.u = await getUsername() arg.u = await getUsername()
...@@ -270,6 +324,13 @@ async def main(): ...@@ -270,6 +324,13 @@ async def main():
# Control when we want to stop scraping. # Control when we want to stop scraping.
if arg.limit is not None and num <= int(arg.limit): if arg.limit is not None and num <= int(arg.limit):
break break
cursor = conn.cursor()
entry = (str(arg.u), str(datetime.datetime.now()), num,)
cursor.execute('INSERT OR REPLACE INTO users VALUES(?,?,?)', entry)
conn.commit()
conn.close()
if arg.count: if arg.count:
print("Finished: Successfully collected {} Tweets.".format(num)) print("Finished: Successfully collected {} Tweets.".format(num))
...@@ -311,6 +372,7 @@ if __name__ == "__main__": ...@@ -311,6 +372,7 @@ if __name__ == "__main__":
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true") ap.add_argument("--count", help="Display number Tweets scraped at the end of session.", action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true") ap.add_argument("--stats", help="Show number of replies, retweets, and likes", action="store_true")
ap.add_argument("--database", help="Store tweets in the database")
arg = ap.parse_args() arg = ap.parse_args()
check() check()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment