Commit 84962bb6 authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Finished dataframes storing option (#224)

* Update (#174)

* add function to clean accumulated pandas storage data

* Fixed typo, dataname, removed attributes

* Added config options and config var

* Added autoclean

Works for search now

* Added Elasticsearch count options

* Added silent output and objects for users and followers

* Update

* Clean following/followers attr

* Final construct of object

* Redesign

* Little fix

* Debug

* Debug

* Globals

* Removed debug

* Globals pt 2

* Mix

* Added _old_obj to store previous scrape

* Prefix

* Pre fix pt 2

* commented

* Fix for object follow

* Update

* Update

* Completed follow_object

* Pandas object for followers and following

* Finished pandas object for followers and following

* Added docstrings in Twint.py

* Added lowercase

#170

* Finished lower case

Close #170

* Fix defaults

* Added some edits

In `panda.py` changing the structure of the dataframe for users that one is following/followed, in `config.py` added autoupdate so that one does not have to call `storage.panda.get()` at every run, in `output.py` edited follow_object, in `run.py` added autoupdate function for panda, in`tweet.py` just some docstrings

* Update for panda and objects

* Finished storing data into dataframes #173

Now followers, following, tweets, and user details are saved in dataframes
parent 421205c2
......@@ -8,17 +8,20 @@ https://github.com/haccer/twint/wiki
Licensed under MIT License
Copyright (c) 2018 Cody Zacharias
'''
import argparse
import twint
import sys
import os
import argparse
import twint
def error(error, message):
print("[-] {}: {}".format(error, message))
def error(_error, message):
""" Print errors to stdout
"""
print("[-] {}: {}".format(_error, message))
sys.exit(0)
def check(args):
# Error checking
""" Error checking
"""
if args.username is not None:
if args.verified:
error("Contradicting Args",
......@@ -42,8 +45,9 @@ def check(args):
# Proxy stuff
if args.proxy_host is not None:
import socks
import socket
if args.proxy_host.lower() == "tor":
import socks, socket
socks.set_default_proxy(socks.SOCKS5, "localhost", 9050)
socket.socket = socks.socksocket
elif args.proxy_port and args.proxy_type:
......@@ -55,7 +59,6 @@ def check(args):
_type = socks.HTTP
else:
error("Error", "Proxy types allowed are: socks5, socks4, and http.")
import socks, socket
socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port))
socket.socket = socks.socksocket
else:
......@@ -64,20 +67,23 @@ def check(args):
if args.proxy_port or args.proxy_type:
error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type")
def loadUserList(ul, type):
def loadUserList(ul, _type):
""" Concatenate users
"""
if os.path.exists(os.path.abspath(ul)):
userlist = open(os.path.abspath(ul), "r").read().splitlines()
else:
userlist = ul.split(",")
if type == "search":
if _type == "search":
un = ""
for user in userlist:
un += "%20OR%20from%3A" + user
return un[15:]
else:
return userlist
def initialize(args):
""" Set default values for config from args
"""
c = twint.Config()
c.Username = args.username
c.User_id = args.userid
......@@ -123,9 +129,12 @@ def initialize(args):
c.Media = args.media
c.Replies = args.replies
c.Pandas_clean = args.pandas_clean
c.ES_count = {"likes":True, "replies":True, "retweets":True}
return c
def options():
""" Parse arguments
"""
ap = argparse.ArgumentParser(prog="Twint.py",
usage="python3 %(prog)s [options]",
description="TWINT - An Advanced Twitter Scraping Tool.")
......@@ -151,7 +160,8 @@ def options():
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
action="store_true")
ap.add_argument("--hostname", help="Store the mysql database host")
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 or mysql database.")
ap.add_argument("--DB_user", help="Store the mysql database user")
......@@ -164,53 +174,71 @@ def options():
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
ap.add_argument("--proxy-port", help="The port of the proxy server.")
ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.")
ap.add_argument("--essid",
help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
nargs="?", default="")
ap.add_argument("--userlist", help="Userlist from list or file.")
ap.add_argument("--retweets", help="Include user's Retweets (Warning: limited).", action="store_true")
ap.add_argument("--retweets",
help="Include user's Retweets (Warning: limited).",
action="store_true")
ap.add_argument("--format", help="Custom output format (See wiki for details).")
ap.add_argument("--user-full", help="Collect all user information (Use with followers or following only).",
ap.add_argument("--user-full",
help="Collect all user information (Use with followers or following only).",
action="store_true")
ap.add_argument("--profile-full",
help="Slow, but effective method of collecting a user's Tweets (Including Retweets).",
help="Slow, but effective method of collecting a user's Tweets and RT.",
action="store_true")
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
ap.add_argument("--pandas-type", help="Specify HDF5 or Pickle (HDF5 as default)")
ap.add_argument("--search_name", help="Name for identify the search like -3dprinter stuff- only for mysql")
ap.add_argument("-it", "--index-tweets", help="Custom Elasticsearch Index name for Tweets.")
ap.add_argument("-if", "--index-follow", help="Custom Elasticsearch Index name for Follows.")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.")
ap.add_argument("--debug", help="Store information in debug logs", action="store_true")
ap.add_argument("--pandas-type",
help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
ap.add_argument("--search_name",
help="Name for identify the search like -3dprinter stuff- only for mysql")
ap.add_argument("-it", "--index-tweets",
help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twint")
ap.add_argument("-if", "--index-follow",
help="Custom Elasticsearch Index name for Follows.",
nargs="?", default="twintGraph")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
nargs="?", default="twintUser")
ap.add_argument("--debug",
help="Store information in debug logs", action="store_true")
ap.add_argument("--resume", help="Resume from Tweet ID.")
ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--media",
help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc", "--pandas-clean",
help="Automatically clean Pandas dataframe at every scrape.")
ap.add_argument("-ec", "--es-count", nargs="?", default="",
help="What NOT to count: likes, replies, retweets; only for Elasticsearch.")
ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
args = ap.parse_args()
return args
def main():
""" Main
"""
args = options()
check(args)
if args.userlist:
args.username = loadUserList(args.userlist, "search")
if not args.pandas_type:
args.pandas_type = "HDF5"
if args.pandas_clean:
twint.storage.panda.clean()
if not args.index_tweets:
args.index_tweets = "twint"
c = initialize(args)
if not args.index_follow:
args.index_follow = "twintGraph"
if "likes" in str(args.es_count):
c.ES_count["likes"] = False
if not args.index_users:
args.index_users = "twintUser"
if "replies" in str(args.es_count):
c.ES_count["replies"] = False
if not args.essid:
args.essid = ""
if "retweets" in str(args.es_count):
c.ES_count["retweets"] = False
if args.pandas_clean:
twint.storage.panda.clean()
......
......@@ -49,3 +49,6 @@ class Config:
Media = False
Replies = False
Pandas_clean = True
ES_count = {"likes":True,"replies":True,"retweets":True}
Lowercase = False
Pandas_au = True
......@@ -4,7 +4,16 @@ from .user import User
from datetime import datetime
from .storage import db, elasticsearch, write, panda
follow_object = {}
tweets_object = []
user_object = []
_follow_list = []
def clean_follow_list():
global _follow_list
_follow_list = []
def datecheck(datestamp, config):
if config.Since and config.Until:
......@@ -21,7 +30,13 @@ def is_tweet(tw):
except:
return False
def _output(obj, output, config):
def _output(obj, output, config, **extra):
if config.Lowercase:
obj.username = obj.username.lower()
for i in range(len(obj.mentions)):
obj.mentions[i] = obj.mentions[i].lower()
for i in range(len(obj.hashtags)):
obj.hashtags[i] = obj.hashtags[i].lower()
if config.Output != None:
if config.Store_csv:
try :
......@@ -33,13 +48,15 @@ def _output(obj, output, config):
else:
write.Text(output, config.Output)
if config.Pandas:
panda.update(obj, config.Essid)
if config.Pandas and config.User_full:
panda.update(obj, config)
if extra.get("follow_list"):
follow_object.username = config.Username
follow_object.action = config.Following*"following" + config.Followers*"followers"
follow_object.users = _follow_list
panda.update(follow_object, config.Essid)
if config.Elasticsearch:
if config.Store_object:
tweets_object.append(obj)
else:
print(output, end=".", flush=True)
print("", end=".", flush=True)
else:
if config.Store_object:
tweets_object.append(obj)
......@@ -63,9 +80,14 @@ async def Tweets(tw, location, config, conn):
if config.Elasticsearch:
elasticsearch.Tweet(tweet, config)
if config.Store_object:
tweets_object.append(tweet) #twint.tweet.tweet
_output(tweet, output, config)
async def Users(u, config, conn):
global user_object
user = User(u)
output = format.User(config.Format, user)
......@@ -81,13 +103,27 @@ async def Users(u, config, conn):
user.join_date = _save_date
user.join_time = _save_time
if config.Store_object:
user_object.append(user) # twint.user.user
_output(user, output, config)
async def Username(username, config, conn):
global follow_object
follow_var = config.Following*"following" + config.Followers*"followers"
if config.Database:
db.follow(conn, config.Username, config.Followers, username)
if config.Elasticsearch:
elasticsearch.Follow(username, config)
_output(username, username, config)
if config.Store_object or config.Pandas:
try:
_ = follow_object[config.Username][follow_var]
except KeyError:
follow_object.update({config.Username: {follow_var: []}})
follow_object[config.Username][follow_var].append(username)
if config.Pandas_au:
panda.update(follow_object[config.Username], config)
_output(username, username, config, follow_list=_follow_list)
......@@ -16,6 +16,9 @@ class Twint:
self.d = datelock.Set(self.config.Until, self.config.Since)
verbose.Elastic(config.Elasticsearch)
if self.config.Store_object:
output.clean_follow_list()
if self.config.Pandas_clean:
storage.panda.clean()
......@@ -124,17 +127,35 @@ def Favorites(config):
run(config)
def Followers(config):
output.clean_follow_list()
config.Followers = True
config.Following = False
run(config)
if config.Pandas_au:
storage.panda._autoget("followers")
if config.User_full:
storage.panda._autoget("user")
storage.panda.clean()
def Following(config):
output.clean_follow_list()
config.Following = True
config.Followers = False
run(config)
if config.Pandas_au:
storage.panda._autoget("following")
if config.User_full:
storage.panda._autoget("user")
storage.panda.clean()
def Profile(config):
config.Profile = True
run(config)
def Search(config):
config.TwitterSearch = True
config.Following = False
config.Followers = False
run(config)
if config.Pandas_au:
storage.panda._autoget("tweet")
......@@ -73,6 +73,7 @@ def Tweet(Tweet, config):
}
actions.append(j_data)
if config.ES_count["likes"] is not False:
for l in range(int(Tweet.likes)):
j_data = {
"_index": config.Index_tweets,
......@@ -99,6 +100,7 @@ def Tweet(Tweet, config):
actions.append(j_data)
nLikes += 1
if config.ES_count["replies"] is not False:
for rep in range(int(Tweet.replies)):
j_data = {
"_index": config.Index_tweets,
......@@ -125,6 +127,7 @@ def Tweet(Tweet, config):
actions.append(j_data)
nReplies += 1
if config.ES_count["retweets"] is not False:
for ret in range(int(Tweet.retweets)):
j_data = {
"_index": config.Index_tweets,
......
from .elasticsearch import *
from time import strftime, localtime
import pandas as pd
import warnings
from .elasticsearch import *
Tweets_df = None
Follow_df = None
User_df = None
_object_blocks = {
"tweet": [],
"user": [],
"following": [],
"followers": []
}
_type = ""
def _concat(df, type):
if df is None:
df = pd.DataFrame(_object_blocks[type])
else:
_df = pd.DataFrame(_object_blocks[type])
df = pd.concat([df, _df], sort=True)
return df
def _autoget(type):
global Tweets_df
global Follow_df
global User_df
if type == "tweet":
Tweets_df = _concat(Tweets_df, type)
if type == "followers" or type == "following":
Follow_df = _concat(Follow_df, type)
if type == "user":
User_df = _concat(User_df, type)
_blocks = []
def update(Tweet, session):
dt = f"{Tweet.datestamp} {Tweet.timestamp}"
def update(object, config):
global _type
try:
_type = ((object.type == "tweet")*"tweet" +
(object.type == "user")*"user")
except AttributeError:
_type = config.Following*"following" + config.Followers*"followers"
if _type == "tweet":
dt = f"{object.datestamp} {object.timestamp}"
_data = {
"id": Tweet.id,
"id": object.id,
"date": dt,
"timezone": Tweet.timezone,
"location": Tweet.location,
"tweet": Tweet.tweet,
"hashtags": Tweet.hashtags,
"user_id": Tweet.user_id,
"username": Tweet.username,
"link": Tweet.link,
"retweet": Tweet.retweet,
"user_rt": Tweet.user_rt,
"essid": str(session),
'mentions': Tweet.mentions
"timezone": object.timezone,
"location": object.location,
"tweet": object.tweet,
"hashtags": object.hashtags,
"user_id": object.user_id,
"username": object.username,
"link": object.link,
"retweet": object.retweet,
"user_rt": object.user_rt,
"essid": config.Essid,
'mentions': object.mentions
}
_object_blocks[_type].append(_data)
elif _type == "user":
_data = {
"id": object.id,
"name": object.name,
"username": object.username,
"bio": object.bio,
"location": object.location,
"url": object.url,
"join_datetime": object.join_date + " " + object.join_time,
"join_date": object.join_date,
"join_time": object.join_time,
"tweets": object.tweets,
"following": object.following,
"followers": object.followers,
"likes": object.likes,
"media": object.media_count,
"private": object.is_private,
"verified": object.is_verified,
"avatar": object.avatar,
"session": str(config.Essid)
}
_object_blocks[_type].append(_data)
elif _type == "followers" or _type == "following":
_data = {
config.Following*"following" + config.Followers*"followers" :
{config.Username: object[_type]}
}
_blocks.append(_data)
_object_blocks[_type] = _data
else:
print("Wrong type of object passed!")
def get():
df = pd.DataFrame(_blocks)
return df
def clean():
_blocks.clear()
_object_blocks["tweet"].clear()
_object_blocks["following"].clear()
_object_blocks["followers"].clear()
_object_blocks["user"].clear()
def save(_filename, _dataframe, **options):
if options.get("dataname"):
......@@ -41,15 +110,16 @@ def save(_filename, _dataframe, **options):
if not options.get("type"):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
_store = pd.HDFStore(_filename)
_store = pd.HDFStore(_filename + ".h5")
_store[_dataname] = _dataframe
_store.close()
elif options.get("type") == "Pickle":
with warnings.catch_warnings():
warnings.simplefilter("ignore")
_dataframe.to_pickle(_filename)
_dataframe.to_pickle(_filename + ".pkl")
else:
print("Please specify: filename, DataFrame, DataFrame name and type (HDF5, default, or Pickle")
print("""Please specify: filename, DataFrame, DataFrame name and type
(HDF5, default, or Pickle)""")
def read(_filename, **options):
if not options.get("dataname"):
......@@ -58,11 +128,12 @@ def read(_filename, **options):
_dataname = options.get("dataname")
if not options.get("type"):
_store = pd.HDFStore(_filename)
df = _store[_dataname]
return df
_store = pd.HDFStore(_filename + ".h5")
_df = _store[_dataname]
return _df
elif options.get("type") == "Pickle":
df = pd.read_pickle(_filename)
return df
_df = pd.read_pickle(_filename + ".pkl")
return _df
else:
print("Please specify: DataFrame, DataFrame name (twint as default), filename and type (HDF5, default, or Pickle")
print("""Please specify: DataFrame, DataFrame name (twint as default),
filename and type (HDF5, default, or Pickle""")
......@@ -2,9 +2,16 @@ from time import strftime, localtime
import re
class tweet:
"""Define Tweet class
"""
type = "tweet"
def __init__(self):
pass
def getMentions(tw):
"""Extract ment from tweet
"""
try:
mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ")
except:
......@@ -13,6 +20,8 @@ def getMentions(tw):
return mentions
def getText(tw):
"""Replace some text
"""
text = tw.find("p", "tweet-text").text
text = text.replace("\n", " ")
text = text.replace("http", " http")
......@@ -33,9 +42,13 @@ def getTweet(tw, mentions):
return text
def getHashtags(text):
"""Get hashtags of tweet
"""
return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
def getStat(tw, _type):
"""Get stats about Tweet
"""
st = f"ProfileTweet-action--{_type} u-hiddenVisually"
return tw.find("span", st).find("span")["data-tweet-stat-count"]
......@@ -44,6 +57,8 @@ def getRetweet(profile, username, user):
return True
def getUser_rt(profile, username, user):
"""Get username that retweeted
"""
if getRetweet(profile, username, user):
user_rt = user
else:
......@@ -52,6 +67,8 @@ def getUser_rt(profile, username, user):
return user_rt
def Tweet(tw, location, config):
"""Create Tweet object
"""
t = tweet()
t.id = tw.find("div")["data-item-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time"])
......
class user:
type = "user"
def __init__(self):
pass
def inf(ur, _type):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment