Commit 561297f5 authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Finished dataframes storing option (#224)

* Update (#174)

* add function to clean accumulated pandas storage data

* Fixed typo, dataname, removed attributes

* Added config options and config var

* Added autoclean

Works for search now

* Added Elasticsearch count options

* Added silent output and objects for users and followers

* Update

* Clean following/followers attr

* Final construct of object

* Redesign

* Little fix

* Debug

* Debug

* Globals

* Removed debug

* Globals pt 2

* Mix

* Added _old_obj to store previous scrape

* Prefix

* Pre fix pt 2

* commented

* Fix for object follow

* Update

* Update

* Completed follow_object

* Pandas object for followers and following

* Finished pandas object for followers and following

* Added docstrings in Twint.py

* Added lowercase

#170

* Finished lower case

Close #170

* Fix defaults

* Added some edits

In `panda.py` changing the structure of the dataframe for users that one is following/followed, in `config.py` added autoupdate so that one does not have to call `storage.panda.get()` at every run, in `output.py` edited follow_object, in `run.py` added autoupdate function for panda, in`tweet.py` just some docstrings

* Update for panda and objects

* Finished storing data into dataframes #173

Now followers, following, tweets, and user details are saved in dataframes
parent bdd1fe42
...@@ -8,24 +8,27 @@ https://github.com/haccer/twint/wiki ...@@ -8,24 +8,27 @@ https://github.com/haccer/twint/wiki
Licensed under MIT License Licensed under MIT License
Copyright (c) 2018 Cody Zacharias Copyright (c) 2018 Cody Zacharias
''' '''
import argparse
import twint
import sys import sys
import os import os
import argparse
import twint
def error(error, message): def error(_error, message):
print("[-] {}: {}".format(error, message)) """ Print errors to stdout
"""
print("[-] {}: {}".format(_error, message))
sys.exit(0) sys.exit(0)
def check(args): def check(args):
# Error checking """ Error checking
"""
if args.username is not None: if args.username is not None:
if args.verified: if args.verified:
error("Contradicting Args", error("Contradicting Args",
"Please use --verified in combination with -s.") "Please use --verified in combination with -s.")
if args.userid: if args.userid:
error("Contradicting Args", error("Contradicting Args",
"--userid and -u cannot be used together.") "--userid and -u cannot be used together.")
if args.output is None: if args.output is None:
if args.csv: if args.csv:
error("Error", "Please specify an output file (Example: -o file.csv).") error("Error", "Please specify an output file (Example: -o file.csv).")
...@@ -42,8 +45,9 @@ def check(args): ...@@ -42,8 +45,9 @@ def check(args):
# Proxy stuff # Proxy stuff
if args.proxy_host is not None: if args.proxy_host is not None:
import socks
import socket
if args.proxy_host.lower() == "tor": if args.proxy_host.lower() == "tor":
import socks, socket
socks.set_default_proxy(socks.SOCKS5, "localhost", 9050) socks.set_default_proxy(socks.SOCKS5, "localhost", 9050)
socket.socket = socks.socksocket socket.socket = socks.socksocket
elif args.proxy_port and args.proxy_type: elif args.proxy_port and args.proxy_type:
...@@ -55,7 +59,6 @@ def check(args): ...@@ -55,7 +59,6 @@ def check(args):
_type = socks.HTTP _type = socks.HTTP
else: else:
error("Error", "Proxy types allowed are: socks5, socks4, and http.") error("Error", "Proxy types allowed are: socks5, socks4, and http.")
import socks, socket
socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port)) socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port))
socket.socket = socks.socksocket socket.socket = socks.socksocket
else: else:
...@@ -64,20 +67,23 @@ def check(args): ...@@ -64,20 +67,23 @@ def check(args):
if args.proxy_port or args.proxy_type: if args.proxy_port or args.proxy_type:
error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type") error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type")
def loadUserList(ul, type): def loadUserList(ul, _type):
""" Concatenate users
"""
if os.path.exists(os.path.abspath(ul)): if os.path.exists(os.path.abspath(ul)):
userlist = open(os.path.abspath(ul), "r").read().splitlines() userlist = open(os.path.abspath(ul), "r").read().splitlines()
else: else:
userlist = ul.split(",") userlist = ul.split(",")
if type == "search": if _type == "search":
un = "" un = ""
for user in userlist: for user in userlist:
un += "%20OR%20from%3A" + user un += "%20OR%20from%3A" + user
return un[15:] return un[15:]
else: return userlist
return userlist
def initialize(args): def initialize(args):
""" Set default values for config from args
"""
c = twint.Config() c = twint.Config()
c.Username = args.username c.Username = args.username
c.User_id = args.userid c.User_id = args.userid
...@@ -123,12 +129,15 @@ def initialize(args): ...@@ -123,12 +129,15 @@ def initialize(args):
c.Media = args.media c.Media = args.media
c.Replies = args.replies c.Replies = args.replies
c.Pandas_clean = args.pandas_clean c.Pandas_clean = args.pandas_clean
c.ES_count = {"likes":True, "replies":True, "retweets":True}
return c return c
def options(): def options():
""" Parse arguments
"""
ap = argparse.ArgumentParser(prog="Twint.py", ap = argparse.ArgumentParser(prog="Twint.py",
usage="python3 %(prog)s [options]", usage="python3 %(prog)s [options]",
description="TWINT - An Advanced Twitter Scraping Tool.") description="TWINT - An Advanced Twitter Scraping Tool.")
ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.") ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.") ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.") ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
...@@ -143,15 +152,16 @@ def options(): ...@@ -143,15 +152,16 @@ def options():
ap.add_argument("--until", help="Filter Tweets sent until date (Example: 2017-12-27).") ap.add_argument("--until", help="Filter Tweets sent until date (Example: 2017-12-27).")
ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true") ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true")
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
action="store_true") action="store_true")
ap.add_argument("--csv", help="Write as .csv file.", action="store_true") ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
ap.add_argument("--json", help="Write as .json file", action="store_true") ap.add_argument("--json", help="Write as .json file", action="store_true")
ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true") ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
ap.add_argument("--userid", help="Twitter user id.") ap.add_argument("--userid", help="Twitter user id.")
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.", ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
action="store_true") action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", action="store_true") ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
action="store_true")
ap.add_argument("--hostname", help="Store the mysql database host") ap.add_argument("--hostname", help="Store the mysql database host")
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 or mysql database.") ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 or mysql database.")
ap.add_argument("--DB_user", help="Store the mysql database user") ap.add_argument("--DB_user", help="Store the mysql database user")
...@@ -164,53 +174,71 @@ def options(): ...@@ -164,53 +174,71 @@ def options():
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.") ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
ap.add_argument("--proxy-host", help="Proxy hostname or IP.") ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
ap.add_argument("--proxy-port", help="The port of the proxy server.") ap.add_argument("--proxy-port", help="The port of the proxy server.")
ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.") ap.add_argument("--essid",
help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
nargs="?", default="")
ap.add_argument("--userlist", help="Userlist from list or file.") ap.add_argument("--userlist", help="Userlist from list or file.")
ap.add_argument("--retweets", help="Include user's Retweets (Warning: limited).", action="store_true") ap.add_argument("--retweets",
help="Include user's Retweets (Warning: limited).",
action="store_true")
ap.add_argument("--format", help="Custom output format (See wiki for details).") ap.add_argument("--format", help="Custom output format (See wiki for details).")
ap.add_argument("--user-full", help="Collect all user information (Use with followers or following only).", ap.add_argument("--user-full",
action="store_true") help="Collect all user information (Use with followers or following only).",
action="store_true")
ap.add_argument("--profile-full", ap.add_argument("--profile-full",
help="Slow, but effective method of collecting a user's Tweets (Including Retweets).", help="Slow, but effective method of collecting a user's Tweets and RT.",
action="store_true") action="store_true")
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.") ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
ap.add_argument("--pandas-type", help="Specify HDF5 or Pickle (HDF5 as default)") ap.add_argument("--pandas-type",
ap.add_argument("--search_name", help="Name for identify the search like -3dprinter stuff- only for mysql") help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
ap.add_argument("-it", "--index-tweets", help="Custom Elasticsearch Index name for Tweets.") ap.add_argument("--search_name",
ap.add_argument("-if", "--index-follow", help="Custom Elasticsearch Index name for Follows.") help="Name for identify the search like -3dprinter stuff- only for mysql")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.") ap.add_argument("-it", "--index-tweets",
ap.add_argument("--debug", help="Store information in debug logs", action="store_true") help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twint")
ap.add_argument("-if", "--index-follow",
help="Custom Elasticsearch Index name for Follows.",
nargs="?", default="twintGraph")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
nargs="?", default="twintUser")
ap.add_argument("--debug",
help="Store information in debug logs", action="store_true")
ap.add_argument("--resume", help="Resume from Tweet ID.") ap.add_argument("--resume", help="Resume from Tweet ID.")
ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true") ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true") ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true") ap.add_argument("--media",
help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc", "--pandas-clean",
help="Automatically clean Pandas dataframe at every scrape.")
ap.add_argument("-ec", "--es-count", nargs="?", default="",
help="What NOT to count: likes, replies, retweets; only for Elasticsearch.")
ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.") ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
args = ap.parse_args() args = ap.parse_args()
return args return args
def main(): def main():
""" Main
"""
args = options() args = options()
check(args) check(args)
if args.userlist: if args.userlist:
args.username = loadUserList(args.userlist, "search") args.username = loadUserList(args.userlist, "search")
if not args.pandas_type: if args.pandas_clean:
args.pandas_type = "HDF5" twint.storage.panda.clean()
if not args.index_tweets: c = initialize(args)
args.index_tweets = "twint"
if not args.index_follow: if "likes" in str(args.es_count):
args.index_follow = "twintGraph" c.ES_count["likes"] = False
if not args.index_users: if "replies" in str(args.es_count):
args.index_users = "twintUser" c.ES_count["replies"] = False
if not args.essid: if "retweets" in str(args.es_count):
args.essid = "" c.ES_count["retweets"] = False
if args.pandas_clean: if args.pandas_clean:
twint.storage.panda.clean() twint.storage.panda.clean()
......
...@@ -49,3 +49,6 @@ class Config: ...@@ -49,3 +49,6 @@ class Config:
Media = False Media = False
Replies = False Replies = False
Pandas_clean = True Pandas_clean = True
ES_count = {"likes":True,"replies":True,"retweets":True}
Lowercase = False
Pandas_au = True
...@@ -4,7 +4,16 @@ from .user import User ...@@ -4,7 +4,16 @@ from .user import User
from datetime import datetime from datetime import datetime
from .storage import db, elasticsearch, write, panda from .storage import db, elasticsearch, write, panda
follow_object = {}
tweets_object = [] tweets_object = []
user_object = []
_follow_list = []
def clean_follow_list():
global _follow_list
_follow_list = []
def datecheck(datestamp, config): def datecheck(datestamp, config):
if config.Since and config.Until: if config.Since and config.Until:
...@@ -21,7 +30,13 @@ def is_tweet(tw): ...@@ -21,7 +30,13 @@ def is_tweet(tw):
except: except:
return False return False
def _output(obj, output, config): def _output(obj, output, config, **extra):
if config.Lowercase:
obj.username = obj.username.lower()
for i in range(len(obj.mentions)):
obj.mentions[i] = obj.mentions[i].lower()
for i in range(len(obj.hashtags)):
obj.hashtags[i] = obj.hashtags[i].lower()
if config.Output != None: if config.Output != None:
if config.Store_csv: if config.Store_csv:
try : try :
...@@ -33,13 +48,15 @@ def _output(obj, output, config): ...@@ -33,13 +48,15 @@ def _output(obj, output, config):
else: else:
write.Text(output, config.Output) write.Text(output, config.Output)
if config.Pandas: if config.Pandas and config.User_full:
panda.update(obj, config.Essid) panda.update(obj, config)
if extra.get("follow_list"):
follow_object.username = config.Username
follow_object.action = config.Following*"following" + config.Followers*"followers"
follow_object.users = _follow_list
panda.update(follow_object, config.Essid)
if config.Elasticsearch: if config.Elasticsearch:
if config.Store_object: print("", end=".", flush=True)
tweets_object.append(obj)
else:
print(output, end=".", flush=True)
else: else:
if config.Store_object: if config.Store_object:
tweets_object.append(obj) tweets_object.append(obj)
...@@ -56,24 +73,29 @@ async def Tweets(tw, location, config, conn): ...@@ -56,24 +73,29 @@ async def Tweets(tw, location, config, conn):
tweet = Tweet(tw, location, config) tweet = Tweet(tw, location, config)
if datecheck(tweet.datestamp, config): if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet) output = format.Tweet(config, tweet)
if config.Database: if config.Database:
db.tweets(conn, tweet, config) db.tweets(conn, tweet, config)
if config.Elasticsearch: if config.Elasticsearch:
elasticsearch.Tweet(tweet, config) elasticsearch.Tweet(tweet, config)
if config.Store_object:
tweets_object.append(tweet) #twint.tweet.tweet
_output(tweet, output, config) _output(tweet, output, config)
async def Users(u, config, conn): async def Users(u, config, conn):
global user_object
user = User(u) user = User(u)
output = format.User(config.Format, user) output = format.User(config.Format, user)
if config.Database: if config.Database:
db.user(conn, config.Username, config.Followers, user) db.user(conn, config.Username, config.Followers, user)
if config.Elasticsearch: if config.Elasticsearch:
_save_date = user.join_date _save_date = user.join_date
_save_time = user.join_time _save_time = user.join_time
user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0] user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1] user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1]
...@@ -81,13 +103,27 @@ async def Users(u, config, conn): ...@@ -81,13 +103,27 @@ async def Users(u, config, conn):
user.join_date = _save_date user.join_date = _save_date
user.join_time = _save_time user.join_time = _save_time
if config.Store_object:
user_object.append(user) # twint.user.user
_output(user, output, config) _output(user, output, config)
async def Username(username, config, conn): async def Username(username, config, conn):
global follow_object
follow_var = config.Following*"following" + config.Followers*"followers"
if config.Database: if config.Database:
db.follow(conn, config.Username, config.Followers, username) db.follow(conn, config.Username, config.Followers, username)
if config.Elasticsearch: if config.Elasticsearch:
elasticsearch.Follow(username, config) elasticsearch.Follow(username, config)
_output(username, username, config) if config.Store_object or config.Pandas:
try:
_ = follow_object[config.Username][follow_var]
except KeyError:
follow_object.update({config.Username: {follow_var: []}})
follow_object[config.Username][follow_var].append(username)
if config.Pandas_au:
panda.update(follow_object[config.Username], config)
_output(username, username, config, follow_list=_follow_list)
...@@ -16,6 +16,9 @@ class Twint: ...@@ -16,6 +16,9 @@ class Twint:
self.d = datelock.Set(self.config.Until, self.config.Since) self.d = datelock.Set(self.config.Until, self.config.Since)
verbose.Elastic(config.Elasticsearch) verbose.Elastic(config.Elasticsearch)
if self.config.Store_object:
output.clean_follow_list()
if self.config.Pandas_clean: if self.config.Pandas_clean:
storage.panda.clean() storage.panda.clean()
...@@ -124,17 +127,35 @@ def Favorites(config): ...@@ -124,17 +127,35 @@ def Favorites(config):
run(config) run(config)
def Followers(config): def Followers(config):
output.clean_follow_list()
config.Followers = True config.Followers = True
config.Following = False
run(config) run(config)
if config.Pandas_au:
storage.panda._autoget("followers")
if config.User_full:
storage.panda._autoget("user")
storage.panda.clean()
def Following(config): def Following(config):
output.clean_follow_list()
config.Following = True config.Following = True
config.Followers = False
run(config) run(config)
if config.Pandas_au:
storage.panda._autoget("following")
if config.User_full:
storage.panda._autoget("user")
storage.panda.clean()
def Profile(config): def Profile(config):
config.Profile = True
run(config) run(config)
def Search(config): def Search(config):
config.TwitterSearch = True config.TwitterSearch = True
config.Following = False
config.Followers = False
run(config) run(config)
if config.Pandas_au:
storage.panda._autoget("tweet")
...@@ -73,8 +73,9 @@ def Tweet(Tweet, config): ...@@ -73,8 +73,9 @@ def Tweet(Tweet, config):
} }
actions.append(j_data) actions.append(j_data)
for l in range(int(Tweet.likes)): if config.ES_count["likes"] is not False:
j_data = { for l in range(int(Tweet.likes)):
j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": config.Index_type, "_type": config.Index_type,
"_id": Tweet.id + "_likes_" + str(nLikes) + "_" + config.Essid, "_id": Tweet.id + "_likes_" + str(nLikes) + "_" + config.Essid,
...@@ -96,11 +97,12 @@ def Tweet(Tweet, config): ...@@ -96,11 +97,12 @@ def Tweet(Tweet, config):
"essid": config.Essid "essid": config.Essid
} }
} }
actions.append(j_data) actions.append(j_data)
nLikes += 1 nLikes += 1
for rep in range(int(Tweet.replies)): if config.ES_count["replies"] is not False:
j_data = { for rep in range(int(Tweet.replies)):
j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": config.Index_type, "_type": config.Index_type,
"_id": Tweet.id + "_replies_" + str(nReplies) + "_" + config.Essid, "_id": Tweet.id + "_replies_" + str(nReplies) + "_" + config.Essid,
...@@ -122,11 +124,12 @@ def Tweet(Tweet, config): ...@@ -122,11 +124,12 @@ def Tweet(Tweet, config):
"essid": config.Essid "essid": config.Essid
} }
} }
actions.append(j_data) actions.append(j_data)
nReplies += 1 nReplies += 1
for ret in range(int(Tweet.retweets)): if config.ES_count["retweets"] is not False:
j_data = { for ret in range(int(Tweet.retweets)):
j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": config.Index_type, "_type": config.Index_type,
"_id": Tweet.id + "_retweets_" + str(nRetweets) + "_" + config.Essid, "_id": Tweet.id + "_retweets_" + str(nRetweets) + "_" + config.Essid,
...@@ -148,8 +151,8 @@ def Tweet(Tweet, config): ...@@ -148,8 +151,8 @@ def Tweet(Tweet, config):
"essid": config.Essid "essid": config.Essid
} }
} }
actions.append(j_data) actions.append(j_data)
nRetweets += 1 nRetweets += 1
es = Elasticsearch(config.Elasticsearch) es = Elasticsearch(config.Elasticsearch)
with nostdout(): with nostdout():
......
from .elasticsearch import *
from time import strftime, localtime from time import strftime, localtime
import pandas as pd import pandas as pd
import warnings import warnings
from .elasticsearch import *
_blocks = [] Tweets_df = None
Follow_df = None
def update(Tweet, session): User_df = None
dt = f"{Tweet.datestamp} {Tweet.timestamp}"
_data = { _object_blocks = {
"id": Tweet.id, "tweet": [],
"date": dt, "user": [],
"timezone": Tweet.timezone, "following": [],
"location": Tweet.location, "followers": []
"tweet": Tweet.tweet, }
"hashtags": Tweet.hashtags, _type = ""
"user_id": Tweet.user_id,
"username": Tweet.username,
"link": Tweet.link,
"retweet": Tweet.retweet,
"user_rt": Tweet.user_rt,
"essid": str(session),
'mentions': Tweet.mentions
}
_blocks.append(_data)
def get(): def _concat(df, type):
df = pd.DataFrame(_blocks) if df is None:
df = pd.DataFrame(_object_blocks[type])
else:
_df = pd.DataFrame(_object_blocks[type])
df = pd.concat([df, _df], sort=True)
return df return df
def _autoget(type):
global Tweets_df
global Follow_df
global User_df
if type == "tweet":
Tweets_df = _concat(Tweets_df, type)
if type == "followers" or type == "following":
Follow_df = _concat(Follow_df, type)
if type == "user":
User_df = _concat(User_df, type)
def update(object, config):
global _type
try:
_type = ((object.type == "tweet")*"tweet" +
(object.type == "user")*"user")
except AttributeError:
_type = config.Following*"following" + config.Followers*"followers"
if _type == "tweet":
dt = f"{object.datestamp} {object.timestamp}"
_data = {
"id": object.id,
"date": dt,
"timezone": object.timezone,
"location": object.location,
"tweet": object.tweet,
"hashtags": object.hashtags,
"user_id": object.user_id,
"username": object.username,
"link": object.link,
"retweet": object.retweet,
"user_rt": object.user_rt,
"essid": config.Essid,
'mentions': object.mentions
}
_object_blocks[_type].append(_data)
elif _type == "user":
_data = {
"id": object.id,
"name": object.name,
"username": object.username,
"bio": object.bio,
"location": object.location,
"url": object.url,
"join_datetime": object.join_date + " " + object.join_time,
"join_date": object.join_date,
"join_time": object.join_time,
"tweets": object.tweets,
"following": object.following,
"followers": object.followers,
"likes": object.likes,
"media": object.media_count,
"private": object.is_private,
"verified": object.is_verified,
"avatar": object.avatar,
"session": str(config.Essid)
}
_object_blocks[_type].append(_data)
elif _type == "followers" or _type == "following":
_data = {
config.Following*"following" + config.Followers*"followers" :
{config.Username: object[_type]}
}
_object_blocks[_type] = _data
else:
print("Wrong type of object passed!")
def clean(): def clean():
_blocks.clear() _object_blocks["tweet"].clear()
_object_blocks["following"].clear()
_object_blocks["followers"].clear()
_object_blocks["user"].clear()
def save(_filename, _dataframe, **options): def save(_filename, _dataframe, **options):
if options.get("dataname"): if options.get("dataname"):
...@@ -41,15 +110,16 @@ def save(_filename, _dataframe, **options): ...@@ -41,15 +110,16 @@ def save(_filename, _dataframe, **options):
if not options.get("type"): if not options.get("type"):
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
_store = pd.HDFStore(_filename) _store = pd.HDFStore(_filename + ".h5")
_store[_dataname] = _dataframe _store[_dataname] = _dataframe
_store.close() _store.close()
elif options.get("type") == "Pickle": elif options.get("type") == "Pickle":
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
_dataframe.to_pickle(_filename) _dataframe.to_pickle(_filename + ".pkl")
else: else:
print("Please specify: filename, DataFrame, DataFrame name and type (HDF5, default, or Pickle") print("""Please specify: filename, DataFrame, DataFrame name and type
(HDF5, default, or Pickle)""")
def read(_filename, **options): def read(_filename, **options):
if not options.get("dataname"): if not options.get("dataname"):
...@@ -58,11 +128,12 @@ def read(_filename, **options): ...@@ -58,11 +128,12 @@ def read(_filename, **options):
_dataname = options.get("dataname") _dataname = options.get("dataname")
if not options.get("type"): if not options.get("type"):
_store = pd.HDFStore(_filename) _store = pd.HDFStore(_filename + ".h5")
df = _store[_dataname] _df = _store[_dataname]
return df return _df
elif options.get("type") == "Pickle": elif options.get("type") == "Pickle":
df = pd.read_pickle(_filename) _df = pd.read_pickle(_filename + ".pkl")
return df return _df
else: else:
print("Please specify: DataFrame, DataFrame name (twint as default), filename and type (HDF5, default, or Pickle") print("""Please specify: DataFrame, DataFrame name (twint as default),
filename and type (HDF5, default, or Pickle""")
...@@ -2,9 +2,16 @@ from time import strftime, localtime ...@@ -2,9 +2,16 @@ from time import strftime, localtime
import re import re
class tweet: class tweet:
pass """Define Tweet class
"""
type = "tweet"
def __init__(self):
pass
def getMentions(tw): def getMentions(tw):
"""Extract ment from tweet
"""
try: try:
mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ") mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ")
except: except:
...@@ -13,6 +20,8 @@ def getMentions(tw): ...@@ -13,6 +20,8 @@ def getMentions(tw):
return mentions return mentions
def getText(tw): def getText(tw):
"""Replace some text
"""
text = tw.find("p", "tweet-text").text text = tw.find("p", "tweet-text").text
text = text.replace("\n", " ") text = text.replace("\n", " ")
text = text.replace("http", " http") text = text.replace("http", " http")
...@@ -33,9 +42,13 @@ def getTweet(tw, mentions): ...@@ -33,9 +42,13 @@ def getTweet(tw, mentions):
return text return text
def getHashtags(text): def getHashtags(text):
"""Get hashtags of tweet
"""
return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE) return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
def getStat(tw, _type): def getStat(tw, _type):
"""Get stats about Tweet
"""
st = f"ProfileTweet-action--{_type} u-hiddenVisually" st = f"ProfileTweet-action--{_type} u-hiddenVisually"
return tw.find("span", st).find("span")["data-tweet-stat-count"] return tw.find("span", st).find("span")["data-tweet-stat-count"]
...@@ -44,14 +57,18 @@ def getRetweet(profile, username, user): ...@@ -44,14 +57,18 @@ def getRetweet(profile, username, user):
return True return True
def getUser_rt(profile, username, user): def getUser_rt(profile, username, user):
"""Get username that retweeted
"""
if getRetweet(profile, username, user): if getRetweet(profile, username, user):
user_rt = user user_rt = user
else: else:
user_rt = "None" user_rt = "None"
return user_rt return user_rt
def Tweet(tw, location, config): def Tweet(tw, location, config):
"""Create Tweet object
"""
t = tweet() t = tweet()
t.id = tw.find("div")["data-item-id"] t.id = tw.find("div")["data-item-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time"]) t.datetime = int(tw.find("span", "_timestamp")["data-time"])
......
class user: class user:
pass type = "user"
def __init__(self):
pass
def inf(ur, _type): def inf(ur, _type):
try: try:
...@@ -17,7 +20,7 @@ def inf(ur, _type): ...@@ -17,7 +20,7 @@ def inf(ur, _type):
ret = group["data-screen-name"] ret = group["data-screen-name"]
elif _type == "private": elif _type == "private":
ret = group["data-protected"] ret = group["data-protected"]
return ret return ret
def card(ur, _type): def card(ur, _type):
...@@ -37,7 +40,7 @@ def card(ur, _type): ...@@ -37,7 +40,7 @@ def card(ur, _type):
ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"] ret = ur.find("span", "ProfileHeaderCard-urlText u-dir").find("a")["title"]
except: except:
ret = "None" ret = "None"
return ret return ret
def join(ur): def join(ur):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment