Commit 3b273006 authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Merge pull request #172 from wardnath/reset_pandas

add function to clean accumulated pandas storage data
#169 
parents e8e1f695 cccc412b
...@@ -122,6 +122,7 @@ def initialize(args): ...@@ -122,6 +122,7 @@ def initialize(args):
c.Videos = args.videos c.Videos = args.videos
c.Media = args.media c.Media = args.media
c.Replies = args.replies c.Replies = args.replies
c.Pandas_clean = args.pandas_clean
return c return c
def options(): def options():
...@@ -184,6 +185,7 @@ def options(): ...@@ -184,6 +185,7 @@ def options():
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true") ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true") ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
args = ap.parse_args() args = ap.parse_args()
return args return args
...@@ -210,6 +212,9 @@ def main(): ...@@ -210,6 +212,9 @@ def main():
if not args.essid: if not args.essid:
args.essid = "" args.essid = ""
if args.pandas_clean:
twint.storage.panda.clean()
c = initialize(args) c = initialize(args)
if args.favorites: if args.favorites:
......
...@@ -47,3 +47,4 @@ class Config: ...@@ -47,3 +47,4 @@ class Config:
Videos = False Videos = False
Media = False Media = False
Replies = False Replies = False
Pandas_clean = True
from . import datelock, feed, get, output, verbose from . import datelock, feed, get, output, verbose, storage
from asyncio import get_event_loop from asyncio import get_event_loop
from datetime import timedelta from datetime import timedelta
from .storage import db from .storage import db
...@@ -16,6 +16,9 @@ class Twint: ...@@ -16,6 +16,9 @@ class Twint:
self.d = datelock.Set(self.config.Until, self.config.Since) self.d = datelock.Set(self.config.Until, self.config.Since)
verbose.Elastic(config.Elasticsearch) verbose.Elastic(config.Elasticsearch)
if self.config.Pandas_clean:
storage.panda.clean()
if not self.config.Timedelta: if not self.config.Timedelta:
if (self.d._until - self.d._since).days > 30: if (self.d._until - self.d._since).days > 30:
self.config.Timedelta = 30 self.config.Timedelta = 30
......
...@@ -6,7 +6,6 @@ import warnings ...@@ -6,7 +6,6 @@ import warnings
_blocks = [] _blocks = []
def update(Tweet, session): def update(Tweet, session):
day = weekday(strftime("%A", localtime(Tweet.datetime)))
dt = f"{Tweet.datestamp} {Tweet.timestamp}" dt = f"{Tweet.datestamp} {Tweet.timestamp}"
_data = { _data = {
...@@ -18,8 +17,6 @@ def update(Tweet, session): ...@@ -18,8 +17,6 @@ def update(Tweet, session):
"hashtags": Tweet.hashtags, "hashtags": Tweet.hashtags,
"user_id": Tweet.user_id, "user_id": Tweet.user_id,
"username": Tweet.username, "username": Tweet.username,
"day": day,
"hour": hour(Tweet.datetime),
"link": Tweet.link, "link": Tweet.link,
"retweet": Tweet.retweet, "retweet": Tweet.retweet,
"user_rt": Tweet.user_rt, "user_rt": Tweet.user_rt,
...@@ -32,6 +29,9 @@ def get(): ...@@ -32,6 +29,9 @@ def get():
df = pd.DataFrame(_blocks) df = pd.DataFrame(_blocks)
return df return df
def clean():
_blocks.clear()
def save(_filename, _dataframe, **options): def save(_filename, _dataframe, **options):
if options.get("dataname"): if options.get("dataname"):
_dataname = options.get("dataname") _dataname = options.get("dataname")
...@@ -53,7 +53,9 @@ def save(_filename, _dataframe, **options): ...@@ -53,7 +53,9 @@ def save(_filename, _dataframe, **options):
def read(_filename, **options): def read(_filename, **options):
if not options.get("dataname"): if not options.get("dataname"):
_dataname = "Twint" _dataname = "twint"
else:
_dataname = options.get("dataname")
if not options.get("type"): if not options.get("type"):
_store = pd.HDFStore(_filename) _store = pd.HDFStore(_filename)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment