Commit 3b273006 authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Merge pull request #172 from wardnath/reset_pandas

add function to clean accumulated pandas storage data
#169 
parents e8e1f695 cccc412b
...@@ -122,6 +122,7 @@ def initialize(args): ...@@ -122,6 +122,7 @@ def initialize(args):
c.Videos = args.videos c.Videos = args.videos
c.Media = args.media c.Media = args.media
c.Replies = args.replies c.Replies = args.replies
c.Pandas_clean = args.pandas_clean
return c return c
def options(): def options():
...@@ -184,6 +185,7 @@ def options(): ...@@ -184,6 +185,7 @@ def options():
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true") ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true") ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
args = ap.parse_args() args = ap.parse_args()
return args return args
...@@ -210,6 +212,9 @@ def main(): ...@@ -210,6 +212,9 @@ def main():
if not args.essid: if not args.essid:
args.essid = "" args.essid = ""
if args.pandas_clean:
twint.storage.panda.clean()
c = initialize(args) c = initialize(args)
if args.favorites: if args.favorites:
......
...@@ -47,3 +47,4 @@ class Config: ...@@ -47,3 +47,4 @@ class Config:
Videos = False Videos = False
Media = False Media = False
Replies = False Replies = False
Pandas_clean = True
from . import datelock, feed, get, output, verbose from . import datelock, feed, get, output, verbose, storage
from asyncio import get_event_loop from asyncio import get_event_loop
from datetime import timedelta from datetime import timedelta
from .storage import db from .storage import db
...@@ -16,6 +16,9 @@ class Twint: ...@@ -16,6 +16,9 @@ class Twint:
self.d = datelock.Set(self.config.Until, self.config.Since) self.d = datelock.Set(self.config.Until, self.config.Since)
verbose.Elastic(config.Elasticsearch) verbose.Elastic(config.Elasticsearch)
if self.config.Pandas_clean:
storage.panda.clean()
if not self.config.Timedelta: if not self.config.Timedelta:
if (self.d._until - self.d._since).days > 30: if (self.d._until - self.d._since).days > 30:
self.config.Timedelta = 30 self.config.Timedelta = 30
...@@ -78,7 +81,7 @@ class Twint: ...@@ -78,7 +81,7 @@ class Twint:
async def main(self): async def main(self):
if self.config.User_id is not None: if self.config.User_id is not None:
self.config.Username = await get.Username(self.config.User_id) self.config.Username = await get.Username(self.config.User_id)
if self.config.TwitterSearch and self.config.Since and self.config.Until: if self.config.TwitterSearch and self.config.Since and self.config.Until:
_days = timedelta(days=int(self.config.Timedelta)) _days = timedelta(days=int(self.config.Timedelta))
while self.d._since < self.d._until: while self.d._since < self.d._until:
...@@ -106,10 +109,10 @@ class Twint: ...@@ -106,10 +109,10 @@ class Twint:
await self.tweets() await self.tweets()
else: else:
break break
if get.Limit(self.config.Limit, self.count): if get.Limit(self.config.Limit, self.count):
break break
if self.config.Count: if self.config.Count:
verbose.Count(self.count, self.config.Username) verbose.Count(self.count, self.config.Username)
......
...@@ -6,7 +6,6 @@ import warnings ...@@ -6,7 +6,6 @@ import warnings
_blocks = [] _blocks = []
def update(Tweet, session): def update(Tweet, session):
day = weekday(strftime("%A", localtime(Tweet.datetime)))
dt = f"{Tweet.datestamp} {Tweet.timestamp}" dt = f"{Tweet.datestamp} {Tweet.timestamp}"
_data = { _data = {
...@@ -18,8 +17,6 @@ def update(Tweet, session): ...@@ -18,8 +17,6 @@ def update(Tweet, session):
"hashtags": Tweet.hashtags, "hashtags": Tweet.hashtags,
"user_id": Tweet.user_id, "user_id": Tweet.user_id,
"username": Tweet.username, "username": Tweet.username,
"day": day,
"hour": hour(Tweet.datetime),
"link": Tweet.link, "link": Tweet.link,
"retweet": Tweet.retweet, "retweet": Tweet.retweet,
"user_rt": Tweet.user_rt, "user_rt": Tweet.user_rt,
...@@ -32,6 +29,9 @@ def get(): ...@@ -32,6 +29,9 @@ def get():
df = pd.DataFrame(_blocks) df = pd.DataFrame(_blocks)
return df return df
def clean():
_blocks.clear()
def save(_filename, _dataframe, **options): def save(_filename, _dataframe, **options):
if options.get("dataname"): if options.get("dataname"):
_dataname = options.get("dataname") _dataname = options.get("dataname")
...@@ -53,7 +53,9 @@ def save(_filename, _dataframe, **options): ...@@ -53,7 +53,9 @@ def save(_filename, _dataframe, **options):
def read(_filename, **options): def read(_filename, **options):
if not options.get("dataname"): if not options.get("dataname"):
_dataname = "Twint" _dataname = "twint"
else:
_dataname = options.get("dataname")
if not options.get("type"): if not options.get("type"):
_store = pd.HDFStore(_filename) _store = pd.HDFStore(_filename)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment