Commit bf671f19 authored by Francesco Poldi's avatar Francesco Poldi

Merge remote-tracking branch 'origin/master'

parents cd5c92b6 59b3bdd0
...@@ -44,7 +44,7 @@ A few simple examples to help you understand the basics: ...@@ -44,7 +44,7 @@ A few simple examples to help you understand the basics:
- `python3 twint.py -u username --following` - Scrape who a Twitter user follows. - `python3 twint.py -u username --following` - Scrape who a Twitter user follows.
- `python3 twint.py -u username --favorites` - Collect all the Tweets a user has favorited. - `python3 twint.py -u username --favorites` - Collect all the Tweets a user has favorited.
- `python3 twint.py -u username --following --user-full` - Collect full user information a person follows - `python3 twint.py -u username --following --user-full` - Collect full user information a person follows
- `python3 twint.py -u username --profile-full` - Use a slow, but effective method to gather all the Tweets from a user's profile (Including Retweets). - `python3 twint.py -u username --profile-full` - Use a slow, but effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, Including Retweets).
- `python3 twint.py -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile. - `python3 twint.py -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
- `python3 twint.py -u username --resume 10940389583058` - Resume a search starting from the specified Tweet ID. - `python3 twint.py -u username --resume 10940389583058` - Resume a search starting from the specified Tweet ID.
......
...@@ -122,6 +122,7 @@ def initialize(args): ...@@ -122,6 +122,7 @@ def initialize(args):
c.Videos = args.videos c.Videos = args.videos
c.Media = args.media c.Media = args.media
c.Replies = args.replies c.Replies = args.replies
c.Pandas_clean = args.pandas_clean
return c return c
def options(): def options():
...@@ -184,6 +185,7 @@ def options(): ...@@ -184,6 +185,7 @@ def options():
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true") ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true") ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.")
args = ap.parse_args() args = ap.parse_args()
return args return args
...@@ -210,6 +212,9 @@ def main(): ...@@ -210,6 +212,9 @@ def main():
if not args.essid: if not args.essid:
args.essid = "" args.essid = ""
if args.pandas_clean:
twint.storage.panda.clean()
c = initialize(args) c = initialize(args)
if args.favorites: if args.favorites:
......
...@@ -47,3 +47,4 @@ class Config: ...@@ -47,3 +47,4 @@ class Config:
Videos = False Videos = False
Media = False Media = False
Replies = False Replies = False
Pandas_clean = True
...@@ -49,7 +49,7 @@ async def Response(session, url): ...@@ -49,7 +49,7 @@ async def Response(session, url):
async def Username(_id): async def Username(_id):
url = f"https://twitter.com/intent/user?user_id={_id}&lang=en" url = f"https://twitter.com/intent/user?user_id={_id}&lang=en"
r = Request(url) r = await Request(url)
soup = BeautifulSoup(r, "html.parser") soup = BeautifulSoup(r, "html.parser")
return soup.find("a", "fn url alternate-context")["href"].replace("/", "") return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
......
from . import datelock, feed, get, output, verbose from . import datelock, feed, get, output, verbose, storage
from asyncio import get_event_loop from asyncio import get_event_loop
from datetime import timedelta from datetime import timedelta
from .storage import db from .storage import db
...@@ -16,6 +16,9 @@ class Twint: ...@@ -16,6 +16,9 @@ class Twint:
self.d = datelock.Set(self.config.Until, self.config.Since) self.d = datelock.Set(self.config.Until, self.config.Since)
verbose.Elastic(config.Elasticsearch) verbose.Elastic(config.Elasticsearch)
if self.config.Pandas_clean:
storage.panda.clean()
if not self.config.Timedelta: if not self.config.Timedelta:
if (self.d._until - self.d._since).days > 30: if (self.d._until - self.d._since).days > 30:
self.config.Timedelta = 30 self.config.Timedelta = 30
...@@ -78,7 +81,7 @@ class Twint: ...@@ -78,7 +81,7 @@ class Twint:
async def main(self): async def main(self):
if self.config.User_id is not None: if self.config.User_id is not None:
self.config.Username = await get.Username(self.config.User_id) self.config.Username = await get.Username(self.config.User_id)
if self.config.TwitterSearch and self.config.Since and self.config.Until: if self.config.TwitterSearch and self.config.Since and self.config.Until:
_days = timedelta(days=int(self.config.Timedelta)) _days = timedelta(days=int(self.config.Timedelta))
while self.d._since < self.d._until: while self.d._since < self.d._until:
...@@ -106,10 +109,10 @@ class Twint: ...@@ -106,10 +109,10 @@ class Twint:
await self.tweets() await self.tweets()
else: else:
break break
if get.Limit(self.config.Limit, self.count): if get.Limit(self.config.Limit, self.count):
break break
if self.config.Count: if self.config.Count:
verbose.Count(self.count, self.config) verbose.Count(self.count, self.config)
......
...@@ -6,7 +6,6 @@ import warnings ...@@ -6,7 +6,6 @@ import warnings
_blocks = [] _blocks = []
def update(Tweet, session): def update(Tweet, session):
day = weekday(strftime("%A", localtime(Tweet.datetime)))
dt = f"{Tweet.datestamp} {Tweet.timestamp}" dt = f"{Tweet.datestamp} {Tweet.timestamp}"
_data = { _data = {
...@@ -18,8 +17,6 @@ def update(Tweet, session): ...@@ -18,8 +17,6 @@ def update(Tweet, session):
"hashtags": Tweet.hashtags, "hashtags": Tweet.hashtags,
"user_id": Tweet.user_id, "user_id": Tweet.user_id,
"username": Tweet.username, "username": Tweet.username,
"day": day,
"hour": hour(Tweet.datetime),
"link": Tweet.link, "link": Tweet.link,
"retweet": Tweet.retweet, "retweet": Tweet.retweet,
"user_rt": Tweet.user_rt, "user_rt": Tweet.user_rt,
...@@ -32,6 +29,9 @@ def get(): ...@@ -32,6 +29,9 @@ def get():
df = pd.DataFrame(_blocks) df = pd.DataFrame(_blocks)
return df return df
def clean():
_blocks.clear()
def save(_filename, _dataframe, **options): def save(_filename, _dataframe, **options):
if options.get("dataname"): if options.get("dataname"):
_dataname = options.get("dataname") _dataname = options.get("dataname")
...@@ -53,7 +53,9 @@ def save(_filename, _dataframe, **options): ...@@ -53,7 +53,9 @@ def save(_filename, _dataframe, **options):
def read(_filename, **options): def read(_filename, **options):
if not options.get("dataname"): if not options.get("dataname"):
_dataname = "Twint" _dataname = "twint"
else:
_dataname = options.get("dataname")
if not options.get("type"): if not options.get("type"):
_store = pd.HDFStore(_filename) _store = pd.HDFStore(_filename)
......
...@@ -66,10 +66,10 @@ async def Search(config, init): ...@@ -66,10 +66,10 @@ async def Search(config, init):
if config.Until: if config.Until:
url += f"%20until%3A{config.Until}" url += f"%20until%3A{config.Until}"
if config.Fruit: if config.Fruit:
url += "%20myspace.com%20OR%20last.fm%20OR" url += "%20%22myspace.com%22%20OR%20%22last.fm%22%20OR"
url += "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail" url += "%20%22mail%22%20OR%20%22email%22%20OR%20%22gmail%22%20OR%20%22e-mail%22"
url += "%20OR%20phone%20OR%20call%20me%20OR%20text%20me" url += "%20OR%20%22phone%22%20OR%20%22call%20me%22%20OR%20%22text%20me%22"
url += "%20OR%20keybase" url += "%20OR%20%22keybase%22"
if config.Verified: if config.Verified:
url += "%20filter%3Averified" url += "%20filter%3Averified"
if config.To: if config.To:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment