Commit 85a2bfed authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Devextra (#253)

* Update index-tweets.json

* Update index-user.json

* Update get.py

* Update output.py

* Update run.py

* Update tweet.py

* Update user.py

* Update db.py

* Update elasticsearch.py

* Update _count variable

* Update elasticsearch.py

* Set to update (#246)

* Added option for custom type

* Create automate.py

* Finished dataframes storing option (#224)

* Update (#174)

* add function to clean accumulated pandas storage data

* Fixed typo, dataname, removed attributes

* Added config options and config var

* Added autoclean

Works for search now

* Added Elasticsearch count options

* Added silent output and objects for users and followers

* Update

* Clean following/followers attr

* Final construct of object

* Redesign

* Little fix

* Debug

* Debug

* Globals

* Removed debug

* Globals pt 2

* Mix

* Added _old_obj to store previous scrape

* Prefix

* Pre fix pt 2

* commented

* Fix for object follow

* Update

* Update

* Completed follow_object

* Pandas object for followers and following

* Finished pandas object for followers and following

* Added docstrings in Twint.py

* Added lowercase

#170

* Finished lower case

Close #170

* Fix defaults

* Added some edits

In `panda.py` changing the structure of the dataframe for users that one is following/followed, in `config.py` added autoupdate so that one does not have to call `storage.panda.get()` at every run, in `output.py` edited follow_object, in `run.py` added autoupdate function for panda, in`tweet.py` just some docstrings

* Update for panda and objects

* Finished storing data into dataframes #173

Now followers, following, tweets, and user details are saved in dataframes

* Added proxy support (#225)

* Added proxy #139

* Added new requirement, fixed proxy, added proxy config

* Changed index names, removed duplicate arg

* Updated default CLI args

* Added visualizations and dashboard

* Typo fix

* Added loggin options, fixes retweets

* Update README.md

Added examples and how-to

* Updated index and fixes

* Update

* Update dashboards

* Update

* Update index-tweets, fixed visualizations and new dashboard

* Update doc

* Fixed errors with user_full

* Fixed quite hidden issue

* Added print error

* Added other print error

* Update

* #173

* Fix non-latin chars #229

* Added auto-index-creation 

Making Elasticsearch setup easier

* Revert "Set to update (#246)" (#247)

This reverts commit 69cbcf58f5cd36a84003c792315e9455bed55a2b.

* Update output.py

* Update output.py

* Elasticsearch update

* Added user_info option

* Removed ES count

* Get userinfo of username passed
parent a869fcdb
...@@ -91,6 +91,7 @@ def initialize(args): ...@@ -91,6 +91,7 @@ def initialize(args):
c.Essid = args.essid c.Essid = args.essid
c.Format = args.format c.Format = args.format
c.User_full = args.user_full c.User_full = args.user_full
c.User_info = args.user_info
c.Profile_full = args.profile_full c.Profile_full = args.profile_full
c.Store_pandas = args.store_pandas c.Store_pandas = args.store_pandas
c.Pandas_type = args.pandas_type c.Pandas_type = args.pandas_type
...@@ -164,6 +165,7 @@ def options(): ...@@ -164,6 +165,7 @@ def options():
ap.add_argument("--user-full", ap.add_argument("--user-full",
help="Collect all user information (Use with followers or following only).", help="Collect all user information (Use with followers or following only).",
action="store_true") action="store_true")
ap.add_argument("--user-info", help="Scrape user's info in tweet", action="store_false")
ap.add_argument("--profile-full", ap.add_argument("--profile-full",
help="Slow, but effective method of collecting a user's Tweets and RT.", help="Slow, but effective method of collecting a user's Tweets and RT.",
action="store_true") action="store_true")
...@@ -189,8 +191,6 @@ def options(): ...@@ -189,8 +191,6 @@ def options():
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc", "--pandas-clean", ap.add_argument("-pc", "--pandas-clean",
help="Automatically clean Pandas dataframe at every scrape.") help="Automatically clean Pandas dataframe at every scrape.")
ap.add_argument("-ec", "--es-count", nargs="?", default="",
help="What NOT to count: likes, replies, retweets; only for Elasticsearch.")
args = ap.parse_args() args = ap.parse_args()
return args return args
...@@ -209,15 +209,6 @@ def main(): ...@@ -209,15 +209,6 @@ def main():
c = initialize(args) c = initialize(args)
if "likes" in str(args.es_count):
c.ES_count["likes"] = True
if "replies" in str(args.es_count):
c.ES_count["replies"] = True
if "retweets" in str(args.es_count):
c.ES_count["retweets"] = True
if args.pandas_clean: if args.pandas_clean:
twint.storage.panda.clean() twint.storage.panda.clean()
......
...@@ -33,6 +33,7 @@ class Config: ...@@ -33,6 +33,7 @@ class Config:
Favorites = False Favorites = False
TwitterSearch = False TwitterSearch = False
User_full = False User_full = False
User_info = True
Profile_full = False Profile_full = False
Store_object = False Store_object = False
Store_pandas = False Store_pandas = False
...@@ -49,7 +50,6 @@ class Config: ...@@ -49,7 +50,6 @@ class Config:
Media = False Media = False
Replies = False Replies = False
Pandas_clean = True Pandas_clean = True
ES_count = {"likes":False,"replies":False,"retweets":False}
Lowercase = True Lowercase = True
Pandas_au = True Pandas_au = True
Proxy_host = None Proxy_host = None
......
...@@ -45,7 +45,7 @@ def _output(obj, output, config, **extra): ...@@ -45,7 +45,7 @@ def _output(obj, output, config, **extra):
else: else:
obj.username = obj.username.lower() obj.username = obj.username.lower()
for i in range(len(obj.mentions)): for i in range(len(obj.mentions)):
obj.mentions[i] = obj.mentions[i].lower() obj.mentions[i] = obj.mentions[i]["screen_name"].lower()
for i in range(len(obj.hashtags)): for i in range(len(obj.hashtags)):
obj.hashtags[i] = obj.hashtags[i].lower() obj.hashtags[i] = obj.hashtags[i].lower()
if config.Output != None: if config.Output != None:
...@@ -83,6 +83,8 @@ async def Tweets(tw, location, config, conn): ...@@ -83,6 +83,8 @@ async def Tweets(tw, location, config, conn):
copyright = tw.find("div", "StreamItemContent--withheld") copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw): if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config) tweet = Tweet(tw, location, config)
if config.Database is not None and config.User_info:
await tweetUserData(tweet, config, conn)
if datecheck(tweet.datestamp, config): if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet) output = format.Tweet(config, tweet)
......
...@@ -113,6 +113,8 @@ class Twint: ...@@ -113,6 +113,8 @@ class Twint:
while True: while True:
if len(self.feed) > 0: if len(self.feed) > 0:
if self.config.Followers or self.config.Following: if self.config.Followers or self.config.Following:
url = f"http://twitter.com/{self.config.Username}?lang=en"
await get.User(url, self.config, self.conn)
await self.follow() await self.follow()
elif self.config.Favorites: elif self.config.Favorites:
await self.favorite() await self.favorite()
......
...@@ -159,18 +159,17 @@ def Tweet(Tweet, config): ...@@ -159,18 +159,17 @@ def Tweet(Tweet, config):
day = weekdays[strftime("%A", localtime(Tweet.datetime))] day = weekdays[strftime("%A", localtime(Tweet.datetime))]
actions = [] actions = []
nLikes = 1
nReplies = 1
nRetweets = 1
dt = f"{Tweet.datestamp} {Tweet.timestamp}" dt = f"{Tweet.datestamp} {Tweet.timestamp}"
j_data = { j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": config.Index_type, "_type": config.Index_type,
"_id": Tweet.id + "_raw_" + config.Essid, "_id": str(Tweet.id) + "_raw_" + config.Essid,
"_source": { "_source": {
"id": Tweet.id, "id": str(Tweet.id),
"conversation_id": Tweet.conversation_id,
"created_at": Tweet.datetime,
"date": dt, "date": dt,
"timezone": Tweet.timezone, "timezone": Tweet.timezone,
"location": Tweet.location, "location": Tweet.location,
...@@ -192,87 +191,6 @@ def Tweet(Tweet, config): ...@@ -192,87 +191,6 @@ def Tweet(Tweet, config):
} }
actions.append(j_data) actions.append(j_data)
if config.ES_count["likes"]:
for l in range(int(Tweet.likes)):
j_data = {
"_index": config.Index_tweets,
"_type": config.Index_type,
"_id": Tweet.id + "_likes_" + str(nLikes) + "_" + config.Essid,
"_source": {
"id": Tweet.id,
"date": dt,
"timezone": Tweet.timezone,
"location": Tweet.location,
"tweet": Tweet.tweet,
"hashtags": Tweet.hashtags,
"likes": True,
"user_id": Tweet.user_id,
"username": Tweet.username,
"day": day,
"hour": hour(Tweet.datetime),
"link": Tweet.link,
"retweet": Tweet.retweet,
"user_rt": Tweet.user_rt,
"essid": config.Essid
}
}
actions.append(j_data)
nLikes += 1
if config.ES_count["replies"]:
for rep in range(int(Tweet.replies)):
j_data = {
"_index": config.Index_tweets,
"_type": config.Index_type,
"_id": Tweet.id + "_replies_" + str(nReplies) + "_" + config.Essid,
"_source": {
"id": Tweet.id,
"date": dt,
"timezone": Tweet.timezone,
"location": Tweet.location,
"tweet": Tweet.tweet,
"hashtags": Tweet.hashtags,
"replies": True,
"user_id": Tweet.user_id,
"username": Tweet.username,
"day": day,
"hour": hour(Tweet.datetime),
"link": Tweet.link,
"retweet": Tweet.retweet,
"user_rt": Tweet.user_rt,
"essid": config.Essid
}
}
actions.append(j_data)
nReplies += 1
if config.ES_count["retweets"]:
for ret in range(int(Tweet.retweets)):
j_data = {
"_index": config.Index_tweets,
"_type": config.Index_type,
"_id": Tweet.id + "_retweets_" + str(nRetweets) + "_" + config.Essid,
"_source": {
"id": Tweet.id,
"date": dt,
"timezone": Tweet.timezone,
"location": Tweet.location,
"tweet": Tweet.tweet,
"hashtags": Tweet.hashtags,
"retweets": True,
"user_id": Tweet.user_id,
"username": Tweet.username,
"day": day,
"hour": hour(Tweet.datetime),
"link": Tweet.link,
"retweet": Tweet.retweet,
"user_rt": Tweet.user_rt,
"essid": config.Essid
}
}
actions.append(j_data)
nRetweets += 1
es = Elasticsearch(config.Elasticsearch) es = Elasticsearch(config.Elasticsearch)
if not _index_tweet_status: if not _index_tweet_status:
_index_tweet_status = createIndex(config, es, scope="tweet") _index_tweet_status = createIndex(config, es, scope="tweet")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment