Commit e80ecb83 authored by Francesco Poldi's avatar Francesco Poldi Committed by GitHub

Merge master to devextra (#230)

* Added option for custom type

* Create automate.py

* Finished dataframes storing option (#224)

* Update (#174)

* add function to clean accumulated pandas storage data

* Fixed typo, dataname, removed attributes

* Added config options and config var

* Added autoclean

Works for search now

* Added Elasticsearch count options

* Added silent output and objects for users and followers

* Update

* Clean following/followers attr

* Final construct of object

* Redesign

* Little fix

* Debug

* Debug

* Globals

* Removed debug

* Globals pt 2

* Mix

* Added _old_obj to store previous scrape

* Prefix

* Pre fix pt 2

* commented

* Fix for object follow

* Update

* Update

* Completed follow_object

* Pandas object for followers and following

* Finished pandas object for followers and following

* Added docstrings in Twint.py

* Added lowercase

#170

* Finished lower case

Close #170

* Fix defaults

* Added some edits

In `panda.py` changing the structure of the dataframe for users that one is following/followed, in `config.py` added autoupdate so that one does not have to call `storage.panda.get()` at every run, in `output.py` edited follow_object, in `run.py` added autoupdate function for panda, in`tweet.py` just some docstrings

* Update for panda and objects

* Finished storing data into dataframes #173

Now followers, following, tweets, and user details are saved in dataframes

* Added proxy support (#225)

* Added proxy #139

* Added new requirement, fixed proxy, added proxy config

* Changed index names, removed duplicate arg

* Updated default CLI args

* Added visualizations and dashboard

* Typo fix

* Added loggin options, fixes retweets

* Update README.md

Added examples and how-to

* Updated index and fixes

* Update

* Update dashboards

* Update

* Update index-tweets, fixed visualizations and new dashboard

* Update doc

* Fixed errors with user_full

* Fixed quite hidden issue

* Added print error

* Added other print error

* Update

* #173

* Fix non-latin chars #229
parent 9fa7b75e
# TWINT - Twitter Intelligence Tool # TWINT - Twitter Intelligence Tool
![2](https://i.imgur.com/iaH3s7z.png)
![3](https://i.imgur.com/hVeCrqL.png)
[![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/haccer/twint.svg?branch=master)](https://travis-ci.org/haccer/twint/) [![Python 3.5|3.6](https://img.shields.io/badge/Python-3.5%2F3.6-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/haccer/twint.svg?branch=master)](https://travis-ci.org/haccer/twint/) [![Python 3.5|3.6](https://img.shields.io/badge/Python-3.5%2F3.6-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE)
>No authentication. No API. No limits. >No authentication. No API. No limits.
......
...@@ -8,17 +8,20 @@ https://github.com/haccer/twint/wiki ...@@ -8,17 +8,20 @@ https://github.com/haccer/twint/wiki
Licensed under MIT License Licensed under MIT License
Copyright (c) 2018 Cody Zacharias Copyright (c) 2018 Cody Zacharias
''' '''
import argparse
import twint
import sys import sys
import os import os
import argparse
import twint
def error(error, message): def error(_error, message):
print("[-] {}: {}".format(error, message)) """ Print errors to stdout
"""
print("[-] {}: {}".format(_error, message))
sys.exit(0) sys.exit(0)
def check(args): def check(args):
# Error checking """ Error checking
"""
if args.username is not None: if args.username is not None:
if args.verified: if args.verified:
error("Contradicting Args", error("Contradicting Args",
...@@ -40,44 +43,23 @@ def check(args): ...@@ -40,44 +43,23 @@ def check(args):
if args.user_full: if args.user_full:
error("Error", "Please use --user-full with --followers or --following.") error("Error", "Please use --user-full with --followers or --following.")
# Proxy stuff def loadUserList(ul, _type):
if args.proxy_host is not None: """ Concatenate users
if args.proxy_host.lower() == "tor": """
import socks, socket
socks.set_default_proxy(socks.SOCKS5, "localhost", 9050)
socket.socket = socks.socksocket
elif args.proxy_port and args.proxy_type:
if args.proxy_type.lower() == "socks5":
_type = socks.SOCKS5
elif args.proxy_type.lower() == "socks4":
_type = socks.SOCKS4
elif args.proxy_type.lower() == "http":
_type = socks.HTTP
else:
error("Error", "Proxy types allowed are: socks5, socks4, and http.")
import socks, socket
socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port))
socket.socket = socks.socksocket
else:
error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type")
else:
if args.proxy_port or args.proxy_type:
error("Error", "Please specify --proxy-host, --proxy-port, and --proxy-type")
def loadUserList(ul, type):
if os.path.exists(os.path.abspath(ul)): if os.path.exists(os.path.abspath(ul)):
userlist = open(os.path.abspath(ul), "r").read().splitlines() userlist = open(os.path.abspath(ul), "r").read().splitlines()
else: else:
userlist = ul.split(",") userlist = ul.split(",")
if type == "search": if _type == "search":
un = "" un = ""
for user in userlist: for user in userlist:
un += "%20OR%20from%3A" + user un += "%20OR%20from%3A" + user
return un[15:] return un[15:]
else:
return userlist return userlist
def initialize(args): def initialize(args):
""" Set default values for config from args
"""
c = twint.Config() c = twint.Config()
c.Username = args.username c.Username = args.username
c.User_id = args.userid c.User_id = args.userid
...@@ -123,9 +105,12 @@ def initialize(args): ...@@ -123,9 +105,12 @@ def initialize(args):
c.Media = args.media c.Media = args.media
c.Replies = args.replies c.Replies = args.replies
c.Pandas_clean = args.pandas_clean c.Pandas_clean = args.pandas_clean
c.ES_count = {"likes":True, "replies":True, "retweets":True}
return c return c
def options(): def options():
""" Parse arguments
"""
ap = argparse.ArgumentParser(prog="Twint.py", ap = argparse.ArgumentParser(prog="Twint.py",
usage="python3 %(prog)s [options]", usage="python3 %(prog)s [options]",
description="TWINT - An Advanced Twitter Scraping Tool.") description="TWINT - An Advanced Twitter Scraping Tool.")
...@@ -151,7 +136,8 @@ def options(): ...@@ -151,7 +136,8 @@ def options():
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.", ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
action="store_true") action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", action="store_true") ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
action="store_true")
ap.add_argument("--hostname", help="Store the mysql database host") ap.add_argument("--hostname", help="Store the mysql database host")
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 or mysql database.") ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 or mysql database.")
ap.add_argument("--DB_user", help="Store the mysql database user") ap.add_argument("--DB_user", help="Store the mysql database user")
...@@ -164,59 +150,74 @@ def options(): ...@@ -164,59 +150,74 @@ def options():
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.") ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
ap.add_argument("--proxy-host", help="Proxy hostname or IP.") ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
ap.add_argument("--proxy-port", help="The port of the proxy server.") ap.add_argument("--proxy-port", help="The port of the proxy server.")
ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.") ap.add_argument("--essid",
help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
nargs="?", default="")
ap.add_argument("--userlist", help="Userlist from list or file.") ap.add_argument("--userlist", help="Userlist from list or file.")
ap.add_argument("--retweets", help="Include user's Retweets (Warning: limited).", action="store_true") ap.add_argument("--retweets",
help="Include user's Retweets (Warning: limited).",
action="store_true")
ap.add_argument("--format", help="Custom output format (See wiki for details).") ap.add_argument("--format", help="Custom output format (See wiki for details).")
ap.add_argument("--user-full", help="Collect all user information (Use with followers or following only).", ap.add_argument("--user-full",
help="Collect all user information (Use with followers or following only).",
action="store_true") action="store_true")
ap.add_argument("--profile-full", ap.add_argument("--profile-full",
help="Slow, but effective method of collecting a user's Tweets (Including Retweets).", help="Slow, but effective method of collecting a user's Tweets and RT.",
action="store_true") action="store_true")
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.") ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
ap.add_argument("--pandas-type", help="Specify HDF5 or Pickle (HDF5 as default)") ap.add_argument("--pandas-type",
ap.add_argument("--search_name", help="Name for identify the search like -3dprinter stuff- only for mysql") help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
ap.add_argument("-it", "--index-tweets", help="Custom Elasticsearch Index name for Tweets.") ap.add_argument("--search_name",
ap.add_argument("-if", "--index-follow", help="Custom Elasticsearch Index name for Follows.") help="Name for identify the search like -3dprinter stuff- only for mysql")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.") ap.add_argument("-it", "--index-tweets",
ap.add_argument("--debug", help="Store information in debug logs", action="store_true") help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
ap.add_argument("-if", "--index-follow",
help="Custom Elasticsearch Index name for Follows.",
nargs="?", default="twintgraph")
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
nargs="?", default="twintuser")
ap.add_argument("--debug",
help="Store information in debug logs", action="store_true")
ap.add_argument("--resume", help="Resume from Tweet ID.") ap.add_argument("--resume", help="Resume from Tweet ID.")
ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true") ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true") ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
ap.add_argument("--media", help="Display Tweets with only images or videos.", action="store_true") ap.add_argument("--media",
help="Display Tweets with only images or videos.", action="store_true")
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
ap.add_argument("-pc","--pandas-clean", help="Automatically clean Pandas dataframe at every scrape.") ap.add_argument("-pc", "--pandas-clean",
help="Automatically clean Pandas dataframe at every scrape.")
ap.add_argument("-ec", "--es-count", nargs="?", default="",
help="What NOT to count: likes, replies, retweets; only for Elasticsearch.")
args = ap.parse_args() args = ap.parse_args()
return args return args
def main(): def main():
""" Main
"""
args = options() args = options()
check(args) check(args)
if args.userlist: if args.userlist:
args.username = loadUserList(args.userlist, "search") args.username = loadUserList(args.userlist, "search")
if not args.pandas_type: if args.pandas_clean:
args.pandas_type = "HDF5" twint.storage.panda.clean()
if not args.index_tweets: c = initialize(args)
args.index_tweets = "twint"
if not args.index_follow: if "likes" in str(args.es_count):
args.index_follow = "twintGraph" c.ES_count["likes"] = True
if not args.index_users: if "replies" in str(args.es_count):
args.index_users = "twintUser" c.ES_count["replies"] = True
if not args.essid: if "retweets" in str(args.es_count):
args.essid = "" c.ES_count["retweets"] = True
if args.pandas_clean: if args.pandas_clean:
twint.storage.panda.clean() twint.storage.panda.clean()
c = initialize(args)
if args.favorites: if args.favorites:
if args.userlist: if args.userlist:
_userlist = loadUserList(args.userlist, "favorites") _userlist = loadUserList(args.userlist, "favorites")
......
import twint
import schedule
import time
# you can change the name of each "job" after "def" if you'd like.
def jobone():
print ("Fetching Tweets")
c = twint.Config()
# choose username (optional)
c.Username = "insert username here"
# choose search term (optional)
c.Search = "insert search term here"
# choose beginning time (narrow results)
c.Since = "2018-01-01"
# set limit on total tweets
c.Limit = 1000
# no idea, but makes the csv format properly
c.Store_csv = True
# format of the csv
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
# change the name of the csv file
c.Output = "filename.csv"
twint.run.Search(c)
def jobtwo():
print ("Fetching Tweets")
c = twint.Config()
# choose username (optional)
c.Username = "insert username here"
# choose search term (optional)
c.Search = "insert search term here"
# choose beginning time (narrow results)
c.Since = "2018-01-01"
# set limit on total tweets
c.Limit = 1000
# no idea, but makes the csv format properly
c.Store_csv = True
# format of the csv
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
# change the name of the csv file
c.Output = "filename2.csv"
twint.run.Search(c)
# run once when you start the program
jobone()
jobtwo()
# run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)
# schedule.every(1).minutes.do(jobone)
schedule.every().hour.do(jobone)
# schedule.every().day.at("10:30").do(jobone)
# schedule.every().monday.do(jobone)
# schedule.every().wednesday.at("13:15").do(jobone)
# schedule.every(1).minutes.do(jobtwo)
schedule.every().hour.do(jobtwo)
# schedule.every().day.at("10:30").do(jobtwo)
# schedule.every().monday.do(jobtwo)
# schedule.every().wednesday.at("13:15").do(jobtwo)
while True:
schedule.run_pending()
time.sleep(1)
...@@ -30,24 +30,55 @@ If you are not getting these outputs I suggest you to dig in the corresponding d ...@@ -30,24 +30,55 @@ If you are not getting these outputs I suggest you to dig in the corresponding d
Now that everything is up and running: Now that everything is up and running:
1. Index some data: `python3.6 Twint.py --elasticsearch localhost:9200 -u user --database twint.db` (the `--database` arg is optional, `--elasticsearch` is mandatory and its value is for default settings, as in our case; 1. Index some data: `python3.6 Twint.py --elasticsearch localhost:9200 -u user` (in this case `--elasticsearch` is mandatory argument and its value is a host:port combination, where the Elasticsearch instance is binding to);
2. Now we can create the index (that I already created): open your browser and go to `http://localhost:5601` (again, this is a default value), `Dev Tools` tab, copy&paste `index-tweets.json` and than click the green arrow. Expected output is 2. Now we can create the index (that I already built): open your browser and go to `http://localhost:5601` (again, this is a default value), `Dev Tools` tab, copy&paste `index-tweets.json` and than click the green arrow. Expected output is
```json ```json
{ {
"acknowledged": true, "acknowledged": true,
"shards_acknowledged": true, "shards_acknowledged": true,
"index": "twint" "index": "twinttweets"
} }
``` ```
3. Go to `Management` tab, `Index Patterns`, `Create Index Pattern`, `Index Pattern: twint` and choose `datestamp` as time field; 3. Go to `Management` tab, `Index Patterns`, `Create Index Pattern`, `Index Pattern: twint` and choose `datestamp` as time field;
4. Go to the `Discover` tab, choose `twint` and you should see something like this: 4. Go to the `Discover` tab, choose `twinttweets` and you should see something like this:
![1](https://i.imgur.com/Ut9173J.png) ![1](https://i.imgur.com/Ut9173J.png)
### Notes PS: this screenshot has the index named `tweep`, you will see `twinttweets`
Different indexes can have different visualizations so there is not a general rule, with the basics provided in the Wiki you should be able to create visualizations. In any case, for every question, don't hesitate to ask. ### Query How-to
\ No newline at end of file 1. Filter out "multiplied" data and analyze only own tweets.
If, during the indexing phase, you specified the `--es-count` param you could have the need of filtering-out the counting of likes/retweets/replies, to achieve this in the `Search` bar type `NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies`;
2. Filter-out tweets for a specific username: `username: handle`, where `handle` is `@handle`;
3. Filter-out tweets for a specific user_id: `user_id: 0123456`;
4. Filter-out tweets for a specific word in the tweet: `tweet: osint`;
5. Define specific timestamp intervals: click on the clock in the top right corner;
6. Concatenate conditions: Lucene syntax has some logic built-in, operators like `AND` and `OR` are useful to restrict the data that you want to study;
[Here](https://www.elastic.co/guide/en/kibana/current/lucene-query.html) a short article about Lucene Query Syntax.
### Examples
Search for every tweet from "@John" and "@Janet":
`username: John AND username: Janet`
Search for tweets from "myearthquakeapp" and restrict the result for earthquakes with magnitude between 5.0 and 5.9:
`username: myearthquakeapp AND tweet: 5.?`
Search for tweets with at least 5 likes:
`nlikes: [5 TO *]` and similarly tweets with at least 1 like but less than 10 `nlikes: [1 TO 10]` (`[]` extremes included, `{}` extremes excluded)
### Ready-to-Use Visualizations
With the newest versions of Kibana users can export objects, for example, but not limited to, visualizations and dashboards.
Making visualizations is a simple but not easy process, you have to combine how you want to index data and how you want to visualize it.
To help you getting started with Twint and Elasticsearch, I made some basic visualization and a dashboard. To use them you have just to import them: go to `Management` tab (the gear), `Saved Objects`, `Import` and then select `visualizations.json`, repeat the process for `dashboard.json`.
After this just to go `Dashboard` tab and click on `Twint Dashboard`.
![2](https://i.imgur.com/iaH3s7z.png)
![3](https://i.imgur.com/hVeCrqL.png)
[
{
"_id": "e6d65380-bfe2-11e8-961a-d371b24d5d1d",
"_type": "dashboard",
"_source": {
"title": "Twint Dashboard",
"hits": 0,
"description": "",
"panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":40,\"h\":17,\"i\":\"1\"},\"embeddableConfig\":{},\"id\":\"d47421c0-bfd5-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":40,\"y\":6,\"w\":8,\"h\":11,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"e2b89640-bfd4-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":32,\"w\":20,\"h\":17,\"i\":\"3\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"8a8bb420-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":17,\"w\":33,\"h\":15,\"i\":\"4\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"a8d3ee70-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":40,\"y\":0,\"w\":8,\"h\":6,\"i\":\"6\"},\"embeddableConfig\":{},\"id\":\"37cd72e0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":33,\"y\":17,\"w\":15,\"h\":15,\"i\":\"7\"},\"embeddableConfig\":{},\"id\":\"149ecbc0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":20,\"y\":32,\"w\":28,\"h\":17,\"i\":\"8\"},\"version\":\"6.3.2\",\"type\":\"visualization\",\"id\":\"b45ec590-c267-11e8-bcd4-3956fe930db7\",\"embeddableConfig\":{}}]",
"optionsJSON": "{\"darkTheme\":true,\"hidePanelTitles\":true,\"useMargins\":true}",
"version": 1,
"timeRestore": false,
"kibanaSavedObjectMeta": {
"searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}"
}
}
}
]
\ No newline at end of file
PUT twintGraph PUT twintgraph
{ {
"mappings": { "mappings": {
"items": { "items": {
......
PUT twint PUT twinttweets
{ {
"mappings": { "mappings": {
"items": { "items": {
...@@ -14,12 +14,16 @@ PUT twint ...@@ -14,12 +14,16 @@ PUT twint
"likes": {"type": "boolean"}, "likes": {"type": "boolean"},
"user_id": {"type": "keyword"}, "user_id": {"type": "keyword"},
"username": {"type": "keyword"}, "username": {"type": "keyword"},
"day": {"type": "keyword"}, "day": {"type": "integer"},
"hour": {"type": "keyword"}, "hour": {"type": "integer"},
"link": {"type": "text"}, "link": {"type": "text"},
"retweet": {"type": "text"}, "retweet": {"type": "text"},
"user_rt": {"type": "text"}, "user_rt": {"type": "text"},
"essid": {"type": "keyword"} "essid": {"type": "keyword"},
"nlikes": {"type": "integer"},
"nreplies": {"type": "integer"},
"nretweets": {"type": "integer"},
"search": {"type": "text"}
} }
} }
} }
......
PUT twintUser PUT twintuser
{ {
"mappings": { "mappings": {
"items": { "items": {
......
This diff is collapsed.
...@@ -5,3 +5,5 @@ cchardet ...@@ -5,3 +5,5 @@ cchardet
elasticsearch elasticsearch
pysocks pysocks
pandas pandas
aiohttp_socks
schedule
\ No newline at end of file
...@@ -9,3 +9,12 @@ Copyright (c) 2018 Cody Zacharias ...@@ -9,3 +9,12 @@ Copyright (c) 2018 Cody Zacharias
''' '''
from .config import Config from .config import Config
from . import run from . import run
#import logging
#logger = logging.getLogger()
#handler = logging.FileHandler('twint.log')
#formatter = logging.Formatter(
# '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
#handler.setFormatter(formatter)
#logger.addHandler(handler)
#logger.setLevel(logging.DEBUG)
\ No newline at end of file
...@@ -38,9 +38,10 @@ class Config: ...@@ -38,9 +38,10 @@ class Config:
Store_pandas = False Store_pandas = False
Pandas_type = None Pandas_type = None
Pandas = False Pandas = False
Index_tweets = "twint" Index_tweets = "twinttweets"
Index_follow = "twintGraph" Index_follow = "twintgraph"
Index_users = "twintUser" Index_users = "twintuser"
Index_type = "items"
Debug = False Debug = False
Resume = None Resume = None
Images = False Images = False
...@@ -48,3 +49,9 @@ class Config: ...@@ -48,3 +49,9 @@ class Config:
Media = False Media = False
Replies = False Replies = False
Pandas_clean = True Pandas_clean = True
ES_count = {"likes":False,"replies":False,"retweets":False}
Lowercase = True
Pandas_au = True
Proxy_host = None
Proxy_port = 0
Proxy_type = None
import datetime import datetime
#import logging
class Datelock: class Datelock:
_until = None _until = None
_since = None _since = None
_since_def_user = None _since_def_user = None
def Set(Until, Since): def Set(Until, Since):
#logging.info("[<] " + str(datetime.datetime.now()) + ':: datelock+Set')
d = Datelock() d = Datelock()
if Until: if Until:
......
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from re import findall from re import findall
from json import loads from json import loads
#import logging
#from datetime import datetime
def Follow(response): def Follow(response):
#logging.info("[<] " + str(datetime.now()) + ':: feed+Follow')
soup = BeautifulSoup(response, "html.parser") soup = BeautifulSoup(response, "html.parser")
follow = soup.find_all("td", "info fifty screenname") follow = soup.find_all("td", "info fifty screenname")
cursor = soup.find_all("div", "w-button-more") cursor = soup.find_all("div", "w-button-more")
try: try:
cursor = findall(r'cursor=(.*?)">', str(cursor))[0] cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
except: except Exception as e:
pass print(e)
return follow, cursor return follow, cursor
def Mobile(response): def Mobile(response):
#logging.info("[<] " + str(datetime.now()) + ':: feed+Mobile')
soup = BeautifulSoup(response, "html.parser") soup = BeautifulSoup(response, "html.parser")
tweets = soup.find_all("span", "metadata") tweets = soup.find_all("span", "metadata")
max_id = soup.find_all("div", "w-button-more") max_id = soup.find_all("div", "w-button-more")
try: try:
max_id = findall(r'max_id=(.*?)">', str(max_id))[0] max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
except: except Exception as e:
pass print(e)
return tweets, max_id return tweets, max_id
def profile(response): def profile(response):
#logging.info("[<] " + str(datetime.now()) + ':: feed+profile')
json_response = loads(response) json_response = loads(response)
html = json_response["items_html"] html = json_response["items_html"]
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
...@@ -33,6 +38,7 @@ def profile(response): ...@@ -33,6 +38,7 @@ def profile(response):
return feed, feed[-1]["data-item-id"] return feed, feed[-1]["data-item-id"]
def Json(response): def Json(response):
#logging.info("[<] " + str(datetime.now()) + ':: feed+Json')
json_response = loads(response) json_response = loads(response)
html = json_response["items_html"] html = json_response["items_html"]
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
......
#import logging
#from datetime import datetime
def Tweet(config, t): def Tweet(config, t):
#logging.info("[<] " + str(datetime.now()) + ':: format+Tweet')
if config.Format: if config.Format:
output = config.Format.replace("{id}", t.id) output = config.Format.replace("{id}", t.id)
output = output.replace("{date}", t.datestamp) output = output.replace("{date}", t.datestamp)
...@@ -35,6 +39,7 @@ def Tweet(config, t): ...@@ -35,6 +39,7 @@ def Tweet(config, t):
return output return output
def User(_format, u): def User(_format, u):
#logging.info("[<] " + str(datetime.now()) + ':: format+User')
if _format: if _format:
output = _format.replace("{id}", u.id) output = _format.replace("{id}", u.id)
output += output.replace("{name}", u.name) output += output.replace("{name}", u.name)
......
from . import url
from .output import Tweets, Users
from async_timeout import timeout from async_timeout import timeout
from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import sys
import aiohttp import aiohttp
import asyncio import asyncio
import concurrent.futures import concurrent.futures
from aiohttp_socks import SocksConnector, SocksVer
from . import url
from .output import Tweets, Users
#import logging
async def RequestUrl(config, init): async def RequestUrl(config, init):
#loggin.info("[<] " + str(datetime.now()) + ':: get+requestURL')
_connector = None
if config.Proxy_host is not None:
if config.Proxy_host.lower() == "tor":
connector = SocksConnector(
socks_ver=SocksVer.SOCKS5,
host='127.0.0.1',
port=9050,
rdns=True)
elif config.Proxy_port and config.Proxy_type:
if config.Proxy_type.lower() == "socks5":
_type = SocksVer.SOCKS5
elif config.Proxy_type.lower() == "socks4":
_type = SocksVer.SOCKS4
else:
print("Error: Proxy types allowed are: socks5 and socks4.")
sys.exit(1)
_connector = SocksConnector(
socks_ver=_type,
host=config.Proxy_host,
port=config.Proxy_port,
rdns=True)
else:
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
sys.exit(1)
else:
if config.Proxy_port or config.Proxy_type:
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
sys.exit(1)
if config.Profile: if config.Profile:
if config.Profile_full: if config.Profile_full:
_url = await url.MobileProfile(config.Username, init) _url = await url.MobileProfile(config.Username, init)
response = await MobileRequest(_url) response = await MobileRequest(_url, connector=_connector)
else: else:
_url = await url.Profile(config.Username, init) _url = await url.Profile(config.Username, init)
response = await Request(_url) response = await Request(_url, connector=_connector)
elif config.TwitterSearch: elif config.TwitterSearch:
_url = await url.Search(config, init) _url = await url.Search(config, init)
response = await Request(_url) response = await Request(_url, options=_connector)
else: else:
if config.Following: if config.Following:
_url = await url.Following(config.Username, init) _url = await url.Following(config.Username, init)
...@@ -24,30 +62,40 @@ async def RequestUrl(config, init): ...@@ -24,30 +62,40 @@ async def RequestUrl(config, init):
_url = await url.Followers(config.Username, init) _url = await url.Followers(config.Username, init)
else: else:
_url = await url.Favorites(config.Username, init) _url = await url.Favorites(config.Username, init)
response = await MobileRequest(_url) response = await MobileRequest(_url, connector=_connector)
if config.Debug: if config.Debug:
print(_url, file=open("twint-request_urls.log", "a", encoding="utf-8")) print(_url, file=open("twint-request_urls.log", "a", encoding="utf-8"))
return response return response
async def MobileRequest(url): async def MobileRequest(url, **options):
#loggin.info("[<] " + str(datetime.now()) + ':: get+MobileRequest')
ua = {'User-Agent': 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/0.8.12'} ua = {'User-Agent': 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/0.8.12'}
connect = aiohttp.TCPConnector(verify_ssl=False) connector = options.get("_connector")
async with aiohttp.ClientSession(headers=ua, connector=connect) as session: if connector:
async with aiohttp.ClientSession(headers=ua, connector=connector) as session:
return await Response(session, url)
async with aiohttp.ClientSession(headers=ua) as session:
return await Response(session, url) return await Response(session, url)
async def Request(url): async def Request(url, **options):
connect = aiohttp.TCPConnector(verify_ssl=False) #loggin.info("[<] " + str(datetime.now()) + ':: get+Request')
async with aiohttp.ClientSession(connector=connect) as session: connector = options.get("_connector")
if connector:
async with aiohttp.ClientSession(connector=connector) as session:
return await Response(session, url)
async with aiohttp.ClientSession() as session:
return await Response(session, url) return await Response(session, url)
async def Response(session, url): async def Response(session, url):
#loggin.info("[<] " + str(datetime.now()) + ':: get+Response')
with timeout(30): with timeout(30):
async with session.get(url) as response: async with session.get(url, ssl=False) as response:
return await response.text() return await response.text()
async def Username(_id): async def Username(_id):
#loggin.info("[<] " + str(datetime.now()) + ':: get+Username')
url = f"https://twitter.com/intent/user?user_id={_id}&lang=en" url = f"https://twitter.com/intent/user?user_id={_id}&lang=en"
r = await Request(url) r = await Request(url)
soup = BeautifulSoup(r, "html.parser") soup = BeautifulSoup(r, "html.parser")
...@@ -55,6 +103,7 @@ async def Username(_id): ...@@ -55,6 +103,7 @@ async def Username(_id):
return soup.find("a", "fn url alternate-context")["href"].replace("/", "") return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
async def Tweet(url, config, conn): async def Tweet(url, config, conn):
#loggin.info("[<] " + str(datetime.now()) + ':: Tweet')
try: try:
response = await Request(url) response = await Request(url)
soup = BeautifulSoup(response, "html.parser") soup = BeautifulSoup(response, "html.parser")
...@@ -62,22 +111,25 @@ async def Tweet(url, config, conn): ...@@ -62,22 +111,25 @@ async def Tweet(url, config, conn):
location = soup.find("span", "ProfileHeaderCard-locationText u-dir").text location = soup.find("span", "ProfileHeaderCard-locationText u-dir").text
location = location[15:].replace("\n", " ")[:-10] location = location[15:].replace("\n", " ")[:-10]
await Tweets(tweet, location, config, conn) await Tweets(tweet, location, config, conn)
except: except Exception as e:
pass print(str(e) + " [x] get.Tweet")
async def User(url, config, conn): async def User(url, config, conn):
#loggin.info("[<] " + str(datetime.now()) + ':: get+User')
try: try:
response = await Request(url) response = await Request(url)
soup = BeautifulSoup(response, "html.parser") soup = BeautifulSoup(response, "html.parser")
await Users(soup, config, conn) await Users(soup, config, conn)
except: except Exception as e:
pass print(str(e) + " [x] get.User")
def Limit(Limit, count): def Limit(Limit, count):
#loggin.info("[<] " + str(datetime.now()) + ':: get+Limit')
if Limit is not None and count >= int(Limit): if Limit is not None and count >= int(Limit):
return True return True
async def Multi(feed, config, conn): async def Multi(feed, config, conn):
#loggin.info("[<] " + str(datetime.now()) + ':: get+Multi')
count = 0 count = 0
try: try:
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
...@@ -103,7 +155,11 @@ async def Multi(feed, config, conn): ...@@ -103,7 +155,11 @@ async def Multi(feed, config, conn):
config, conn))) config, conn)))
await asyncio.gather(*futures) await asyncio.gather(*futures)
except: except Exception as e:
# TODO: fix error not error
# print(str(e) + " [x] get.Multi")
# will return "'NoneType' object is not callable"
# but still works
pass pass
return count return count
...@@ -4,9 +4,22 @@ from .user import User ...@@ -4,9 +4,22 @@ from .user import User
from datetime import datetime from datetime import datetime
from .storage import db, elasticsearch, write, panda from .storage import db, elasticsearch, write, panda
#import logging
follow_object = {}
tweets_object = [] tweets_object = []
user_object = []
_follow_list = []
def clean_follow_list():
#logging.info("[<] " + str(datetime.now()) + ':: output+clean_follow_list')
global _follow_list
_follow_list = []
def datecheck(datestamp, config): def datecheck(datestamp, config):
#logging.info("[<] " + str(datetime.now()) + ':: output+datecheck')
if config.Since and config.Until: if config.Since and config.Until:
d = int(datestamp.replace("-", "")) d = int(datestamp.replace("-", ""))
s = int(config.Since.replace("-", "")) s = int(config.Since.replace("-", ""))
...@@ -15,31 +28,46 @@ def datecheck(datestamp, config): ...@@ -15,31 +28,46 @@ def datecheck(datestamp, config):
return True return True
def is_tweet(tw): def is_tweet(tw):
#logging.info("[<] " + str(datetime.now()) + ':: output+is_tweet')
try: try:
tw.find("div")["data-item-id"] tw.find("div")["data-item-id"]
return True return True
except: except:
return False return False
def _output(obj, output, config): def _output(obj, output, config, **extra):
#logging.info("[<] " + str(datetime.now()) + ':: output+_output')
if config.Lowercase:
if isinstance(obj, str):
obj = obj.lower()
elif str(type(obj)) == "<class 'twint.user.user'>":
pass
else:
obj.username = obj.username.lower()
for i in range(len(obj.mentions)):
obj.mentions[i] = obj.mentions[i].lower()
for i in range(len(obj.hashtags)):
obj.hashtags[i] = obj.hashtags[i].lower()
if config.Output != None: if config.Output != None:
if config.Store_csv: if config.Store_csv:
try : try :
write.Csv(obj, config) write.Csv(obj, config)
except Exception as e: except Exception as e:
print("Error: " + str(e)) print(str(e) + " [x] output._output")
elif config.Store_json: elif config.Store_json:
write.Json(obj, config) write.Json(obj, config)
else: else:
write.Text(output, config.Output) write.Text(output, config.Output)
if config.Pandas: if config.Pandas and config.User_full:
panda.update(obj, config.Essid) panda.update(obj, config)
if extra.get("follow_list"):
follow_object.username = config.Username
follow_object.action = config.Following*"following" + config.Followers*"followers"
follow_object.users = _follow_list
panda.update(follow_object, config.Essid)
if config.Elasticsearch: if config.Elasticsearch:
if config.Store_object: print("", end=".", flush=True)
tweets_object.append(obj)
else:
print(output, end=".", flush=True)
else: else:
if config.Store_object: if config.Store_object:
tweets_object.append(obj) tweets_object.append(obj)
...@@ -48,9 +76,10 @@ def _output(obj, output, config): ...@@ -48,9 +76,10 @@ def _output(obj, output, config):
print(output) print(output)
pass pass
except UnicodeEncodeError: except UnicodeEncodeError:
print("unicode error") print("unicode error [x] output._output")
async def Tweets(tw, location, config, conn): async def Tweets(tw, location, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Tweets')
copyright = tw.find("div", "StreamItemContent--withheld") copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tw): if copyright is None and is_tweet(tw):
tweet = Tweet(tw, location, config) tweet = Tweet(tw, location, config)
...@@ -60,12 +89,21 @@ async def Tweets(tw, location, config, conn): ...@@ -60,12 +89,21 @@ async def Tweets(tw, location, config, conn):
if config.Database: if config.Database:
db.tweets(conn, tweet, config) db.tweets(conn, tweet, config)
if config.Pandas:
panda.update(tweet, config)
if config.Elasticsearch: if config.Elasticsearch:
elasticsearch.Tweet(tweet, config) elasticsearch.Tweet(tweet, config)
if config.Store_object:
tweets_object.append(tweet) #twint.tweet.tweet
_output(tweet, output, config) _output(tweet, output, config)
async def Users(u, config, conn): async def Users(u, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Users')
global user_object
user = User(u) user = User(u)
output = format.User(config.Format, user) output = format.User(config.Format, user)
...@@ -81,13 +119,28 @@ async def Users(u, config, conn): ...@@ -81,13 +119,28 @@ async def Users(u, config, conn):
user.join_date = _save_date user.join_date = _save_date
user.join_time = _save_time user.join_time = _save_time
if config.Store_object:
user_object.append(user) # twint.user.user
_output(user, output, config) _output(user, output, config)
async def Username(username, config, conn): async def Username(username, config, conn):
#logging.info("[<] " + str(datetime.now()) + ':: output+Username')
global follow_object
follow_var = config.Following*"following" + config.Followers*"followers"
if config.Database: if config.Database:
db.follow(conn, config.Username, config.Followers, username) db.follow(conn, config.Username, config.Followers, username)
if config.Elasticsearch: if config.Elasticsearch:
elasticsearch.Follow(username, config) elasticsearch.Follow(username, config)
_output(username, username, config) if config.Store_object or config.Pandas:
try:
_ = follow_object[config.Username][follow_var]
except KeyError:
follow_object.update({config.Username: {follow_var: []}})
follow_object[config.Username][follow_var].append(username)
if config.Pandas_au:
panda.update(follow_object[config.Username], config)
_output(username, username, config, follow_list=_follow_list)
from . import datelock, feed, get, output, verbose, storage from . import datelock, feed, get, output, verbose, storage
from asyncio import get_event_loop from asyncio import get_event_loop
from datetime import timedelta from datetime import timedelta, datetime
from .storage import db from .storage import db
#import logging
class Twint: class Twint:
def __init__(self, config): def __init__(self, config):
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+__init__')
if config.Resume is not None and config.TwitterSearch: if config.Resume is not None and config.TwitterSearch:
self.init = f"TWEET-{config.Resume}-0" self.init = f"TWEET-{config.Resume}-0"
else: else:
...@@ -16,6 +19,9 @@ class Twint: ...@@ -16,6 +19,9 @@ class Twint:
self.d = datelock.Set(self.config.Until, self.config.Since) self.d = datelock.Set(self.config.Until, self.config.Since)
verbose.Elastic(config.Elasticsearch) verbose.Elastic(config.Elasticsearch)
if self.config.Store_object:
output.clean_follow_list()
if self.config.Pandas_clean: if self.config.Pandas_clean:
storage.panda.clean() storage.panda.clean()
...@@ -26,6 +32,7 @@ class Twint: ...@@ -26,6 +32,7 @@ class Twint:
self.config.Timedelta = (self.d._until - self.d._since).days self.config.Timedelta = (self.d._until - self.d._since).days
async def Feed(self): async def Feed(self):
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+Feed')
response = await get.RequestUrl(self.config, self.init) response = await get.RequestUrl(self.config, self.init)
if self.config.Debug: if self.config.Debug:
print(response, file=open("twint-last-request.log", "w", encoding="utf-8")) print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
...@@ -43,10 +50,11 @@ class Twint: ...@@ -43,10 +50,11 @@ class Twint:
self.feed, self.init = feed.profile(response) self.feed, self.init = feed.profile(response)
elif self.config.TwitterSearch: elif self.config.TwitterSearch:
self.feed, self.init = feed.Json(response) self.feed, self.init = feed.Json(response)
except: except Exception as e:
pass print(str(e) + " [x] run.Feed")
async def follow(self): async def follow(self):
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+follow')
await self.Feed() await self.Feed()
if self.config.User_full: if self.config.User_full:
self.count += await get.Multi(self.feed, self.config, self.conn) self.count += await get.Multi(self.feed, self.config, self.conn)
...@@ -57,10 +65,12 @@ class Twint: ...@@ -57,10 +65,12 @@ class Twint:
await output.Username(username, self.config, self.conn) await output.Username(username, self.config, self.conn)
async def favorite(self): async def favorite(self):
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+favorite')
await self.Feed() await self.Feed()
self.count += await get.Multi(self.feed, self.config, self.conn) self.count += await get.Multi(self.feed, self.config, self.conn)
async def profile(self): async def profile(self):
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+profile')
await self.Feed() await self.Feed()
if self.config.Profile_full: if self.config.Profile_full:
self.count += await get.Multi(self.feed, self.config, self.conn) self.count += await get.Multi(self.feed, self.config, self.conn)
...@@ -70,6 +80,7 @@ class Twint: ...@@ -70,6 +80,7 @@ class Twint:
await output.Tweets(tweet, "", self.config, self.conn) await output.Tweets(tweet, "", self.config, self.conn)
async def tweets(self): async def tweets(self):
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+tweets')
await self.Feed() await self.Feed()
if self.config.Location: if self.config.Location:
self.count += await get.Multi(self.feed, self.config, self.conn) self.count += await get.Multi(self.feed, self.config, self.conn)
...@@ -79,6 +90,7 @@ class Twint: ...@@ -79,6 +90,7 @@ class Twint:
await output.Tweets(tweet, "", self.config, self.conn) await output.Tweets(tweet, "", self.config, self.conn)
async def main(self): async def main(self):
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main')
if self.config.User_id is not None: if self.config.User_id is not None:
self.config.Username = await get.Username(self.config.User_id) self.config.Username = await get.Username(self.config.User_id)
...@@ -93,6 +105,7 @@ class Twint: ...@@ -93,6 +105,7 @@ class Twint:
self.d._until = self.d._until - _days self.d._until = self.d._until - _days
self.feed = [-1] self.feed = [-1]
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit1')
if get.Limit(self.config.Limit, self.count): if get.Limit(self.config.Limit, self.count):
self.d._until = self.d._until - _days self.d._until = self.d._until - _days
self.feed = [-1] self.feed = [-1]
...@@ -110,6 +123,7 @@ class Twint: ...@@ -110,6 +123,7 @@ class Twint:
else: else:
break break
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
if get.Limit(self.config.Limit, self.count): if get.Limit(self.config.Limit, self.count):
break break
...@@ -117,24 +131,50 @@ class Twint: ...@@ -117,24 +131,50 @@ class Twint:
verbose.Count(self.count, self.config) verbose.Count(self.count, self.config)
def run(config): def run(config):
#logging.info("[<] " + str(datetime.now()) + ':: run+run')
get_event_loop().run_until_complete(Twint(config).main()) get_event_loop().run_until_complete(Twint(config).main())
def Favorites(config): def Favorites(config):
#logging.info("[<] " + str(datetime.now()) + ':: run+Favorites')
config.Favorites = True config.Favorites = True
run(config) run(config)
def Followers(config): def Followers(config):
#logging.info("[<] " + str(datetime.now()) + ':: run+Followers')
output.clean_follow_list()
config.Followers = True config.Followers = True
config.Following = False
run(config) run(config)
if config.Pandas_au:
storage.panda._autoget("followers")
if config.User_full:
storage.panda._autoget("user")
if config.Pandas:
storage.panda.clean()
def Following(config): def Following(config):
#logging.info("[<] " + str(datetime.now()) + ':: run+Following')
output.clean_follow_list()
config.Following = True config.Following = True
config.Followers = False
run(config) run(config)
if config.Pandas_au:
storage.panda._autoget("following")
if config.User_full:
storage.panda._autoget("user")
if config.Pandas:
storage.panda.clean()
def Profile(config): def Profile(config):
config.Profile = True config.Profile = True
#logging.info("[<] " + str(datetime.now()) + ':: run+Profile')
run(config) run(config)
def Search(config): def Search(config):
#logging.info("[<] " + str(datetime.now()) + ':: run+Search')
config.TwitterSearch = True config.TwitterSearch = True
config.Following = False
config.Followers = False
run(config) run(config)
if config.Pandas_au:
storage.panda._autoget("tweet")
...@@ -73,6 +73,7 @@ def init(db): ...@@ -73,6 +73,7 @@ def init(db):
name text, name text,
username text not null, username text not null,
bio text, bio text,
location,
url text, url text,
join_date text not null, join_date text not null,
join_time text not null, join_time text not null,
......
...@@ -44,15 +44,15 @@ def Tweet(Tweet, config): ...@@ -44,15 +44,15 @@ def Tweet(Tweet, config):
day = weekdays[strftime("%A", localtime(Tweet.datetime))] day = weekdays[strftime("%A", localtime(Tweet.datetime))]
actions = [] actions = []
nLikes = 0 nLikes = 1
nReplies = 0 nReplies = 1
nRetweets = 0 nRetweets = 1
dt = f"{Tweet.datestamp} {Tweet.timestamp}" dt = f"{Tweet.datestamp} {Tweet.timestamp}"
j_data = { j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": "items", "_type": config.Index_type,
"_id": Tweet.id + "_raw_" + config.Essid, "_id": Tweet.id + "_raw_" + config.Essid,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
...@@ -68,15 +68,20 @@ def Tweet(Tweet, config): ...@@ -68,15 +68,20 @@ def Tweet(Tweet, config):
"link": Tweet.link, "link": Tweet.link,
"retweet": Tweet.retweet, "retweet": Tweet.retweet,
"user_rt": Tweet.user_rt, "user_rt": Tweet.user_rt,
"essid": config.Essid "essid": config.Essid,
"nlikes": int(Tweet.likes),
"nreplies": int(Tweet.replies),
"nretweets": int(Tweet.retweets),
"search": str(config.Search)
} }
} }
actions.append(j_data) actions.append(j_data)
if config.ES_count["likes"]:
for l in range(int(Tweet.likes)): for l in range(int(Tweet.likes)):
j_data = { j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": "items", "_type": config.Index_type,
"_id": Tweet.id + "_likes_" + str(nLikes) + "_" + config.Essid, "_id": Tweet.id + "_likes_" + str(nLikes) + "_" + config.Essid,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
...@@ -99,10 +104,11 @@ def Tweet(Tweet, config): ...@@ -99,10 +104,11 @@ def Tweet(Tweet, config):
actions.append(j_data) actions.append(j_data)
nLikes += 1 nLikes += 1
if config.ES_count["replies"]:
for rep in range(int(Tweet.replies)): for rep in range(int(Tweet.replies)):
j_data = { j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": "items", "_type": config.Index_type,
"_id": Tweet.id + "_replies_" + str(nReplies) + "_" + config.Essid, "_id": Tweet.id + "_replies_" + str(nReplies) + "_" + config.Essid,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
...@@ -125,10 +131,11 @@ def Tweet(Tweet, config): ...@@ -125,10 +131,11 @@ def Tweet(Tweet, config):
actions.append(j_data) actions.append(j_data)
nReplies += 1 nReplies += 1
if config.ES_count["retweets"]:
for ret in range(int(Tweet.retweets)): for ret in range(int(Tweet.retweets)):
j_data = { j_data = {
"_index": config.Index_tweets, "_index": config.Index_tweets,
"_type": "items", "_type": config.Index_type,
"_id": Tweet.id + "_retweets_" + str(nRetweets) + "_" + config.Essid, "_id": Tweet.id + "_retweets_" + str(nRetweets) + "_" + config.Essid,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
...@@ -161,7 +168,7 @@ def Follow(user, config): ...@@ -161,7 +168,7 @@ def Follow(user, config):
j_data = { j_data = {
"_index": config.Index_follow, "_index": config.Index_follow,
"_type": "items", "_type": config.Index_type,
"_id": user + "_" + config.Username + "_" + config.Essid, "_id": user + "_" + config.Username + "_" + config.Essid,
"_source": { "_source": {
"user": user, "user": user,
...@@ -181,7 +188,7 @@ def UserProfile(user, config): ...@@ -181,7 +188,7 @@ def UserProfile(user, config):
j_data = { j_data = {
"_index": config.Index_users, "_index": config.Index_users,
"_type": "items", "_type": config.Index_type,
"_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid, "_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid,
"_source": { "_source": {
"id": user.id, "id": user.id,
......
from .elasticsearch import *
from time import strftime, localtime from time import strftime, localtime
import pandas as pd import pandas as pd
import warnings import warnings
from .elasticsearch import *
Tweets_df = None
Follow_df = None
User_df = None
_object_blocks = {
"tweet": [],
"user": [],
"following": [],
"followers": []
}
_type = ""
def _concat(df, type):
if df is None:
df = pd.DataFrame(_object_blocks[type])
else:
_df = pd.DataFrame(_object_blocks[type])
df = pd.concat([df, _df], sort=True)
return df
def _autoget(type):
global Tweets_df
global Follow_df
global User_df
if type == "tweet":
Tweets_df = _concat(Tweets_df, type)
if type == "followers" or type == "following":
Follow_df = _concat(Follow_df, type)
if type == "user":
User_df = _concat(User_df, type)
_blocks = []
def update(Tweet, session): def update(object, config):
dt = f"{Tweet.datestamp} {Tweet.timestamp}" global _type
try:
_type = ((object.type == "tweet")*"tweet" +
(object.type == "user")*"user")
except AttributeError:
_type = config.Following*"following" + config.Followers*"followers"
if _type == "tweet":
dt = f"{object.datestamp} {object.timestamp}"
_data = { _data = {
"id": Tweet.id, "id": object.id,
"date": dt, "date": dt,
"timezone": Tweet.timezone, "timezone": object.timezone,
"location": Tweet.location, "location": object.location,
"tweet": Tweet.tweet, "tweet": object.tweet,
"hashtags": Tweet.hashtags, "hashtags": object.hashtags,
"user_id": Tweet.user_id, "user_id": object.user_id,
"username": Tweet.username, "username": object.username,
"link": Tweet.link, "link": object.link,
"retweet": Tweet.retweet, "retweet": object.retweet,
"user_rt": Tweet.user_rt, "user_rt": object.user_rt,
"essid": str(session), "essid": config.Essid,
'mentions': Tweet.mentions 'mentions': object.mentions
}
_object_blocks[_type].append(_data)
elif _type == "user":
_data = {
"id": object.id,
"name": object.name,
"username": object.username,
"bio": object.bio,
"location": object.location,
"url": object.url,
"join_datetime": object.join_date + " " + object.join_time,
"join_date": object.join_date,
"join_time": object.join_time,
"tweets": object.tweets,
"following": object.following,
"followers": object.followers,
"likes": object.likes,
"media": object.media_count,
"private": object.is_private,
"verified": object.is_verified,
"avatar": object.avatar,
"session": str(config.Essid)
}
_object_blocks[_type].append(_data)
elif _type == "followers" or _type == "following":
_data = {
config.Following*"following" + config.Followers*"followers" :
{config.Username: object[_type]}
} }
_blocks.append(_data) _object_blocks[_type] = _data
else:
print("Wrong type of object passed!")
def get():
df = pd.DataFrame(_blocks)
return df
def clean(): def clean():
_blocks.clear() _object_blocks["tweet"].clear()
_object_blocks["following"].clear()
_object_blocks["followers"].clear()
_object_blocks["user"].clear()
def save(_filename, _dataframe, **options): def save(_filename, _dataframe, **options):
if options.get("dataname"): if options.get("dataname"):
...@@ -41,15 +110,16 @@ def save(_filename, _dataframe, **options): ...@@ -41,15 +110,16 @@ def save(_filename, _dataframe, **options):
if not options.get("type"): if not options.get("type"):
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
_store = pd.HDFStore(_filename) _store = pd.HDFStore(_filename + ".h5")
_store[_dataname] = _dataframe _store[_dataname] = _dataframe
_store.close() _store.close()
elif options.get("type") == "Pickle": elif options.get("type") == "Pickle":
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
_dataframe.to_pickle(_filename) _dataframe.to_pickle(_filename + ".pkl")
else: else:
print("Please specify: filename, DataFrame, DataFrame name and type (HDF5, default, or Pickle") print("""Please specify: filename, DataFrame, DataFrame name and type
(HDF5, default, or Pickle)""")
def read(_filename, **options): def read(_filename, **options):
if not options.get("dataname"): if not options.get("dataname"):
...@@ -58,11 +128,12 @@ def read(_filename, **options): ...@@ -58,11 +128,12 @@ def read(_filename, **options):
_dataname = options.get("dataname") _dataname = options.get("dataname")
if not options.get("type"): if not options.get("type"):
_store = pd.HDFStore(_filename) _store = pd.HDFStore(_filename + ".h5")
df = _store[_dataname] _df = _store[_dataname]
return df return _df
elif options.get("type") == "Pickle": elif options.get("type") == "Pickle":
df = pd.read_pickle(_filename) _df = pd.read_pickle(_filename + ".pkl")
return df return _df
else: else:
print("Please specify: DataFrame, DataFrame name (twint as default), filename and type (HDF5, default, or Pickle") print("""Please specify: DataFrame, DataFrame name (twint as default),
filename and type (HDF5, default, or Pickle""")
...@@ -44,5 +44,5 @@ def Json(obj, config): ...@@ -44,5 +44,5 @@ def Json(obj, config):
null, data = struct(obj, config.Custom, Type(config)) null, data = struct(obj, config.Custom, Type(config))
with open(config.Output, "a", newline='', encoding="utf-8") as json_file: with open(config.Output, "a", newline='', encoding="utf-8") as json_file:
json.dump(data, json_file) json.dump(data, json_file, ensure_ascii=False)
json_file.write("\n") json_file.write("\n")
from time import strftime, localtime from time import strftime, localtime
import re import re
#from datetime import datetime
#import logging
class tweet: class tweet:
"""Define Tweet class
"""
type = "tweet"
def __init__(self):
pass pass
def getMentions(tw): def getMentions(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getMentions')
"""Extract ment from tweet
"""
try: try:
mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ") mentions = tw.find("div", "js-original-tweet")["data-mentions"].split(" ")
except: except:
...@@ -13,6 +23,9 @@ def getMentions(tw): ...@@ -13,6 +23,9 @@ def getMentions(tw):
return mentions return mentions
def getText(tw): def getText(tw):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getText')
"""Replace some text
"""
text = tw.find("p", "tweet-text").text text = tw.find("p", "tweet-text").text
text = text.replace("\n", " ") text = text.replace("\n", " ")
text = text.replace("http", " http") text = text.replace("http", " http")
...@@ -21,6 +34,7 @@ def getText(tw): ...@@ -21,6 +34,7 @@ def getText(tw):
return text return text
def getTweet(tw, mentions): def getTweet(tw, mentions):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getTweet')
try: try:
text = getText(tw) text = getText(tw)
for i in range(len(mentions)): for i in range(len(mentions)):
...@@ -33,17 +47,27 @@ def getTweet(tw, mentions): ...@@ -33,17 +47,27 @@ def getTweet(tw, mentions):
return text return text
def getHashtags(text): def getHashtags(text):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getHashtags')
"""Get hashtags of tweet
"""
return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE) return re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
def getStat(tw, _type): def getStat(tw, _type):
"""Get stats about Tweet
"""
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getStat')
st = f"ProfileTweet-action--{_type} u-hiddenVisually" st = f"ProfileTweet-action--{_type} u-hiddenVisually"
return tw.find("span", st).find("span")["data-tweet-stat-count"] return tw.find("span", st).find("span")["data-tweet-stat-count"]
def getRetweet(profile, username, user): def getRetweet(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getRetweet')
if profile and username.lower() != user: if profile and username.lower() != user:
return True return True
def getUser_rt(profile, username, user): def getUser_rt(profile, username, user):
#logging.info("[<] " + str(datetime.now()) + ':: tweet+getUser_rt')
"""Get username that retweeted
"""
if getRetweet(profile, username, user): if getRetweet(profile, username, user):
user_rt = user user_rt = user
else: else:
...@@ -52,6 +76,9 @@ def getUser_rt(profile, username, user): ...@@ -52,6 +76,9 @@ def getUser_rt(profile, username, user):
return user_rt return user_rt
def Tweet(tw, location, config): def Tweet(tw, location, config):
"""Create Tweet object
"""
##logging.info("[<] " + str(datetime.now()) + ':: tweet+Tweet')
t = tweet() t = tweet()
t.id = tw.find("div")["data-item-id"] t.id = tw.find("div")["data-item-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time"]) t.datetime = int(tw.find("span", "_timestamp")["data-time"])
......
#from datetime import datetime
#import logging
mobile = "https://mobile.twitter.com" mobile = "https://mobile.twitter.com"
base = "https://twitter.com/i" base = "https://twitter.com/i"
async def Favorites(username, init): async def Favorites(username, init):
#logging.info("[<] " + str(datetime.now()) + ':: url+Favorites')
url = f"{mobile}/{username}/favorites?lang=en" url = f"{mobile}/{username}/favorites?lang=en"
if init != -1: if init != -1:
...@@ -10,6 +14,7 @@ async def Favorites(username, init): ...@@ -10,6 +14,7 @@ async def Favorites(username, init):
return url return url
async def Followers(username, init): async def Followers(username, init):
#logging.info("[<] " + str(datetime.now()) + ':: url+Followers')
url = f"{mobile}/{username}/followers?lang=en" url = f"{mobile}/{username}/followers?lang=en"
if init != -1: if init != -1:
...@@ -18,6 +23,7 @@ async def Followers(username, init): ...@@ -18,6 +23,7 @@ async def Followers(username, init):
return url return url
async def Following(username, init): async def Following(username, init):
#logging.info("[<] " + str(datetime.now()) + ':: url+Following')
url = f"{mobile}/{username}/following?lang=en" url = f"{mobile}/{username}/following?lang=en"
if init != -1: if init != -1:
...@@ -26,6 +32,7 @@ async def Following(username, init): ...@@ -26,6 +32,7 @@ async def Following(username, init):
return url return url
async def MobileProfile(username, init): async def MobileProfile(username, init):
#logging.info("[<] " + str(datetime.now()) + ':: url+MobileProfile')
url = f"{mobile}/{username}?lang=en" url = f"{mobile}/{username}?lang=en"
if init != -1: if init != -1:
...@@ -34,6 +41,7 @@ async def MobileProfile(username, init): ...@@ -34,6 +41,7 @@ async def MobileProfile(username, init):
return url return url
async def Profile(username, init): async def Profile(username, init):
#logging.info("[<] " + str(datetime.now()) + ':: url+Profile')
url = f"{base}/profiles/show/{username}/timeline/tweets?include_" url = f"{base}/profiles/show/{username}/timeline/tweets?include_"
url += "available_features=1&lang=en&include_entities=1" url += "available_features=1&lang=en&include_entities=1"
url += "&include_new_items_bar=true" url += "&include_new_items_bar=true"
...@@ -44,6 +52,7 @@ async def Profile(username, init): ...@@ -44,6 +52,7 @@ async def Profile(username, init):
return url return url
async def Search(config, init): async def Search(config, init):
#logging.info("[<] " + str(datetime.now()) + ':: url+Search')
url = f"{base}/search/timeline?f=tweets&vertical=default&lang=en" url = f"{base}/search/timeline?f=tweets&vertical=default&lang=en"
url += "&include_available_features=1&include_entities=1&" url += "&include_available_features=1&include_entities=1&"
url += f"reset_error_state=false&src=typd&qf=off&max_position={init}&q=" url += f"reset_error_state=false&src=typd&qf=off&max_position={init}&q="
......
class user: class user:
type = "user"
def __init__(self):
pass pass
def inf(ur, _type): def inf(ur, _type):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment