Commit 0e20e6e5 authored by Nestor75's avatar Nestor75 Committed by Francesco Poldi

adding mysql support (#136)

* test commit

* version 1.1.3.4 added mysql support

* Revert "(#126)"

This reverts commit ecafc09e784a097a358f78aa97398480ac30afb4.

* remove everything

* remove things

* delete all

* mysql init

to review

* renaming files

* some new addings

* remove unnecesary files

* twint.py

* adding files

* apdate text

* update file

* update

* delete all

* adding again all

* updating

* Update Twint.py

* Update Twint.py

* fix mysqlcheck

* fix db.py create tabñle and sqlite3 schema

* placing the sqlite3 schema properly

* fix db.py char left

* Create .travis.yml

* Create .gitignore

* Delete test_mysql_v1.py
parent b28ddaef
......@@ -74,6 +74,7 @@ twint.run.Search(c)
- CSV
- JSON
- SQLite
- Mysql (DB collation utf8mb4)
- Elasticsearch
- MySQL (See MySQL Branch)
### Elasticsearch Setup
......
......@@ -31,6 +31,10 @@ def check(args):
error("Error", "Please specify an output file (Example: -o file.csv).")
elif args.json:
error("Error", "Please specify an output file (Example: -o file.json).")
if args.hostname:
if args.Database is None or args.DB_user is None or args.DB_pwd is None:
error("Error", "Please specify database name, user and password")
if not args.followers and not args.following:
if args.user_full:
......@@ -96,7 +100,10 @@ def initialize(args):
c.Limit = args.limit
c.Count = args.count
c.Stats = args.stats
c.hostname = args.hostname
c.Database = args.database
c.DB_user = args.DB_user
c.DB_pwd = args.DB_pwd
c.To = args.to
c.All = args.all
c.Essid = args.essid
......@@ -105,6 +112,7 @@ def initialize(args):
c.Profile_full = args.profile_full
c.Store_pandas = args.store_pandas
c.Pandas_type = args.pandas_type
c.search_name = args.search_name
return c
def options():
......@@ -124,7 +132,7 @@ def options():
ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).")
ap.add_argument("--until", help="Filter Tweets sent until date (Example: 2017-12-27).")
ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true")
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
action="store_true")
ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
ap.add_argument("--json", help="Write as .json file", action="store_true")
......@@ -134,7 +142,10 @@ def options():
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
action="store_true")
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", action="store_true")
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.")
ap.add_argument("--hostname", help="Store the mysql database host")
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 or mysql database.")
ap.add_argument("--DB_user", help="Store the mysql database user")
ap.add_argument("--DB_pwd", help="Store the mysql database pwd")
ap.add_argument("--to", help="Search Tweets to a user.")
ap.add_argument("--all", help="Search all Tweets associated with a user.")
ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true")
......@@ -149,12 +160,14 @@ def options():
ap.add_argument("--format", help="Custom output format (See wiki for details).")
ap.add_argument("--user-full", help="Collect all user information (Use with followers or following only).",
action="store_true")
ap.add_argument("--profile-full",
ap.add_argument("--profile-full",
help="Slow, but effective method of collecting a user's Tweets (Including Retweets).",
action="store_true")
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
ap.add_argument("--pandas-type", help="Specify HDF5 or Pickle (HDF5 as default)")
ap.add_argument("--search_name", help="name for identify the search like -3dprinter stuff- only for mysql")
args = ap.parse_args()
return args
def main():
......
CREATE DATABASE IF NOT EXISTS `twitterdata_v9` /*!40100 DEFAULT CHARACTER SET utf8mb4 */;
USE `twitterdata_v9`;
-- MySQL dump 10.13 Distrib 5.7.22, for Linux (x86_64)
--
-- Host: localhost Database: twitterdata_v9
-- ------------------------------------------------------
-- Server version 5.7.22-0ubuntu0.16.04.1
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
--
-- Table structure for table `followers`
--
DROP TABLE IF EXISTS `followers`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `followers` (
`id` bigint(30) NOT NULL,
`name` mediumtext,
`username` text NOT NULL,
`bio` longtext,
`location` tinytext,
`url` longtext,
`join_date` tinytext NOT NULL,
`join_time` tinytext NOT NULL,
`tweets` int(11) DEFAULT NULL,
`following` int(11) DEFAULT NULL,
`followers` int(11) DEFAULT NULL,
`likes` int(11) DEFAULT NULL,
`media` int(11) DEFAULT NULL,
`private` tinytext NOT NULL,
`verified` tinytext NOT NULL,
`avatar` longtext NOT NULL,
`date_update` datetime NOT NULL,
`follower` text NOT NULL,
PRIMARY KEY (`follower`(255),`username`(255),`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `followers_names`
--
DROP TABLE IF EXISTS `followers_names`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `followers_names` (
`user` text NOT NULL,
`date_update` datetime NOT NULL,
`follower` text NOT NULL,
PRIMARY KEY (`user`(255),`follower`(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `following`
--
DROP TABLE IF EXISTS `following`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `following` (
`id` bigint(30) NOT NULL,
`name` mediumtext,
`username` text NOT NULL,
`bio` longtext,
`location` tinytext,
`url` longtext,
`join_date` tinytext NOT NULL,
`join_time` tinytext NOT NULL,
`tweets` int(11) DEFAULT NULL,
`following` int(11) DEFAULT NULL,
`followers` int(11) DEFAULT NULL,
`likes` int(11) DEFAULT NULL,
`media` int(11) DEFAULT NULL,
`private` tinytext NOT NULL,
`verified` tinytext NOT NULL,
`avatar` longtext NOT NULL,
`date_update` datetime NOT NULL,
`follows` text NOT NULL,
PRIMARY KEY (`id`,`username`(255),`follows`(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `following_names`
--
DROP TABLE IF EXISTS `following_names`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `following_names` (
`user` text NOT NULL,
`date_update` datetime NOT NULL,
`follows` text NOT NULL,
PRIMARY KEY (`user`(255),`follows`(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `tweets`
--
DROP TABLE IF EXISTS `tweets`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `tweets` (
`id` bigint(30) NOT NULL,
`user_id` bigint(30) DEFAULT NULL,
`date` date NOT NULL,
`time` time NOT NULL,
`timezone` tinytext NOT NULL,
`location` tinytext NOT NULL,
`user` text NOT NULL,
`tweet` longtext NOT NULL,
`replies` int(11) DEFAULT NULL,
`likes` int(11) DEFAULT NULL,
`retweets` int(11) DEFAULT NULL,
`hashtags` longtext,
`link` longtext,
`retweet` int(1) DEFAULT NULL,
`user_rt` text,
`mentions` longtext,
`date_update` datetime NOT NULL,
`search_name` mediumtext NOT NULL COMMENT 'user can use this field to know from which search the info comes. max 255 chars. if the user do not especify, it must be set to "-" ',
PRIMARY KEY (`id`,`search_name`(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/*!40101 SET character_set_client = @saved_cs_client */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on 2018-06-03 14:52:08
......@@ -5,4 +5,5 @@ cchardet
elasticsearch
pysocks
pandas
tables
\ No newline at end of file
tables
mysqlclient
VERSION = (1, 1, 3, 4)
VERSION = (1, 1, 3, 4) #mysql support
__version__ = '.'.join(map(str, VERSION))
......@@ -20,8 +20,11 @@ class Config:
Show_hashtags = False
Limit = None
Count = None
Stats = False
Stats = False
hostname = None #mysql
Database = None
DB_user = None #mysql
DB_pwd = None #mysql
To = None
All = None
Debug = False
......@@ -37,4 +40,5 @@ class Config:
Store_object = False
Store_pandas = False
Pandas_type = None
Pandas = False
\ No newline at end of file
Pandas = False
search_name = "-" #for identify a records in mysql with the search it provides from. it cannot be null for DB requirements. a tweet must be in several search so the PK are tweet ID and search_name
......@@ -21,7 +21,7 @@ def init(db):
table_tweets = """
CREATE TABLE IF NOT EXISTS
tweets (
id integer primary key,
id integer not null,
user_id integer,
date text not null,
time text not null,
......@@ -37,7 +37,9 @@ def init(db):
retweet bool,
user_rt text,
mentions text,
date_update text not null
date_update text not null,
search_name text not null,
PRIMARY KEY (id, search_name)
);
"""
cursor.execute(table_tweets)
......@@ -175,7 +177,7 @@ def user(conn, Username, Followers, User):
except sqlite3.IntegrityError:
pass
def tweets(conn, Tweet):
def tweets(conn, Tweet, config):
try:
date_time = str(datetime.now())
cursor = conn.cursor()
......@@ -195,8 +197,9 @@ def tweets(conn, Tweet):
Tweet.retweet,
Tweet.user_rt,
",".join(Tweet.mentions),
date_time,)
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
date_time,
config.search_name,)
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
conn.commit()
except sqlite3.IntegrityError:
pass
from datetime import datetime
import MySQLdb
import sys
def Conn(hostname,Database,db_user,db_pwd):
if Database:
print("[+] Inserting into Database: " + str(Database))
conn = init(hostname,Database,db_user,db_pwd)
if isinstance(conn, str):
print(str)
sys.exit(1)
else:
conn = ""
return conn
def init(hostname,Database,db_user,db_pwd):
try:
conn = MySQLdb.connect(host=hostname, # your host, usually localhost
user=db_user, # your username
passwd=db_pwd, # your password
db=Database,# name of the data base
charset='utf8mb4',
use_unicode=True)
cursor = conn.cursor()
#here would be the code for creating the tables if them don't exist
return conn
except Exception as e:
return str(e)
def fTable(Followers):
if Followers:
table = "followers_names"
else:
table = "following_names"
return table
def uTable(Followers):
if Followers:
table = "followers"
else:
table = "following"
return table
def follow(conn, Username, Followers, User):
try:
date_time = str(datetime.now())
cursor = conn.cursor()
entry = (User, date_time, Username,)
query = 'INSERT INTO {} VALUES(%s,%s,%s)'.format(fTable(Followers))
cursor.execute(query, entry)
conn.commit()
except MySQLdb.IntegrityError:
pass
def user(conn, Username, Followers, User):
try:
date_time = str(datetime.now())
cursor = conn.cursor()
entry = (User.id,
User.name,
User.username,
User.bio,
User.location,
User.url,
User.join_date,
User.join_time,
User.tweets,
User.following,
User.followers,
User.likes,
User.media_count,
User.is_private,
User.is_verified,
User.avatar,
date_time,
Username,)
query = 'INSERT INTO {} VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'.format(uTable(Followers))
cursor.execute(query, entry)
conn.commit()
except MySQLdb.IntegrityError:
pass
def tweets(conn, Tweet, config):
try:
date_time = str(datetime.now())
cursor = conn.cursor()
entry = (Tweet.id,
Tweet.user_id,
Tweet.datestamp,
Tweet.timestamp,
Tweet.timezone,
Tweet.location,
Tweet.username,
Tweet.tweet,
Tweet.replies,
Tweet.likes,
Tweet.retweets,
",".join(Tweet.hashtags),
Tweet.link,
Tweet.retweet,
Tweet.user_rt,
",".join(Tweet.mentions),
date_time,
config.search_name,)
cursor.execute('INSERT INTO tweets VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', entry)
conn.commit()
except MySQLdb.IntegrityError:
pass
from . import feed, get, db, output, verbose
from . import feed, get, db, output, verbose, dbmysql
class Favorites:
def __init__(self, config):
......@@ -6,7 +6,10 @@ class Favorites:
self.feed = [-1]
self.count = 0
self.config = config
self.conn = db.Conn(config.Database)
if config.hostname:
self.conn = dbmysql.Conn(config.hostname, config.Database, config.DB_user, config.DB_pwd)
else:
self.conn = db.Conn(config.Database)
self.config.Favorites = True
verbose.Elastic(config)
......
from . import feed, get, db, output, verbose
from . import feed, get, db, output, verbose, dbmysql
class Follow:
def __init__(self, config):
......@@ -6,7 +6,10 @@ class Follow:
self.feed = [-1]
self.count = 0
self.config = config
self.conn = db.Conn(config.Database)
if config.hostname:
self.conn = dbmysql.Conn(config.hostname, config.Database, config.DB_user, config.DB_pwd)
else:
self.conn = db.Conn(config.Database)
verbose.Elastic(config)
async def Feed(self):
......
from datetime import datetime
from . import db, elasticsearch, format, write, Pandas
from . import db, elasticsearch, format, write, Pandas, dbmysql
from .tweet import Tweet
from .user import User
tweets_object = []
def datecheck(datestamp, config):
......@@ -29,10 +30,9 @@ def _output(obj, output, config):
write.Json(obj, config)
else:
write.Text(output, config.Output)
if config.Pandas:
Pandas.update(obj, config.Essid)
if config.Elasticsearch:
if config.Store_object:
tweets_object.append(obj)
......@@ -53,19 +53,22 @@ async def Tweets(tw, location, config, conn):
tweet = Tweet(tw, location, config)
if datecheck(tweet.datestamp, config):
output = format.Tweet(config, tweet)
if config.Database:
db.tweets(conn, tweet)
if config.hostname:
dbmysql.tweets(conn, tweet, config)
elif config.Database:
db.tweets(conn, tweet, config)
if config.Elasticsearch:
elasticsearch.Tweet(tweet, config.Elasticsearch, config.Essid)
_output(tweet, output, config)
async def Users(u, config, conn):
user = User(u)
output = format.User(config.Format, user)
if config.Database:
if config.hostname:
dbmysql.user(conn, config.Username, config.Followers, user)
elif config.Database:
db.user(conn, config.Username, config.Followers, user)
if config.Elasticsearch:
......@@ -77,11 +80,13 @@ async def Users(u, config, conn):
config.Username, config.Essid)
user.join_date = _save_date
user.join_time = _save_time
_output(user, output, config)
async def Username(username, config, conn):
if config.Database:
if config.hostname:
dbmysql.follow(conn, config.Username, config.Followers, username)
elif config.Database:
db.follow(conn, config.Username, config.Followers, username)
if config.Elasticsearch:
......
from . import db, get, feed, output, verbose
from . import db, get, feed, output, verbose, dbmysql
class Profile:
def __init__(self, config):
......@@ -6,7 +6,12 @@ class Profile:
self.feed = [-1]
self.count = 0
self.config = config
self.conn = db.Conn(config.Database)
if config.hostname:
self.conn = dbmysql.Conn(config.hostname, config.Database, config.DB_user, config.DB_pwd)
else:
self.conn = db.Conn(config.Database)
self.config.Profile = True
verbose.Elastic(config)
......
......@@ -6,6 +6,10 @@ def run(x):
def Favorites(config):
config.Favorites = True
if config.Username:
config.search_name="Favourites"+str(config.Username) #to identify to which user is related to
else:
config.search_name="Favourites"+str(config.User_id) #to identify to which user is related to
run(favorites.Favorites(config).main())
def Followers(config):
......
from . import datelock, db, get, feed, output, verbose
from . import datelock, db, get, feed, output, verbose, dbmysql
from datetime import timedelta
class Search:
......@@ -7,7 +7,10 @@ class Search:
self.feed = [-1]
self.count = 0
self.config = config
self.conn = db.Conn(config.Database)
if config.hostname:
self.conn = dbmysql.Conn(config.hostname, config.Database, config.DB_user, config.DB_pwd)
else:
self.conn = db.Conn(config.Database)
self.d = datelock.Set(self.config.Until, self.config.Since)
self.config.TwitterSearch = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment