Commit 65239911 authored by Francesco Poldi's avatar Francesco Poldi

New features and new index

parent 1fe95cca
...@@ -15,6 +15,7 @@ Copyright (c) 2018 Cody Zacharias ...@@ -15,6 +15,7 @@ Copyright (c) 2018 Cody Zacharias
import argparse import argparse
import twint import twint
import sys import sys
import os
def error(error, message): def error(error, message):
print("[-] {}: {}".format(error, message)) print("[-] {}: {}".format(error, message))
...@@ -45,7 +46,7 @@ def check(args): ...@@ -45,7 +46,7 @@ def check(args):
elif args.proxy_type.lower() == "http": elif args.proxy_type.lower() == "http":
_type = socks.HTTP _type = socks.HTTP
else: else:
error("Error", "Proxy type allower are: socks5, socks4 and http.") error("Error", "Proxy type allowed are: socks5, socks4 and http.")
import socks, socket import socks, socket
socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port)) socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port))
socket.socket = socks.socksocket socket.socket = socks.socksocket
...@@ -55,7 +56,15 @@ def check(args): ...@@ -55,7 +56,15 @@ def check(args):
if args.proxy_port or args.proxy_type: if args.proxy_port or args.proxy_type:
error("Error", "Please specify --proxy-host, --proxy-port and --proxy-type") error("Error", "Please specify --proxy-host, --proxy-port and --proxy-type")
def loadUserList(ul):
if not isinstance(ul, str):
userlist = open(ul, "r").readline()
else:
userlist = ul.split(",")
un = ""
for user in userlist:
un += "%20OR%20from%3A" + user
return un[15:]
def initialize(args): def initialize(args):
...@@ -90,6 +99,8 @@ def initialize(args): ...@@ -90,6 +99,8 @@ def initialize(args):
c.Proxy_type = args.proxy_type c.Proxy_type = args.proxy_type
c.Proxy_host = args.proxy_host c.Proxy_host = args.proxy_host
c.Proxy_port = args.proxy_port c.Proxy_port = args.proxy_port
c.Essid = args.essid
c.Userlist = args.userlist
return c return c
def options(): def options():
...@@ -125,14 +136,21 @@ def options(): ...@@ -125,14 +136,21 @@ def options():
ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true") ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
ap.add_argument("--debug", help="Debug mode", action="store_true") ap.add_argument("--debug", help="Debug mode", action="store_true")
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.") ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
ap.add_argument("--proxy-host", help="Proxy hostname or IP") ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
ap.add_argument("--proxy-port", help="The port of the proxy server") ap.add_argument("--proxy-port", help="The port of the proxy server.")
ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.")
ap.add_argument("--userlist", help="Userlist from list or file.")
args = ap.parse_args() args = ap.parse_args()
return args return args
def main(): def main():
args = options() args = options()
check(args) check(args)
if args.userlist:
args.username = loadUserList(args.userlist)
print(args.username)
c = initialize(args) c = initialize(args)
if args.favorites: if args.favorites:
......
PUT twint PUT twint2
{ {
"mappings" : { "mappings" : {
"items": { "items": {
...@@ -16,7 +16,8 @@ PUT twint ...@@ -16,7 +16,8 @@ PUT twint
"username": {"type": "keyword"}, "username": {"type": "keyword"},
"day": {"type": "keyword"}, "day": {"type": "keyword"},
"hour": {"type": "keyword"}, "hour": {"type": "keyword"},
"link": {"type": "text"} "link": {"type": "text"},
"essid": {"type": "keyword"}
} }
} }
} }
......
...@@ -31,3 +31,5 @@ class Config: ...@@ -31,3 +31,5 @@ class Config:
Proxy_type = None Proxy_type = None
Proxy_host = None Proxy_host = None
Proxy_port = None Proxy_port = None
Essid = None
Userlist = None
...@@ -2,9 +2,11 @@ from elasticsearch import Elasticsearch, helpers ...@@ -2,9 +2,11 @@ from elasticsearch import Elasticsearch, helpers
import contextlib import contextlib
import datetime import datetime
import time import time
import sys
class RecycleObject(object): class RecycleObject(object):
def write(self, junk): pass def write(self, junk): pass
def flush(self): pass
@contextlib.contextmanager @contextlib.contextmanager
def nostdout(): def nostdout():
...@@ -26,7 +28,7 @@ def weekday(day): ...@@ -26,7 +28,7 @@ def weekday(day):
return weekdays[day] return weekdays[day]
def Elastic(Tweet, config): def Tweet(Tweet, es, session):
# Todo play around with this some more # Todo play around with this some more
day = weekday(Tweet.date.strftime("%A")) day = weekday(Tweet.date.strftime("%A"))
...@@ -40,7 +42,7 @@ def Elastic(Tweet, config): ...@@ -40,7 +42,7 @@ def Elastic(Tweet, config):
j_data = { j_data = {
"_index": "twint", "_index": "twint",
"_type": "items", "_type": "items",
"_id": Tweet.id + "_raw", "_id": Tweet.id + "_raw_" + session,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
"date": dt, "date": dt,
...@@ -52,7 +54,8 @@ def Elastic(Tweet, config): ...@@ -52,7 +54,8 @@ def Elastic(Tweet, config):
"username": Tweet.username, "username": Tweet.username,
"day": day, "day": day,
"hour": Tweet.time.strftime("%H"), "hour": Tweet.time.strftime("%H"),
"link": Tweet.link "link": Tweet.link,
"essid": session
} }
} }
...@@ -62,7 +65,7 @@ def Elastic(Tweet, config): ...@@ -62,7 +65,7 @@ def Elastic(Tweet, config):
j_data = { j_data = {
"_index": "twint", "_index": "twint",
"_type": "items", "_type": "items",
"_id": Tweet.id + "_likes_" + str(nLikes), "_id": Tweet.id + "_likes_" + str(nLikes) + "_" + session,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
"date": dt, "date": dt,
...@@ -75,7 +78,8 @@ def Elastic(Tweet, config): ...@@ -75,7 +78,8 @@ def Elastic(Tweet, config):
"username": Tweet.username, "username": Tweet.username,
"day": day, "day": day,
"hour": Tweet.time.strftime("%H"), "hour": Tweet.time.strftime("%H"),
"link": Tweet.link "link": Tweet.link,
"essid": session
} }
} }
...@@ -86,7 +90,7 @@ def Elastic(Tweet, config): ...@@ -86,7 +90,7 @@ def Elastic(Tweet, config):
j_data = { j_data = {
"_index": "twint", "_index": "twint",
"_type": "items", "_type": "items",
"_id": Tweet.id + "_replies_" + str(nReplies), "_id": Tweet.id + "_replies_" + str(nReplies) + "_" + session,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
"date": dt, "date": dt,
...@@ -99,7 +103,8 @@ def Elastic(Tweet, config): ...@@ -99,7 +103,8 @@ def Elastic(Tweet, config):
"username": Tweet.username, "username": Tweet.username,
"day": day, "day": day,
"hour": Tweet.time.strftime("%H"), "hour": Tweet.time.strftime("%H"),
"link": Tweet.link "link": Tweet.link,
"essid": session
} }
} }
...@@ -110,7 +115,7 @@ def Elastic(Tweet, config): ...@@ -110,7 +115,7 @@ def Elastic(Tweet, config):
j_data = { j_data = {
"_index": "twint", "_index": "twint",
"_type": "items", "_type": "items",
"_id": Tweet.id + "_retweets_" + str(nRetweets), "_id": Tweet.id + "_retweets_" + str(nRetweets) + "_" + session,
"_source": { "_source": {
"id": Tweet.id, "id": Tweet.id,
"date": dt, "date": dt,
...@@ -123,14 +128,36 @@ def Elastic(Tweet, config): ...@@ -123,14 +128,36 @@ def Elastic(Tweet, config):
"username": Tweet.username, "username": Tweet.username,
"day": day, "day": day,
"hour": Tweet.time.strftime("%H"), "hour": Tweet.time.strftime("%H"),
"link": Tweet.link "link": Tweet.link,
"essid": session
} }
} }
actions.append(j_data) actions.append(j_data)
nRetweets += 1 nRetweets += 1
es = Elasticsearch(config.Elasticsearch) es = Elasticsearch(es)
with nostdout(): with nostdout():
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200) helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
actions = [] actions = []
def Follow(es, user, follow, session):
actions = []
j_data = {
"_index": "twintgraph2",
"_type": "items",
"_id": user + "_" + follow + "_" + session,
"_source": {
"user": user,
"follow": follow,
"essid": session
}
}
actions.append(j_data)
es = Elasticsearch(es)
with nostdout():
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
actions = []
\ No newline at end of file
from . import feed, get, db, output from . import feed, get, db, output, elasticsearch
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import aiohttp import aiohttp
import asyncio import asyncio
...@@ -45,6 +45,9 @@ class Followers: ...@@ -45,6 +45,9 @@ class Followers:
if self.config.Output != None: if self.config.Output != None:
output.write(User.name, self.config.Output) output.write(User.name, self.config.Output)
if self.config.Elasticsearch:
elasticsearch.Follow(self.config.Elasticsearch, User.name, self.config.Username, self.config.Essid)
self.count += 1 self.count += 1
print(User.name) print(User.name)
......
from . import feed, get, db, output from . import feed, get, db, output, elasticsearch
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import aiohttp import aiohttp
import asyncio import asyncio
...@@ -45,6 +45,9 @@ class Following: ...@@ -45,6 +45,9 @@ class Following:
if self.config.Output != None: if self.config.Output != None:
output.write(User.name, self.config.Output) output.write(User.name, self.config.Output)
if self.config.Elasticsearch:
elasticsearch.Follow(self.config.Elasticsearch, self.config.Username, User.name, self.config.Essid)
self.count += 1 self.count += 1
print(User.name) print(User.name)
......
...@@ -165,7 +165,7 @@ async def Tweets(tw, location, config, conn): ...@@ -165,7 +165,7 @@ async def Tweets(tw, location, config, conn):
if config.Database: if config.Database:
db.tweets(conn, Tweet) db.tweets(conn, Tweet)
if config.Elasticsearch: if config.Elasticsearch:
elasticsearch.Elastic(Tweet, config) elasticsearch.Tweet(Tweet, config.Elasticsearch, config.Essid)
if config.Users_only: if config.Users_only:
output = Tweet.username output = Tweet.username
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment