Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
T
Twint
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nanahira
Twint
Commits
7c1b04f8
Commit
7c1b04f8
authored
Apr 10, 2018
by
Cody Zacharias
Committed by
GitHub
Apr 10, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added ability to scrape followers & more
parent
f6edadd1
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
239 additions
and
84 deletions
+239
-84
tweep.py
tweep.py
+239
-84
No files found.
tweep.py
View file @
7c1b04f8
#!/usr/bin/python3
#!/usr/bin/python3
'''
Twint.py - Twitter Intelligence (formerly known as Tweep).
Written by Cody Zacharias (@now)
Special thanks to @hpiedcoq & @pielco11 for contributing
several search and storing options.
See wiki on Github for in-depth details.
https://github.com/haccer/twint/wiki
Licensed under MIT License
Copyright (c) 2018 Cody Zacharias
'''
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
elasticsearch
import
Elasticsearch
,
helpers
from
elasticsearch
import
Elasticsearch
,
helpers
from
time
import
gmtime
,
strftime
from
time
import
gmtime
,
strftime
...
@@ -6,6 +19,7 @@ import argparse
...
@@ -6,6 +19,7 @@ import argparse
import
aiohttp
import
aiohttp
import
asyncio
import
asyncio
import
async_timeout
import
async_timeout
import
concurrent.futures
import
contextlib
import
contextlib
import
csv
import
csv
import
datetime
import
datetime
...
@@ -30,7 +44,7 @@ def nostdout():
...
@@ -30,7 +44,7 @@ def nostdout():
def
initdb
(
db
):
def
initdb
(
db
):
'''
'''
Creates a new SQLite database or connects to
it if exists
Creates a new SQLite database or connects to
an existing one.
'''
'''
try
:
try
:
conn
=
sqlite3
.
connect
(
db
)
conn
=
sqlite3
.
connect
(
db
)
...
@@ -76,74 +90,102 @@ def initdb(db):
...
@@ -76,74 +90,102 @@ def initdb(db):
except
Exception
as
e
:
except
Exception
as
e
:
return
str
(
e
)
return
str
(
e
)
def
getAction
():
if
arg
.
following
:
action
=
"following"
elif
arg
.
followers
:
action
=
"followers"
elif
arg
.
favorites
:
action
=
"favorites"
else
:
action
=
""
return
action
async
def
getUrl
(
init
):
async
def
getUrl
(
init
):
'''
'''
URL Descision:
URL Descision:
Tw
eep
utilizes positions of Tweet's from Twitter's search feature to
Tw
int
utilizes positions of Tweet's from Twitter's search feature to
iterate through a user's Twitter feed. This section decides whether
iterate through a user's Twitter feed. This section decides whether
this is the first URL request or not and
develop
s the URL based on the
this is the first URL request or not and
form
s the URL based on the
args given.
args given.
Mobile Twitter URLs are used to collect a Twitter user's Followers,
Followings, and Favorites.
Returns complete URL.
Returns complete URL.
'''
'''
action
=
getAction
()
if
init
==
-
1
:
if
init
==
-
1
:
url
=
"https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
if
action
!=
""
:
url
=
"https://mobile.twitter.com/{0.u}/{1}?"
.
format
(
arg
,
action
)
else
:
url
=
"https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
else
:
else
:
url
=
"https://twitter.com/i/search/timeline?f=tweets&vertical=default"
if
action
!=
""
:
url
+=
"&lang=en&include_available_features=1&include_entities=1&reset_"
if
arg
.
favorites
:
url
+=
"error_state=false&src=typd&max_position={}&q="
.
format
(
init
)
id
=
"max_id"
else
:
if
arg
.
l
!=
None
:
id
=
"cursor"
url
=
url
.
replace
(
"lang=en"
,
"l={0.l}&lang=en"
.
format
(
arg
))
url
=
"https://mobile.twitter.com/{0.u}/{1}?{2}={3}"
.
format
(
arg
,
action
,
id
,
init
)
if
arg
.
u
!=
None
:
else
:
url
+=
"from
%3
A{0.u}"
.
format
(
arg
)
if
arg
.
g
!=
None
:
arg
.
g
=
arg
.
g
.
replace
(
" "
,
""
)
url
+=
"geocode
%3
A{0.g}"
.
format
(
arg
)
if
arg
.
s
!=
None
:
arg
.
s
=
arg
.
s
.
replace
(
" "
,
"
%20
"
)
.
replace
(
"#"
,
"
%23
"
)
url
+=
"
%20
{0.s}"
.
format
(
arg
)
if
arg
.
year
!=
None
:
url
+=
"
%20
until
%3
A{0.year}-1-1"
.
format
(
arg
)
if
arg
.
since
!=
None
:
url
+=
"
%20
since
%3
A{0.since}"
.
format
(
arg
)
if
arg
.
until
!=
None
:
url
+=
"
%20
until
%3
A{0.until}"
.
format
(
arg
)
if
arg
.
fruit
:
url
+=
"
%20
myspace.com
%20
OR
%20
last.fm
%20
OR"
url
+=
"
%20
mail
%20
OR
%20
email
%20
OR
%20
gmail
%20
OR
%20
e-mail"
url
+=
"
%20
OR
%20
phone
%20
OR
%20
call
%20
me
%20
OR
%20
text
%20
me"
url
+=
"
%20
OR
%20
keybase"
if
arg
.
verified
:
url
+=
"
%20
filter
%3
Averified"
if
arg
.
to
:
url
+=
"
%20
to
%3
A{0.to}"
.
format
(
arg
)
if
arg
.
all
:
url
+=
"
%20
to
%3
A{0.all}
%20
OR
%20
from
%3
A{0.all}
%20
OR
%20
@{0.all}"
.
format
(
arg
)
url
=
"https://twitter.com/i/search/timeline?f=tweets&vertical=default"
url
+=
"&lang=en&include_available_features=1&include_entities=1&reset_"
url
+=
"error_state=false&src=typd&max_position={}&q="
.
format
(
init
)
if
action
==
""
:
if
arg
.
l
!=
None
:
url
=
url
.
replace
(
"lang=en"
,
"l={0.l}&lang=en"
.
format
(
arg
))
if
arg
.
u
!=
None
:
url
+=
"from
%3
A{0.u}"
.
format
(
arg
)
if
arg
.
g
!=
None
:
arg
.
g
=
arg
.
g
.
replace
(
" "
,
""
)
url
+=
"geocode
%3
A{0.g}"
.
format
(
arg
)
if
arg
.
s
!=
None
:
arg
.
s
=
arg
.
s
.
replace
(
" "
,
"
%20
"
)
.
replace
(
"#"
,
"
%23
"
)
url
+=
"
%20
{0.s}"
.
format
(
arg
)
if
arg
.
year
!=
None
:
url
+=
"
%20
until
%3
A{0.year}-1-1"
.
format
(
arg
)
if
arg
.
since
!=
None
:
url
+=
"
%20
since
%3
A{0.since}"
.
format
(
arg
)
if
arg
.
until
!=
None
:
url
+=
"
%20
until
%3
A{0.until}"
.
format
(
arg
)
if
arg
.
fruit
:
url
+=
"
%20
myspace.com
%20
OR
%20
last.fm
%20
OR"
url
+=
"
%20
mail
%20
OR
%20
email
%20
OR
%20
gmail
%20
OR
%20
e-mail"
url
+=
"
%20
OR
%20
phone
%20
OR
%20
call
%20
me
%20
OR
%20
text
%20
me"
url
+=
"
%20
OR
%20
keybase"
if
arg
.
verified
:
url
+=
"
%20
filter
%3
Averified"
if
arg
.
to
:
url
+=
"
%20
to
%3
A{0.to}"
.
format
(
arg
)
if
arg
.
all
:
url
+=
"
%20
to
%3
A{0.all}
%20
OR
%20
from
%3
A{0.all}
%20
OR
%20
@{0.all}"
.
format
(
arg
)
return
url
return
url
async
def
fetch
(
session
,
url
):
async
def
fetch
(
session
,
url
):
'''
'''
Basic
aiohttp request with a 30 second timeout.
Standard
aiohttp request with a 30 second timeout.
'''
'''
with
async_timeout
.
timeout
(
30
):
with
async_timeout
.
timeout
(
30
):
async
with
session
.
get
(
url
)
as
response
:
async
with
session
.
get
(
url
)
as
response
:
return
await
response
.
text
()
return
await
response
.
text
()
async
def
initial
(
response
):
def
initial
(
response
):
'''
'''
Initial response parsing and collecting the position ID
Initial response parsing and collecting the position ID
'''
'''
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
feed
=
soup
.
find_all
(
"li"
,
"js-stream-item"
)
feed
=
soup
.
find_all
(
"li"
,
"js-stream-item"
)
init
=
"TWEET-{}-{}"
.
format
(
feed
[
-
1
][
"data-item-id"
],
feed
[
0
][
"data-item-id"
])
init
=
"TWEET-{}-{}"
.
format
(
feed
[
-
1
][
"data-item-id"
],
feed
[
0
][
"data-item-id"
])
return
feed
,
init
return
feed
,
init
async
def
cont
(
response
):
def
cont
(
response
):
'''
'''
Regular
json response parsing and collecting P
osition ID
Regular
JSON response parsing and collecting p
osition ID
'''
'''
json_response
=
json
.
loads
(
response
)
json_response
=
json
.
loads
(
response
)
html
=
json_response
[
"items_html"
]
html
=
json_response
[
"items_html"
]
...
@@ -151,30 +193,78 @@ async def cont(response):
...
@@ -151,30 +193,78 @@ async def cont(response):
feed
=
soup
.
find_all
(
"li"
,
"js-stream-item"
)
feed
=
soup
.
find_all
(
"li"
,
"js-stream-item"
)
split
=
json_response
[
"min_position"
]
.
split
(
"-"
)
split
=
json_response
[
"min_position"
]
.
split
(
"-"
)
split
[
1
]
=
feed
[
-
1
][
"data-item-id"
]
split
[
1
]
=
feed
[
-
1
][
"data-item-id"
]
init
=
"-"
.
join
(
split
)
return
feed
,
"-"
.
join
(
split
)
def
follow
(
response
):
'''
Response and parsing of a user's followers or following list.
'''
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
followers
=
soup
.
find_all
(
"td"
,
"info fifty screenname"
)
cursor
=
soup
.
find_all
(
"div"
,
"w-button-more"
)
# Try & Except neccessary for collecting the last feed.
try
:
cursor
=
re
.
findall
(
r'cursor=(.*?)">'
,
str
(
cursor
))[
0
]
except
:
pass
return
followers
,
cursor
def
favorite
(
response
):
'''
Response and parsing of a user's favorites/likes list.
'''
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
tweets
=
soup
.
find_all
(
"span"
,
"metadata"
)
max_id
=
soup
.
find_all
(
"div"
,
"w-button-more"
)
# Try & Except neccessary for collecting the last feed.
try
:
max_id
=
re
.
findall
(
r'max_id=(.*?)">'
,
str
(
max_id
))[
0
]
except
:
pass
return
tweets
,
max_id
async
def
getfeed
(
init
):
'''
The magic user-agent was Lynx (but could be any old one).
If we want to collect a person's favorites, we're signalling
that function; if not, we're signalling the follow() function.
'''
ua
=
{
'User-Agent'
:
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/0.8.12'
}
connect
=
aiohttp
.
TCPConnector
(
verify_ssl
=
False
)
async
with
aiohttp
.
ClientSession
(
headers
=
ua
,
connector
=
connect
)
as
session
:
response
=
await
fetch
(
session
,
await
getUrl
(
init
))
feed
=
[]
try
:
if
arg
.
favorites
:
feed
,
init
=
favorite
(
response
)
else
:
feed
,
init
=
follow
(
response
)
except
:
pass
return
feed
,
init
return
feed
,
init
async
def
getFeed
(
init
):
async
def
getFeed
(
init
):
'''
'''
Parsing Descision:
Parsing Descision:
Responses from requests with the position
id
's are JSON,
Responses from requests with the position
ID
's are JSON,
so this section decides whether this is an initial request
so this section decides whether this is an initial request
or not to use the approriate response reading for parsing
or not to use the appropriate function for parsing with
with BeautifulSoup4.
BeautifulSoup4.
Returns html for Tweets and position id.
'''
'''
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
verify_ssl
=
False
))
as
session
:
connect
=
aiohttp
.
TCPConnector
(
verify_ssl
=
False
)
async
with
aiohttp
.
ClientSession
(
connector
=
connect
)
as
session
:
response
=
await
fetch
(
session
,
await
getUrl
(
init
))
response
=
await
fetch
(
session
,
await
getUrl
(
init
))
feed
=
[]
feed
=
[]
try
:
try
:
if
init
==
-
1
:
if
init
==
-
1
:
feed
,
init
=
await
initial
(
response
)
feed
,
init
=
initial
(
response
)
else
:
else
:
feed
,
init
=
await
cont
(
response
)
feed
,
init
=
cont
(
response
)
except
:
except
:
#
Tweep will r
ealize that it's done scraping.
#
R
ealize that it's done scraping.
pass
pass
return
feed
,
init
return
feed
,
init
...
@@ -182,12 +272,12 @@ async def getFeed(init):
...
@@ -182,12 +272,12 @@ async def getFeed(init):
async
def
outTweet
(
tweet
):
async
def
outTweet
(
tweet
):
'''
'''
Parsing Section:
Parsing Section:
This function will create the desired output string
and
This function will create the desired output string
write it to a file or csv
if specified.
and store it
if specified.
Returns output
.
Returns output
'''
'''
tweetid
=
tweet
[
"data-item-id"
]
tweetid
=
tweet
.
find
(
"div"
)
[
"data-item-id"
]
# Formatting the date & time stamps just how I like it.
# Formatting the date & time stamps just how I like it.
datestamp
=
tweet
.
find
(
"a"
,
"tweet-timestamp"
)[
"title"
]
.
rpartition
(
" - "
)[
-
1
]
datestamp
=
tweet
.
find
(
"a"
,
"tweet-timestamp"
)[
"title"
]
.
rpartition
(
" - "
)[
-
1
]
d
=
datetime
.
datetime
.
strptime
(
datestamp
,
"
%
d
%
b
%
Y"
)
d
=
datetime
.
datetime
.
strptime
(
datestamp
,
"
%
d
%
b
%
Y"
)
...
@@ -198,9 +288,9 @@ async def outTweet(tweet):
...
@@ -198,9 +288,9 @@ async def outTweet(tweet):
# The @ in the username annoys me.
# The @ in the username annoys me.
username
=
tweet
.
find
(
"span"
,
"username"
)
.
text
.
replace
(
"@"
,
""
)
username
=
tweet
.
find
(
"span"
,
"username"
)
.
text
.
replace
(
"@"
,
""
)
timezone
=
strftime
(
"
%
Z"
,
gmtime
())
timezone
=
strftime
(
"
%
Z"
,
gmtime
())
# Replace all emoticons with their title, to be included in the
t
weet text
# Replace all emoticons with their title, to be included in the
T
weet text
for
img
in
tweet
.
findAll
(
"img"
,
"Emoji Emoji--forText"
):
for
img
in
tweet
.
findAll
(
"img"
,
"Emoji Emoji--forText"
):
img
.
replaceWith
(
"<
%
s>"
%
img
[
'aria-label'
]
)
img
.
replaceWith
(
"<
{}>"
.
format
(
img
[
'aria-label'
])
)
# The context of the Tweet compressed into a single line.
# The context of the Tweet compressed into a single line.
text
=
tweet
.
find
(
"p"
,
"tweet-text"
)
.
text
.
replace
(
"
\n
"
,
""
)
.
replace
(
"http"
,
" http"
)
.
replace
(
"pic.twitter"
,
" pic.twitter"
)
text
=
tweet
.
find
(
"p"
,
"tweet-text"
)
.
text
.
replace
(
"
\n
"
,
""
)
.
replace
(
"http"
,
" http"
)
.
replace
(
"pic.twitter"
,
" pic.twitter"
)
# Regex for gathering hashtags
# Regex for gathering hashtags
...
@@ -210,7 +300,7 @@ async def outTweet(tweet):
...
@@ -210,7 +300,7 @@ async def outTweet(tweet):
likes
=
tweet
.
find
(
"span"
,
"ProfileTweet-action--favorite u-hiddenVisually"
)
.
find
(
"span"
)[
"data-tweet-stat-count"
]
likes
=
tweet
.
find
(
"span"
,
"ProfileTweet-action--favorite u-hiddenVisually"
)
.
find
(
"span"
)[
"data-tweet-stat-count"
]
'''
'''
This part tries to get a list of mentions.
This part tries to get a list of mentions.
It sometimes gets slow with Tweets that contain
It sometimes gets slow with Tweets that contain
s
40+ mentioned people.. rather than just appending
40+ mentioned people.. rather than just appending
the whole list to the Tweet, it goes through each
the whole list to the Tweet, it goes through each
one to make sure there arn't any duplicates.
one to make sure there arn't any duplicates.
...
@@ -223,15 +313,9 @@ async def outTweet(tweet):
...
@@ -223,15 +313,9 @@ async def outTweet(tweet):
text
=
"{} {}"
.
format
(
mention
,
text
)
text
=
"{} {}"
.
format
(
mention
,
text
)
except
:
except
:
pass
pass
# Preparing to output
# Preparing storage
'''
There were certain cases where I used Tweep
to gather a list of users and then fed that
generated list into Tweep. That's why these
modes exist.
'''
if
arg
.
database
:
if
arg
.
database
:
try
:
try
:
cursor
=
conn
.
cursor
()
cursor
=
conn
.
cursor
()
...
@@ -287,7 +371,7 @@ async def outTweet(tweet):
...
@@ -287,7 +371,7 @@ async def outTweet(tweet):
"hour"
:
time
.
split
(
":"
)[
0
]
"hour"
:
time
.
split
(
":"
)[
0
]
}
}
j_data
=
{
j_data
=
{
"_index"
:
"tw
eep
"
,
"_index"
:
"tw
int
"
,
"_type"
:
"items"
,
"_type"
:
"items"
,
"_id"
:
tweetid
+
"_likes_"
+
str
(
nLikes
),
"_id"
:
tweetid
+
"_likes_"
+
str
(
nLikes
),
"_source"
:
jObject
"_source"
:
jObject
...
@@ -307,7 +391,7 @@ async def outTweet(tweet):
...
@@ -307,7 +391,7 @@ async def outTweet(tweet):
"hour"
:
time
.
split
(
":"
)[
0
]
"hour"
:
time
.
split
(
":"
)[
0
]
}
}
j_data
=
{
j_data
=
{
"_index"
:
"tw
eep
"
,
"_index"
:
"tw
int
"
,
"_type"
:
"items"
,
"_type"
:
"items"
,
"_id"
:
tweetid
+
"_replies_"
+
str
(
nReplies
),
"_id"
:
tweetid
+
"_replies_"
+
str
(
nReplies
),
"_source"
:
jObject
"_source"
:
jObject
...
@@ -327,7 +411,7 @@ async def outTweet(tweet):
...
@@ -327,7 +411,7 @@ async def outTweet(tweet):
"hour"
:
time
.
split
(
":"
)[
0
]
"hour"
:
time
.
split
(
":"
)[
0
]
}
}
j_data
=
{
j_data
=
{
"_index"
:
"tw
eep
"
,
"_index"
:
"tw
int
"
,
"_type"
:
"items"
,
"_type"
:
"items"
,
"_id"
:
tweetid
+
"_retweets_"
+
str
(
nRetweets
),
"_id"
:
tweetid
+
"_retweets_"
+
str
(
nRetweets
),
"_source"
:
jObject
"_source"
:
jObject
...
@@ -347,9 +431,9 @@ async def outTweet(tweet):
...
@@ -347,9 +431,9 @@ async def outTweet(tweet):
else
:
else
:
'''
'''
The standard output is how I like it, although
The standard output is how I like it, although
t
his can be modified to your desire. Uncomment
T
his can be modified to your desire. Uncomment
the
bottom line and add in
the variables in the
the
line bellow and add
the variables in the
order
you want them or how you want it
to look.
order
/format you want them
to look.
'''
'''
# output = ""
# output = ""
output
=
"{} {} {} {} <{}> {}"
.
format
(
tweetid
,
date
,
time
,
timezone
,
username
,
text
)
output
=
"{} {} {} {} <{}> {}"
.
format
(
tweetid
,
date
,
time
,
timezone
,
username
,
text
)
...
@@ -358,7 +442,7 @@ async def outTweet(tweet):
...
@@ -358,7 +442,7 @@ async def outTweet(tweet):
if
arg
.
stats
:
if
arg
.
stats
:
output
+=
" | {} replies {} retweets {} likes"
.
format
(
replies
,
retweets
,
likes
)
output
+=
" | {} replies {} retweets {} likes"
.
format
(
replies
,
retweets
,
likes
)
# Output section
# Output section
if
arg
.
o
!=
None
:
if
arg
.
o
!=
None
:
if
arg
.
csv
:
if
arg
.
csv
:
...
@@ -381,9 +465,8 @@ async def outTweet(tweet):
...
@@ -381,9 +465,8 @@ async def outTweet(tweet):
async
def
getTweets
(
init
):
async
def
getTweets
(
init
):
'''
'''
This function uses the html responses from getFeed()
This function uses the HTML responses from getFeed()
and sends that info to the Tweet parser outTweet() and
and sends that info to outTweet() to output it.
outputs it.
Returns response feed, if it's first-run, and Tweet count.
Returns response feed, if it's first-run, and Tweet count.
'''
'''
...
@@ -398,18 +481,79 @@ async def getTweets(init):
...
@@ -398,18 +481,79 @@ async def getTweets(init):
if
copyright
is
None
:
if
copyright
is
None
:
count
+=
1
count
+=
1
if
arg
.
elasticsearch
:
if
arg
.
elasticsearch
:
print
(
await
outTweet
(
tweet
),
end
=
"."
,
flush
=
True
)
print
(
await
outTweet
(
tweet
),
end
=
"."
,
flush
=
True
)
else
:
print
(
await
outTweet
(
tweet
))
return
tweets
,
init
,
count
async
def
getTweet
(
url
):
'''
This function is used in a concurrent loop
to fetch individual Tweets and send them
for formatting/parsing, very similar to
getTweets().
'''
try
:
connect
=
aiohttp
.
TCPConnector
(
verify_ssl
=
False
)
async
with
aiohttp
.
ClientSession
(
connector
=
connect
)
as
session
:
response
=
await
fetch
(
session
,
url
)
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
tweet
=
soup
.
find
(
"div"
,
"permalink-inner permalink-tweet-container"
)
copyright
=
tweet
.
find
(
"div"
,
"StreamItemContent--withheld"
)
print
(
url
)
if
copyright
is
None
:
if
arg
.
elasticsearch
:
print
(
await
outTweet
(
tweet
),
end
=
"."
,
flush
=
True
)
else
:
else
:
print
(
await
outTweet
(
tweet
))
print
(
await
outTweet
(
tweet
))
except
:
pass
async
def
getFavorites
(
init
):
'''
This will get the URL for the Tweet that was
liked by the user and schedules it to be
requested. Also similar to getTweets().
'''
tweets
,
init
=
await
getfeed
(
init
)
count
=
0
try
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
20
)
as
executor
:
loop
=
asyncio
.
get_event_loop
()
futures
=
[]
for
tweet
in
tweets
:
count
+=
1
link
=
tweet
.
find
(
"a"
)[
"href"
]
url
=
"https://twitter.com{}"
.
format
(
link
)
futures
.
append
(
loop
.
run_in_executor
(
executor
,
await
getTweet
(
url
)))
await
asyncio
.
gather
(
*
futures
)
except
:
pass
return
tweets
,
init
,
count
return
tweets
,
init
,
count
async
def
getFollow
(
init
):
'''
For now, just printing the Twitter username
of a follower/user followed. Will include more
data on the user upon request.
'''
follow
,
init
=
await
getfeed
(
init
)
for
f
in
follow
:
user
=
f
.
find
(
"a"
)[
"name"
]
print
(
user
)
return
follow
,
init
async
def
getUsername
():
async
def
getUsername
():
'''
'''
This function uses a Twitter ID search to resolve a Twitter
U
ser
This function uses a Twitter ID search to resolve a Twitter
u
ser
ID and return it's corresponding username.
ID and return it's corresponding username.
'''
'''
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
verify_ssl
=
False
))
as
session
:
connect
=
aiohttp
.
TCPConnector
(
verify_ssl
=
False
)
async
with
aiohttp
.
ClientSession
(
connector
=
connect
)
as
session
:
r
=
await
fetch
(
session
,
"https://twitter.com/intent/user?user_id={0.userid}"
.
format
(
arg
))
r
=
await
fetch
(
session
,
"https://twitter.com/intent/user?user_id={0.userid}"
.
format
(
arg
))
soup
=
BeautifulSoup
(
r
,
"html.parser"
)
soup
=
BeautifulSoup
(
r
,
"html.parser"
)
return
soup
.
find
(
"a"
,
"fn url alternate-context"
)[
"href"
]
.
replace
(
"/"
,
""
)
return
soup
.
find
(
"a"
,
"fn url alternate-context"
)[
"href"
]
.
replace
(
"/"
,
""
)
...
@@ -449,20 +593,28 @@ async def main():
...
@@ -449,20 +593,28 @@ async def main():
feed
=
[
-
1
]
feed
=
[
-
1
]
init
=
-
1
init
=
-
1
num
=
0
num
=
0
action
=
getAction
()
while
_since
<
_until
:
while
_since
<
_until
:
arg
.
since
=
str
(
_until
-
datetime
.
timedelta
(
days
=
int
(
arg
.
timedelta
)))
arg
.
since
=
str
(
_until
-
datetime
.
timedelta
(
days
=
int
(
arg
.
timedelta
)))
arg
.
until
=
str
(
_until
)
arg
.
until
=
str
(
_until
)
'''
'''
If our response from getFeed() has an exception,
If our response from getFeed() has an exception,
it signifies there are no position IDs to continue
it signifies there are no position IDs to continue
with, telling Tw
eep
it's finished scraping.
with, telling Tw
int
it's finished scraping.
'''
'''
if
len
(
feed
)
>
0
:
if
len
(
feed
)
>
0
:
feed
,
init
,
count
=
await
getTweets
(
init
)
if
action
!=
""
:
num
+=
count
if
arg
.
favorites
:
feed
,
init
,
count
=
await
getFavorites
(
init
)
else
:
feed
,
init
=
await
getFollow
(
init
)
else
:
feed
,
init
,
count
=
await
getTweets
(
init
)
num
+=
count
else
:
else
:
_until
=
_until
-
datetime
.
timedelta
(
days
=
int
(
arg
.
timedelta
))
_until
=
_until
-
datetime
.
timedelta
(
days
=
int
(
arg
.
timedelta
))
feed
=
[
-
1
]
feed
=
[
-
1
]
break
# Control when we want to stop scraping.
# Control when we want to stop scraping.
if
arg
.
limit
is
not
None
and
num
>=
int
(
arg
.
limit
):
if
arg
.
limit
is
not
None
and
num
>=
int
(
arg
.
limit
):
break
break
...
@@ -505,7 +657,7 @@ def check():
...
@@ -505,7 +657,7 @@ def check():
Error
(
"Error"
,
"Please specify an output file (Example: -o file.csv"
)
Error
(
"Error"
,
"Please specify an output file (Example: -o file.csv"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
ap
=
argparse
.
ArgumentParser
(
prog
=
"tw
eep.py"
,
usage
=
"python3
%(prog)
s [options]"
,
description
=
"tweep
.py - An Advanced Twitter Scraping Tool"
)
ap
=
argparse
.
ArgumentParser
(
prog
=
"tw
int.py"
,
usage
=
"python3
%(prog)
s [options]"
,
description
=
"twint
.py - An Advanced Twitter Scraping Tool"
)
ap
.
add_argument
(
"-u"
,
help
=
"User's Tweets you want to scrape."
)
ap
.
add_argument
(
"-u"
,
help
=
"User's Tweets you want to scrape."
)
ap
.
add_argument
(
"-s"
,
help
=
"Search for Tweets containing this word or phrase."
)
ap
.
add_argument
(
"-s"
,
help
=
"Search for Tweets containing this word or phrase."
)
ap
.
add_argument
(
"-g"
,
help
=
"Search for geocoded tweets."
)
ap
.
add_argument
(
"-g"
,
help
=
"Search for geocoded tweets."
)
...
@@ -529,7 +681,10 @@ if __name__ == "__main__":
...
@@ -529,7 +681,10 @@ if __name__ == "__main__":
ap
.
add_argument
(
"--stats"
,
help
=
"Show number of replies, retweets, and likes"
,
action
=
"store_true"
)
ap
.
add_argument
(
"--stats"
,
help
=
"Show number of replies, retweets, and likes"
,
action
=
"store_true"
)
ap
.
add_argument
(
"--database"
,
help
=
"Store tweets in the database"
)
ap
.
add_argument
(
"--database"
,
help
=
"Store tweets in the database"
)
ap
.
add_argument
(
"--to"
,
help
=
"Search Tweets to a user"
)
ap
.
add_argument
(
"--to"
,
help
=
"Search Tweets to a user"
)
ap
.
add_argument
(
"--all"
,
help
=
"Search all Tweets associated with a user"
)
ap
.
add_argument
(
"--all"
,
help
=
"Search all Tweets associated with a user"
)
ap
.
add_argument
(
"--followers"
,
help
=
"Scrape a person's followers"
,
action
=
"store_true"
)
ap
.
add_argument
(
"--following"
,
help
=
"Scrape who a person follows."
,
action
=
"store_true"
)
ap
.
add_argument
(
"--favorites"
,
help
=
"Scrape Tweets a user has liked."
,
action
=
"store_true"
)
arg
=
ap
.
parse_args
()
arg
=
ap
.
parse_args
()
check
()
check
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment