Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
T
Twint
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nanahira
Twint
Commits
34fbb5fe
Commit
34fbb5fe
authored
Feb 24, 2018
by
Cody Zacharias
Committed by
GitHub
Feb 24, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Annotated
parent
53183006
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
114 additions
and
13 deletions
+114
-13
tweep.py
tweep.py
+114
-13
No files found.
tweep.py
View file @
34fbb5fe
...
...
@@ -12,6 +12,17 @@ import re
import
sys
async
def
getUrl
(
init
):
'''
URL Descision:
Tweep utilizes positions of Tweet's from Twitter's search feature to
iterate through a user's Twitter feed. This section decides whether
this is the first URL request or not and develops the URL based on the
args given.
Returns complete URL.
Todo: Make everything URL encoded at the end.
'''
if
init
==
-
1
:
url
=
"https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
else
:
...
...
@@ -42,48 +53,94 @@ async def getUrl(init):
return
url
async
def
fetch
(
session
,
url
):
'''
Basic aiohttp request with a 30 second timeout.
'''
with
async_timeout
.
timeout
(
30
):
async
with
session
.
get
(
url
)
as
response
:
return
await
response
.
text
()
async
def
initial
(
response
):
'''
Initial response parsing and collecting the position ID
'''
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
feed
=
soup
.
find_all
(
"li"
,
"js-stream-item"
)
init
=
"TWEET-{}-{}"
.
format
(
feed
[
-
1
][
"data-item-id"
],
feed
[
0
][
"data-item-id"
])
return
feed
,
init
async
def
cont
(
response
):
'''
Regular json response parsing and collecting Position ID
'''
json_response
=
json
.
loads
(
response
)
html
=
json_response
[
"items_html"
]
soup
=
BeautifulSoup
(
html
,
"html.parser"
)
feed
=
soup
.
find_all
(
"li"
,
"js-stream-item"
)
split
=
json_response
[
"min_position"
]
.
split
(
"-"
)
split
[
1
]
=
feed
[
-
1
][
"data-item-id"
]
init
=
"-"
.
join
(
split
)
return
feed
,
init
async
def
getFeed
(
init
):
'''
Parsing Descision:
Responses from requests with the position id's are JSON,
so this section decides whether this is an initial request
or not to use the approriate response reading for parsing
with BeautifulSoup4.
Returns html for Tweets and position id.
'''
async
with
aiohttp
.
ClientSession
()
as
session
:
r
=
await
fetch
(
session
,
await
getUrl
(
init
))
r
esponse
=
await
fetch
(
session
,
await
getUrl
(
init
))
feed
=
[]
try
:
if
init
==
-
1
:
html
=
r
else
:
json_response
=
json
.
loads
(
r
)
html
=
json_response
[
"items_html"
]
soup
=
BeautifulSoup
(
html
,
"html.parser"
)
feed
=
soup
.
find_all
(
"li"
,
"js-stream-item"
)
if
init
==
-
1
:
init
=
"TWEET-{}-{}"
.
format
(
feed
[
-
1
][
"data-item-id"
],
feed
[
0
][
"data-item-id"
])
feed
,
init
=
await
initial
(
response
)
else
:
split
=
json_response
[
"min_position"
]
.
split
(
"-"
)
split
[
1
]
=
feed
[
-
1
][
"data-item-id"
]
init
=
"-"
.
join
(
split
)
feed
,
init
=
await
cont
(
response
)
except
:
# Tweep will realize that it's done scraping.
pass
return
feed
,
init
async
def
outTweet
(
tweet
):
'''
Parsing Section:
This function will create the desired output string and
write it to a file or csv if specified.
Returns output.
'''
tweetid
=
tweet
[
"data-item-id"
]
# Formatting the date & time stamps just how I like it.
datestamp
=
tweet
.
find
(
"a"
,
"tweet-timestamp"
)[
"title"
]
.
rpartition
(
" - "
)[
-
1
]
d
=
datetime
.
datetime
.
strptime
(
datestamp
,
"
%
d
%
b
%
Y"
)
date
=
d
.
strftime
(
"
%
Y-
%
m-
%
d"
)
timestamp
=
str
(
datetime
.
timedelta
(
seconds
=
int
(
tweet
.
find
(
"span"
,
"_timestamp"
)[
"data-time"
])))
.
rpartition
(
", "
)[
-
1
]
t
=
datetime
.
datetime
.
strptime
(
timestamp
,
"
%
H:
%
M:
%
S"
)
time
=
t
.
strftime
(
"
%
H:
%
M:
%
S"
)
# The @ in the username annoys me.
username
=
tweet
.
find
(
"span"
,
"username"
)
.
text
.
replace
(
"@"
,
""
)
timezone
=
strftime
(
"
%
Z"
,
gmtime
())
# The context of the Tweet compressed into a single line.
text
=
tweet
.
find
(
"p"
,
"tweet-text"
)
.
text
.
replace
(
"
\n
"
,
""
)
.
replace
(
"http"
,
" http"
)
.
replace
(
"pic.twitter"
,
" pic.twitter"
)
# Regex for gathering hashtags
hashtags
=
","
.
join
(
re
.
findall
(
r'(?i)\#\w+'
,
text
,
flags
=
re
.
UNICODE
))
replies
=
tweet
.
find
(
"span"
,
"ProfileTweet-action--reply u-hiddenVisually"
)
.
find
(
"span"
)[
"data-tweet-stat-count"
]
retweets
=
tweet
.
find
(
"span"
,
"ProfileTweet-action--retweet u-hiddenVisually"
)
.
find
(
"span"
)[
"data-tweet-stat-count"
]
likes
=
tweet
.
find
(
"span"
,
"ProfileTweet-action--favorite u-hiddenVisually"
)
.
find
(
"span"
)[
"data-tweet-stat-count"
]
'''
This part tries to get a list of mentions.
It sometimes gets slow with Tweets that contain
40+ mentioned people.. rather than just appending
the whole list to the Tweet, it goes through each
one to make sure there arn't any duplicates.
'''
try
:
mentions
=
tweet
.
find
(
"div"
,
"js-original-tweet"
)[
"data-mentions"
]
.
split
(
" "
)
for
i
in
range
(
len
(
mentions
)):
...
...
@@ -93,32 +150,62 @@ async def outTweet(tweet):
except
:
pass
# Preparing to output
'''
There were certain cases where I used Tweep
to gather a list of users and then fed that
generated list into Tweep. That's why these
modes exist.
'''
if
arg
.
users
:
output
=
username
elif
arg
.
tweets
:
output
=
tweets
else
:
'''
The standard output is how I like it, although
this can be modified to your desire. Uncomment
the bottom line and add in the variables in the
order you want them or how you want it to look.
'''
# output = ""
output
=
"{} {} {} {} <{}> {}"
.
format
(
tweetid
,
date
,
time
,
timezone
,
username
,
text
)
if
arg
.
hashtags
:
output
+=
" {}"
.
format
(
hashtags
)
if
arg
.
stats
:
output
+=
" | {} replies {} retweets {} likes"
.
format
(
replies
,
retweets
,
likes
)
# Output section
if
arg
.
o
!=
None
:
if
arg
.
csv
:
# Write all variables scraped to CSV
dat
=
[
tweetid
,
date
,
time
,
timezone
,
username
,
text
,
replies
,
retweets
,
likes
,
hashtags
]
with
open
(
arg
.
o
,
"a"
,
newline
=
''
)
as
csv_file
:
writer
=
csv
.
writer
(
csv_file
,
delimiter
=
"|"
)
writer
.
writerow
(
dat
)
else
:
# Writes or appends to a file.
print
(
output
,
file
=
open
(
arg
.
o
,
"a"
))
return
output
async
def
getTweets
(
init
):
'''
This function uses the html responses from getFeed()
and sends that info to the Tweet parser outTweet() and
outputs it.
Returns response feed, if it's first-run, and Tweet count.
'''
tweets
,
init
=
await
getFeed
(
init
)
count
=
0
for
tweet
in
tweets
:
'''
Certain Tweets get taken down for copyright but are still
visible in the search. We want to avoid those.
'''
copyright
=
tweet
.
find
(
"div"
,
"StreamItemContent--withheld"
)
if
copyright
is
None
:
count
+=
1
...
...
@@ -127,12 +214,19 @@ async def getTweets(init):
return
tweets
,
init
,
count
async
def
getUsername
():
'''
This function uses a Twitter ID search to resolve a Twitter User
ID and return it's corresponding username.
'''
async
with
aiohttp
.
ClientSession
()
as
session
:
r
=
await
fetch
(
session
,
"https://twitter.com/intent/user?user_id={0.userid}"
.
format
(
arg
))
soup
=
BeautifulSoup
(
r
,
"html.parser"
)
return
soup
.
find
(
"a"
,
"fn url alternate-context"
)[
"href"
]
.
replace
(
"/"
,
""
)
async
def
main
():
'''
Putting it all together.
'''
if
arg
.
userid
is
not
None
:
arg
.
u
=
await
getUsername
()
...
...
@@ -140,22 +234,29 @@ async def main():
init
=
-
1
num
=
0
while
True
:
'''
If our response from getFeed() has an exception,
it signifies there are no position IDs to continue
with, telling Tweep it's finished scraping.
'''
if
len
(
feed
)
>
0
:
feed
,
init
,
count
=
await
getTweets
(
init
)
num
+=
count
else
:
break
# Control when we want to stop scraping.
if
arg
.
limit
is
not
None
and
num
<=
int
(
arg
.
limit
):
break
if
arg
.
count
:
print
(
"Finished: Successfully collected {} Tweets."
.
format
(
num
))
def
Error
(
error
,
message
):
# Error formatting
print
(
"[-] {}: {}"
.
format
(
error
,
message
))
sys
.
exit
(
0
)
def
check
():
# Performs main argument checks so nothing unintended happens.
if
arg
.
u
is
not
None
:
if
arg
.
users
:
Error
(
"Contradicting Args"
,
"Please use --users in combination with -s."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment