Merge remote-tracking branch 'haccer/master'

e5f1c566 · Francesco Poldi · d2d15c85 · f6a0306b · e5f1c566 · e5f1c566
Commit e5f1c566 authored Mar 30, 2018 by Francesco Poldi
Hide whitespace changes
Inline Side-by-side

Showing with 54 additions and 36 deletions

README.md README.md +35 -30

tweep.py tweep.py +19 -6

No files found.
--- a/README.md
+++ b/README.md
@@ -16,36 +16,27 @@ Some of the benefits of using Tweep vs Twitter API:
 - Python 3.5/3.6
 - `pip3 install -r requirements.txt`

-## Usage
- `-u` The user's Tweets you want to scrape.
- `-s` Search for Tweets containing this word or phrase.
- `-g` Retrieve tweets by geolocation. Format of the argument is lat,lon,range(km or mi).
- `-o` Save output to a file.
- `-es` Output to Elasticsearch
- `--year` Filter Tweets before the specified year. 
- `--fruit` Display Tweets with "low-hanging-fruit".
- `--tweets` Display Tweets only.
- `--verified` Display Tweets only from verified users (Use with `-s`).
- `--users` Display users only (Use with `-s`).
- `--csv` Write as a .csv file.
- `--hashtags` Extract hashtags.
- `--userid` Search from Twitter user's ID.
- `--limit` Number of Tweets to pull (Increments of 20).
- `--count` Display number Tweets scraped at the end of session.
- `--stats` Show number of replies, retweets, and likes.
-
-## Elasticsearch Setup
-1. Go [here](https://www.elastic.co/downloads) and download `Elasticsearch` and `Kibana`, install both; (do this once)
-2. Run `Elasticsearch` and than `Kibana`, in the Kibana output you should see "[info][status][plugin:elasticsearch@6.2.2] Status changed from yellow to green - Ready";
-3. Go to `http://localhost:5601`, `Dev Tools`, copy&paste from `index.json` and select the **green arrow**; (do this once)
-4. Index some data: `python3.6 tweep.py --elasticsearch localhost:9200 -u whatsoever`;
-5. Back to Kibana's interface, `Management`, `Index Pattern`, `Create Index Pattern`, type `tweep`, choose `datestamp` as Time filter; (do this once)
-6. Have fun.
-
-Unfortunately, you cannot import visualizations and/or dashboards if you do not have the same index id, so under `elasticsearch` directory I wrote a how-to for a initial setup.
-
-If you have problems don't hesitate to write to the mainteiner [@pielco11](https://github.com/pielco11) or open an issue.
-Feel free to edit the dashboard and don't hesitate to share it if you want.
+## Options
+Command|Usage
+-------|-----------
+`-u`|The user's Tweets you want to scrape.
+`-s`|Search for Tweets containing this word or phrase.
+`-g`|Retrieve tweets by geolocation. Format of the argument is lat,lon,range(km or mi).
+`-o`|Save output to a file.
+`-es`|Output to Elasticsearch
+`--year`|Filter Tweets before the specified year. 
+`--fruit`|Display Tweets with "low-hanging-fruit".
+`--tweets`|Display Tweets only.
+`--verified`|Display Tweets only from verified users (Use with `-s`).
+`--users`|Display users only (Use with `-s`).
+`--csv`|Write as a .csv file.
+`--json`|Write as a .json file.
+`--hashtags`|Extract hashtags.
+`--userid`|Search from Twitter user's ID.
+`--limit`|Number of Tweets to pull (Increments of 20).
+`--count`|Display number Tweets scraped at the end of session.
+`--stats`|Show number of replies, retweets, and likes.
+`--database`|Store Tweets in a SQLite database

 ## Low-Hanging Fruit
 The `--fruit` feature will display Tweets that *might* contain sensitive info such as:
@@ -68,6 +59,8 @@ A few simple examples to help you understand the basics:
 - `python3 tweep.py -s "Donald Trump" --verified --users` - List verified users that Tweet about Donald Trump.
 - `python3 tweep.py -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
 - `python3 tweep.py -u username -es localhost:9200` - Output Tweets to Elasticsearch
+- `python3 tweep.py -u username -o file.json --json` - Scrape Tweets and save as a json file.
+- `python3 tweep.py -u username --database tweets.db` - Save Tweets to a SQLite database.

 ## Example String
 `955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
@@ -75,6 +68,18 @@ A few simple examples to help you understand the basics:
 ## Screenshot
 <img src="https://i.imgur.com/RKdBrHr.png" />

+## Elasticsearch Setup
+1. Go [here](https://www.elastic.co/downloads) and download `Elasticsearch` and `Kibana`, install both; (do this once)
+2. Run `Elasticsearch` and than `Kibana`, in the Kibana output you should see "[info][status][plugin:elasticsearch@6.2.2] Status changed from yellow to green - Ready";
+3. Go to `http://localhost:5601`, `Dev Tools`, copy&paste from `index.json` and select the **green arrow**; (do this once)
+4. Index some data: `python3.6 tweep.py --elasticsearch localhost:9200 -u whatsoever`;
+5. Back to Kibana's interface, `Management`, `Index Pattern`, `Create Index Pattern`, type `tweep`, choose `datestamp` as Time filter; (do this once)
+6. Go back to `Management`, `Saved Objects`, Import `dashboard.json` and than `visualization.json`; (do this once)
+7. Have fun.
+
+If you have problems don't hesitate to write to open an issue.
+Feel free to edit the dashboard and don't hesitate to share it if you want.
+
 ## Thanks
 Thanks to [@hpiedcoq](https://github.com/hpiedcoq) & [@pielco11](https://github.com/pielco11) for contributing several features!


--- a/tweep.py
+++ b/tweep.py
@@ -91,6 +91,8 @@ async def getUrl(init):
        url+= "%20until%3A{0.year}-1-1".format(arg)
    if arg.since != None:
        url+= "%20since%3A{0.since}".format(arg)
+    if arg.until != None:
+        url+= "%20until%3A{0.until}".format(arg)
    if arg.fruit:
        url+= "%20myspace.com%20OR%20last.fm%20OR"
        url+= "%20mail%20OR%20email%20OR%20gmail%20OR%20e-mail"
@@ -143,7 +145,7 @@ async def getFeed(init):

    Returns html for Tweets and position id.
    '''
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
        response = await fetch(session, await getUrl(init))
    feed = []
    try:
@@ -176,6 +178,9 @@ async def outTweet(tweet):
    # The @ in the username annoys me.
    username = tweet.find("span", "username").text.replace("@", "")
    timezone = strftime("%Z", gmtime())
+    # Replace all emoticons with their title, to be included in the tweet text
+    for img in tweet.findAll("img", "Emoji Emoji--forText"):
+        img.replaceWith("<%s>" % img['aria-label'])
    # The context of the Tweet compressed into a single line.
    text = tweet.find("p", "tweet-text").text.replace("\n", "").replace("http", " http").replace("pic.twitter", " pic.twitter")
    # Regex for gathering hashtags
@@ -313,7 +318,7 @@ async def outTweet(tweet):
    elif arg.users:
        output = username
    elif arg.tweets:
-        output = tweets
+        output = text
    else:
        '''
        The standard output is how I like it, although
@@ -334,12 +339,18 @@ async def outTweet(tweet):
        if arg.csv:
            # Write all variables scraped to CSV
            dat = [tweetid, date, time, timezone, username, text, replies, retweets, likes, hashtags]
-            with open(arg.o, "a", newline='') as csv_file:
+            with open(arg.o, "a", newline='', encoding="utf-8") as csv_file:
                writer = csv.writer(csv_file, delimiter="|")
                writer.writerow(dat)
+        elif arg.json:
+            # Write all variables scraped to JSON
+            dat = {"id":tweetid, "date":date, "time":time, "timezone":timezone, "username":username, "content":text, "replies":replies, "retweets":retweets, "likes":likes, "hashtags":hashtags}
+            with open(arg.o, "a", newline='', encoding="utf-8") as json_file:
+                json.dump(dat,json_file)
+                json_file.write('\n')
        else:
            # Writes or appends to a file.
-            print(output, file=open(arg.o, "a"))
+            print(output, file=open(arg.o, "a", encoding="utf-8"))

    return output

@@ -373,7 +384,7 @@ async def getUsername():
    This function uses a Twitter ID search to resolve a Twitter User
    ID and return it's corresponding username.
    '''
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
        r = await fetch(session, "https://twitter.com/intent/user?user_id={0.userid}".format(arg))
    soup = BeautifulSoup(r, "html.parser")
    return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
@@ -412,7 +423,7 @@ async def main():
        else:
            break
        # Control when we want to stop scraping.
-        if arg.limit is not None and num <= int(arg.limit):
+        if arg.limit is not None and num >= int(arg.limit):
            break

    if arg.database:
@@ -453,11 +464,13 @@ if __name__ == "__main__":
    ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch")
    ap.add_argument("--year", help="Filter Tweets before specified year.")
    ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).")
+    ap.add_argument("--until", help="Filter Tweets sent until date (Example: 2017-12-27).")
    ap.add_argument("--fruit", help="Display 'low-hanging-fruit' Tweets.", action="store_true")
    ap.add_argument("--tweets", help="Display Tweets only.", action="store_true")
    ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", action="store_true")
    ap.add_argument("--users", help="Display users only (Use with -s).", action="store_true")
    ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
+    ap.add_argument("--json", help="Write as .json file.", action="store_true")
    ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
    ap.add_argument("--userid", help="Twitter user id")
    ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")