Commit 65709919 authored by Aaron Gonzales's avatar Aaron Gonzales Committed by GitHub

Dual linear + polynomial timer for #604 (#726)

* [FIX]: fix request error bug limiting requests

* [FEAT]: add dual poly. backoff + linear timer if request limit hit

Add a dual polynomial + linear timer to allow for optimal fine grain
searching of twitter timeout value. A new arg `min_wait_time` allows
users to to do one of three things; Firstly, if they know a value that
is optimal already then can input it directly allowing the timer to
stricty be a linear timer. Secondly, if Twitter suddenly changes their
timeout limit and the value no longer satifies the required amount,
the timer will decide the higher valued option between the polynomial
timer's and linear timer's next value so the application still
functions. Third, as the linear timers min_wait_time arg goes to zero
the timer becomes strictly a polynomial backoff timer that allows the
user versatility and less likely to explode in the case of a change in
Twitter request limit timeout.

* [FIX]: add args to config.py

* [REFACT]: minor changes to run.py

* [REFACT]: small change to arg case
parent 6d980f1e
...@@ -46,6 +46,10 @@ def check(args): ...@@ -46,6 +46,10 @@ def check(args):
error("Error", "Please specify an output file (Example: -o file.csv).") error("Error", "Please specify an output file (Example: -o file.csv).")
elif args.json: elif args.json:
error("Error", "Please specify an output file (Example: -o file.json).") error("Error", "Please specify an output file (Example: -o file.json).")
if args.backoff_exponent <= 0:
error("Error", "Please specifiy a positive value for backoff_exponent")
if args.min_wait_time < 0:
error("Error", "Please specifiy a non negative value for min_wait_time")
def loadUserList(ul, _type): def loadUserList(ul, _type):
""" Concatenate users """ Concatenate users
...@@ -61,7 +65,6 @@ def loadUserList(ul, _type): ...@@ -61,7 +65,6 @@ def loadUserList(ul, _type):
return un[15:] return un[15:]
return userlist return userlist
def initialize(args): def initialize(args):
""" Set default values for config from args """ Set default values for config from args
""" """
...@@ -124,6 +127,8 @@ def initialize(args): ...@@ -124,6 +127,8 @@ def initialize(args):
c.Filter_retweets = args.filter_retweets c.Filter_retweets = args.filter_retweets
c.Translate = args.translate c.Translate = args.translate
c.TranslateDest = args.translate_dest c.TranslateDest = args.translate_dest
c.Backoff_exponent = args.backoff_exponent
c.Min_wait_time = args.min_wait_time
return c return c
def options(): def options():
...@@ -220,8 +225,10 @@ def options(): ...@@ -220,8 +225,10 @@ def options():
ap.add_argument("--source", help="Filter the tweets for specific source client.") ap.add_argument("--source", help="Filter the tweets for specific source client.")
ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.") ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.")
ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true") ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true")
ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.", type=float, default=3.0)
ap.add_argument("--min-wait-time", type=float, default=15, help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints")
args = ap.parse_args() args = ap.parse_args()
return args return args
def main(): def main():
......
...@@ -75,3 +75,5 @@ class Config: ...@@ -75,3 +75,5 @@ class Config:
Translate = False Translate = False
TranslateSrc = "en" TranslateSrc = "en"
TranslateDest = "en" TranslateDest = "en"
Backoff_exponent = 3.0
Min_wait_time = 0
import sys, os, time import sys, os, time
from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
from datetime import datetime
from . import datelock, feed, get, output, verbose, storage from . import datelock, feed, get, output, verbose, storage
from .storage import db from .storage import db
#from . import _logme
#
#logme = _logme._logger(__name__)
import logging as logme import logging as logme
...@@ -54,11 +50,11 @@ class Twint: ...@@ -54,11 +50,11 @@ class Twint:
try: try:
if self.config.Favorites: if self.config.Favorites:
self.feed, self.init = feed.Mobile(response) self.feed, self.init = feed.Mobile(response)
if not self.count%40: if not self.count % 40:
time.sleep(5) time.sleep(5)
elif self.config.Followers or self.config.Following: elif self.config.Followers or self.config.Following:
self.feed, self.init = feed.Follow(response) self.feed, self.init = feed.Follow(response)
if not self.count%40: if not self.count % 40:
time.sleep(5) time.sleep(5)
elif self.config.Profile: elif self.config.Profile:
if self.config.Profile_full: if self.config.Profile_full:
...@@ -91,11 +87,20 @@ class Twint: ...@@ -91,11 +87,20 @@ class Twint:
# Sometimes Twitter says there is no data. But it's a lie. # Sometimes Twitter says there is no data. But it's a lie.
consecutive_errors_count += 1 consecutive_errors_count += 1
if consecutive_errors_count < self.config.Retries_count: if consecutive_errors_count < self.config.Retries_count:
self.user_agent = await get.RandomUserAgent() # skip to the next iteration if wait time does not satisfy limit constraints
delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1)
# if the delay is less than users set min wait time then replace delay
if self.config.Min_wait_time > delay:
delay = self.config.Min_wait_time
sys.stderr.write('sleeping for {} secs\n'.format(delay))
time.sleep(delay)
self.user_agent = await get.RandomUserAgent(wa=True)
continue continue
logme.critical(__name__+':Twint:Feed:Tweets_known_error:' + str(e)) logme.critical(__name__+':Twint:Feed:Tweets_known_error:' + str(e))
print(str(e) + " [x] run.Feed") sys.stderr.write(str(e) + " [x] run.Feed")
print("[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!") sys.stderr.write("[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!")
break break
if self.config.Resume: if self.config.Resume:
print(self.init, file=open(self.config.Resume, "a", encoding="utf-8")) print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment