diff options
author | Michal Čihař <michal@cihar.com> | 2017-11-08 13:07:09 +0100 |
---|---|---|
committer | Michal Čihař <michal@cihar.com> | 2017-11-08 13:07:09 +0100 |
commit | b4d7fae788ad346f755339f98d809ce00a249dd5 (patch) | |
tree | 377ad4349db846bc48947d12447cb56ad86cba72 | |
parent | b287268cb34fa36ab3acd9b19f41953bbd418498 (diff) |
New upstream version 3.10.2
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | PKG-INFO | 2 | ||||
-rw-r--r-- | setup.py | 16 | ||||
-rw-r--r-- | test/test_mirror.py | 52 | ||||
-rw-r--r-- | urlgrabber/__init__.py | 4 | ||||
-rw-r--r-- | urlgrabber/grabber.py | 61 |
6 files changed, 125 insertions, 19 deletions
@@ -1,3 +1,12 @@ +2017-02-02 Valentina Mukhamedzhanova <vmukhame@redhat.com> + + * Add no_cache and retry_no_cache options. + * Work around pycurl dependency in setup.py. + * Don't set speed=0 on a new mirror that 404'd. + * Add a comprehensive error message to pycurl error 77. + * Don't crash on timedhosts parsing error. + * bump version to 3.10.2 + 2013-10-09 Zdenek Pavlas <zpavlas@redhat.com> * lots of enahncements and bugfixes @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: urlgrabber -Version: 3.10.1 +Version: 3.10.2 Summary: A high-level cross-protocol url-grabber Home-page: http://urlgrabber.baseurl.org/ Author: Michael D. Stenner, Ryan Tomayko @@ -1,7 +1,23 @@ # urlgrabber distutils setup import re as _re +import sys as _sys + +class _pycurlFake(object): + Curl = staticmethod(lambda: None) + +# Unforunately __init__.py imports urlgrabber.grabber which then imports +# pycurl package. And finally pycurl.Curl() is called in the top level +# of grabber module. We don't need pycurl nor pycurl.Curl() during +# setup. Fake this module to be loaded already so we don't need to have +# pycurl installed at all. Maybe developer wants to install it in later +# phase. +_sys.modules["pycurl"] = _pycurlFake + +# We need urlgrabber package for some constants. import urlgrabber as _urlgrabber +del _sys.modules["pycurl"] + name = "urlgrabber" description = "A high-level cross-protocol url-grabber" long_description = _urlgrabber.__doc__ diff --git a/test/test_mirror.py b/test/test_mirror.py index 7f493d0..a6bb6cb 100644 --- a/test/test_mirror.py +++ b/test/test_mirror.py @@ -268,13 +268,14 @@ class ActionTests(TestCase): self.assertEquals(self.g.calls, expected_calls) self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) -import thread, socket +import threading, socket LOCALPORT = 'localhost', 2000 class HttpReplyCode(TestCase): def setUp(self): # start the server self.exit = False + self.process = lambda data: None def server(): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) @@ -282,7 +283,10 @@ class HttpReplyCode(TestCase): while 1: c, a = s.accept() if self.exit: c.close(); break - while not c.recv(4096).endswith('\r\n\r\n'): pass + data = '' + while not data.endswith('\r\n\r\n'): + data = c.recv(4096) + self.process(data) c.sendall('HTTP/1.1 %d %s\r\n' % self.reply) if self.content is not None: c.sendall('Content-Length: %d\r\n\r\n' % len(self.content)) @@ -290,7 +294,8 @@ class HttpReplyCode(TestCase): c.close() s.close() self.exit = False - thread.start_new_thread(server, ()) + self.thread = threading.Thread(target=server) + self.thread.start() # create grabber and mirror group objects def failure(obj): @@ -305,7 +310,7 @@ class HttpReplyCode(TestCase): self.exit = True s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(LOCALPORT); s.close() # wake it up - while self.exit: pass # poor man's join + self.thread.join() def test_grab(self): 'tests the propagation of HTTP reply code' @@ -336,6 +341,45 @@ class HttpReplyCode(TestCase): data = self.mg.urlread('foo', range = (3, 5)) self.assertEquals(data, 'DE') + def test_retry_no_cache(self): + 'test bypassing proxy cache on failure' + def process(data): + if 'Pragma:no-cache' in data: + self.content = 'version2' + else: + self.content = 'version1' + + def checkfunc_read(obj): + if obj.data == 'version1': + raise URLGrabError(-1, 'Outdated version of foo') + + def checkfunc_grab(obj): + with open('foo') as f: + if f.read() == 'version1': + raise URLGrabError(-1, 'Outdated version of foo') + + self.process = process + self.reply = 200, "OK" + + opts = self.g.opts + opts.retry = 3 + opts.retry_no_cache = True + + # single + opts.checkfunc = checkfunc_read + try: + self.mg.urlread('foo') + except URLGrabError as e: + self.fail(str(e)) + + # multi + opts.checkfunc = checkfunc_grab + self.mg.urlgrab('foo', async=True) + try: + urlgrabber.grabber.parallel_wait() + except URLGrabError as e: + self.fail(str(e)) + def suite(): tl = TestLoader() return tl.loadTestsFromModule(sys.modules[__name__]) diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py index b3047b0..e1028c6 100644 --- a/urlgrabber/__init__.py +++ b/urlgrabber/__init__.py @@ -44,8 +44,8 @@ following features: automatically switching mirrors if there is a failure. """ -__version__ = '3.10.1' -__date__ = '2013/12/18' +__version__ = '3.10.2' +__date__ = '2017/02/02' __author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \ 'Ryan Tomayko <rtomayko@naeblis.cx>' \ 'Seth Vidal <skvidal@fedoraproject.org>' \ diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index ef18d6a..074a82f 100644 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -171,6 +171,12 @@ GENERAL ARGUMENTS (kwargs) The libproxy code is only used if the proxies dictionary does not provide any proxies. + no_cache = False + + When True, server-side cache will be disabled for http and https + requests. This is equivalent to setting + http_headers = (('Pragma', 'no-cache'),) + prefix = None a url prefix that will be prepended to all requested urls. For @@ -383,10 +389,11 @@ RETRY RELATED ARGUMENTS identical to checkfunc, except for the attributes defined in the CallbackObject instance. The attributes for failure_callback are: - exception = the raised exception - url = the url we're trying to fetch - tries = the number of tries so far (including this one) - retry = the value of the retry option + exception = the raised exception + url = the url we're trying to fetch + tries = the number of tries so far (including this one) + retry = the value of the retry option + retry_no_cache = the value of the retry_no_cache option The callback is present primarily to inform the calling program of the failure, but if it raises an exception (including the one it's @@ -431,6 +438,19 @@ RETRY RELATED ARGUMENTS passed the same arguments, so you could use the same function for both. + retry_no_cache = False + + When True, automatically enable no_cache for future retries if + checkfunc performs an unsuccessful check. + + This option is useful if your application expects a set of files + from the same server to form an atomic unit and you write your + checkfunc to ensure each file being downloaded belongs to such a + unit. If transparent proxy caching is in effect, the files can + become out-of-sync, disrupting the atomicity. Enabling this option + will prevent that, while ensuring that you still enjoy the benefits + of caching when possible. + BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and @@ -1001,6 +1021,8 @@ class URLGrabberOptions: self.half_life = 30*24*60*60 # 30 days self.default_speed = 500e3 # 500 kBps self.ftp_disable_epsv = False + self.no_cache = False + self.retry_no_cache = False def __repr__(self): return self.format() @@ -1077,7 +1099,8 @@ class URLGrabber(object): if callback: if DEBUG: DEBUG.info('calling callback: %s', callback) obj = CallbackObject(exception=exception, url=args[0], - tries=tries, retry=opts.retry) + tries=tries, retry=opts.retry, + retry_no_cache=opts.retry_no_cache) _run_callback(callback, obj) if (opts.retry is None) or (tries == opts.retry): @@ -1089,6 +1112,8 @@ class URLGrabber(object): if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', retrycode, opts.retrycodes) raise + if retrycode is not None and retrycode < 0 and opts.retry_no_cache: + opts.no_cache = True def urlopen(self, url, opts=None, **kwargs): """open the url and return a file object @@ -1439,11 +1464,15 @@ class PyCurlFileObject(object): self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) #headers: - if opts.http_headers and self.scheme in ('http', 'https'): + if self.scheme in ('http', 'https'): headers = [] - for (tag, content) in opts.http_headers: - headers.append('%s:%s' % (tag, content)) - self.curl_obj.setopt(pycurl.HTTPHEADER, headers) + if opts.http_headers is not None: + for (tag, content) in opts.http_headers: + headers.append('%s:%s' % (tag, content)) + if opts.no_cache: + headers.append('Pragma:no-cache') + if headers: + self.curl_obj.setopt(pycurl.HTTPHEADER, headers) # ranges: if opts.range or opts.reget: @@ -1569,6 +1598,7 @@ class PyCurlFileObject(object): 67 : _("Authentication failure"), 70 : _("Out of disk space on server"), 73 : _("Remove file exists"), + 77 : _("Problem with the SSL CA cert (path? access rights?)"), } errstr = str(e.args[1]) or pyerr2str.get(errcode, '<Unknown>') if code and not 200 <= code <= 299: @@ -2064,7 +2094,8 @@ class _ExternalDownloader: 'ssl_key_pass', 'ssl_verify_peer', 'ssl_verify_host', 'size', 'max_header_size', 'ip_resolve', - 'ftp_disable_epsv' + 'ftp_disable_epsv', + 'no_cache', ) def start(self, opts): @@ -2246,6 +2277,8 @@ def parallel_wait(meter=None): except URLGrabError, ug_err: retry = 0 # no retries if opts.tries < retry and ug_err.errno in opts.retrycodes: + if ug_err.errno < 0 and opts.retry_no_cache: + opts.no_cache = True start(opts, opts.tries + 1) # simple retry continue @@ -2376,8 +2409,11 @@ class _TH: try: now = int(time.time()) for line in open(filename): - host, speed, fail, ts = line.rsplit(' ', 3) - _TH.hosts[host] = int(speed), int(fail), min(int(ts), now) + try: + host, speed, fail, ts = line.rsplit(' ', 3) + _TH.hosts[host] = int(speed), int(fail), min(int(ts), now) + except ValueError: + if DEBUG: DEBUG.info('Error parsing timedhosts: line "%s"', line) except IOError: pass _TH.dirty = False @@ -2418,6 +2454,7 @@ class _TH: speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2) fail = 0 elif getattr(ug_err, 'code', None) == 404: + if not ts: return # 1st update, avoid speed=0 fail = 0 # alive, at least else: fail += 1 # seems dead |