summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Čihař <michal@cihar.com>2017-11-08 13:07:09 +0100
committerMichal Čihař <michal@cihar.com>2017-11-08 13:07:09 +0100
commitb4d7fae788ad346f755339f98d809ce00a249dd5 (patch)
tree377ad4349db846bc48947d12447cb56ad86cba72
parentb287268cb34fa36ab3acd9b19f41953bbd418498 (diff)
New upstream version 3.10.2
-rw-r--r--ChangeLog9
-rw-r--r--PKG-INFO2
-rw-r--r--setup.py16
-rw-r--r--test/test_mirror.py52
-rw-r--r--urlgrabber/__init__.py4
-rw-r--r--urlgrabber/grabber.py61
6 files changed, 125 insertions, 19 deletions
diff --git a/ChangeLog b/ChangeLog
index fdbe63d..3269701 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2017-02-02 Valentina Mukhamedzhanova <vmukhame@redhat.com>
+
+ * Add no_cache and retry_no_cache options.
+ * Work around pycurl dependency in setup.py.
+ * Don't set speed=0 on a new mirror that 404'd.
+ * Add a comprehensive error message to pycurl error 77.
+ * Don't crash on timedhosts parsing error.
+ * bump version to 3.10.2
+
2013-10-09 Zdenek Pavlas <zpavlas@redhat.com>
* lots of enahncements and bugfixes
diff --git a/PKG-INFO b/PKG-INFO
index ef83002..b99ca86 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: urlgrabber
-Version: 3.10.1
+Version: 3.10.2
Summary: A high-level cross-protocol url-grabber
Home-page: http://urlgrabber.baseurl.org/
Author: Michael D. Stenner, Ryan Tomayko
diff --git a/setup.py b/setup.py
index bfa4a18..72859af 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,23 @@
# urlgrabber distutils setup
import re as _re
+import sys as _sys
+
+class _pycurlFake(object):
+ Curl = staticmethod(lambda: None)
+
+# Unforunately __init__.py imports urlgrabber.grabber which then imports
+# pycurl package. And finally pycurl.Curl() is called in the top level
+# of grabber module. We don't need pycurl nor pycurl.Curl() during
+# setup. Fake this module to be loaded already so we don't need to have
+# pycurl installed at all. Maybe developer wants to install it in later
+# phase.
+_sys.modules["pycurl"] = _pycurlFake
+
+# We need urlgrabber package for some constants.
import urlgrabber as _urlgrabber
+del _sys.modules["pycurl"]
+
name = "urlgrabber"
description = "A high-level cross-protocol url-grabber"
long_description = _urlgrabber.__doc__
diff --git a/test/test_mirror.py b/test/test_mirror.py
index 7f493d0..a6bb6cb 100644
--- a/test/test_mirror.py
+++ b/test/test_mirror.py
@@ -268,13 +268,14 @@ class ActionTests(TestCase):
self.assertEquals(self.g.calls, expected_calls)
self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
-import thread, socket
+import threading, socket
LOCALPORT = 'localhost', 2000
class HttpReplyCode(TestCase):
def setUp(self):
# start the server
self.exit = False
+ self.process = lambda data: None
def server():
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
@@ -282,7 +283,10 @@ class HttpReplyCode(TestCase):
while 1:
c, a = s.accept()
if self.exit: c.close(); break
- while not c.recv(4096).endswith('\r\n\r\n'): pass
+ data = ''
+ while not data.endswith('\r\n\r\n'):
+ data = c.recv(4096)
+ self.process(data)
c.sendall('HTTP/1.1 %d %s\r\n' % self.reply)
if self.content is not None:
c.sendall('Content-Length: %d\r\n\r\n' % len(self.content))
@@ -290,7 +294,8 @@ class HttpReplyCode(TestCase):
c.close()
s.close()
self.exit = False
- thread.start_new_thread(server, ())
+ self.thread = threading.Thread(target=server)
+ self.thread.start()
# create grabber and mirror group objects
def failure(obj):
@@ -305,7 +310,7 @@ class HttpReplyCode(TestCase):
self.exit = True
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(LOCALPORT); s.close() # wake it up
- while self.exit: pass # poor man's join
+ self.thread.join()
def test_grab(self):
'tests the propagation of HTTP reply code'
@@ -336,6 +341,45 @@ class HttpReplyCode(TestCase):
data = self.mg.urlread('foo', range = (3, 5))
self.assertEquals(data, 'DE')
+ def test_retry_no_cache(self):
+ 'test bypassing proxy cache on failure'
+ def process(data):
+ if 'Pragma:no-cache' in data:
+ self.content = 'version2'
+ else:
+ self.content = 'version1'
+
+ def checkfunc_read(obj):
+ if obj.data == 'version1':
+ raise URLGrabError(-1, 'Outdated version of foo')
+
+ def checkfunc_grab(obj):
+ with open('foo') as f:
+ if f.read() == 'version1':
+ raise URLGrabError(-1, 'Outdated version of foo')
+
+ self.process = process
+ self.reply = 200, "OK"
+
+ opts = self.g.opts
+ opts.retry = 3
+ opts.retry_no_cache = True
+
+ # single
+ opts.checkfunc = checkfunc_read
+ try:
+ self.mg.urlread('foo')
+ except URLGrabError as e:
+ self.fail(str(e))
+
+ # multi
+ opts.checkfunc = checkfunc_grab
+ self.mg.urlgrab('foo', async=True)
+ try:
+ urlgrabber.grabber.parallel_wait()
+ except URLGrabError as e:
+ self.fail(str(e))
+
def suite():
tl = TestLoader()
return tl.loadTestsFromModule(sys.modules[__name__])
diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py
index b3047b0..e1028c6 100644
--- a/urlgrabber/__init__.py
+++ b/urlgrabber/__init__.py
@@ -44,8 +44,8 @@ following features:
automatically switching mirrors if there is a failure.
"""
-__version__ = '3.10.1'
-__date__ = '2013/12/18'
+__version__ = '3.10.2'
+__date__ = '2017/02/02'
__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
'Ryan Tomayko <rtomayko@naeblis.cx>' \
'Seth Vidal <skvidal@fedoraproject.org>' \
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index ef18d6a..074a82f 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -171,6 +171,12 @@ GENERAL ARGUMENTS (kwargs)
The libproxy code is only used if the proxies dictionary
does not provide any proxies.
+ no_cache = False
+
+ When True, server-side cache will be disabled for http and https
+ requests. This is equivalent to setting
+ http_headers = (('Pragma', 'no-cache'),)
+
prefix = None
a url prefix that will be prepended to all requested urls. For
@@ -383,10 +389,11 @@ RETRY RELATED ARGUMENTS
identical to checkfunc, except for the attributes defined in the
CallbackObject instance. The attributes for failure_callback are:
- exception = the raised exception
- url = the url we're trying to fetch
- tries = the number of tries so far (including this one)
- retry = the value of the retry option
+ exception = the raised exception
+ url = the url we're trying to fetch
+ tries = the number of tries so far (including this one)
+ retry = the value of the retry option
+ retry_no_cache = the value of the retry_no_cache option
The callback is present primarily to inform the calling program of
the failure, but if it raises an exception (including the one it's
@@ -431,6 +438,19 @@ RETRY RELATED ARGUMENTS
passed the same arguments, so you could use the same function for
both.
+ retry_no_cache = False
+
+ When True, automatically enable no_cache for future retries if
+ checkfunc performs an unsuccessful check.
+
+ This option is useful if your application expects a set of files
+ from the same server to form an atomic unit and you write your
+ checkfunc to ensure each file being downloaded belongs to such a
+ unit. If transparent proxy caching is in effect, the files can
+ become out-of-sync, disrupting the atomicity. Enabling this option
+ will prevent that, while ensuring that you still enjoy the benefits
+ of caching when possible.
+
BANDWIDTH THROTTLING
urlgrabber supports throttling via two values: throttle and
@@ -1001,6 +1021,8 @@ class URLGrabberOptions:
self.half_life = 30*24*60*60 # 30 days
self.default_speed = 500e3 # 500 kBps
self.ftp_disable_epsv = False
+ self.no_cache = False
+ self.retry_no_cache = False
def __repr__(self):
return self.format()
@@ -1077,7 +1099,8 @@ class URLGrabber(object):
if callback:
if DEBUG: DEBUG.info('calling callback: %s', callback)
obj = CallbackObject(exception=exception, url=args[0],
- tries=tries, retry=opts.retry)
+ tries=tries, retry=opts.retry,
+ retry_no_cache=opts.retry_no_cache)
_run_callback(callback, obj)
if (opts.retry is None) or (tries == opts.retry):
@@ -1089,6 +1112,8 @@ class URLGrabber(object):
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
retrycode, opts.retrycodes)
raise
+ if retrycode is not None and retrycode < 0 and opts.retry_no_cache:
+ opts.no_cache = True
def urlopen(self, url, opts=None, **kwargs):
"""open the url and return a file object
@@ -1439,11 +1464,15 @@ class PyCurlFileObject(object):
self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
#headers:
- if opts.http_headers and self.scheme in ('http', 'https'):
+ if self.scheme in ('http', 'https'):
headers = []
- for (tag, content) in opts.http_headers:
- headers.append('%s:%s' % (tag, content))
- self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
+ if opts.http_headers is not None:
+ for (tag, content) in opts.http_headers:
+ headers.append('%s:%s' % (tag, content))
+ if opts.no_cache:
+ headers.append('Pragma:no-cache')
+ if headers:
+ self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
# ranges:
if opts.range or opts.reget:
@@ -1569,6 +1598,7 @@ class PyCurlFileObject(object):
67 : _("Authentication failure"),
70 : _("Out of disk space on server"),
73 : _("Remove file exists"),
+ 77 : _("Problem with the SSL CA cert (path? access rights?)"),
}
errstr = str(e.args[1]) or pyerr2str.get(errcode, '<Unknown>')
if code and not 200 <= code <= 299:
@@ -2064,7 +2094,8 @@ class _ExternalDownloader:
'ssl_key_pass',
'ssl_verify_peer', 'ssl_verify_host',
'size', 'max_header_size', 'ip_resolve',
- 'ftp_disable_epsv'
+ 'ftp_disable_epsv',
+ 'no_cache',
)
def start(self, opts):
@@ -2246,6 +2277,8 @@ def parallel_wait(meter=None):
except URLGrabError, ug_err:
retry = 0 # no retries
if opts.tries < retry and ug_err.errno in opts.retrycodes:
+ if ug_err.errno < 0 and opts.retry_no_cache:
+ opts.no_cache = True
start(opts, opts.tries + 1) # simple retry
continue
@@ -2376,8 +2409,11 @@ class _TH:
try:
now = int(time.time())
for line in open(filename):
- host, speed, fail, ts = line.rsplit(' ', 3)
- _TH.hosts[host] = int(speed), int(fail), min(int(ts), now)
+ try:
+ host, speed, fail, ts = line.rsplit(' ', 3)
+ _TH.hosts[host] = int(speed), int(fail), min(int(ts), now)
+ except ValueError:
+ if DEBUG: DEBUG.info('Error parsing timedhosts: line "%s"', line)
except IOError: pass
_TH.dirty = False
@@ -2418,6 +2454,7 @@ class _TH:
speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2)
fail = 0
elif getattr(ug_err, 'code', None) == 404:
+ if not ts: return # 1st update, avoid speed=0
fail = 0 # alive, at least
else:
fail += 1 # seems dead