mirror of https://github.com/python/cpython.git
Issue10050 - urlretrieve uses newer urlopen. reporthook of urlretrieve takes, block number, block read size, file_size
This commit is contained in:
parent
a2251aadaa
commit
e24f96a059
|
@ -56,6 +56,13 @@ The simplest way to use urllib.request is as follows::
|
||||||
response = urllib.request.urlopen('http://python.org/')
|
response = urllib.request.urlopen('http://python.org/')
|
||||||
html = response.read()
|
html = response.read()
|
||||||
|
|
||||||
|
If you wish to retrieve a resource via URL and store it in a temporary location,
|
||||||
|
you can do so via the :func:`urlretrieve` function::
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
local_filename, headers = urllib.request.urlretrieve('http://python.org/')
|
||||||
|
html = open(local_filename)
|
||||||
|
|
||||||
Many uses of urllib will be that simple (note that instead of an 'http:' URL we
|
Many uses of urllib will be that simple (note that instead of an 'http:' URL we
|
||||||
could have used an URL starting with 'ftp:', 'file:', etc.). However, it's the
|
could have used an URL starting with 'ftp:', 'file:', etc.). However, it's the
|
||||||
purpose of this tutorial to explain the more complicated cases, concentrating on
|
purpose of this tutorial to explain the more complicated cases, concentrating on
|
||||||
|
|
|
@ -1124,16 +1124,14 @@ The following functions and classes are ported from the Python 2 module
|
||||||
``urllib`` (as opposed to ``urllib2``). They might become deprecated at
|
``urllib`` (as opposed to ``urllib2``). They might become deprecated at
|
||||||
some point in the future.
|
some point in the future.
|
||||||
|
|
||||||
|
|
||||||
.. function:: urlretrieve(url, filename=None, reporthook=None, data=None)
|
.. function:: urlretrieve(url, filename=None, reporthook=None, data=None)
|
||||||
|
|
||||||
Copy a network object denoted by a URL to a local file, if necessary. If the URL
|
Copy a network object denoted by a URL to a local file. If the URL
|
||||||
points to a local file, or a valid cached copy of the object exists, the object
|
points to a local file, the object will not be copied unless filename is supplied.
|
||||||
is not copied. Return a tuple ``(filename, headers)`` where *filename* is the
|
Return a tuple ``(filename, headers)`` where *filename* is the
|
||||||
local file name under which the object can be found, and *headers* is whatever
|
local file name under which the object can be found, and *headers* is whatever
|
||||||
the :meth:`info` method of the object returned by :func:`urlopen` returned (for
|
the :meth:`info` method of the object returned by :func:`urlopen` returned (for
|
||||||
a remote object, possibly cached). Exceptions are the same as for
|
a remote object). Exceptions are the same as for :func:`urlopen`.
|
||||||
:func:`urlopen`.
|
|
||||||
|
|
||||||
The second argument, if present, specifies the file location to copy to (if
|
The second argument, if present, specifies the file location to copy to (if
|
||||||
absent, the location will be a tempfile with a generated name). The third
|
absent, the location will be a tempfile with a generated name). The third
|
||||||
|
@ -1144,11 +1142,18 @@ some point in the future.
|
||||||
third argument may be ``-1`` on older FTP servers which do not return a file
|
third argument may be ``-1`` on older FTP servers which do not return a file
|
||||||
size in response to a retrieval request.
|
size in response to a retrieval request.
|
||||||
|
|
||||||
|
The following example illustrates the most common usage scenario::
|
||||||
|
|
||||||
|
>>> import urllib.request
|
||||||
|
>>> local_filename, headers = urllib.request.urlretrieve('http://python.org/')
|
||||||
|
>>> html = open(local_filename)
|
||||||
|
>>> html.close()
|
||||||
|
|
||||||
If the *url* uses the :file:`http:` scheme identifier, the optional *data*
|
If the *url* uses the :file:`http:` scheme identifier, the optional *data*
|
||||||
argument may be given to specify a ``POST`` request (normally the request type
|
argument may be given to specify a ``POST`` request (normally the request
|
||||||
is ``GET``). The *data* argument must in standard
|
type is ``GET``). The *data* argument must in standard
|
||||||
:mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode`
|
:mimetype:`application/x-www-form-urlencoded` format; see the
|
||||||
function below.
|
:func:`urlencode` function below.
|
||||||
|
|
||||||
:func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that
|
:func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that
|
||||||
the amount of data available was less than the expected amount (which is the
|
the amount of data available was less than the expected amount (which is the
|
||||||
|
@ -1156,20 +1161,20 @@ some point in the future.
|
||||||
the download is interrupted.
|
the download is interrupted.
|
||||||
|
|
||||||
The *Content-Length* is treated as a lower bound: if there's more data to read,
|
The *Content-Length* is treated as a lower bound: if there's more data to read,
|
||||||
:func:`urlretrieve` reads more data, but if less data is available, it raises
|
urlretrieve reads more data, but if less data is available, it raises the
|
||||||
the exception.
|
exception.
|
||||||
|
|
||||||
You can still retrieve the downloaded data in this case, it is stored in the
|
You can still retrieve the downloaded data in this case, it is stored in the
|
||||||
:attr:`content` attribute of the exception instance.
|
:attr:`content` attribute of the exception instance.
|
||||||
|
|
||||||
If no *Content-Length* header was supplied, :func:`urlretrieve` can not check
|
If no *Content-Length* header was supplied, urlretrieve can not check the size
|
||||||
the size of the data it has downloaded, and just returns it. In this case
|
of the data it has downloaded, and just returns it. In this case you just have
|
||||||
you just have to assume that the download was successful.
|
to assume that the download was successful.
|
||||||
|
|
||||||
.. function:: urlcleanup()
|
.. function:: urlcleanup()
|
||||||
|
|
||||||
Clear the cache that may have been built up by previous calls to
|
Cleans up temporary files that may have been left behind by previous
|
||||||
:func:`urlretrieve`.
|
calls to :func:`urlretrieve`.
|
||||||
|
|
||||||
.. class:: URLopener(proxies=None, **x509)
|
.. class:: URLopener(proxies=None, **x509)
|
||||||
|
|
||||||
|
|
|
@ -384,11 +384,11 @@ def test_copy(self):
|
||||||
|
|
||||||
def test_reporthook(self):
|
def test_reporthook(self):
|
||||||
# Make sure that the reporthook works.
|
# Make sure that the reporthook works.
|
||||||
def hooktester(count, block_size, total_size, count_holder=[0]):
|
def hooktester(block_count, block_read_size, file_size, count_holder=[0]):
|
||||||
self.assertIsInstance(count, int)
|
self.assertIsInstance(block_count, int)
|
||||||
self.assertIsInstance(block_size, int)
|
self.assertIsInstance(block_read_size, int)
|
||||||
self.assertIsInstance(total_size, int)
|
self.assertIsInstance(file_size, int)
|
||||||
self.assertEqual(count, count_holder[0])
|
self.assertEqual(block_count, count_holder[0])
|
||||||
count_holder[0] = count_holder[0] + 1
|
count_holder[0] = count_holder[0] + 1
|
||||||
second_temp = "%s.2" % support.TESTFN
|
second_temp = "%s.2" % support.TESTFN
|
||||||
self.registerFileForCleanUp(second_temp)
|
self.registerFileForCleanUp(second_temp)
|
||||||
|
@ -399,8 +399,8 @@ def hooktester(count, block_size, total_size, count_holder=[0]):
|
||||||
def test_reporthook_0_bytes(self):
|
def test_reporthook_0_bytes(self):
|
||||||
# Test on zero length file. Should call reporthook only 1 time.
|
# Test on zero length file. Should call reporthook only 1 time.
|
||||||
report = []
|
report = []
|
||||||
def hooktester(count, block_size, total_size, _report=report):
|
def hooktester(block_count, block_read_size, file_size, _report=report):
|
||||||
_report.append((count, block_size, total_size))
|
_report.append((block_count, block_read_size, file_size))
|
||||||
srcFileName = self.createNewTempFile()
|
srcFileName = self.createNewTempFile()
|
||||||
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
||||||
support.TESTFN, hooktester)
|
support.TESTFN, hooktester)
|
||||||
|
@ -410,31 +410,31 @@ def hooktester(count, block_size, total_size, _report=report):
|
||||||
def test_reporthook_5_bytes(self):
|
def test_reporthook_5_bytes(self):
|
||||||
# Test on 5 byte file. Should call reporthook only 2 times (once when
|
# Test on 5 byte file. Should call reporthook only 2 times (once when
|
||||||
# the "network connection" is established and once when the block is
|
# the "network connection" is established and once when the block is
|
||||||
# read). Since the block size is 8192 bytes, only one block read is
|
# read).
|
||||||
# required to read the entire file.
|
|
||||||
report = []
|
report = []
|
||||||
def hooktester(count, block_size, total_size, _report=report):
|
def hooktester(block_count, block_read_size, file_size, _report=report):
|
||||||
_report.append((count, block_size, total_size))
|
_report.append((block_count, block_read_size, file_size))
|
||||||
srcFileName = self.createNewTempFile(b"x" * 5)
|
srcFileName = self.createNewTempFile(b"x" * 5)
|
||||||
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
||||||
support.TESTFN, hooktester)
|
support.TESTFN, hooktester)
|
||||||
self.assertEqual(len(report), 2)
|
self.assertEqual(len(report), 2)
|
||||||
self.assertEqual(report[0][1], 8192)
|
self.assertEqual(report[0][1], 0)
|
||||||
self.assertEqual(report[0][2], 5)
|
self.assertEqual(report[1][1], 5)
|
||||||
|
|
||||||
def test_reporthook_8193_bytes(self):
|
def test_reporthook_8193_bytes(self):
|
||||||
# Test on 8193 byte file. Should call reporthook only 3 times (once
|
# Test on 8193 byte file. Should call reporthook only 3 times (once
|
||||||
# when the "network connection" is established, once for the next 8192
|
# when the "network connection" is established, once for the next 8192
|
||||||
# bytes, and once for the last byte).
|
# bytes, and once for the last byte).
|
||||||
report = []
|
report = []
|
||||||
def hooktester(count, block_size, total_size, _report=report):
|
def hooktester(block_count, block_read_size, file_size, _report=report):
|
||||||
_report.append((count, block_size, total_size))
|
_report.append((block_count, block_read_size, file_size))
|
||||||
srcFileName = self.createNewTempFile(b"x" * 8193)
|
srcFileName = self.createNewTempFile(b"x" * 8193)
|
||||||
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
||||||
support.TESTFN, hooktester)
|
support.TESTFN, hooktester)
|
||||||
self.assertEqual(len(report), 3)
|
self.assertEqual(len(report), 3)
|
||||||
self.assertEqual(report[0][1], 8192)
|
self.assertEqual(report[0][1], 0)
|
||||||
self.assertEqual(report[0][2], 8193)
|
self.assertEqual(report[1][1], 8192)
|
||||||
|
self.assertEqual(report[2][1], 1)
|
||||||
|
|
||||||
|
|
||||||
class urlretrieve_HttpTests(unittest.TestCase, FakeHTTPMixin):
|
class urlretrieve_HttpTests(unittest.TestCase, FakeHTTPMixin):
|
||||||
|
|
|
@ -94,6 +94,9 @@
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import collections
|
import collections
|
||||||
|
import tempfile
|
||||||
|
import contextlib
|
||||||
|
|
||||||
|
|
||||||
from urllib.error import URLError, HTTPError, ContentTooShortError
|
from urllib.error import URLError, HTTPError, ContentTooShortError
|
||||||
from urllib.parse import (
|
from urllib.parse import (
|
||||||
|
@ -156,17 +159,78 @@ def install_opener(opener):
|
||||||
global _opener
|
global _opener
|
||||||
_opener = opener
|
_opener = opener
|
||||||
|
|
||||||
# TODO(jhylton): Make this work with the same global opener.
|
_url_tempfiles = []
|
||||||
_urlopener = None
|
|
||||||
def urlretrieve(url, filename=None, reporthook=None, data=None):
|
def urlretrieve(url, filename=None, reporthook=None, data=None):
|
||||||
global _urlopener
|
"""
|
||||||
if not _urlopener:
|
Retrieve a URL into a temporary location on disk.
|
||||||
_urlopener = FancyURLopener()
|
|
||||||
return _urlopener.retrieve(url, filename, reporthook, data)
|
Requires a URL argument. If a filename is passed, it is used as
|
||||||
|
the temporary file location. The reporthook argument should be
|
||||||
|
a callable that accepts a block number, a read size, and the
|
||||||
|
total file size of the URL target. The data argument should be
|
||||||
|
valid URL encoded data.
|
||||||
|
|
||||||
|
If a filename is passed and the URL points to a local resource,
|
||||||
|
the result is a copy from local file to new file.
|
||||||
|
|
||||||
|
Returns a tuple containing the path to the newly created
|
||||||
|
data file as well as the resulting HTTPMessage object.
|
||||||
|
"""
|
||||||
|
url_type, path = splittype(url)
|
||||||
|
|
||||||
|
with contextlib.closing(urlopen(url, data)) as fp:
|
||||||
|
headers = fp.info()
|
||||||
|
|
||||||
|
# Just return the local path and the "headers" for file://
|
||||||
|
# URLs. No sense in performing a copy unless requested.
|
||||||
|
if url_type == "file" and not filename:
|
||||||
|
return os.path.normpath(path), headers
|
||||||
|
|
||||||
|
# Handle temporary file setup.
|
||||||
|
if filename:
|
||||||
|
tfp = open(filename, 'wb')
|
||||||
|
else:
|
||||||
|
tfp = tempfile.NamedTemporaryFile(delete=False)
|
||||||
|
filename = tfp.name
|
||||||
|
_url_tempfiles.append(filename)
|
||||||
|
|
||||||
|
with tfp:
|
||||||
|
result = filename, headers
|
||||||
|
bs = 1024*8
|
||||||
|
size = -1
|
||||||
|
read = 0
|
||||||
|
blocknum = 0
|
||||||
|
if "content-length" in headers:
|
||||||
|
size = int(headers["Content-Length"])
|
||||||
|
|
||||||
|
if reporthook:
|
||||||
|
reporthook(blocknum, 0, size)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
block = fp.read(bs)
|
||||||
|
if not block:
|
||||||
|
break
|
||||||
|
read += len(block)
|
||||||
|
tfp.write(block)
|
||||||
|
blocknum += 1
|
||||||
|
if reporthook:
|
||||||
|
reporthook(blocknum, len(block), size)
|
||||||
|
|
||||||
|
if size >= 0 and read < size:
|
||||||
|
raise ContentTooShortError(
|
||||||
|
"retrieval incomplete: got only %i out of %i bytes"
|
||||||
|
% (read, size), result)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def urlcleanup():
|
def urlcleanup():
|
||||||
if _urlopener:
|
for temp_file in _url_tempfiles:
|
||||||
_urlopener.cleanup()
|
try:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
except EnvironmentError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
del _url_tempfiles[:]
|
||||||
global _opener
|
global _opener
|
||||||
if _opener:
|
if _opener:
|
||||||
_opener = None
|
_opener = None
|
||||||
|
|
Loading…
Reference in New Issue