[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests

Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.
This commit is contained in:
Sergey M․ 2018-06-18 04:01:48 +07:00
parent 075a13d3e9
commit d391b7e23d
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

View File

@ -19,6 +19,7 @@ from ..compat import (
compat_cookies, compat_cookies,
compat_etree_fromstring, compat_etree_fromstring,
compat_getpass, compat_getpass,
compat_integer_types,
compat_http_client, compat_http_client,
compat_os_name, compat_os_name,
compat_str, compat_str,
@ -548,8 +549,26 @@ class InfoExtractor(object):
def IE_NAME(self): def IE_NAME(self):
return compat_str(type(self).__name__[:-2]) return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): @staticmethod
""" Returns the response handle """ def __can_accept_status_code(err, expected_status):
assert isinstance(err, compat_urllib_error.HTTPError)
if expected_status is None:
return False
if isinstance(expected_status, compat_integer_types):
return err.code == expected_status
elif isinstance(expected_status, (list, tuple)):
return err.code in expected_status
elif callable(expected_status):
return expected_status(err.code) is True
else:
assert False
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
"""
Return the response handle.
See _download_webpage docstring for arguments specification.
"""
if note is None: if note is None:
self.report_download_webpage(video_id) self.report_download_webpage(video_id)
elif note is not False: elif note is not False:
@ -578,6 +597,10 @@ class InfoExtractor(object):
try: try:
return self._downloader.urlopen(url_or_request) return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if isinstance(err, compat_urllib_error.HTTPError):
if self.__can_accept_status_code(err, expected_status):
return err.fp
if errnote is False: if errnote is False:
return False return False
if errnote is None: if errnote is None:
@ -590,13 +613,17 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg) self._downloader.report_warning(errmsg)
return False return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
""" Returns a tuple (page content as string, URL handle) """ """
Return a tuple (page content as string, URL handle).
See _download_webpage docstring for arguments specification.
"""
# Strip hashes from the URL (#1038) # Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)): if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0] url_or_request = url_or_request.partition('#')[0]
urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
if urlh is False: if urlh is False:
assert not fatal assert not fatal
return False return False
@ -685,13 +712,52 @@ class InfoExtractor(object):
return content return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): def _download_webpage(
""" Returns the data of the page as a string """ self, url_or_request, video_id, note=None, errnote=None,
fatal=True, tries=1, timeout=5, encoding=None, data=None,
headers={}, query={}, expected_status=None):
"""
Return the data of the page as a string.
Arguments:
url_or_request -- plain text URL as a string or
a compat_urllib_request.Requestobject
video_id -- Video/playlist/item identifier (string)
Keyword arguments:
note -- note printed before downloading (string)
errnote -- note printed in case of an error (string)
fatal -- flag denoting whether error should be considered fatal,
i.e. whether it should cause ExtractionError to be raised,
otherwise a warning will be reported and extraction continued
tries -- number of tries
timeout -- sleep interval between tries
encoding -- encoding for a page content decoding, guessed automatically
when not explicitly specified
data -- POST data (bytes)
headers -- HTTP headers (dict)
query -- URL query (dict)
expected_status -- allows to accept failed HTTP requests (non 2xx
status code) by explicitly specifying a set of accepted status
codes. Can be any of the following entities:
- an integer type specifying an exact failed status code to
accept
- a list or a tuple of integer types specifying a list of
failed status codes to accept
- a callable accepting an actual failed status code and
returning True if it should be accepted
Note that this argument does not affect success status codes (2xx)
which are always accepted.
"""
success = False success = False
try_count = 0 try_count = 0
while success is False: while success is False:
try: try:
res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal,
encoding=encoding, data=data, headers=headers, query=query,
expected_status=expected_status)
success = True success = True
except compat_http_client.IncompleteRead as e: except compat_http_client.IncompleteRead as e:
try_count += 1 try_count += 1
@ -707,11 +773,17 @@ class InfoExtractor(object):
def _download_xml_handle( def _download_xml_handle(
self, url_or_request, video_id, note='Downloading XML', self, url_or_request, video_id, note='Downloading XML',
errnote='Unable to download XML', transform_source=None, errnote='Unable to download XML', transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}): fatal=True, encoding=None, data=None, headers={}, query={},
"""Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" expected_status=None):
"""
Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
See _download_webpage docstring for arguments specification.
"""
res = self._download_webpage_handle( res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal, url_or_request, video_id, note, errnote, fatal=fatal,
encoding=encoding, data=data, headers=headers, query=query) encoding=encoding, data=data, headers=headers, query=query,
expected_status=expected_status)
if res is False: if res is False:
return res return res
xml_string, urlh = res xml_string, urlh = res
@ -719,15 +791,21 @@ class InfoExtractor(object):
xml_string, video_id, transform_source=transform_source, xml_string, video_id, transform_source=transform_source,
fatal=fatal), urlh fatal=fatal), urlh
def _download_xml(self, url_or_request, video_id, def _download_xml(
self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML', note='Downloading XML', errnote='Unable to download XML',
transform_source=None, fatal=True, encoding=None, transform_source=None, fatal=True, encoding=None,
data=None, headers={}, query={}): data=None, headers={}, query={}, expected_status=None):
"""Return the xml as an xml.etree.ElementTree.Element""" """
Return the xml as an xml.etree.ElementTree.Element.
See _download_webpage docstring for arguments specification.
"""
res = self._download_xml_handle( res = self._download_xml_handle(
url_or_request, video_id, note=note, errnote=errnote, url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding, transform_source=transform_source, fatal=fatal, encoding=encoding,
data=data, headers=headers, query=query) data=data, headers=headers, query=query,
expected_status=expected_status)
return res if res is False else res[0] return res if res is False else res[0]
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
@ -745,11 +823,17 @@ class InfoExtractor(object):
def _download_json_handle( def _download_json_handle(
self, url_or_request, video_id, note='Downloading JSON metadata', self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None, errnote='Unable to download JSON metadata', transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}): fatal=True, encoding=None, data=None, headers={}, query={},
"""Return a tuple (JSON object, URL handle)""" expected_status=None):
"""
Return a tuple (JSON object, URL handle).
See _download_webpage docstring for arguments specification.
"""
res = self._download_webpage_handle( res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal, url_or_request, video_id, note, errnote, fatal=fatal,
encoding=encoding, data=data, headers=headers, query=query) encoding=encoding, data=data, headers=headers, query=query,
expected_status=expected_status)
if res is False: if res is False:
return res return res
json_string, urlh = res json_string, urlh = res
@ -760,11 +844,18 @@ class InfoExtractor(object):
def _download_json( def _download_json(
self, url_or_request, video_id, note='Downloading JSON metadata', self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None, errnote='Unable to download JSON metadata', transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}): fatal=True, encoding=None, data=None, headers={}, query={},
expected_status=None):
"""
Return the JSON object as a dict.
See _download_webpage docstring for arguments specification.
"""
res = self._download_json_handle( res = self._download_json_handle(
url_or_request, video_id, note=note, errnote=errnote, url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding, transform_source=transform_source, fatal=fatal, encoding=encoding,
data=data, headers=headers, query=query) data=data, headers=headers, query=query,
expected_status=expected_status)
return res if res is False else res[0] return res if res is False else res[0]
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):