[spiegel] Test format video URLs for 404 (Closes #4579)

This commit is contained in:
Sergey M․ 2015-01-14 20:27:14 +06:00
parent f2cbc96c3e
commit e92d4a11f5

View File

@ -4,7 +4,14 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse from ..compat import (
compat_urlparse,
compat_HTTPError,
)
from ..utils import (
HEADRequest,
ExtractorError,
)
from .spiegeltv import SpiegeltvIE from .spiegeltv import SpiegeltvIE
@ -60,21 +67,31 @@ def _real_extract(self, url):
xml_url = base_url + video_id + '.xml' xml_url = base_url + video_id + '.xml'
idoc = self._download_xml(xml_url, video_id) idoc = self._download_xml(xml_url, video_id)
formats = [ formats = []
{ for n in list(idoc):
'format_id': n.tag.rpartition('type')[2], if n.tag.startswith('type') and n.tag != 'type6':
'url': base_url + n.find('./filename').text, format_id = n.tag.rpartition('type')[2]
'width': int(n.find('./width').text), video_url = base_url + n.find('./filename').text
'height': int(n.find('./height').text), # Test video URLs beforehand as some of them are invalid
'abr': int(n.find('./audiobitrate').text), try:
'vbr': int(n.find('./videobitrate').text), self._request_webpage(
'vcodec': n.find('./codec').text, HEADRequest(video_url), video_id,
'acodec': 'MP4A', 'Checking %s video URL' % format_id)
} except ExtractorError as e:
for n in list(idoc) if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
# Blacklist type 6, it's extremely LQ and not available on the same server self.report_warning(
if n.tag.startswith('type') and n.tag != 'type6' '%s video URL is invalid, skipping' % format_id, video_id)
] continue
formats.append({
'format_id': format_id,
'url': video_url,
'width': int(n.find('./width').text),
'height': int(n.find('./height').text),
'abr': int(n.find('./audiobitrate').text),
'vbr': int(n.find('./videobitrate').text),
'vcodec': n.find('./codec').text,
'acodec': 'MP4A',
})
duration = float(idoc[0].findall('./duration')[0].text) duration = float(idoc[0].findall('./duration')[0].text)
self._sort_formats(formats) self._sort_formats(formats)