[npo] Generalize playlist extractors

This commit is contained in:
Sergey M․ 2016-09-25 21:58:17 +07:00
parent e590b7ff9e
commit a3d8b38168
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

View File

@ -438,9 +438,29 @@ def _real_extract(self, url):
} }
class VPROIE(NPOIE): class NPOPlaylistBaseIE(NPOIE):
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
entries = [
self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id)
for video_id in re.findall(self._PLAYLIST_ENTRY_RE, webpage)
]
playlist_title = self._html_search_regex(
self._PLAYLIST_TITLE_RE, webpage, 'playlist title',
default=None) or self._og_search_title(webpage)
return self.playlist_result(entries, playlist_id, playlist_title)
class VPROIE(NPOPlaylistBaseIE):
IE_NAME = 'vpro' IE_NAME = 'vpro'
_VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html'
_PLAYLIST_TITLE_RE = r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>'
_PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"'
_TESTS = [ _TESTS = [
{ {
@ -473,48 +493,17 @@ class VPROIE(NPOIE):
} }
] ]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id) class WNLIE(NPOPlaylistBaseIE):
entries = [
self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id)
for video_id in re.findall(r'data-media-id="([^"]+)"', webpage)
]
playlist_title = self._search_regex(
r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>',
webpage, 'playlist title', default=None) or self._og_search_title(webpage)
return self.playlist_result(entries, playlist_id, playlist_title)
class WNLIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+' _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+'
_PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>'
_PLAYLIST_ENTRY_RE = r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+'
_TEST = { _TESTS = [{
'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',
'info_dict': { 'info_dict': {
'id': 'vandaag-de-dag-6-mei', 'id': 'vandaag-de-dag-6-mei',
'title': 'Vandaag de Dag 6 mei', 'title': 'Vandaag de Dag 6 mei',
}, },
'playlist_count': 4, 'playlist_count': 4,
} }]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
entries = [
self.url_result('npo:%s' % video_id, 'NPO')
for video_id, part in re.findall(
r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage)
]
playlist_title = self._html_search_regex(
r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>',
webpage, 'playlist title')
return self.playlist_result(entries, playlist_id, playlist_title)