[ie/ARD] Overhaul extractors (#8878)

Closes #8731, Closes #6784, Closes #2366, Closes #2975, Closes #8760
Authored by: seproDev
This commit is contained in:
sepro 2023-12-30 21:44:32 +01:00 committed by GitHub
parent 225cf2b830
commit 5f009a094f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 250 additions and 375 deletions

View File

@ -134,8 +134,8 @@
from .arkena import ArkenaIE
from .ard import (
ARDBetaMediathekIE,
ARDMediathekCollectionIE,
ARDIE,
ARDMediathekIE,
)
from .arte import (
ArteTVIE,

View File

@ -1,24 +1,23 @@
import json
import re
from functools import partial
from .common import InfoExtractor
from .generic import GenericIE
from ..utils import (
OnDemandPagedList,
determine_ext,
ExtractorError,
int_or_none,
join_nonempty,
make_archive_id,
parse_duration,
qualities,
parse_iso8601,
remove_start,
str_or_none,
try_get,
unified_strdate,
unified_timestamp,
update_url,
update_url_query,
url_or_none,
xpath_text,
)
from ..compat import compat_etree_fromstring
from ..utils.traversal import traverse_obj
class ARDMediathekBaseIE(InfoExtractor):
@ -61,45 +60,6 @@ def _parse_media_info(self, media_info, video_id, fsk):
'subtitles': subtitles,
}
def _ARD_extract_episode_info(self, title):
"""Try to extract season/episode data from the title."""
res = {}
if not title:
return res
for pattern in [
# Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
# from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
# E.g.: title="Fritjof aus Norwegen (2) (AD)"
# from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
# E.g.: title="Folge 25/42: Symmetrie"
# from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
# E.g.: title="Folge 1063 - Vertrauen"
# from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
]:
m = re.match(pattern, title)
if m:
groupdict = m.groupdict()
res['season_number'] = int_or_none(groupdict.get('season_number'))
res['episode_number'] = int_or_none(groupdict.get('episode_number'))
res['episode'] = str_or_none(groupdict.get('episode'))
# Build the episode title by removing numeric episode information:
if groupdict.get('ep_info') and not res['episode']:
res['episode'] = str_or_none(
title.replace(groupdict.get('ep_info'), ''))
if res['episode']:
res['episode'] = res['episode'].strip()
break
# As a fallback use the whole title as the episode name:
if not res.get('episode'):
res['episode'] = title.strip()
return res
def _extract_formats(self, media_info, video_id):
type_ = media_info.get('_type')
media_array = media_info.get('_mediaArray', [])
@ -155,138 +115,6 @@ def _extract_formats(self, media_info, video_id):
return formats
class ARDMediathekIE(ARDMediathekBaseIE):
IE_NAME = 'ARD:mediathek'
_VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
# available till 26.07.2022
'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
'info_dict': {
'id': '44726822',
'ext': 'mp4',
'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
'duration': 1740,
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
'only_matching': True,
}, {
# audio
'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
'only_matching': True,
}, {
'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
'only_matching': True,
}, {
# audio
'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
'only_matching': True,
}, {
'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
def _real_extract(self, url):
# determine video id from url
m = self._match_valid_url(url)
document_id = None
numid = re.search(r'documentId=([0-9]+)', url)
if numid:
document_id = video_id = numid.group(1)
else:
video_id = m.group('video_id')
webpage = self._download_webpage(url, video_id)
ERRORS = (
('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
'Video %s is no longer available'),
)
for pattern, message in ERRORS:
if pattern in webpage:
raise ExtractorError(message % video_id, expected=True)
if re.search(r'[\?&]rss($|[=&])', url):
doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc)
title = self._og_search_title(webpage, default=None) or self._html_search_regex(
[r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
r'<meta name="dcterms\.title" content="(.*?)"/>',
r'<h4 class="headline">(.*?)</h4>',
r'<title[^>]*>(.*?)</title>'],
webpage, 'title')
description = self._og_search_description(webpage, default=None) or self._html_search_meta(
'dcterms.abstract', webpage, 'description', default=None)
if description is None:
description = self._html_search_meta(
'description', webpage, 'meta description', default=None)
if description is None:
description = self._html_search_regex(
r'<p\s+class="teasertext">(.+?)</p>',
webpage, 'teaser text', default=None)
# Thumbnail is sometimes not present.
# It is in the mobile version, but that seems to use a different URL
# structure altogether.
thumbnail = self._og_search_thumbnail(webpage, default=None)
media_streams = re.findall(r'''(?x)
mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
"([^"]+)"''', webpage)
if media_streams:
QUALITIES = qualities(['lo', 'hi', 'hq'])
formats = []
for furl in set(media_streams):
if furl.endswith('.f4m'):
fid = 'f4m'
else:
fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
fid = fid_m.group(1) if fid_m else None
formats.append({
'quality': QUALITIES(fid),
'format_id': fid,
'url': furl,
})
info = {
'formats': formats,
}
else: # request JSON file
if not document_id:
video_id = self._search_regex(
(r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
webpage, 'media id', default=None)
info = self._extract_media_info(
'http://www.ardmediathek.de/play/media/%s' % video_id,
webpage, video_id)
info.update({
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
})
info.update(self._ARD_extract_episode_info(info['title']))
return info
class ARDIE(InfoExtractor):
_VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
_TESTS = [{
@ -399,21 +227,23 @@ def _real_extract(self, url):
}
class ARDBetaMediathekIE(ARDMediathekBaseIE):
class ARDBetaMediathekIE(InfoExtractor):
IE_NAME = 'ARDMediathek'
_VALID_URL = r'''(?x)https://
(?:(?:beta|www)\.)?ardmediathek\.de/
(?:(?P<client>[^/]+)/)?
(?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
(?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
(?:[^/]+/)?
(?:player|live|video)/
(?:(?P<display_id>[^?#]+)/)?
(?P<id>[a-zA-Z0-9]+)
/?(?:[?#]|$)'''
_GEO_COUNTRIES = ['DE']
_TESTS = [{
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
'info_dict': {
'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
'id': '12939099',
'id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
'title': 'Liebe auf vier Pfoten',
'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
'duration': 5222,
@ -422,7 +252,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'upload_date': '20231130',
'ext': 'mp4',
'episode': 'Liebe auf vier Pfoten',
'series': 'Filme im MDR'
'series': 'Filme im MDR',
'age_limit': 0,
'channel': 'MDR',
'_old_archive_ids': ['ardbetamediathek 12939099'],
},
}, {
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
@ -443,7 +276,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
'md5': '1e73ded21cb79bac065117e80c81dc88',
'info_dict': {
'id': '10049223',
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
'ext': 'mp4',
'title': 'tagesschau, 20:00 Uhr',
'timestamp': 1636398000,
@ -454,6 +287,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'episode': 'tagesschau, 20:00 Uhr',
'series': 'tagesschau',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
'channel': 'ARD-Aktuell',
'_old_archive_ids': ['ardbetamediathek 10049223'],
},
}, {
'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
'md5': 'c428b9effff18ff624d4f903bda26315',
'info_dict': {
'id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
'ext': 'mp4',
'duration': 2700,
'episode': '7 Tage ... unter harten Jungs',
'description': 'md5:0f215470dcd2b02f59f4bd10c963f072',
'upload_date': '20231005',
'timestamp': 1696491171,
'display_id': '7-tage/7-tage-unter-harten-jungs/hr-fernsehen',
'series': '7 Tage ...',
'channel': 'HR',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a',
'title': '7 Tage ... unter harten Jungs',
'_old_archive_ids': ['ardbetamediathek 94834686'],
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -470,6 +323,176 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
}, {
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
'only_matching': True,
}]
def _extract_episode_info(self, title):
patterns = [
# Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
# from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
# E.g.: title="Fritjof aus Norwegen (2) (AD)"
# from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
# E.g.: title="Folge 25/42: Symmetrie"
# from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
# E.g.: title="Folge 1063 - Vertrauen"
# from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
# As a fallback use the full title
r'(?P<title>.*)',
]
return traverse_obj(patterns, (..., {partial(re.match, string=title)}, {
'season_number': ('season_number', {int_or_none}),
'episode_number': ('episode_number', {int_or_none}),
'episode': ((
('episode', {str_or_none}),
('ep_info', {lambda x: title.replace(x, '')}),
('title', {str}),
), {str.strip}),
}), get_all=False)
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
page_data = self._download_json(
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{video_id}', video_id, query={
'embedded': 'false',
'mcV6': 'true',
})
player_data = traverse_obj(
page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False)
is_live = player_data.get('type') == 'player_live'
media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict}))
if player_data.get('blockedByFsk'):
self.raise_no_formats('This video is only available after 22:00', expected=True)
formats = []
subtitles = {}
for stream in traverse_obj(media_data, ('streams', ..., {dict})):
kind = stream.get('kind')
# Prioritize main stream over sign language and others
preference = 1 if kind == 'main' else None
for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))):
media_url = media['url']
audio_kind = traverse_obj(media, (
'audios', 0, 'kind', {str}), default='').replace('standard', '')
lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu'
lang = join_nonempty(lang_code, audio_kind)
language_preference = 10 if lang == 'deu' else -10
if determine_ext(media_url) == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live)
for f in fmts:
f['language'] = lang
f['language_preference'] = language_preference
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': media_url,
'format_id': f'http-{kind}',
'preference': preference,
'language': lang,
'language_preference': language_preference,
**traverse_obj(media, {
'format_note': ('forcedLabel', {str}),
'width': ('maxHResolutionPx', {int_or_none}),
'height': ('maxVResolutionPx', {int_or_none}),
'vcodec': ('videoCodec', {str}),
}),
})
for sub in traverse_obj(media_data, ('subtitles', ..., {dict})):
for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))):
subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({
'url': sources['url'],
'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')),
})
age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none}))
old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId'))
return {
'id': video_id,
'display_id': display_id,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
'age_limit': age_limit,
**traverse_obj(media_data, ('meta', {
'title': 'title',
'description': 'synopsis',
'timestamp': ('broadcastedOnDateTime', {parse_iso8601}),
'series': 'seriesTitle',
'thumbnail': ('images', 0, 'url', {url_or_none}),
'duration': ('durationSeconds', {int_or_none}),
'channel': 'clipSourceName',
})),
**self._extract_episode_info(page_data.get('title')),
'_old_archive_ids': [make_archive_id(ARDBetaMediathekIE, old_id)],
}
class ARDMediathekCollectionIE(InfoExtractor):
_VALID_URL = r'''(?x)https://
(?:(?:beta|www)\.)?ardmediathek\.de/
(?:[^/?#]+/)?
(?P<playlist>sendung|serie|sammlung)/
(?:(?P<display_id>[^?#]+?)/)?
(?P<id>[a-zA-Z0-9]+)
(?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)'''
_GEO_COUNTRIES = ['DE']
_TESTS = [{
'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV',
'info_dict': {
'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV',
'display_id': 'quiz/staffel-1-originalversion',
'title': 'Staffel 1 Originalversion',
},
'playlist_count': 3,
}, {
'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD',
'info_dict': {
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD',
'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription',
'title': 'Staffel 4 mit Audiodeskription',
},
'playlist_count': 12,
}, {
'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/',
'info_dict': {
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1',
'display_id': 'babylon-berlin/staffel-1',
'title': 'Staffel 1',
},
'playlist_count': 8,
}, {
'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
'info_dict': {
'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
'display_id': 'tatort',
'title': 'Tatort',
},
'playlist_mincount': 500,
}, {
'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2',
'info_dict': {
'id': '5eOHzt8XB2sqeFXbIoJlg2',
'display_id': 'die-kirche-bleibt-im-dorf',
'title': 'Die Kirche bleibt im Dorf',
'description': 'Die Kirche bleibt im Dorf',
},
'playlist_count': 4,
}, {
# playlist of type 'sendung'
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
@ -482,196 +505,48 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
# playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
'only_matching': True,
}]
def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
""" Query the ARD server for playlist information
and returns the data in "raw" format """
assert mode in ('sendung', 'serie', 'sammlung')
if mode in ('sendung', 'serie'):
graphQL = json.dumps({
'query': '''{
showPage(
client: "%s"
showId: "%s"
pageNumber: %d
) {
pagination {
pageSize
totalElements
}
teasers { # Array
mediumTitle
links { target { id href title } }
type
}
}}''' % (client, playlist_id, page_number),
}).encode()
else: # mode == 'sammlung'
graphQL = json.dumps({
'query': '''{
morePage(
client: "%s"
compilationId: "%s"
pageNumber: %d
) {
widget {
pagination {
pageSize
totalElements
}
teasers { # Array
mediumTitle
links { target { id href title } }
type
}
}
}}''' % (client, playlist_id, page_number),
}).encode()
# Ressources for ARD graphQL debugging:
# https://api-test.ardmediathek.de/public-gateway
show_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
'[Playlist] %s' % display_id,
data=graphQL,
headers={'Content-Type': 'application/json'})['data']
# align the structure of the returned data:
if mode in ('sendung', 'serie'):
show_page = show_page['showPage']
else: # mode == 'sammlung'
show_page = show_page['morePage']['widget']
return show_page
def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
""" Collects all playlist entries and returns them as info dict.
Supports playlists of mode 'sendung', 'serie', and 'sammlung',
as well as nested playlists. """
entries = []
pageNumber = 0
while True: # iterate by pageNumber
show_page = self._ARD_load_playlist_snippet(
playlist_id, display_id, client, mode, pageNumber)
for teaser in show_page['teasers']: # process playlist items
if '/compilation/' in teaser['links']['target']['href']:
# alternativ cond.: teaser['type'] == "compilation"
# => This is an nested compilation, e.g. like:
# https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
link_mode = 'sammlung'
else:
link_mode = 'video'
item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
client, link_mode, display_id,
# perform HTLM quoting of episode title similar to ARD:
re.sub('^-|-$', '', # remove '-' from begin/end
re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by -
teaser['links']['target']['title'].lower()
.replace('ä', 'ae').replace('ö', 'oe')
.replace('ü', 'ue').replace('ß', 'ss'))),
teaser['links']['target']['id'])
entries.append(self.url_result(
item_url,
ie=ARDBetaMediathekIE.ie_key()))
if (show_page['pagination']['pageSize'] * (pageNumber + 1)
>= show_page['pagination']['totalElements']):
# we've processed enough pages to get all playlist entries
break
pageNumber = pageNumber + 1
return self.playlist_result(entries, playlist_id, playlist_title=display_id)
_PAGE_SIZE = 100
def _real_extract(self, url):
video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
'id', 'display_id', 'playlist', 'client', 'season')
display_id, client = display_id or video_id, client or 'ard'
playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group(
'id', 'display_id', 'playlist', 'season', 'version')
if playlist_type:
# TODO: Extract only specified season
return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
def call_api(page_num):
api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset'
return self._download_json(
f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id,
f'Downloading playlist page {page_num}', query={
'pageNumber': page_num,
'pageSize': self._PAGE_SIZE,
**({
'seasoned': 'true',
'seasonNumber': season_number,
'withOriginalversion': 'true' if version == 'OV' else 'false',
'withAudiodescription': 'true' if version == 'AD' else 'false',
} if season_number else {}),
})
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
display_id, data=json.dumps({
'query': '''{
playerPage(client:"%s", clipId: "%s") {
blockedByFsk
broadcastedOn
maturityContentRating
mediaCollection {
_duration
_geoblocked
_isLive
_mediaArray {
_mediaStreamArray {
_quality
_server
_stream
}
}
_previewImage
_subtitleUrl
_type
}
show {
title
}
image {
src
}
synopsis
title
tracking {
atiCustomVars {
contentId
}
}
}
}''' % (client, video_id),
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
title = player_page['title']
content_id = str_or_none(try_get(
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
media_collection = player_page.get('mediaCollection') or {}
if not media_collection and content_id:
media_collection = self._download_json(
'https://www.ardmediathek.de/play/media/' + content_id,
content_id, fatal=False) or {}
info = self._parse_media_info(
media_collection, content_id or video_id,
player_page.get('blockedByFsk'))
age_limit = None
description = player_page.get('synopsis')
maturity_content_rating = player_page.get('maturityContentRating')
if maturity_content_rating:
age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
if not age_limit and description:
age_limit = int_or_none(self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
info.update({
'age_limit': age_limit,
'display_id': display_id,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
'thumbnail': (media_collection.get('_previewImage')
or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
or self.get_thumbnail_from_html(display_id, url)),
})
info.update(self._ARD_extract_episode_info(info['title']))
return info
def fetch_page(page_num):
for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})):
item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False)
if not item_id or item_id == playlist_id:
continue
item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video'
yield self.url_result(
f'https://www.ardmediathek.de/{item_mode}/{item_id}',
ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE),
**traverse_obj(item, {
'id': ('id', {str}),
'title': ('longTitle', {str}),
'duration': ('duration', {int_or_none}),
'timestamp': ('broadcastedOn', {parse_iso8601}),
}))
def get_thumbnail_from_html(self, display_id, url):
webpage = self._download_webpage(url, display_id, fatal=False) or ''
return (
self._og_search_thumbnail(webpage, default=None)
or self._html_search_meta('thumbnailUrl', webpage, default=None))
page_data = call_api(0)
full_id = join_nonempty(playlist_id, season_number, version, delim='_')
return self.playlist_result(
OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id,
title=page_data.get('title'), description=page_data.get('synopsis'))