[ie/rai] Fix m3u8 formats extraction (#9291)

Closes #887
Authored by: nixxo
This commit is contained in:
nixxo 2024-02-29 23:49:25 +01:00 committed by GitHub
parent 804f236611
commit 8f423cf805
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -28,6 +28,29 @@ class RaiBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['IT'] _GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False _GEO_BYPASS = False
def _fix_m3u8_formats(self, media_url, video_id):
fmts = self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
# Fix malformed m3u8 manifests by setting audio-only/video-only formats
for f in fmts:
if not f.get('acodec'):
f['acodec'] = 'mp4a'
if not f.get('vcodec'):
f['vcodec'] = 'avc1'
man_url = f['url']
if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only
f['vcodec'] = 'none'
elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only
f['acodec'] = 'none'
else: # video+audio
if f['acodec'] == 'none':
f['acodec'] = 'mp4a'
if f['vcodec'] == 'none':
f['vcodec'] = 'avc1'
return fmts
def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): def _extract_relinker_info(self, relinker_url, video_id, audio_only=False):
def fix_cdata(s): def fix_cdata(s):
# remove \r\n\t before and after <![CDATA[ ]]> to avoid # remove \r\n\t before and after <![CDATA[ ]]> to avoid
@ -69,8 +92,7 @@ def fix_cdata(s):
'format_id': 'https-mp3', 'format_id': 'https-mp3',
}) })
elif ext == 'm3u8' or 'format=m3u8' in media_url: elif ext == 'm3u8' or 'format=m3u8' in media_url:
formats.extend(self._extract_m3u8_formats( formats.extend(self._fix_m3u8_formats(media_url, video_id))
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'f4m': elif ext == 'f4m':
# very likely no longer needed. Cannot find any url that uses it. # very likely no longer needed. Cannot find any url that uses it.
manifest_url = update_url_query( manifest_url = update_url_query(
@ -153,10 +175,10 @@ def get_format_info(tbr):
'format_id': f'https-{tbr}', 'format_id': f'https-{tbr}',
'width': format_copy.get('width'), 'width': format_copy.get('width'),
'height': format_copy.get('height'), 'height': format_copy.get('height'),
'tbr': format_copy.get('tbr'), 'tbr': format_copy.get('tbr') or tbr,
'vcodec': format_copy.get('vcodec'), 'vcodec': format_copy.get('vcodec') or 'avc1',
'acodec': format_copy.get('acodec'), 'acodec': format_copy.get('acodec') or 'mp4a',
'fps': format_copy.get('fps'), 'fps': format_copy.get('fps') or 25,
} if format_copy else { } if format_copy else {
'format_id': f'https-{tbr}', 'format_id': f'https-{tbr}',
'width': _QUALITY[tbr][0], 'width': _QUALITY[tbr][0],
@ -245,7 +267,7 @@ class RaiPlayIE(RaiBaseIE):
'series': 'Report', 'series': 'Report',
'season': '2013/14', 'season': '2013/14',
'subtitles': {'it': 'count:4'}, 'subtitles': {'it': 'count:4'},
'release_year': 2022, 'release_year': 2024,
'episode': 'Espresso nel caffè - 07/04/2014', 'episode': 'Espresso nel caffè - 07/04/2014',
'timestamp': 1396919880, 'timestamp': 1396919880,
'upload_date': '20140408', 'upload_date': '20140408',
@ -253,7 +275,7 @@ class RaiPlayIE(RaiBaseIE):
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
# 1080p direct mp4 url # 1080p
'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html',
'md5': 'aeda7243115380b2dd5e881fd42d949a', 'md5': 'aeda7243115380b2dd5e881fd42d949a',
'info_dict': { 'info_dict': {
@ -274,7 +296,7 @@ class RaiPlayIE(RaiBaseIE):
'episode': 'Senza occhi', 'episode': 'Senza occhi',
'timestamp': 1637318940, 'timestamp': 1637318940,
'upload_date': '20211119', 'upload_date': '20211119',
'formats': 'count:12', 'formats': 'count:7',
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
'expected_warnings': ['Video not available. Likely due to geo-restriction.'] 'expected_warnings': ['Video not available. Likely due to geo-restriction.']
@ -527,7 +549,7 @@ class RaiPlaySoundPlaylistIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'ilruggitodelconiglio', 'id': 'ilruggitodelconiglio',
'title': 'Il Ruggito del Coniglio', 'title': 'Il Ruggito del Coniglio',
'description': 'md5:48cff6972435964284614d70474132e6', 'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e',
}, },
'playlist_mincount': 65, 'playlist_mincount': 65,
}, { }, {
@ -634,19 +656,20 @@ def _real_extract(self, url):
} }
class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE class RaiNewsIE(RaiBaseIE):
_VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' _VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html'
_EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)']
_TESTS = [{ _TESTS = [{
# new rainews player (#3911) # new rainews player (#3911)
'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html', 'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html',
'info_dict': { 'info_dict': {
'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf', 'id': '31e8017c-845c-43f5-9c48-245b43c3a079',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Puntata del 29/05/2022', 'title': 'md5:1e81364b09de4a149042bac3c7d36f0b',
'duration': 1589, 'duration': 196,
'upload_date': '20220529', 'upload_date': '20240225',
'uploader': 'rainews', 'uploader': 'rainews',
'formats': 'count:2',
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
@ -659,7 +682,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
'description': 'I film in uscita questa settimana.', 'description': 'I film in uscita questa settimana.',
'thumbnail': r're:^https?://.*\.png$', 'thumbnail': r're:^https?://.*\.png$',
'duration': 833, 'duration': 833,
'upload_date': '20161103' 'upload_date': '20161103',
'formats': 'count:8',
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
'expected_warnings': ['unable to extract player_data'], 'expected_warnings': ['unable to extract player_data'],
@ -684,7 +708,7 @@ def _real_extract(self, url):
if not relinker_url: if not relinker_url:
# fallback on old implementation for some old content # fallback on old implementation for some old content
try: try:
return self._extract_from_content_id(video_id, url) return RaiIE._real_extract(self, url)
except GeoRestrictedError: except GeoRestrictedError:
raise raise
except ExtractorError as e: except ExtractorError as e: