[ie/rai] Fix m3u8 formats extraction (#9291)

Closes #887
Authored by: nixxo
This commit is contained in:
nixxo 2024-02-29 23:49:25 +01:00 committed by GitHub
parent 804f236611
commit 8f423cf805
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -28,6 +28,29 @@ class RaiBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
def _fix_m3u8_formats(self, media_url, video_id):
fmts = self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
# Fix malformed m3u8 manifests by setting audio-only/video-only formats
for f in fmts:
if not f.get('acodec'):
f['acodec'] = 'mp4a'
if not f.get('vcodec'):
f['vcodec'] = 'avc1'
man_url = f['url']
if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only
f['vcodec'] = 'none'
elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only
f['acodec'] = 'none'
else: # video+audio
if f['acodec'] == 'none':
f['acodec'] = 'mp4a'
if f['vcodec'] == 'none':
f['vcodec'] = 'avc1'
return fmts
def _extract_relinker_info(self, relinker_url, video_id, audio_only=False):
def fix_cdata(s):
# remove \r\n\t before and after <![CDATA[ ]]> to avoid
@ -69,8 +92,7 @@ def fix_cdata(s):
'format_id': 'https-mp3',
})
elif ext == 'm3u8' or 'format=m3u8' in media_url:
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
formats.extend(self._fix_m3u8_formats(media_url, video_id))
elif ext == 'f4m':
# very likely no longer needed. Cannot find any url that uses it.
manifest_url = update_url_query(
@ -153,10 +175,10 @@ def get_format_info(tbr):
'format_id': f'https-{tbr}',
'width': format_copy.get('width'),
'height': format_copy.get('height'),
'tbr': format_copy.get('tbr'),
'vcodec': format_copy.get('vcodec'),
'acodec': format_copy.get('acodec'),
'fps': format_copy.get('fps'),
'tbr': format_copy.get('tbr') or tbr,
'vcodec': format_copy.get('vcodec') or 'avc1',
'acodec': format_copy.get('acodec') or 'mp4a',
'fps': format_copy.get('fps') or 25,
} if format_copy else {
'format_id': f'https-{tbr}',
'width': _QUALITY[tbr][0],
@ -245,7 +267,7 @@ class RaiPlayIE(RaiBaseIE):
'series': 'Report',
'season': '2013/14',
'subtitles': {'it': 'count:4'},
'release_year': 2022,
'release_year': 2024,
'episode': 'Espresso nel caffè - 07/04/2014',
'timestamp': 1396919880,
'upload_date': '20140408',
@ -253,7 +275,7 @@ class RaiPlayIE(RaiBaseIE):
},
'params': {'skip_download': True},
}, {
# 1080p direct mp4 url
# 1080p
'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html',
'md5': 'aeda7243115380b2dd5e881fd42d949a',
'info_dict': {
@ -274,7 +296,7 @@ class RaiPlayIE(RaiBaseIE):
'episode': 'Senza occhi',
'timestamp': 1637318940,
'upload_date': '20211119',
'formats': 'count:12',
'formats': 'count:7',
},
'params': {'skip_download': True},
'expected_warnings': ['Video not available. Likely due to geo-restriction.']
@ -527,7 +549,7 @@ class RaiPlaySoundPlaylistIE(InfoExtractor):
'info_dict': {
'id': 'ilruggitodelconiglio',
'title': 'Il Ruggito del Coniglio',
'description': 'md5:48cff6972435964284614d70474132e6',
'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e',
},
'playlist_mincount': 65,
}, {
@ -634,19 +656,20 @@ def _real_extract(self, url):
}
class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
class RaiNewsIE(RaiBaseIE):
_VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html'
_EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)']
_TESTS = [{
# new rainews player (#3911)
'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html',
'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html',
'info_dict': {
'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf',
'id': '31e8017c-845c-43f5-9c48-245b43c3a079',
'ext': 'mp4',
'title': 'Puntata del 29/05/2022',
'duration': 1589,
'upload_date': '20220529',
'title': 'md5:1e81364b09de4a149042bac3c7d36f0b',
'duration': 196,
'upload_date': '20240225',
'uploader': 'rainews',
'formats': 'count:2',
},
'params': {'skip_download': True},
}, {
@ -659,7 +682,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
'description': 'I film in uscita questa settimana.',
'thumbnail': r're:^https?://.*\.png$',
'duration': 833,
'upload_date': '20161103'
'upload_date': '20161103',
'formats': 'count:8',
},
'params': {'skip_download': True},
'expected_warnings': ['unable to extract player_data'],
@ -684,7 +708,7 @@ def _real_extract(self, url):
if not relinker_url:
# fallback on old implementation for some old content
try:
return self._extract_from_content_id(video_id, url)
return RaiIE._real_extract(self, url)
except GeoRestrictedError:
raise
except ExtractorError as e: