From da1c94ee459bf8ae9e5fae486071e0c2d111f5d9 Mon Sep 17 00:00:00 2001 From: Felix S Date: Fri, 16 Jul 2021 16:22:56 +0200 Subject: [PATCH] [generic] Extract previously missed subtitles (#515) * [generic] Extract subtitles in cases missed previously * [common] Detect discarded subtitles in SMIL manifests * [generic] Extract everything in the SMIL manifest Authored by: fstirlitz --- yt_dlp/extractor/common.py | 17 +++++++++++++++-- yt_dlp/extractor/generic.py | 24 +++++++++++++++--------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 81b88e4fa9..0ee7ee3b12 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2206,7 +2206,7 @@ def _xpath_ns(path, namespace=None): out.append('{%s}%s' % (namespace, c)) return '/'.join(out) - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if smil is False: @@ -2215,8 +2215,21 @@ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, namespace = self._parse_smil_namespace(smil) - return self._parse_smil_formats( + fmts = self._parse_smil_formats( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subs = self._parse_smil_subtitles( + smil, namespace=namespace) + + return fmts, subs + + def _extract_smil_formats(self, *args, **kwargs): + fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the SMIL manifest; " + "if any subtitle tracks are missing," + )) + return fmts def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): smil = self._download_smil(smil_url, video_id, fatal=fatal) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index e53a35008a..7e0598e58e 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2462,7 +2462,7 @@ def _real_extract(self, url): # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): - info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') self._sort_formats(info_dict['formats']) return info_dict @@ -3410,6 +3410,7 @@ def _real_extract(self, url): if not isinstance(sources, list): sources = [sources] formats = [] + subtitles = {} for source in sources: src = source.get('src') if not src or not isinstance(src, compat_str): @@ -3422,12 +3423,16 @@ def _real_extract(self, url): if src_type == 'video/youtube': return self.url_result(src, YoutubeIE.ie_key()) if src_type == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + src, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif src_type == 'application/x-mpegurl' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'url': src, @@ -3437,9 +3442,10 @@ def _real_extract(self, url): 'Referer': full_response.geturl(), }, }) - if formats: + if formats or subtitles: self._sort_formats(formats) info_dict['formats'] = formats + info_dict['subtitles'] = subtitles return info_dict # Looking for http://schema.org/VideoObject @@ -3574,13 +3580,13 @@ def filter_video(urls): ext = determine_ext(video_url) if ext == 'smil': - entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) + entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} elif ext == 'xspf': return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) elif ext == 'm3u8': - entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4') elif ext == 'mpd': - entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: