From ae61d108dd83a951b6e8a27e1fb969682416150d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 28 Jun 2022 10:40:54 +0530 Subject: [PATCH] [cleanup] Misc cleanup --- README.md | 21 +++++++++---------- requirements.txt | 2 +- test/test_download.py | 6 +++++- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/compat/_legacy.py | 38 +++++++++++++++++------------------ yt_dlp/downloader/__init__.py | 8 +++++--- yt_dlp/downloader/hls.py | 2 +- yt_dlp/extractor/generic.py | 32 ++++++++++++++--------------- yt_dlp/extractor/youtube.py | 9 +++++---- yt_dlp/utils.py | 9 ++++----- 10 files changed, 65 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 0040a0d13..e2e789d0c 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ # NEW FEATURES -* Based on **youtube-dl 2021.12.17 [commit/8a158a9](https://github.com/ytdl-org/youtube-dl/commit/8a158a936c8b002ef536e9e2b778ded02c09c0fa)** and **youtube-dlc 2020.11.11-3 [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Merged with **youtube-dl v2021.12.17 [commit/8a158a9](https://github.com/ytdl-org/youtube-dl/commit/8a158a936c8b002ef536e9e2b778ded02c09c0fa)** and **youtube-dlc v2020.11.11-3 [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API @@ -79,18 +79,13 @@ # NEW FEATURES * **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that the NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. -* **Youtube improvements**: - * All Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) and private playlists supports downloading multiple pages of content - * Search (`ytsearch:`, `ytsearchdate:`), search URLs and in-channel search works - * Mixes supports downloading multiple pages of content - * Some (but not all) age-gated content can be downloaded without cookies - * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) +* **YouTube improvements**: + * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, YouTube Music Albums/Channels ([except self-uploaded music](https://github.com/yt-dlp/yt-dlp/issues/723)), and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) + * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** + * Supports some (but not all) age-gated content without cookies + * Download livestreams from the start using `--live-from-start` (*experimental*) + * `255kbps` audio is extracted (if available) from YouTube Music when premium cookies are given * Redirect channel's home URL automatically to `/video` to preserve the old behaviour - * `255kbps` audio is extracted (if available) from youtube music when premium cookies are given - * Youtube music Albums, channels etc can be downloaded ([except self-uploaded music](https://github.com/yt-dlp/yt-dlp/issues/723)) - * Download livestreams from the start using `--live-from-start` (experimental) - * Support for downloading stories (`ytstories:`) - * Support for downloading clips * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE]` @@ -124,6 +119,8 @@ # NEW FEATURES See [changelog](Changelog.md) or [commits](https://github.com/yt-dlp/yt-dlp/commits) for the full list of changes +Features marked with a **\*** have been back-ported to youtube-dl + ### Differences in default behavior Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: diff --git a/requirements.txt b/requirements.txt index a48b78d7a..dde37120f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ pycryptodomex websockets brotli; platform_python_implementation=='CPython' brotlicffi; platform_python_implementation!='CPython' -certifi \ No newline at end of file +certifi diff --git a/test/test_download.py b/test/test_download.py index b397b3ecf..c9f5e735c 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -273,7 +273,11 @@ def batch_generator(name, num_tests): def test_template(self): for i in range(num_tests): - getattr(self, f'test_{name}_{i}' if i else f'test_{name}')() + test_name = f'test_{name}_{i}' if i else f'test_{name}' + try: + getattr(self, test_name)() + except unittest.SkipTest: + print(f'Skipped {test_name}') return test_template diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9ebb0b82a..0711f38c7 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3531,7 +3531,7 @@ def render_formats_table(self, info_dict): 'none', '' if f.get('vcodec') == 'none' else self._format_out('video only', self.Styles.SUPPRESS)), format_field(f, 'abr', '\t%dk'), - format_field(f, 'asr', '\t%dHz'), + format_field(f, 'asr', '\t%s', func=format_decimal_suffix), join_nonempty( self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, format_field(f, 'language', '[%s]'), diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index 49bb13a3c..e75f79bbf 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -44,14 +44,26 @@ def compat_setenv(key, value, env=os.environ): compat_basestring = str +compat_chr = chr compat_collections_abc = collections.abc +compat_cookiejar = http.cookiejar +compat_cookiejar_Cookie = http.cookiejar.Cookie compat_cookies = http.cookies +compat_cookies_SimpleCookie = http.cookies.SimpleCookie compat_etree_Element = etree.Element compat_etree_register_namespace = etree.register_namespace compat_filter = filter +compat_get_terminal_size = shutil.get_terminal_size compat_getenv = os.getenv +compat_getpass = getpass.getpass +compat_html_entities = html.entities +compat_html_entities_html5 = html.entities.html5 +compat_HTMLParser = html.parser.HTMLParser +compat_http_client = http.client +compat_http_server = http.server compat_input = input compat_integer_types = (int, ) +compat_itertools_count = itertools.count compat_kwargs = lambda kwargs: kwargs compat_map = map compat_numeric_types = (int, float, complex) @@ -59,34 +71,22 @@ def compat_setenv(key, value, env=os.environ): compat_shlex_split = shlex.split compat_socket_create_connection = socket.create_connection compat_Struct = struct.Struct +compat_struct_pack = struct.pack +compat_struct_unpack = struct.unpack compat_subprocess_get_DEVNULL = lambda: DEVNULL +compat_tokenize_tokenize = tokenize.tokenize +compat_urllib_error = urllib.error +compat_urllib_parse = urllib.parse compat_urllib_parse_quote = urllib.parse.quote compat_urllib_parse_quote_plus = urllib.parse.quote_plus +compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes compat_urllib_parse_urlunparse = urllib.parse.urlunparse -compat_urllib_request_DataHandler = urllib.request.DataHandler compat_urllib_request = urllib.request +compat_urllib_request_DataHandler = urllib.request.DataHandler compat_urllib_response = urllib.response compat_urlretrieve = urllib.request.urlretrieve compat_xml_parse_error = etree.ParseError compat_xpath = lambda xpath: xpath compat_zip = zip workaround_optparse_bug9161 = lambda: None -compat_getpass = getpass.getpass -compat_chr = chr -compat_urllib_parse = urllib.parse -compat_itertools_count = itertools.count -compat_cookiejar = http.cookiejar -compat_cookiejar_Cookie = http.cookiejar.Cookie -compat_cookies_SimpleCookie = http.cookies.SimpleCookie -compat_get_terminal_size = shutil.get_terminal_size -compat_html_entities = html.entities -compat_html_entities_html5 = html.entities.html5 -compat_tokenize_tokenize = tokenize.tokenize -compat_HTMLParser = html.parser.HTMLParser -compat_http_client = http.client -compat_http_server = http.server -compat_struct_pack = struct.pack -compat_struct_unpack = struct.unpack -compat_urllib_error = urllib.error -compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index a7dc6c9d0..c34dbcea9 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -59,10 +59,11 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N def shorten_protocol_name(proto, simplify=False): short_protocol_names = { - 'm3u8_native': 'm3u8_n', - 'rtmp_ffmpeg': 'rtmp_f', + 'm3u8_native': 'm3u8', + 'm3u8': 'm3u8F', + 'rtmp_ffmpeg': 'rtmpF', 'http_dash_segments': 'dash', - 'http_dash_segments_generator': 'dash_g', + 'http_dash_segments_generator': 'dashG', 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } @@ -70,6 +71,7 @@ def shorten_protocol_name(proto, simplify=False): short_protocol_names.update({ 'https': 'http', 'ftps': 'ftp', + 'm3u8': 'm3u8', # Reverse above m3u8 mapping 'm3u8_native': 'm3u8', 'http_dash_segments_generator': 'dash', 'rtmp_ffmpeg': 'rtmp', diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index 1e75c5e9c..2010f3dc9 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -69,7 +69,7 @@ def real_download(self, filename, info_dict): elif no_crypto: message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodomex are available; ' 'Decryption will be performed natively, but will be extremely slow') - elif re.search(r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', s): + elif info_dict.get('extractor_key') == 'Generic' and re.search(r'(?m)#EXT-X-MEDIA-SEQUENCE:(?!0$)', s): install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and ' message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, ' f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command') diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 49f81e562..b63271c1f 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2825,12 +2825,22 @@ def _real_extract(self, url): new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) + def request_webpage(): + request = sanitized_Request(url) + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to yt-dlp default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after HEAD request finishes, but not sure if we can rely on this. + request.add_header('Accept-Encoding', '*') + return self._request_webpage(request, video_id) + full_response = None if head_response is False: - request = sanitized_Request(url) - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - head_response = full_response + head_response = full_response = request_webpage() info_dict = { 'id': video_id, @@ -2868,19 +2878,7 @@ def _real_extract(self, url): self.report_warning( '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) - if not full_response: - request = sanitized_Request(url) - # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) - # making it impossible to download only chunk of the file (yet we need only 512kB to - # test whether it's HTML or not). According to yt-dlp default Accept-Encoding - # that will always result in downloading the whole file that is not desirable. - # Therefore for extraction pass we have to override Accept-Encoding to any in order - # to accept raw bytes and being able to download only a chunk. - # It may probably better to solve this by checking Content-Type for application/octet-stream - # after HEAD request finishes, but not sure if we can rely on this. - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - + full_response = full_response or request_webpage() first_bytes = full_response.read(512) # Is it an M3U playlist? diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1a9c88f35..3e2ac030e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2467,6 +2467,7 @@ def _extract_signature_function(self, video_id, player_url, example_sig): func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}' assert os.path.basename(func_id) == func_id + self.write_debug(f'Extracting signature function {func_id}') cache_spec = self.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) @@ -2714,10 +2715,10 @@ def _extract_url(webpage): @classmethod def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - return mobj.group('id') + video_id = cls.get_temp_id(url) + if not video_id: + raise ExtractorError(f'Invalid URL: {url}') + return video_id def _extract_chapters_from_json(self, data, duration): chapter_list = traverse_obj( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 9c9be5fe5..32c41a169 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -234,7 +234,7 @@ def random_user_agent(): ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)' +JSON_LD_RE = r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P{.+?})\s*' NUMBER_RE = r'\d+(?:\.\d+)?' @@ -673,8 +673,8 @@ def replace_insane(char): s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps result = ''.join(map(replace_insane, s)) if is_id is NO_DEFAULT: - result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars - STRIP_RE = '(?:\0.|[ _-])*' + result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars + STRIP_RE = r'(?:\0.|[ _-])*' result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end result = result.replace('\0', '') or '_' @@ -2400,8 +2400,7 @@ def remove_quotes(s): def get_domain(url): - domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url) - return domain.group('domain') if domain else None + return '.'.join(urllib.parse.urlparse(url).netloc.rsplit('.', 2)[-2:]) def url_basename(url):