[YouTube] Use new browse API for continuation page extraction. (#131)

Known issues (these issues existed in previous API as well) * Mix playlists only give 1 page (25 vids) * Trending only gives 1 video * History gives 5 pages (200 vids) Co-authored-by: colethedj, pukkandan
2024-12-11 14:18:18 +01:00 · 2021-03-03 10:32:40 +00:00 · 2021-03-03 10:32:40 +00:00 · a5c5623470
commit a5c5623470
parent c705177da2
1 changed files with 38 additions and 34 deletions
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -2,6 +2,7 @@

 from __future__ import unicode_literals

+import hashlib
 import itertools
 import json
 import os.path
@ -274,7 +275,7 @@ def _real_initialize(self):
        'context': {
            'client': {
                'clientName': 'WEB',
-                'clientVersion': '2.20201021.03.00',
+                'clientVersion': '2.20210301.08.00',
            }
        },
    }
@ -283,15 +284,28 @@ def _real_initialize(self):
    _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
    _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'

-    def _call_api(self, ep, query, video_id, fatal=True):
+    def _generate_sapisidhash_header(self):
+        sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
+        if sapisid_cookie is None:
+            return
+        time_now = round(time.time())
+        sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
+        return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
+
+    def _call_api(self, ep, query, video_id, fatal=True, headers=None,
+                  note='Downloading API JSON', errnote='Unable to download API page'):
        data = self._DEFAULT_API_DATA.copy()
        data.update(query)
+        headers = headers or {}
+        headers.update({'content-type': 'application/json'})
+        auth = self._generate_sapisidhash_header()
+        if auth is not None:
+            headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})

        return self._download_json(
-            'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
-            note='Downloading API JSON', errnote='Unable to download API page',
-            data=json.dumps(data).encode('utf8'), fatal=fatal,
-            headers={'content-type': 'application/json'},
+            'https://www.youtube.com/youtubei/v1/%s' % ep,
+            video_id=video_id, fatal=fatal, note=note, errnote=errnote,
+            data=json.dumps(data).encode('utf8'), headers=headers,
            query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})

    def _extract_yt_initial_data(self, video_id, webpage):
@ -2699,7 +2713,7 @@ def _extract_continuation(cls, renderer):
            ctp = continuation_ep.get('clickTrackingParams')
            return YoutubeTabIE._build_continuation_query(continuation, ctp)

-    def _entries(self, tab, identity_token):
+    def _entries(self, tab, identity_token, item_id):

        def extract_entries(parent_renderer):  # this needs to called again for continuation to work with feeds
            contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
@ -2770,11 +2784,14 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                if last_error:
                    self.report_warning('%s. Retrying ...' % last_error)
                try:
-                    browse = self._download_json(
-                        'https://www.youtube.com/browse_ajax', None,
-                        'Downloading page %d%s'
-                        % (page_num, ' (retry #%d)' % count if count else ''),
-                        headers=headers, query=continuation)
+                    response = self._call_api(
+                        ep="browse", fatal=True, headers=headers,
+                        video_id='%s page %s' % (item_id, page_num),
+                        query={
+                            'continuation': continuation['continuation'],
+                            'clickTracking': {'clickTrackingParams': continuation['itct']},
+                        },
+                        note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
                except ExtractorError as e:
                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
                        # Downloading page may result in intermittent 5xx HTTP error
@ -2784,8 +2801,6 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                            continue
                    raise
                else:
-                    response = try_get(browse, lambda x: x[1]['response'], dict)
-
                    # Youtube sometimes sends incomplete data
                    # See: https://github.com/ytdl-org/youtube-dl/issues/28194
                    if response.get('continuationContents') or response.get('onResponseReceivedActions'):
@ -2793,7 +2808,8 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                    last_error = 'Incomplete data recieved'
                    if count >= retries:
                        self._downloader.report_error(last_error)
-            if not browse or not response:
+
+            if not response:
                break

            known_continuation_renderers = {
@ -2936,7 +2952,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
            'channel_id': metadata['uploader_id'],
            'channel_url': metadata['uploader_url']})
        return self.playlist_result(
-            self._entries(selected_tab, identity_token),
+            self._entries(selected_tab, identity_token, playlist_id),
            **metadata)

    def _extract_from_playlist(self, item_id, url, data, playlist):
@ -3223,26 +3239,14 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
    _TESTS = []

    def _entries(self, query, n):
-        data = {
-            'context': {
-                'client': {
-                    'clientName': 'WEB',
-                    'clientVersion': '2.20201021.03.00',
-                }
-            },
-            'query': query,
-        }
+        data = {'query': query}
        if self._SEARCH_PARAMS:
            data['params'] = self._SEARCH_PARAMS
        total = 0
        for page_num in itertools.count(1):
-            search = self._download_json(
-                'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
-                video_id='query "%s"' % query,
-                note='Downloading page %s' % page_num,
-                errnote='Unable to download API page', fatal=False,
-                data=json.dumps(data).encode('utf8'),
-                headers={'content-type': 'application/json'})
+            search = self._call_api(
+                ep='search', video_id='query "%s"' % query, fatal=False,
+                note='Downloading page %s' % page_num, query=data)
            if not search:
                break
            slr_contents = try_get(
@ -3394,8 +3398,8 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):


 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
-    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = r':ythistory'
+    IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
+    _VALID_URL = r':ythis(?:tory)?'
    _FEED_NAME = 'history'
    _TESTS = [{
        'url': ':ythistory',