[ie/instagram] Add app_id extractor-arg (#12359)

Authored by: chrisellsworth
2025-02-27 02:51:30 +01:00 · 2025-02-23 00:16:04 -08:00 · 2025-02-23 00:16:04 -08:00 · a90641c836
commit a90641c836
parent 65c3c58c0a
2 changed files with 21 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -1812,6 +1812,9 @@ #### hotstar
 * `vcodec`: vcodec to ignore - one or more of `h264`, `h265`, `dvh265`
 * `dr`: dynamic range to ignore - one or more of `sdr`, `hdr10`, `dv`

+#### instagram
+* `app_id`: The value of the `X-IG-App-ID` header used for API requests. Default is the web app ID, `936619743392459`
+
 #### niconicochannelplus
 * `max_comments`: Maximum number of comments to extract - default is `120`

--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@ -28,7 +28,8 @@

 def _pk_to_id(media_id):
    """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id"""
-    return encode_base_n(int(media_id.split('_')[0]), table=_ENCODING_CHARS)
+    pk = int(str(media_id).split('_')[0])
+    return encode_base_n(pk, table=_ENCODING_CHARS)


 def _id_to_pk(shortcode):
@ -42,13 +43,16 @@ class InstagramBaseIE(InfoExtractor):

    _API_BASE_URL = 'https://i.instagram.com/api/v1'
    _LOGIN_URL = 'https://www.instagram.com/accounts/login'
-    _API_HEADERS = {
-        'X-IG-App-ID': '936619743392459',
-        'X-ASBD-ID': '198387',
-        'X-IG-WWW-Claim': '0',
-        'Origin': 'https://www.instagram.com',
-        'Accept': '*/*',
-    }
+
+    @property
+    def _api_headers(self):
+        return {
+            'X-IG-App-ID': self._configuration_arg('app_id', ['936619743392459'], ie_key=InstagramIE)[0],
+            'X-ASBD-ID': '198387',
+            'X-IG-WWW-Claim': '0',
+            'Origin': 'https://www.instagram.com',
+            'Accept': '*/*',
+        }

    def _perform_login(self, username, password):
        if self._IS_LOGGED_IN:
@ -62,7 +66,7 @@ def _perform_login(self, username, password):

        login = self._download_json(
            f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={
-                **self._API_HEADERS,
+                **self._api_headers,
                'X-Requested-With': 'XMLHttpRequest',
                'X-CSRFToken': shared_data['config']['csrf_token'],
                'X-Instagram-AJAX': shared_data['rollout_hash'],
@ -209,7 +213,7 @@ def _extract_product(self, product_info):
    def _get_comments(self, video_id):
        comments_info = self._download_json(
            f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/comments/?can_support_threading=true&permalink_enabled=false', video_id,
-            fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._API_HEADERS) or {}
+            fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._api_headers) or {}

        comment_data = traverse_obj(comments_info, ('edge_media_to_parent_comment', 'edges'), 'comments')
        for comment_dict in comment_data or []:
@ -402,14 +406,14 @@ def _real_extract(self, url):
            info = traverse_obj(self._download_json(
                f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id,
                fatal=False, errnote='Video info extraction failed',
-                note='Downloading video info', headers=self._API_HEADERS), ('items', 0))
+                note='Downloading video info', headers=self._api_headers), ('items', 0))
            if info:
                media.update(info)
                return self._extract_product(media)

        api_check = self._download_json(
            f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}',
-            video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {}
+            video_id, headers=self._api_headers, fatal=False, note='Setting up session', errnote=False) or {}
        csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken')

        if not csrf_token:
@ -429,7 +433,7 @@ def _real_extract(self, url):
        general_info = self._download_json(
            'https://www.instagram.com/graphql/query/', video_id, fatal=False, errnote=False,
            headers={
-                **self._API_HEADERS,
+                **self._api_headers,
                'X-CSRFToken': csrf_token or '',
                'X-Requested-With': 'XMLHttpRequest',
                'Referer': url,
@ -727,7 +731,7 @@ def _real_extract(self, url):

        videos = traverse_obj(self._download_json(
            f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}',
-            display_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels')
+            display_id, errnote=False, fatal=False, headers=self._api_headers), 'reels')
        if not videos:
            self.raise_login_required('You need to log in to access this content')
        user_info = traverse_obj(videos, (user_id, 'user', {dict})) or {}