From 45b2ee6f4fae139892a1a4335c269dcbb6671497 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 17 Feb 2023 16:51:34 +0530 Subject: [PATCH] Update to ytdl-commit-2dd6c6e [YouTube] Avoid crash if uploader_id extraction fails https://github.com/ytdl-org/youtube-dl/commit/2dd6c6edd8e0fc5e45865b8e6d865e35147de772 Except: * 295736c9cba714fb5de7d1c3dd31d86e50091cf8 [jsinterp] Improve parsing * 384f632e8a9b61e864a26678d85b2b39933b9bae [ITV] Overhaul ITV extractor * 33db85c571304bbd6863e3407ad8d08764c9e53b [feat]: Add support to external downloader aria2p --- README.md | 2 +- test/test_InfoExtractor.py | 2 + test/test_age_restriction.py | 19 +- yt_dlp/compat/_legacy.py | 30 ++- yt_dlp/extractor/_extractors.py | 7 + yt_dlp/extractor/americastestkitchen.py | 78 +++++- yt_dlp/extractor/blerp.py | 167 ++++++++++++ yt_dlp/extractor/callin.py | 55 +++- yt_dlp/extractor/cammodels.py | 39 +-- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/ign.py | 337 +++++++++++++++++------- yt_dlp/extractor/kommunetv.py | 31 +++ yt_dlp/extractor/myvideoge.py | 68 +++-- yt_dlp/extractor/pr0gramm.py | 97 +++++++ yt_dlp/extractor/rbgtum.py | 93 +++++++ yt_dlp/extractor/unsupported.py | 3 + yt_dlp/extractor/vimeo.py | 55 ++-- yt_dlp/extractor/xhamster.py | 8 +- yt_dlp/utils.py | 28 +- 19 files changed, 911 insertions(+), 210 deletions(-) create mode 100644 yt_dlp/extractor/blerp.py create mode 100644 yt_dlp/extractor/kommunetv.py create mode 100644 yt_dlp/extractor/pr0gramm.py create mode 100644 yt_dlp/extractor/rbgtum.py diff --git a/README.md b/README.md index 29a6c06fdd..9b91775bc7 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ # NEW FEATURES -* Merged with **youtube-dl v2021.12.17+ [commit/195f22f](https://github.com/ytdl-org/youtube-dl/commit/195f22f)** and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Merged with **youtube-dl v2021.12.17+ [commit/2dd6c6e](https://github.com/ytdl-org/youtube-dl/commit/2dd6c6e)** ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 683ead315d..e8d94a6ac2 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -69,6 +69,7 @@ def test_opengraph(self): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -81,6 +82,7 @@ def test_opengraph(self): self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) + self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value') def test_html_search_meta(self): ie = self.ie diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index ff248432b9..68107590e9 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -10,6 +10,7 @@ from test.helper import is_download_test, try_rm from yt_dlp import YoutubeDL +from yt_dlp.utils import DownloadError def _download_restricted(url, filename, age): @@ -25,10 +26,14 @@ def _download_restricted(url, filename, age): ydl.add_default_info_extractors() json_filename = os.path.splitext(filename)[0] + '.info.json' try_rm(json_filename) - ydl.download([url]) - res = os.path.exists(json_filename) - try_rm(json_filename) - return res + try: + ydl.download([url]) + except DownloadError: + pass + else: + return os.path.exists(json_filename) + finally: + try_rm(json_filename) @is_download_test @@ -38,12 +43,12 @@ def _assert_restricted(self, url, filename, age, old_age=None): self.assertFalse(_download_restricted(url, filename, age)) def test_youtube(self): - self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) + self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10) def test_youporn(self): self._assert_restricted( - 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - '505835.mp4', 2, old_age=25) + 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/', + '16715086.mp4', 2, old_age=25) if __name__ == '__main__': diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index d19333d314..84d749209e 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -1,5 +1,6 @@ """ Do not use! """ +import base64 import collections import ctypes import getpass @@ -29,6 +30,7 @@ from re import Pattern as compat_Pattern # noqa: F401 from re import match as compat_Match # noqa: F401 +from . import compat_expanduser, compat_HTMLParseError, compat_realpath from .compat_utils import passthrough_module from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401 from ..dependencies import brotli as compat_brotli # noqa: F401 @@ -47,23 +49,25 @@ def compat_setenv(key, value, env=os.environ): env[key] = value +compat_base64_b64decode = base64.b64decode compat_basestring = str compat_casefold = str.casefold compat_chr = chr compat_collections_abc = collections.abc -compat_cookiejar = http.cookiejar -compat_cookiejar_Cookie = http.cookiejar.Cookie -compat_cookies = http.cookies -compat_cookies_SimpleCookie = http.cookies.SimpleCookie -compat_etree_Element = etree.Element -compat_etree_register_namespace = etree.register_namespace +compat_cookiejar = compat_http_cookiejar = http.cookiejar +compat_cookiejar_Cookie = compat_http_cookiejar_Cookie = http.cookiejar.Cookie +compat_cookies = compat_http_cookies = http.cookies +compat_cookies_SimpleCookie = compat_http_cookies_SimpleCookie = http.cookies.SimpleCookie +compat_etree_Element = compat_xml_etree_ElementTree_Element = etree.Element +compat_etree_register_namespace = compat_xml_etree_register_namespace = etree.register_namespace compat_filter = filter compat_get_terminal_size = shutil.get_terminal_size compat_getenv = os.getenv -compat_getpass = getpass.getpass +compat_getpass = compat_getpass_getpass = getpass.getpass compat_html_entities = html.entities compat_html_entities_html5 = html.entities.html5 -compat_HTMLParser = html.parser.HTMLParser +compat_html_parser_HTMLParseError = compat_HTMLParseError +compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser compat_http_client = http.client compat_http_server = http.server compat_input = input @@ -72,6 +76,8 @@ def compat_setenv(key, value, env=os.environ): compat_kwargs = lambda kwargs: kwargs compat_map = map compat_numeric_types = (int, float, complex) +compat_os_path_expanduser = compat_expanduser +compat_os_path_realpath = compat_realpath compat_print = print compat_shlex_split = shlex.split compat_socket_create_connection = socket.create_connection @@ -81,7 +87,9 @@ def compat_setenv(key, value, env=os.environ): compat_subprocess_get_DEVNULL = lambda: DEVNULL compat_tokenize_tokenize = tokenize.tokenize compat_urllib_error = urllib.error +compat_urllib_HTTPError = urllib.error.HTTPError compat_urllib_parse = urllib.parse +compat_urllib_parse_parse_qs = urllib.parse.parse_qs compat_urllib_parse_quote = urllib.parse.quote compat_urllib_parse_quote_plus = urllib.parse.quote_plus compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus @@ -90,8 +98,10 @@ def compat_setenv(key, value, env=os.environ): compat_urllib_request = urllib.request compat_urllib_request_DataHandler = urllib.request.DataHandler compat_urllib_response = urllib.response -compat_urlretrieve = urllib.request.urlretrieve -compat_xml_parse_error = etree.ParseError +compat_urlretrieve = compat_urllib_request_urlretrieve = urllib.request.urlretrieve +compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseError compat_xpath = lambda xpath: xpath compat_zip = zip workaround_optparse_bug9161 = lambda: None + +legacy = [] diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6dab2636b8..a7bcafb4c5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -239,6 +239,7 @@ BleacherReportIE, BleacherReportCMSIE, ) +from .blerp import BlerpIE from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE @@ -861,6 +862,7 @@ from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .kompas import KompasVideoIE from .konserthusetplay import KonserthusetPlayIE from .koo import KooIE @@ -1460,6 +1462,7 @@ PuhuTVIE, PuhuTVSerieIE, ) +from .pr0gramm import Pr0grammStaticIE, Pr0grammIE from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE @@ -1521,6 +1524,10 @@ RayWenderlichCourseIE, ) from .rbmaradio import RBMARadioIE +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rcs import ( RCSIE, RCSEmbedsIE, diff --git a/yt_dlp/extractor/americastestkitchen.py b/yt_dlp/extractor/americastestkitchen.py index abda55dcf3..e889458a28 100644 --- a/yt_dlp/extractor/americastestkitchen.py +++ b/yt_dlp/extractor/americastestkitchen.py @@ -11,7 +11,7 @@ class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -72,6 +72,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,7 +106,7 @@ def _real_extract(self, url): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P/cookscountry)?/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|(?Pcooks(?:country|illustrated)))\.com(?:(?:/(?Pcooks(?:country|illustrated)))?(?:/?$|(?\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -117,29 +123,73 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_path, season_number = self._match_valid_url(url).group('show', 'id') - season_number = int(season_number) + season_number, show1, show = self._match_valid_url(url).group('season', 'show', 'show2') + show_path = ('/' + show) if show else '' + show = show or show1 + season_number = int_or_none(season_number) - slug = 'cco' if show_path == '/cookscountry' else 'atk' + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - season = 'Season %d' % season_number + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] + + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ + playlist_id, headers={ 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) @@ -162,4 +212,4 @@ def entries(): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) diff --git a/yt_dlp/extractor/blerp.py b/yt_dlp/extractor/blerp.py new file mode 100644 index 0000000000..4631ad2e97 --- /dev/null +++ b/yt_dlp/extractor/blerp.py @@ -0,0 +1,167 @@ +import json + +from .common import InfoExtractor +from ..utils import strip_or_none, traverse_obj + + +class BlerpIE(InfoExtractor): + IE_NAME = 'blerp' + _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a', + 'info_dict': { + 'id': '6320fe8745636cb4dd677a5a', + 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016', + 'uploader': 'luminousaj', + 'uploader_id': '5fb81e51aa66ae000c395478', + 'ext': 'mp3', + 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], + } + }, { + 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', + 'info_dict': { + 'id': '5bc94ef4796001000498429f', + 'title': 'Yee', + 'uploader': '179617322678353920', + 'uploader_id': '5ba99cf71386730004552c42', + 'ext': 'mp3', + 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] + } + }] + + _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" + _GRAPHQL_QUERY = ( + '''query webBitePageGetBite($_id: MongoID!) { + web { + biteById(_id: $_id) { + ...bitePageFrag + __typename + } + __typename + } + } + + fragment bitePageFrag on Bite { + _id + title + userKeywords + keywords + color + visibility + isPremium + owned + price + extraReview + isAudioExists + image { + filename + original { + url + __typename + } + __typename + } + userReactions { + _id + reactions + createdAt + __typename + } + topReactions + totalSaveCount + saved + blerpLibraryType + license + licenseMetaData + playCount + totalShareCount + totalFavoriteCount + totalAddedToBoardCount + userCategory + userAudioQuality + audioCreationState + transcription + userTranscription + description + createdAt + updatedAt + author + listingType + ownerObject { + _id + username + profileImage { + filename + original { + url + __typename + } + __typename + } + __typename + } + transcription + favorited + visibility + isCurated + sourceUrl + audienceRating + strictAudienceRating + ownerId + reportObject { + reportedContentStatus + __typename + } + giphy { + mp4 + gif + __typename + } + audio { + filename + original { + url + __typename + } + mp3 { + url + __typename + } + __typename + } + __typename + } + + ''') + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = { + 'operationName': self._GRAPHQL_OPERATIONNAME, + 'query': self._GRAPHQL_QUERY, + 'variables': { + '_id': audio_id + } + } + + headers = { + 'Content-Type': 'application/json' + } + + json_result = self._download_json('https://api.blerp.com/graphql', + audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) + + bite_json = json_result['data']['web']['biteById'] + + info_dict = { + 'id': bite_json['_id'], + 'url': bite_json['audio']['mp3']['url'], + 'title': bite_json['title'], + 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), + 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), + 'ext': 'mp3', + 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) + } + + return info_dict diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index e9668763ef..c77179c7bb 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - traverse_obj, - float_or_none, - int_or_none -) +from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj class CallinIE(InfoExtractor): @@ -35,6 +31,54 @@ class CallinIE(InfoExtractor): 'episode_number': 1, 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' } + }, { + 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'md5': '14ede27ee2c957b7e4db93140fc0745c', + 'info_dict': { + 'id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'ext': 'ts', + 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'description': 'Or, why the government doesn’t like SpaceX', + 'channel': 'The Pull Request', + 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa', + 'duration': 3182.472, + 'series_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'uploader_url': 'http://thepullrequest.com', + 'upload_date': '20220902', + 'episode': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'display_id': 'fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'series': 'The Pull Request', + 'channel_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'view_count': int, + 'uploader': 'Antonio García Martínez', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/1ade9142625344045dc17cf523469ced1d93610762f4c886d06aa190a2f979e8.png', + 'episode_id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'timestamp': 1662100688.005, + } + }, { + 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', + 'md5': '16f704ddbf82a27e3930533b12062f07', + 'info_dict': { + 'id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'ext': 'ts', + 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.', + 'channel': 'The DEBRIEF With Briahna Joy Gray', + 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm', + 'duration': 10043.16, + 'series_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'uploader_url': 'http://patreon.com/badfaithpodcast', + 'upload_date': '20220826', + 'episode': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'display_id': 'episode-', + 'series': 'The DEBRIEF With Briahna Joy Gray', + 'channel_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'view_count': int, + 'uploader': 'Briahna Gray', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/461ea0d86172cb6aff7d6c80fd49259cf5e64bdf737a4650f8bc24cf392ca218.png', + 'episode_id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'timestamp': 1661476708.282, + } }] def try_get_user_name(self, d): @@ -86,6 +130,7 @@ def _real_extract(self, url): return { 'id': id, + '_old_archive_ids': [make_archive_id(self, display_id.rsplit('-', 1)[-1])], 'display_id': display_id, 'title': title, 'formats': formats, diff --git a/yt_dlp/extractor/cammodels.py b/yt_dlp/extractor/cammodels.py index 0509057fc6..135b31529f 100644 --- a/yt_dlp/extractor/cammodels.py +++ b/yt_dlp/extractor/cammodels.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - url_or_none, -) +from ..utils import int_or_none, url_or_none class CamModelsIE(InfoExtractor): @@ -17,32 +13,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -82,12 +57,20 @@ def _real_extract(self, url): 'quality': -10, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) return { 'id': user_id, 'title': user_id, + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b7c687bc32..ebacc87bc0 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1338,7 +1338,7 @@ def _get_tfa_info(self, note='two-factor verification code'): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r']+?%s[^>]+?%s' diff --git a/yt_dlp/extractor/ign.py b/yt_dlp/extractor/ign.py index d4797d35e0..e4db7f9fa9 100644 --- a/yt_dlp/extractor/ign.py +++ b/yt_dlp/extractor/ign.py @@ -1,17 +1,20 @@ import re +import urllib.error from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) +from ..compat import compat_parse_qs from ..utils import ( - HEADRequest, + ExtractorError, determine_ext, + error_to_compat_str, + extract_attributes, int_or_none, + merge_dicts, parse_iso8601, strip_or_none, - try_get, + traverse_obj, + url_or_none, + urljoin, ) @@ -20,14 +23,90 @@ def _call_api(self, slug): return self._download_json( 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + raise + + def _extract_video_info(self, video, fatal=True): + video_id = video['videoId'] + + formats = [] + refs = traverse_obj(video, 'refs', expected_type=dict) or {} + + m3u8_url = url_or_none(refs.get('m3uUrl')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + f4m_url = url_or_none(refs.get('f4mUrl')) + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + for asset in (video.get('assets') or []): + asset_url = url_or_none(asset.get('url')) + if not asset_url: + continue + formats.append({ + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + + mezzanine_url = traverse_obj( + video, ('system', 'mezzanineUrl'), expected_type=url_or_none) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'quality': 1, + 'url': mezzanine_url, + }) + + thumbnails = traverse_obj( + video, ('thumbnails', ..., {'url': 'url'}), expected_type=url_or_none) + tags = traverse_obj( + video, ('tags', ..., 'displayName'), + expected_type=lambda x: x.strip() or None) + + metadata = traverse_obj(video, 'metadata', expected_type=dict) or {} + title = traverse_obj( + metadata, 'longTitle', 'title', 'name', + expected_type=lambda x: x.strip() or None) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + 'tags': tags, + } + class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - - _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' + _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P.+?)' + _PLAYLIST_PATH_RE = r'(?:/?\?(?P[^&#]+))?' + _VALID_URL = ( + r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)' + % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE))) IE_NAME = 'ign.com' _PAGE_TYPE = 'video' @@ -42,7 +121,13 @@ class IGNIE(IGNBaseIE): 'timestamp': 1370440800, 'upload_date': '20130605', 'tags': 'count:9', - } + 'display_id': 'the-last-of-us-review', + 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2014/03/26/lastofusreviewmimig2.jpg', + 'duration': 440, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', @@ -54,84 +139,48 @@ class IGNIE(IGNBaseIE): 'timestamp': 1420571160, 'upload_date': '20150106', 'tags': 'count:4', - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + grids = re.findall( + r'''(?s)]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)]*>''', + webpage) + return filter(None, + (urljoin(url, m.group('path')) for m in re.finditer( + r''']+\bhref\s*=\s*('|")(?P/videos%s)\1''' + % cls._VIDEO_PATH_RE, grids[0] if grids else ''))) + def _real_extract(self, url): - display_id = self._match_id(url) - video = self._call_api(display_id) - video_id = video['videoId'] - metadata = video['metadata'] - title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] + display_id, filt = self._match_valid_url(url).group('id', 'filt') + if display_id: + return self._extract_video(url, display_id) + return self._extract_playlist(url, filt or 'all') - formats = [] - refs = video.get('refs') or {} + def _extract_playlist(self, url, display_id): + webpage = self._download_webpage(url, display_id) - m3u8_url = refs.get('m3uUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + return self.playlist_result( + (self.url_result(u, self.ie_key()) + for u in self._extract_embed_urls(url, webpage)), + playlist_id=display_id) - f4m_url = refs.get('f4mUrl') - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False)) + def _extract_video(self, url, display_id): + video = self._checked_call_api(display_id) - for asset in (video.get('assets') or []): - asset_url = asset.get('url') - if not asset_url: - continue - formats.append({ - 'url': asset_url, - 'tbr': int_or_none(asset.get('bitrate'), 1000), - 'fps': int_or_none(asset.get('frame_rate')), - 'height': int_or_none(asset.get('height')), - 'width': int_or_none(asset.get('width')), - }) + info = self._extract_video_info(video) - mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) - if mezzanine_url: - formats.append({ - 'ext': determine_ext(mezzanine_url, 'mp4'), - 'format_id': 'mezzanine', - 'quality': 1, - 'url': mezzanine_url, - }) - - thumbnails = [] - for thumbnail in (video.get('thumbnails') or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - }) - - tags = [] - for tag in (video.get('tags') or []): - display_name = tag.get('displayName') - if not display_name: - continue - tags.append(display_name) - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(metadata.get('description')), - 'timestamp': parse_iso8601(metadata.get('publishDate')), - 'duration': int_or_none(metadata.get('duration')), + return merge_dicts({ 'display_id': display_id, - 'thumbnails': thumbnails, - 'formats': formats, - 'tags': tags, - } + }, info) -class IGNVideoIE(InfoExtractor): +class IGNVideoIE(IGNBaseIE): _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', @@ -143,7 +192,16 @@ class IGNVideoIE(InfoExtractor): 'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'timestamp': 1444665600, 'upload_date': '20151012', - } + 'display_id': '112203', + 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', + 'duration': 298, + 'tags': 'count:13', + 'display_id': '112203', + 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', + 'duration': 298, + 'tags': 'count:13', + }, + 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, @@ -163,22 +221,38 @@ class IGNVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') - url = self._request_webpage(req, video_id).geturl() + parsed_url = urllib.parse.urlparse(url) + embed_url = urllib.parse.urlunparse( + parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) + + webpage, urlh = self._download_webpage_handle(embed_url, video_id) + new_url = urlh.geturl() ign_url = compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + urllib.parse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: return self.url_result(ign_url, IGNIE.ie_key()) - return self.url_result(url) + video = self._search_regex(r'(]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False) + if not video: + if new_url == url: + raise ExtractorError('Redirect loop: ' + url) + return self.url_result(new_url) + video = extract_attributes(video) + video_data = video.get('data-settings') or '{}' + video_data = self._parse_json(video_data, video_id)['video'] + info = self._extract_video_info(video_data) + + return merge_dicts({ + 'display_id': video_id, + }, info) class IGNArticleIE(IGNBaseIE): - _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P[^/?&#]+)' _PAGE_TYPE = 'article' _TESTS = [{ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': '524497489e4e8ff5848ece34', + 'id': '72113', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', }, 'playlist': [ @@ -186,34 +260,43 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937', 'ext': 'mp4', - 'title': 'GTA 5 Video Review', + 'title': 'Grand Theft Auto V Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'timestamp': 1379339880, 'upload_date': '20130916', + 'tags': 'count:12', + 'thumbnail': 'https://assets1.ignimgs.com/thumbs/userUploaded/2021/8/16/gta-v-heistsjpg-e94705-1629138553533.jpeg', + 'display_id': 'grand-theft-auto-v-video-review', + 'duration': 501, }, }, { 'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2', 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'title': 'GTA 5 In Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'timestamp': 1386878820, 'upload_date': '20131212', + 'duration': 202, + 'tags': 'count:25', + 'display_id': 'gta-5-in-slow-motion', + 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2013/11/03/GTA-SLO-MO-1.jpg', }, }, ], 'params': { - 'playlist_items': '2-3', 'skip_download': True, }, + 'expected_warnings': ['Backend fetch failed'], }, { 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { 'id': '53ee806780a81ec46e0790f8', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', }, - 'playlist_count': 2, + 'playlist_count': 1, + 'expected_warnings': ['Backend fetch failed'], }, { # videoId pattern 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', @@ -236,18 +319,84 @@ class IGNArticleIE(IGNBaseIE): 'only_matching': True, }] + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError): + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + if e.cause.code == 404: + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + elif e.cause.code == 503: + self.report_warning(error_to_compat_str(e.cause)) + return + raise + def _real_extract(self, url): display_id = self._match_id(url) - article = self._call_api(display_id) + article = self._checked_call_api(display_id) - def entries(): - media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) - if media_url: - yield self.url_result(media_url, IGNIE.ie_key()) - for content in (article.get('content') or []): - for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): - yield self.url_result(video_url) + if article: + # obsolete ? + def entries(): + media_url = traverse_obj( + article, ('mediaRelations', 0, 'media', 'metadata', 'url'), + expected_type=url_or_none) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + if url_or_none(video_url): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + traverse_obj( + article, ('metadata', 'headline'), + expected_type=lambda x: x.strip() or None)) + + webpage = self._download_webpage(url, display_id) + + playlist_id = self._html_search_meta('dable:item_id', webpage, default=None) + if playlist_id: + + def entries(): + for m in re.finditer( + r'''(?s)]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P.+?)]+\bname\s*=\s*("|')flashvars\2[^>]*>)''', + m.group('params'), 'flashvars', default='') + flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '') + v_url = url_or_none((flashvars.get('url') or [None])[-1]) + if v_url: + yield self.url_result(v_url) + else: + playlist_id = self._search_regex( + r'''\bdata-post-id\s*=\s*("|')(?P[\da-f]+)\1''', + webpage, 'id', group='id', default=None) + + nextjs_data = self._search_nextjs_data(webpage, display_id) + + def entries(): + for player in traverse_obj( + nextjs_data, + ('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')): + # skip promo links (which may not always be served, eg GH CI servers) + if traverse_obj(nextjs_data, + ('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')), + expected_type=dict): + continue + video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {} + info = self._extract_video_info(video, fatal=False) + if info: + yield merge_dicts({ + 'display_id': display_id, + }, info) return self.playlist_result( - entries(), article.get('articleId'), - strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) + entries(), playlist_id or display_id, + re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None) diff --git a/yt_dlp/extractor/kommunetv.py b/yt_dlp/extractor/kommunetv.py new file mode 100644 index 0000000000..e21e556be3 --- /dev/null +++ b/yt_dlp/extractor/kommunetv.py @@ -0,0 +1,31 @@ +from .common import InfoExtractor +from ..utils import update_url + + +class KommunetvIE(InfoExtractor): + _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _TEST = { + 'url': 'https://oslo.kommunetv.no/archive/921', + 'md5': '5f102be308ee759be1e12b63d5da4bbc', + 'info_dict': { + 'id': '921', + 'title': 'Bystyremøte', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'application/json' + } + data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + title = data['stream']['title'] + file = data['playlist'][0]['playlist'][0]['file'] + url = update_url(file, query=None, fragment=None) + formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + return { + 'id': video_id, + 'formats': formats, + 'title': title + } diff --git a/yt_dlp/extractor/myvideoge.py b/yt_dlp/extractor/myvideoge.py index 513d4cb773..64cee48e7f 100644 --- a/yt_dlp/extractor/myvideoge.py +++ b/yt_dlp/extractor/myvideoge.py @@ -1,5 +1,16 @@ +import re + from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + MONTH_NAMES, + clean_html, + get_element_by_class, + get_element_by_id, + int_or_none, + js_to_json, + qualities, + unified_strdate, +) class MyVideoGeIE(InfoExtractor): @@ -11,37 +22,50 @@ class MyVideoGeIE(InfoExtractor): 'id': '3941048', 'ext': 'mp4', 'title': 'The best prikol', + 'upload_date': '20200611', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'md5:d72addd357b0dd914e704781f7f777d8', - 'description': 'md5:5c0371f540f5888d603ebfedd46b6df3' - } + 'uploader': 'chixa33', + 'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3', + }, } + _MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი'] + + _quality = staticmethod(qualities(('SD', 'HD'))) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r']*>([^<]+)', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._html_search_meta(['og:image'], webpage) - uploader = self._search_regex(r']+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + title = ( + self._og_search_title(webpage, default=None) + or clean_html(get_element_by_class('my_video_title', webpage)) + or self._html_search_regex(r']*>([^<]+)]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + + upload_date = get_element_by_class('mv_vid_upl_date', webpage) + # as ka locale may not be present roll a local date conversion + upload_date = (unified_strdate( + # translate any ka month to an en one + re.sub('|'.join(self._MONTH_NAMES_KA), + lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))], + upload_date, re.I)) + if upload_date else None) return { 'id': video_id, @@ -49,5 +73,9 @@ def _formats_key(f): 'description': description, 'uploader': uploader, 'formats': formats, - 'thumbnail': thumbnail + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': upload_date, + 'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)), + 'like_count': int_or_none(get_element_by_id('likes_count', webpage)), + 'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)), } diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py new file mode 100644 index 0000000000..2eb327fba1 --- /dev/null +++ b/yt_dlp/extractor/pr0gramm.py @@ -0,0 +1,97 @@ +import re + +from .common import InfoExtractor +from ..utils import merge_dicts + + +class Pr0grammStaticIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/static/5466437 + _VALID_URL = r'https?://pr0gramm\.com/static/(?P[0-9]+)' + _TEST = { + 'url': 'https://pr0gramm.com/static/5466437', + 'md5': '52fa540d70d3edc286846f8ca85938aa', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Fetch media sources + entries = self._parse_html5_media_entries(url, webpage, video_id) + media_info = entries[0] + + # Fetch author + uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') + + # Fetch approx upload timestamp from filename + # Have None-defaults in case the extraction fails + uploadDay = None + uploadMon = None + uploadYear = None + uploadTimestr = None + # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) + m = re.search(r'//img\.pr0gramm\.com/(?P[\d]+)/(?P[\d]+)/(?P[\d]+)/\w+\.\w{,4}', webpage) + + if (m): + # Up to a day of accuracy should suffice... + uploadDay = m.groupdict().get('day') + uploadMon = m.groupdict().get('mon') + uploadYear = m.groupdict().get('year') + uploadTimestr = uploadYear + uploadMon + uploadDay + + return merge_dicts({ + 'id': video_id, + 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), + 'uploader': uploader, + 'upload_date': uploadTimestr + }, media_info) + + +# This extractor is for the primary url (used for sharing, and appears in the +# location bar) Since this page loads the DOM via JS, yt-dl can't find any +# video information here. So let's redirect to a compatibility version of +# the site, which does contain the