diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 6fcc4ac932..554a570059 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -7,6 +7,7 @@ from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( + compat_str, compat_parse_qs, compat_urlparse, ) @@ -15,6 +16,7 @@ int_or_none, float_or_none, parse_iso8601, + try_get, smuggle_url, str_or_none, strip_jsonp, @@ -113,6 +115,13 @@ class BiliBiliIE(InfoExtractor): # new BV video id format 'url': 'https://www.bilibili.com/video/BV1JE411F741', 'only_matching': True, + }, { + # Anthology + 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'info_dict': { + 'id': 'BV1bK411W797', + }, + 'playlist_count': 17, }] _APP_KEY = 'iVGUTjsxvpLeuDCf' @@ -139,9 +148,19 @@ def _real_extract(self, url): page_id = mobj.group('page') webpage = self._download_webpage(url, video_id) + # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. + # If the video has no page argument, check to see if it's an anthology + if page_id is None: + if not self._downloader.params.get('noplaylist'): + r = self._extract_anthology_entries(bv_id, video_id, webpage) + if r is not None: + self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id) + return r + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + if 'anime/' not in url: cid = self._search_regex( - r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', + r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + compat_str(page_id), webpage, 'cid', default=None ) or self._search_regex( r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', @@ -224,7 +243,18 @@ def _real_extract(self, url): title = self._html_search_regex( (r']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', - group='title') + ('_p' + str(page_id) if page_id is not None else '') + group='title') + + # Get part title for anthologies + if page_id is not None: + # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video + part_title = try_get( + self._download_json( + "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id, + video_id, note='Extracting videos in anthology'), + lambda x: x['data'][int(page_id) - 1]['part']) + title = part_title or title + description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', @@ -234,7 +264,7 @@ def _real_extract(self, url): # TODO 'view_count' requires deobfuscating Javascript info = { - 'id': str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id), + 'id': compat_str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id), 'cid': cid, 'title': title, 'description': description, @@ -300,7 +330,7 @@ def get_comments(): global_info = { '_type': 'multi_video', - 'id': video_id, + 'id': compat_str(video_id), 'bv_id': bv_id, 'title': title, 'description': description, @@ -312,6 +342,20 @@ def get_comments(): return global_info + def _extract_anthology_entries(self, bv_id, video_id, webpage): + title = self._html_search_regex( + (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') + json_data = self._download_json( + "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id, + video_id, note='Extracting videos in anthology') + + if len(json_data['data']) > 1: + return self.playlist_from_matches( + json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(), + getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page'])) + def _get_video_id_set(self, id, is_bv): query = {'bvid': id} if is_bv else {'aid': id} response = self._download_json( @@ -506,7 +550,7 @@ def _get_n_results(self, query, n): videos = data['result'] for video in videos: - e = self.url_result(video['arcurl'], 'BiliBili', str(video['aid'])) + e = self.url_result(video['arcurl'], 'BiliBili', compat_str(video['aid'])) entries.append(e) if(len(entries) >= n or len(videos) >= BiliBiliSearchIE.MAX_NUMBER_OF_RESULTS):