From c34f505b045d6de769c8338481c0b5a20daf7a74 Mon Sep 17 00:00:00 2001 From: animelover1984 <54511032+animelover1984@users.noreply.github.com> Date: Fri, 20 Aug 2021 11:27:40 -0700 Subject: [PATCH] [bilibili] Add category extractor (#695) Authored by: animelover1984 --- yt_dlp/extractor/bilibili.py | 73 ++++++++++++++++++++++++++++++++++ yt_dlp/extractor/extractors.py | 1 + 2 files changed, 74 insertions(+) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 92c9882744..743fc501ec 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -4,13 +4,16 @@ import hashlib import itertools import json +import functools import re +import math from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_str, compat_parse_qs, compat_urlparse, + compat_urllib_parse_urlparse ) from ..utils import ( ExtractorError, @@ -24,6 +27,7 @@ unified_timestamp, unsmuggle_url, urlencode_postdata, + OnDemandPagedList ) @@ -535,6 +539,75 @@ def _real_extract(self, url): return self.playlist_result(self._entries(list_id), list_id) +class BilibiliCategoryIE(InfoExtractor): + IE_NAME = 'Bilibili category extractor' + _MAX_RESULTS = 1000000 + _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' + _TESTS = [{ + 'url': 'https://www.bilibili.com/v/kichiku/mad', + 'info_dict': { + 'id': 'kichiku: mad', + 'title': 'kichiku: mad' + }, + 'playlist_mincount': 45, + 'params': { + 'playlistend': 45 + } + }] + + def _fetch_page(self, api_url, num_pages, query, page_num): + parsed_json = self._download_json( + api_url, query, query={'Search_key': query, 'pn': page_num}, + note='Extracting results from page %s of %s' % (page_num, num_pages)) + + video_list = try_get(parsed_json, lambda x: x['data']['archives'], list) + if not video_list: + raise ExtractorError('Failed to retrieve video list for page %d' % page_num) + + for video in video_list: + yield self.url_result( + 'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid']) + + def _entries(self, category, subcategory, query): + # map of categories : subcategories : RIDs + rid_map = { + 'kichiku': { + 'mad': 26, + 'manual_vocaloid': 126, + 'guide': 22, + 'theatre': 216, + 'course': 127 + }, + } + + if category not in rid_map: + raise ExtractorError('The supplied category, %s, is not supported. List of supported categories: %s' % (category, list(rid_map.keys()))) + + if subcategory not in rid_map[category]: + raise ExtractorError('The subcategory, %s, isn\'t supported for this category. Supported subcategories: %s' % (subcategory, list(rid_map[category].keys()))) + + rid_value = rid_map[category][subcategory] + + api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value + page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'}) + page_data = try_get(page_json, lambda x: x['data']['page'], dict) + count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size')) + if count is None or not size: + raise ExtractorError('Failed to calculate either page count or size') + + num_pages = math.ceil(count / size) + + return OnDemandPagedList(functools.partial( + self._fetch_page, api_url, num_pages, query), size) + + def _real_extract(self, url): + u = compat_urllib_parse_urlparse(url) + category, subcategory = u.path.split('/')[2:4] + query = '%s: %s' % (category, subcategory) + + return self.playlist_result(self._entries(category, subcategory, query), query, query) + + class BiliBiliSearchIE(SearchInfoExtractor): IE_DESC = 'Bilibili video search, "bilisearch" keyword' _MAX_RESULTS = 100000 diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 1997e3c5b0..077fb75030 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -140,6 +140,7 @@ from .bilibili import ( BiliBiliIE, BiliBiliSearchIE, + BilibiliCategoryIE, BiliBiliBangumiIE, BilibiliAudioIE, BilibiliAudioAlbumIE,