From 7e09c147fdccb44806bbf601573adc4b77210a89 Mon Sep 17 00:00:00 2001 From: trainman261 Date: Tue, 12 Dec 2023 01:00:35 +0100 Subject: [PATCH] [ie/theplatform] Extract more metadata (#8635) Authored by: trainman261 --- yt_dlp/extractor/aenetworks.py | 23 ++++++++++++++++-- yt_dlp/extractor/cbc.py | 37 ++++++++++++++++++++++------- yt_dlp/extractor/cwtv.py | 4 ++++ yt_dlp/extractor/mediaset.py | 2 ++ yt_dlp/extractor/nbc.py | 4 ++++ yt_dlp/extractor/scrippsnetworks.py | 2 ++ yt_dlp/extractor/theplatform.py | 12 ++++++++++ 7 files changed, 73 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index cc26653c1d..63a0532ef1 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -121,11 +121,21 @@ class AENetworksIE(AENetworksBaseIE): 'info_dict': { 'id': '22253814', 'ext': 'mp4', - 'title': 'Winter is Coming', - 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'title': 'Winter Is Coming', + 'description': 'md5:a40e370925074260b1c8a633c632c63a', 'timestamp': 1338306241, 'upload_date': '20120529', 'uploader': 'AENE-NEW', + 'duration': 2592.0, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'chapters': 'count:5', + 'tags': 'count:14', + 'categories': ['Mountain Men'], + 'episode_number': 1, + 'episode': 'Episode 1', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'Mountain Men', }, 'params': { # m3u8 download @@ -143,6 +153,15 @@ class AENetworksIE(AENetworksBaseIE): 'timestamp': 1452634428, 'upload_date': '20160112', 'uploader': 'AENE-NEW', + 'duration': 1277.695, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'chapters': 'count:4', + 'tags': 'count:23', + 'episode': 'Episode 1', + 'episode_number': 1, + 'season': 'Season 9', + 'season_number': 9, + 'series': 'Duck Dynasty', }, 'params': { # m3u8 download diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 29f0e307d1..b5beb1ec8c 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -180,6 +180,13 @@ class CBCPlayerIE(InfoExtractor): 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', 'chapters': [], 'duration': 494.811, + 'categories': ['AudioMobile/All in a Weekend Montreal'], + 'tags': 'count:8', + 'location': 'Quebec', + 'series': 'All in a Weekend Montreal', + 'season': 'Season 2015', + 'season_number': 2015, + 'media_type': 'Excerpt', }, }, { 'url': 'http://www.cbc.ca/player/play/2164402062', @@ -195,25 +202,37 @@ class CBCPlayerIE(InfoExtractor): 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', 'chapters': [], 'duration': 186.867, + 'series': 'CBC News: Windsor at 6:00', + 'categories': ['News/Canada/Windsor'], + 'location': 'Windsor', + 'tags': ['cancer'], + 'creator': 'Allison Johnson', + 'media_type': 'Excerpt', }, }, { # Has subtitles # These broadcasts expire after ~1 month, can find new test URL here: # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast - 'url': 'http://www.cbc.ca/player/play/2249992771553', - 'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', + 'url': 'http://www.cbc.ca/player/play/2284799043667', + 'md5': '9b49f0839e88b6ec0b01d840cf3d42b5', 'info_dict': { - 'id': '2249992771553', + 'id': '2284799043667', 'ext': 'mp4', - 'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake', - 'description': 'md5:adba28011a56cfa47a080ff198dad27a', - 'timestamp': 1690596000, - 'duration': 2716.333, + 'title': 'The National | Hockey coach charged, Green grants, Safer drugs', + 'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa', + 'timestamp': 1700272800, + 'duration': 2718.833, 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg', 'uploader': 'CBCC-NEW', 'chapters': 'count:5', - 'upload_date': '20230729', + 'upload_date': '20231118', + 'categories': 'count:4', + 'series': 'The National - Full Show', + 'tags': 'count:1', + 'creator': 'News', + 'location': 'Canada', + 'media_type': 'Full Program', }, }] diff --git a/yt_dlp/extractor/cwtv.py b/yt_dlp/extractor/cwtv.py index 9b83264ee1..69d50daf6c 100644 --- a/yt_dlp/extractor/cwtv.py +++ b/yt_dlp/extractor/cwtv.py @@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor): 'timestamp': 1444107300, 'age_limit': 14, 'uploader': 'CWTV', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'chapters': 'count:4', + 'episode': 'Episode 20', + 'season': 'Season 11', }, 'params': { # m3u8 download diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 2d62042982..e04a1ce901 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -73,6 +73,7 @@ class MediasetIE(ThePlatformBaseIE): 'season_number': 5, 'episode_number': 5, 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}], + 'categories': ['Informazione'], }, }, { # DRM @@ -149,6 +150,7 @@ class MediasetIE(ThePlatformBaseIE): 'season_number': 12, 'episode': 'Episode 8', 'episode_number': 8, + 'categories': ['Intrattenimento'], }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 2d3aa26ec9..267fa83532 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -53,6 +53,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'chapters': 'count:1', 'tags': 'count:4', 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'], + 'media_type': 'Full Episode', }, 'params': { 'skip_download': 'm3u8', @@ -131,6 +133,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'tags': 'count:10', 'age_limit': 0, 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/Quantum Leap 2022'], + 'media_type': 'Highlight', }, 'params': { 'skip_download': 'm3u8', diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py index 7f0bc96456..3912f77865 100644 --- a/yt_dlp/extractor/scrippsnetworks.py +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -114,6 +114,8 @@ class ScrippsNetworksIE(InfoExtractor): 'timestamp': 1475678834, 'upload_date': '20161005', 'uploader': 'SCNI-SCND', + 'tags': 'count:10', + 'creator': 'Cooking Channel', 'duration': 29.995, 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': ''}], 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 433ce8427c..9160f5ec6b 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -104,6 +104,10 @@ def _add_chapter(start_time, end_time): _add_chapter(chapter.get('startTime'), chapter.get('endTime')) _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + def extract_site_specific_field(field): + # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber' + return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False) + return { 'title': info['title'], 'subtitles': subtitles, @@ -113,6 +117,14 @@ def _add_chapter(start_time, end_time): 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'uploader': info.get('billingCode'), 'chapters': chapters, + 'creator': traverse_obj(info, ('author', {str})) or None, + 'categories': traverse_obj(info, ( + 'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None, + 'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})), + 'location': extract_site_specific_field('region'), + 'series': extract_site_specific_field('show'), + 'season_number': int_or_none(extract_site_specific_field('seasonNumber')), + 'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'), } def _extract_theplatform_metadata(self, path, video_id):