Update to ytdl-commit-6508688

Make default upload_/release_date a compat_str
6508688e88

Except:
* "[NDR] Overhaul NDR and NJoy extractors" https://github.com/ytdl-org/youtube-dl/pull/30531
    - 01824d275b
    - 39a98b09a2
    - f0a05a55c2
    - 4186e81777
This commit is contained in:
pukkandan 2022-03-04 22:31:04 +05:30
parent 72e995f122
commit 50e93e03a7
No known key found for this signature in database
GPG Key ID: 7EEE9E1E817D0A39
21 changed files with 621 additions and 140 deletions

View File

@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor):
'id': '2800002704436634',
'ext': 'mp4',
'title': 'CASIMA7.22',
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'uploader': 'CASIMA Official Store',
'timestamp': 1500717600,
'upload_date': '20170722',

View File

@ -0,0 +1,87 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
dict_get,
get_element_by_class,
int_or_none,
unified_strdate,
url_or_none,
)
class Alsace20TVBaseIE(InfoExtractor):
def _extract_video(self, video_id, url=None):
info = self._download_json(
'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ),
video_id) or {}
title = info.get('titre')
formats = []
for res, fmt_url in (info.get('files') or {}).items():
formats.extend(
self._extract_smil_formats(fmt_url, video_id, fatal=False)
if '/smil:_' in fmt_url
else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False))
self._sort_formats(formats)
webpage = (url and self._download_webpage(url, video_id, fatal=False)) or ''
thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage))
upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None)
upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None
return {
'id': video_id,
'title': title,
'formats': formats,
'description': clean_html(get_element_by_class('wysiwyg', webpage)),
'upload_date': upload_date,
'thumbnail': thumbnail,
'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None),
'view_count': int_or_none(info.get('nb_vues')),
}
class Alsace20TVIE(Alsace20TVBaseIE):
_VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)'
_TESTS = [{
'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html',
'info_dict': {
'id': 'lyNHCXpYJh',
'ext': 'mp4',
'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7',
'title': 'Votre JT du jeudi 3 février',
'upload_date': '20220203',
'thumbnail': r're:https?://.+\.jpg',
'duration': 1073,
'view_count': int,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video(video_id, url)
class Alsace20TVEmbedIE(Alsace20TVBaseIE):
_VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)'
_TESTS = [{
'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh',
# 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb',
'info_dict': {
'id': 'lyNHCXpYJh',
'ext': 'mp4',
'title': 'Votre JT du jeudi 3 février',
'upload_date': '20220203',
'thumbnail': r're:https?://.+\.jpg',
'view_count': int,
},
'params': {
'format': 'bestvideo',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video(video_id)

View File

@ -3,7 +3,9 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
clean_podcast_url,
get_element_by_class,
int_or_none,
parse_iso8601,
try_get,
@ -14,16 +16,17 @@ class ApplePodcastsIE(InfoExtractor):
_VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
_TESTS = [{
'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
'md5': 'df02e6acb11c10e844946a39e7222b08',
'md5': '41dc31cd650143e530d9423b6b5a344f',
'info_dict': {
'id': '1000482637777',
'ext': 'mp3',
'title': '207 - Whitney Webb Returns',
'description': 'md5:13a73bade02d2e43737751e3987e1399',
'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
'upload_date': '20200705',
'timestamp': 1593921600,
'duration': 6425,
'timestamp': 1593932400,
'duration': 6454,
'series': 'The Tim Dillon Show',
'thumbnail': 're:.+[.](png|jpe?g|webp)',
}
}, {
'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
@ -39,24 +42,47 @@ class ApplePodcastsIE(InfoExtractor):
def _real_extract(self, url):
episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id)
episode_data = {}
ember_data = {}
# new page type 2021-11
amp_data = self._parse_json(self._search_regex(
r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
amp_data = try_get(amp_data,
lambda a: self._parse_json(
next(a[x] for x in iter(a) if episode_id in x),
episode_id),
dict) or {}
amp_data = amp_data.get('d') or []
episode_data = try_get(
amp_data,
lambda a: next(x for x in a
if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
dict)
if not episode_data:
# try pre 2021-11 page type: TODO: consider deleting if no longer used
ember_data = self._parse_json(self._search_regex(
r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
webpage, 'ember data'), episode_id)
r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
webpage, 'ember data'), episode_id) or {}
ember_data = ember_data.get(episode_id) or ember_data
episode = ember_data['data']['attributes']
episode_data = try_get(ember_data, lambda x: x['data'], dict)
episode = episode_data['attributes']
description = episode.get('description') or {}
series = None
for inc in (ember_data.get('included') or []):
for inc in (amp_data or ember_data.get('included') or []):
if inc.get('type') == 'media/podcast':
series = try_get(inc, lambda x: x['attributes']['name'])
series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
return {
'id': episode_id,
'title': episode['name'],
'title': episode.get('name'),
'url': clean_podcast_url(episode['assetUrl']),
'description': description.get('standard') or description.get('short'),
'timestamp': parse_iso8601(episode.get('releaseDateTime')),
'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
'series': series,
'thumbnail': self._og_search_thumbnail(webpage),
'vcodec': 'none',
}

View File

@ -12,6 +12,7 @@
int_or_none,
parse_qs,
qualities,
strip_or_none,
try_get,
unified_strdate,
url_or_none,
@ -253,3 +254,44 @@ def _real_extract(self, url):
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
return self.playlist_result(entries, playlist_id, title, description)
class ArteTVCategoryIE(ArteTVBaseIE):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/politics-and-society/',
'info_dict': {
'id': 'politics-and-society',
'title': 'Politics and society',
'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
},
'playlist_mincount': 13,
},
]
@classmethod
def suitable(cls, url):
return (
not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
and super(ArteTVCategoryIE, cls).suitable(url))
def _real_extract(self, url):
lang, playlist_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, playlist_id)
items = []
for video in re.finditer(
r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
webpage):
video = video.group('url')
if video == url:
continue
if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
items.append(video)
title = (self._og_search_title(webpage, default=None)
or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
description=self._og_search_description(webpage, default=None))

View File

@ -29,6 +29,7 @@ class AudiomackIE(InfoExtractor):
}
},
# audiomack wrapper around soundcloud song
# Needs new test URL.
{
'add_ie': ['Soundcloud'],
'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle',

View File

@ -11,6 +11,7 @@
compat_etree_Element,
compat_HTTPError,
compat_str,
compat_urllib_error,
compat_urlparse,
)
from ..utils import (
@ -38,7 +39,7 @@
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
_ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
_ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
@ -394,9 +395,17 @@ def _process_media_selector(self, media_selection, programme_id):
formats.extend(self._extract_mpd_formats(
href, programme_id, mpd_id=format_id, fatal=False))
elif transfer_format == 'hls':
formats.extend(self._extract_m3u8_formats(
# TODO: let expected_status be passed into _extract_xxx_formats() instead
try:
fmts = self._extract_m3u8_formats(
href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False))
m3u8_id=format_id, fatal=False)
except ExtractorError as e:
if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
and e.exc_info[1].code in (403, 404)):
raise
fmts = []
formats.extend(fmts)
elif transfer_format == 'hds':
formats.extend(self._extract_f4m_formats(
href, programme_id, f4m_id=format_id, fatal=False))
@ -784,21 +793,33 @@ class BBCIE(BBCCoUkIE):
'timestamp': 1437785037,
'upload_date': '20150725',
},
}, {
# video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': {
'id': 'p0b71qth',
'ext': 'mp4',
'title': 'Why France is making this woman a national hero',
'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1638230731,
'upload_date': '20211130',
},
}, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
'only_matching': True,
}, {
# bbcthreeConfig
'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
'info_dict': {
'id': 'p06556y7',
'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
'title': 'Things Not To Say to people that live on council estates',
'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
'duration': 360,
'thumbnail': r're:https?://.+/.+\.jpg',
},
'params': {
'skip_download': True,
}
}, {
# window.__PRELOADED_STATE__
'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
@ -1171,9 +1192,16 @@ def _real_extract(self, url):
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
initial_data = self._parse_json(self._parse_json(self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}");', webpage,
'preload state', default='"{}"'), playlist_id, fatal=False), playlist_id, fatal=False)
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None)
if initial_data is None:
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
'preload state', default={})
else:
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
def parse_media(media):
if not media:
@ -1214,7 +1242,10 @@ def parse_media(media):
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
for block in (try_get(resp, lambda x: x['data']['content']['model']['blocks'], list) or []):
for block in (try_get(resp,
(lambda x: x['data']['blocks'],
lambda x: x['data']['content']['model']['blocks'],),
list) or []):
if block.get('type') != 'media':
continue
parse_media(block.get('model'))

View File

@ -34,9 +34,11 @@ def _real_extract(self, url):
'https://bigo.tv/studio/getInternalStudioInfo',
user_id, data=urlencode_postdata({'siteId': user_id}))
if not isinstance(info_raw, dict):
raise ExtractorError('Received invalid JSON data')
if info_raw.get('code'):
raise ExtractorError(
f'{info_raw["msg"]} (code {info_raw["code"]})', expected=True)
'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True)
info = info_raw.get('data') or {}
if not info.get('alive'):
@ -44,7 +46,7 @@ def _real_extract(self, url):
return {
'id': info.get('roomId') or user_id,
'title': info.get('roomTopic'),
'title': info.get('roomTopic') or info.get('nick_name') or user_id,
'formats': [{
'url': info.get('hls_src'),
'ext': 'mp4',

148
yt_dlp/extractor/cpac.py Normal file
View File

@ -0,0 +1,148 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
str_or_none,
try_get,
unified_timestamp,
update_url_query,
urljoin,
)
# compat_range
try:
if callable(xrange):
range = xrange
except (NameError, TypeError):
pass
class CPACIE(InfoExtractor):
IE_NAME = 'cpac'
_VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})'
_TEST = {
# 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909',
'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
'md5': 'e46ad699caafd7aa6024279f2614e8fa',
'info_dict': {
'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
'ext': 'mp4',
'upload_date': '20220215',
'title': 'News Conference to Celebrate National Kindness Week February 15, 2022',
'description': 'md5:466a206abd21f3a6f776cdef290c23fb',
'timestamp': 1644901200,
},
'params': {
'format': 'bestvideo',
'hls_prefer_native': True,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
url_lang = 'fr' if '/l-episode?' in url else 'en'
content = self._download_json(
'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id,
video_id)
video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str)
formats = []
if video_url:
content = content['page']
title = str_or_none(content['details']['title_%s_t' % (url_lang, )])
formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4')
for fmt in formats:
# prefer language to match URL
fmt_lang = fmt.get('language')
if fmt_lang == url_lang:
fmt['language_preference'] = 10
elif not fmt_lang:
fmt['language_preference'] = -1
else:
fmt['language_preference'] = -10
self._sort_formats(formats)
category = str_or_none(content['details']['category_%s_t' % (url_lang, )])
def is_live(v_type):
return (v_type == 'live') if v_type is not None else None
return {
'id': video_id,
'formats': formats,
'title': title,
'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))),
'timestamp': unified_timestamp(content['details'].get('liveDateTime')),
'category': [category] if category else None,
'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))),
'is_live': is_live(content['details'].get('type')),
}
class CPACPlaylistIE(InfoExtractor):
IE_NAME = 'cpac:playlist'
_VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))'
_TESTS = [{
'url': 'https://www.cpac.ca/program?id=6',
'info_dict': {
'id': 'id=6',
'title': 'Headline Politics',
'description': 'Watch CPACs signature long-form coverage of the days pressing political events as they unfold.',
},
'playlist_count': 10,
}, {
'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc',
'info_dict': {
'id': 'key=hudson',
'title': 'hudson',
},
'playlist_count': 22,
}, {
'url': 'https://www.cpac.ca/search?programId=50',
'info_dict': {
'id': 'programId=50',
'title': '50',
},
'playlist_count': 9,
}, {
'url': 'https://www.cpac.ca/emission?id=6',
'only_matching': True,
}, {
'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en'
pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult')
api_url = (
'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s'
% (pl_type, video_id, ))
content = self._download_json(api_url, video_id)
entries = []
total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1)
for page in range(1, total_pages + 1):
if page > 1:
api_url = update_url_query(api_url, {'page': '%d' % (page, ), })
content = self._download_json(
api_url, video_id,
note='Downloading continuation - %d' % (page, ),
fatal=False)
for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []:
episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )]))
if episode_url:
entries.append(episode_url)
return self.playlist_result(
(self.url_result(entry) for entry in entries),
playlist_id=video_id,
playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1],
playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]),
)

View File

@ -68,6 +68,10 @@
from .aol import AolIE
from .allocine import AllocineIE
from .aliexpress import AliExpressLiveIE
from .alsace20tv import (
Alsace20TVIE,
Alsace20TVEmbedIE,
)
from .apa import APAIE
from .aparat import AparatIE
from .appleconnect import AppleConnectIE
@ -91,6 +95,7 @@
ArteTVIE,
ArteTVEmbedIE,
ArteTVPlaylistIE,
ArteTVCategoryIE,
)
from .arnes import ArnesIE
from .asiancrush import (
@ -306,6 +311,10 @@
from .condenast import CondeNastIE
from .contv import CONtvIE
from .corus import CorusIE
from .cpac import (
CPACIE,
CPACPlaylistIE,
)
from .cozytv import CozyTVIE
from .cracked import CrackedIE
from .crackle import CrackleIE

View File

@ -1,11 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
parse_duration,
int_or_none,
try_get,
strip_or_none,
traverse_obj,
url_or_none,
)
@ -20,14 +23,30 @@ class NuvidIE(InfoExtractor):
'title': 'italian babe',
'duration': 321.0,
'age_limit': 18,
'thumbnail': r're:https?://.+\.jpg',
}
}, {
'url': 'https://m.nuvid.com/video/6523263',
'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52',
'info_dict': {
'id': '6523263',
'ext': 'mp4',
'age_limit': 18,
'title': 'Slut brunette college student anal dorm',
'duration': 421.0,
'age_limit': 18,
'thumbnail': r're:https?://.+\.jpg',
'thumbnails': list,
}
}, {
'url': 'http://m.nuvid.com/video/6415801/',
'md5': '638d5ececb138d5753593f751ae3f697',
'info_dict': {
'id': '6415801',
'ext': 'mp4',
'title': 'My best friend wanted to fuck my wife for a long time',
'duration': 1882,
'age_limit': 18,
'thumbnail': r're:https?://.+\.jpg',
}
}]
@ -46,6 +65,16 @@ def _real_extract(self, url):
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
})
webpage = self._download_webpage(
'http://m.nuvid.com/video/%s' % (video_id, ),
video_id, 'Downloading video page', fatal=False) or ''
title = strip_or_none(video_data.get('title') or self._html_search_regex(
(r'''<span\s[^>]*?\btitle\s*=\s*(?P<q>"|'|\b)(?P<title>[^"]+)(?P=q)\s*>''',
r'''<div\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)thumb-holder video(?P=q)>\s*<h5\b[^>]*>(?P<title>[^<]+)</h5''',
r'''<span\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)title_thumb(?P=q)>(?P<title>[^<]+)</span'''),
webpage, 'title', group='title'))
formats = [{
'url': source,
'format_id': qualities.get(quality),
@ -55,19 +84,19 @@ def _real_extract(self, url):
self._check_formats(formats, video_id)
self._sort_formats(formats)
title = video_data.get('title')
thumbnail_base_url = try_get(video_data, lambda x: x['thumbs']['url'])
thumbnail_extension = try_get(video_data, lambda x: x['thumbs']['extension'])
thumbnail_id = self._search_regex(
r'/media/videos/tmb/6523263/preview/(/d+)' + thumbnail_extension, video_data.get('poster', ''), 'thumbnail id', default=19)
thumbnail = f'{thumbnail_base_url}player/{thumbnail_id}{thumbnail_extension}'
duration = parse_duration(video_data.get('duration') or video_data.get('duration_format'))
duration = parse_duration(traverse_obj(video_data, 'duration', 'duration_format'))
thumbnails = [
{'url': thumb_url} for thumb_url in re.findall(
r'<div\s+class\s*=\s*"video-tmb-wrap"\s*>\s*<img\s+src\s*=\s*"([^"]+)"\s*/>', webpage)
if url_or_none(thumb_url)]
if url_or_none(video_data.get('poster')):
thumbnails.append({'url': video_data['poster'], 'preference': 1})
return {
'id': video_id,
'formats': formats,
'title': title,
'thumbnail': thumbnail,
'thumbnails': thumbnails,
'duration': duration,
'age_limit': 18,
}

View File

@ -6,7 +6,8 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none
int_or_none,
str_to_int
)
@ -179,7 +180,7 @@ def _real_extract(self, url):
'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22',
'rtmp_live': True,
'ext': 'flv',
'vbr': int(quality),
'vbr': str_to_int(quality),
'quality': preference,
}
elif transport == 'm3u8':

View File

@ -22,6 +22,20 @@ class StreamCZIE(InfoExtractor):
'title': 'Bůh',
'display_id': 'buh',
'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165',
'duration': 1369.6,
'view_count': int,
}
}, {
'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937',
'md5': '41fd358000086a1ccdb068c77809b158',
'info_dict': {
'id': '64087937',
'ext': 'mp4',
'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna',
'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna',
'description': 'md5:97a811000a6460266029d6c1c2ebcd59',
'duration': 50.2,
'view_count': int,
}
}, {
'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267',
@ -31,7 +45,9 @@ class StreamCZIE(InfoExtractor):
'ext': 'mp4',
'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili',
'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili',
'description': 'md5:1dcb5e010eb697dedc5942f76c5b3744',
'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf',
'duration': 442.84,
'view_count': int,
}
}]

View File

@ -1,19 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
from .nexx import NexxIE
from .dplay import DPlayIE
from ..compat import compat_urlparse
from ..utils import (
NO_DEFAULT,
parse_qs,
smuggle_url,
ExtractorError,
extract_attributes,
)
class Tele5IE(InfoExtractor):
class Tele5IE(DPlayIE):
_VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_GEO_COUNTRIES = ['DE']
_TESTS = [{
@ -28,6 +24,7 @@ class Tele5IE(InfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'No longer available: "404 Seite nicht gefunden"',
}, {
# jwplatform, nexx unavailable
'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/',
@ -42,7 +39,20 @@ class Tele5IE(InfoExtractor):
'params': {
'skip_download': True,
},
'add_ie': [JWPlatformIE.ie_key()],
'skip': 'No longer available, redirects to Filme page',
}, {
'url': 'https://tele5.de/mediathek/angel-of-mine/',
'info_dict': {
'id': '1252360',
'ext': 'mp4',
'upload_date': '20220109',
'timestamp': 1641762000,
'title': 'Angel of Mine',
'description': 'md5:a72546a175e1286eb3251843a52d1ad7',
},
'params': {
'format': 'bestvideo',
},
}, {
'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191',
'only_matching': True,
@ -64,45 +74,18 @@ class Tele5IE(InfoExtractor):
}]
def _real_extract(self, url):
qs = parse_qs(url)
video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0]
NEXX_ID_RE = r'\d{6,}'
JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}'
def nexx_result(nexx_id):
return self.url_result(
'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id,
ie=NexxIE.ie_key(), video_id=nexx_id)
nexx_id = jwplatform_id = None
if video_id:
if re.match(NEXX_ID_RE, video_id):
return nexx_result(video_id)
elif re.match(JWPLATFORM_ID_RE, video_id):
jwplatform_id = video_id
if not nexx_id:
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
def extract_id(pattern, name, default=NO_DEFAULT):
return self._html_search_regex(
(r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern,
r'\s+id\s*=\s*["\']player_(%s)' % pattern,
r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name,
default=default)
nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None)
if nexx_id:
return nexx_result(nexx_id)
if not jwplatform_id:
jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id')
return self.url_result(
smuggle_url(
'jwplatform:%s' % jwplatform_id,
{'geo_countries': self._GEO_COUNTRIES}),
ie=JWPlatformIE.ie_key(), video_id=jwplatform_id)
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_element = self._search_regex(r'(<hyoga-player\b[^>]+?>)', webpage, 'video player')
player_info = extract_attributes(player_element)
asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', ))
endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname
source_type = player_info.get('sourcetype')
if source_type:
endpoint = '%s-%s' % (source_type, endpoint)
try:
return self._get_disco_api_info(url, asset_id, endpoint, realm, country)
except ExtractorError as e:
if getattr(e, 'message', '') == 'Missing deviceId in context':
self.report_drm(video_id)
raise

View File

@ -41,8 +41,16 @@ class TV2DKIE(InfoExtractor):
'duration': 1347,
'view_count': int,
},
'params': {
'skip_download': True,
'add_ie': ['Kaltura'],
}, {
'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn',
'info_dict': {
'id': '1_7iwll9n0',
'ext': 'mp4',
'upload_date': '20211027',
'title': 'Gadekamp #6 - Højhuse i København',
'uploader_id': 'tv2lorry',
'timestamp': 1635345229,
},
'add_ie': ['Kaltura'],
}, {
@ -91,11 +99,14 @@ def add_entry(partner_id, kaltura_id):
add_entry(partner_id, kaltura_id)
if not entries:
kaltura_id = self._search_regex(
r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id')
(r'entry_id\s*:\s*["\']([0-9a-z_]+)',
r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id')
partner_id = self._search_regex(
(r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage,
'partner id')
add_entry(partner_id, kaltura_id)
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries)

View File

@ -95,7 +95,6 @@ def _real_extract(self, url):
if v:
query[k] = v
f_url = update_url_query(f_url, query)
format_id = format_id
if format_id == 'HLS':
m3u8_formats = self._extract_m3u8_formats(
f_url, media_id, 'mp4', 'm3u8_native',

View File

@ -4,7 +4,11 @@
from .common import InfoExtractor
from ..utils import (
dict_get,
ExtractorError,
int_or_none,
ISO639Utils,
parse_age_limit,
try_get,
unified_timestamp,
)
@ -23,9 +27,10 @@ class URPlayIE(InfoExtractor):
'upload_date': '20171214',
'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik',
'duration': 2269,
'categories': ['Kultur & historia'],
'categories': ['Vetenskap & teknik'],
'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'],
'episode': 'Om vetenskap, kritiskt tänkande och motstånd',
'age_limit': 15,
},
}, {
'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
@ -50,11 +55,16 @@ def _real_extract(self, url):
video_id = self._match_id(url)
url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id)
vid = int(video_id)
urplayer_data = self._search_nextjs_data(webpage, video_id, fatal=False) or {}
if urplayer_data:
urplayer_data = try_get(urplayer_data, lambda x: x['props']['pageProps']['program'], dict)
if not urplayer_data:
raise ExtractorError('Unable to parse __NEXT_DATA__')
else:
accessible_episodes = self._parse_json(self._html_search_regex(
r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"',
webpage, 'urplayer data'), video_id)['accessibleEpisodes']
urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid)
urplayer_data = next(e for e in accessible_episodes if e.get('id') == int_or_none(video_id))
episode = urplayer_data['title']
host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
@ -72,11 +82,28 @@ def _real_extract(self, url):
self._sort_formats(formats)
subtitles = {}
subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")
if subs:
subtitles.setdefault('Svenska', []).append({
'url': subs,
})
def parse_lang_code(code):
"3-character language code or None (utils candidate)"
if code is None:
return
lang = code.lower()
if not ISO639Utils.long2short(lang):
lang = ISO639Utils.short2long(lang)
return lang or None
for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items():
if (k in ('sd', 'hd') or not isinstance(v, dict)):
continue
lang, sttl_url = (v.get(kk) for kk in ('language', 'location', ))
if not sttl_url:
continue
lang = parse_lang_code(lang)
if not lang:
continue
sttl = subtitles.get(lang) or []
sttl.append({'ext': k, 'url': sttl_url, })
subtitles[lang] = sttl
image = urplayer_data.get('image') or {}
thumbnails = []
@ -98,7 +125,6 @@ def _real_extract(self, url):
return {
'id': video_id,
'subtitles': subtitles,
'title': '%s : %s' % (series_title, episode) if series_title else episode,
'description': urplayer_data.get('description'),
'thumbnails': thumbnails,
@ -111,4 +137,7 @@ def _real_extract(self, url):
'season': series.get('label'),
'episode': episode,
'episode_number': int_or_none(urplayer_data.get('episodeNumber')),
'age_limit': parse_age_limit(min(try_get(a, lambda x: x['from'], int) or 0
for a in urplayer_data.get('ageRanges', []))),
'subtitles': subtitles,
}

View File

@ -111,7 +111,6 @@ def rc4(cipher_text, key):
def _real_extract(self, url):
video_id = self._match_id(url)
video_page = self._download_webpage(url, video_id)
if 'videa.hu/player' in url:
@ -146,7 +145,7 @@ def _real_extract(self, url):
compat_b64decode(b64_info), key), video_id)
video = xpath_element(info, './video', 'video')
if not video:
if video is None:
raise ExtractorError(xpath_element(
info, './error', fatal=True), expected=True)
sources = xpath_element(
@ -163,9 +162,9 @@ def _real_extract(self, url):
source_exp = source.get('exp')
if not (source_url and source_name):
continue
hash_value = None
if hash_values:
hash_value = xpath_text(hash_values, 'hash_value_' + source_name)
hash_value = (
xpath_text(hash_values, 'hash_value_' + source_name)
if hash_values is not None else None)
if hash_value and source_exp:
source_url = update_url_query(source_url, {
'md5': hash_value,

View File

@ -636,6 +636,24 @@ class VimeoIE(VimeoBaseInfoExtractor):
'url': 'https://vimeo.com/392479337/a52724358e',
'only_matching': True,
},
{
# similar, but all numeric: ID must be 581039021, not 9603038895
# issue #29690
'url': 'https://vimeo.com/581039021/9603038895',
'info_dict': {
'id': '581039021',
# these have to be provided but we don't care
'ext': 'mp4',
'timestamp': 1627621014,
'title': 're:.+',
'uploader_id': 're:.+',
'uploader': 're:.+',
'upload_date': r're:\d+',
},
'params': {
'skip_download': True,
},
}
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
]

View File

@ -10,6 +10,7 @@
)
from ..utils import (
determine_ext,
dict_get,
ExtractorError,
js_to_json,
strip_jsonp,
@ -22,13 +23,14 @@
class WDRIE(InfoExtractor):
__API_URL_TPL = '//deviceids-medp.wdr.de/ondemand/%s/%s'
_VALID_URL = r'''(?x)https?://
(?:deviceids-medp\.wdr\.de/ondemand/\d+/|
kinder\.wdr\.de/(?!mediathek/)[^#?]+-)
(?P<id>\d+)\.(?:js|assetjsonp)
'''
_GEO_COUNTRIES = ['DE']
_TEST = {
_TESTS = [{
'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js',
'info_dict': {
'id': 'mdb-1557833',
@ -36,11 +38,19 @@ class WDRIE(InfoExtractor):
'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe',
'upload_date': '20180112',
},
}
}]
def _asset_url(self, wdr_id):
id_len = max(len(wdr_id), 5)
return ''.join(('https:', self.__API_URL_TPL % (wdr_id[:id_len - 4], wdr_id, ), '.js'))
def _real_extract(self, url):
video_id = self._match_id(url)
if url.startswith('wdr:'):
video_id = url[4:]
url = self._asset_url(video_id)
metadata = self._download_json(
url, video_id, transform_source=strip_jsonp)
@ -126,10 +136,10 @@ def _real_extract(self, url):
}
class WDRPageIE(InfoExtractor):
_CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
class WDRPageIE(WDRIE):
_MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P<maus_id>[^/?#.]+)(?:/?|/index\.php5|\.php5)$'
_PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html'
_VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
_VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX
_TESTS = [
{
@ -170,11 +180,11 @@ class WDRPageIE(InfoExtractor):
{
'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
'info_dict': {
'id': 'mdb-1406149',
'id': 'mdb-2296252',
'ext': 'mp4',
'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'title': r're:^WDR Fernsehen im Livestream (?:\(nur in Deutschland erreichbar\) )?[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'alt_title': 'WDR Fernsehen Live',
'upload_date': '20150101',
'upload_date': '20201112',
'is_live': True,
},
'params': {
@ -183,7 +193,7 @@ class WDRPageIE(InfoExtractor):
},
{
'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
'playlist_mincount': 7,
'playlist_mincount': 6,
'info_dict': {
'id': 'aktuelle-stunde-120',
},
@ -191,10 +201,10 @@ class WDRPageIE(InfoExtractor):
{
'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
'info_dict': {
'id': 'mdb-1552552',
'id': 'mdb-2627637',
'ext': 'mp4',
'upload_date': 're:^[0-9]{8}$',
'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$',
},
'skip': 'The id changes from week to week because of the new episode'
},
@ -207,6 +217,7 @@ class WDRPageIE(InfoExtractor):
'upload_date': '20130919',
'title': 'Sachgeschichte - Achterbahn ',
},
'skip': 'HTTP Error 404: Not Found',
},
{
'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html',
@ -232,6 +243,7 @@ class WDRPageIE(InfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'HTTP Error 404: Not Found',
},
{
'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html',
@ -245,7 +257,7 @@ class WDRPageIE(InfoExtractor):
def _real_extract(self, url):
mobj = self._match_valid_url(url)
display_id = mobj.group('display_id')
display_id = dict_get(mobj.groupdict(), ('display_id', 'maus_id'), 'wdrmaus')
webpage = self._download_webpage(url, display_id)
entries = []
@ -271,6 +283,14 @@ def _real_extract(self, url):
jsonp_url = try_get(
media_link_obj, lambda x: x['mediaObj']['url'], compat_str)
if jsonp_url:
# metadata, or player JS with ['ref'] giving WDR id, or just media, perhaps
clip_id = media_link_obj['mediaObj'].get('ref')
if jsonp_url.endswith('.assetjsonp'):
asset = self._download_json(
jsonp_url, display_id, fatal=False, transform_source=strip_jsonp)
clip_id = try_get(asset, lambda x: x['trackerData']['trackerClipId'], compat_str)
if clip_id:
jsonp_url = self._asset_url(clip_id[4:])
entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key()))
# Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html)
@ -290,16 +310,14 @@ def _real_extract(self, url):
class WDRElefantIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)'
_TEST = {
'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015',
'url': 'http://www.wdrmaus.de/elefantenseite/#elefantenkino_wippe',
# adaptive stream: unstable file MD5
'info_dict': {
'title': 'Folge Oster-Spezial 2015',
'id': 'mdb-1088195',
'title': 'Wippe',
'id': 'mdb-1198320',
'ext': 'mp4',
'age_limit': None,
'upload_date': '20150406'
},
'params': {
'skip_download': True,
'upload_date': '20071003'
},
}
@ -334,6 +352,7 @@ class WDRMobileIE(InfoExtractor):
/[0-9]+/[0-9]+/
(?P<id>[0-9]+)_(?P<title>[0-9]+)'''
IE_NAME = 'wdr:mobile'
_WORKING = False # no such domain
_TEST = {
'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4',
'info_dict': {

View File

@ -136,6 +136,34 @@ def _extract_player(self, webpage, video_id, fatal=True):
class ZDFIE(ZDFBaseIE):
_VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
_TESTS = [{
# Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
'md5': '34ec321e7eb34231fd88616c65c92db0',
'info_dict': {
'id': '210222_phx_nachgehakt_corona_protest',
'ext': 'mp4',
'title': 'Wohin führt der Protest in der Pandemie?',
'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
'duration': 1691,
'timestamp': 1613948400,
'upload_date': '20210221',
},
'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
}, {
# Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
'info_dict': {
'id': '141007_ab18_10wochensommer_film',
'ext': 'mp4',
'title': 'Ab 18! - 10 Wochen Sommer',
'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
'duration': 2660,
'timestamp': 1608604200,
'upload_date': '20201222',
},
'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
}, {
'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html',
'info_dict': {
'id': '211230_sendung_hjo',
@ -195,13 +223,16 @@ class ZDFIE(ZDFBaseIE):
'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
'only_matching': True,
}, {
# Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
'only_matching': True
}, {
# Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
'only_matching': True
'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html',
'info_dict': {
'id': 'video_artede_083871-001-A',
'ext': 'mp4',
'title': 'Tödliche Flucht (1/6)',
'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315',
'duration': 3193.0,
'timestamp': 1641355200,
'upload_date': '20220105',
},
}]
def _extract_entry(self, url, player, content, video_id):