[rcs] Improved extraction (See desc) (#170)

* improved `video_data` extraction
* added an extra fallback value for `description`
* improved regex in `RCSVariousIE`

Authored by: nixxo
This commit is contained in:
nixxo 2021-03-15 19:56:29 +01:00 committed by GitHub
parent e4beae703d
commit 18c1f04362
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -15,6 +15,9 @@
class RCSBaseIE(InfoExtractor): class RCSBaseIE(InfoExtractor):
# based on VideoPlayerLoader.prototype.getVideoSrc
# and VideoPlayerLoader.prototype.transformSrc from
# https://js2.corriereobjects.it/includes2013/LIBS/js/corriere_video.sjs
_ALL_REPLACE = { _ALL_REPLACE = {
'media2vam.corriere.it.edgesuite.net': 'media2vam.corriere.it.edgesuite.net':
'media2vam-corriere-it.akamaized.net', 'media2vam-corriere-it.akamaized.net',
@ -191,10 +194,10 @@ def _create_formats(self, urls, video_id):
urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native', urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False) m3u8_id='hls', fatal=False)
if not formats: if urls.get('mp4'):
formats.append({ formats.append({
'format_id': 'http-mp4', 'format_id': 'http-mp4',
'url': urls.get('mp4') 'url': urls['mp4']
}) })
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
@ -216,10 +219,12 @@ def _real_extract(self, url):
video_data = None video_data = None
# look for json video data url # look for json video data url
json = self._search_regex( json = self._search_regex(
r'''(?x)var url\s*=\s*["']((?:https?:)? r'''(?x)url\s*=\s*(["'])
//video\.rcs\.it (?P<url>
/fragment-includes/video-includes/.+?\.json)["'];''', (?:https?:)?//video\.rcs\.it
page, video_id, default=None) /fragment-includes/video-includes/.+?\.json
)\1;''',
page, video_id, group='url', default=None)
if json: if json:
if json.startswith('//'): if json.startswith('//'):
json = 'https:%s' % json json = 'https:%s' % json
@ -227,13 +232,16 @@ def _real_extract(self, url):
# if json url not found, look for json video data directly in the page # if json url not found, look for json video data directly in the page
else: else:
# RCS normal pages and most of the embeds
json = self._search_regex( json = self._search_regex(
r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)', r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)',
page, video_id, default=None) page, video_id, default=None)
if json: if not json and 'video-embed' in url:
video_data = self._parse_json( page = self._download_webpage(url.replace('video-embed', 'video-json'), video_id)
json, video_id, transform_source=js_to_json) json = self._search_regex(
else: r'##start-video##({[\s\S]+?})##end-video##',
page, video_id, default=None)
if not json:
# if no video data found try search for iframes # if no video data found try search for iframes
emb = RCSEmbedsIE._extract_url(page) emb = RCSEmbedsIE._extract_url(page)
if emb: if emb:
@ -242,6 +250,9 @@ def _real_extract(self, url):
'url': emb, 'url': emb,
'ie_key': RCSEmbedsIE.ie_key() 'ie_key': RCSEmbedsIE.ie_key()
} }
if json:
video_data = self._parse_json(
json, video_id, transform_source=js_to_json)
if not video_data: if not video_data:
raise ExtractorError('Video data not found in the page') raise ExtractorError('Video data not found in the page')
@ -250,7 +261,8 @@ def _real_extract(self, url):
self._get_video_src(video_data), video_id) self._get_video_src(video_data), video_id)
description = (video_data.get('description') description = (video_data.get('description')
or clean_html(video_data.get('htmlDescription'))) or clean_html(video_data.get('htmlDescription'))
or self._html_search_meta('description', page))
uploader = video_data.get('provider') or mobj.group('cdn') uploader = video_data.get('provider') or mobj.group('cdn')
return { return {
@ -283,6 +295,7 @@ class RCSEmbedsIE(RCSBaseIE):
'uploader': 'rcs.it', 'uploader': 'rcs.it',
} }
}, { }, {
# redownload the page changing 'video-embed' in 'video-json'
'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789',
'md5': 'a043e3fecbe4d9ed7fc5d888652a5440', 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440',
'info_dict': { 'info_dict': {
@ -359,6 +372,7 @@ class RCSIE(RCSBaseIE):
'uploader': 'Corriere Tv', 'uploader': 'Corriere Tv',
} }
}, { }, {
# video data inside iframe
'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/', 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/',
'md5': 'da378e4918d2afbf7d61c35abb948d4c', 'md5': 'da378e4918d2afbf7d61c35abb948d4c',
'info_dict': { 'info_dict': {
@ -389,15 +403,15 @@ class RCSVariousIE(RCSBaseIE):
(?P<cdn> (?P<cdn>
leitv\.it| leitv\.it|
youreporter\.it youreporter\.it
)/(?:video/)?(?P<id>[^/]+?)(?:$|\?|/)''' )/(?:[^/]+/)?(?P<id>[^/]+?)(?:$|\?|/)'''
_TESTS = [{ _TESTS = [{
'url': 'https://www.leitv.it/video/marmellata-di-ciliegie-fatta-in-casa/', 'url': 'https://www.leitv.it/benessere/mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa/',
'md5': '618aaabac32152199c1af86784d4d554', 'md5': '92b4e63667b8f95acb0a04da25ae28a1',
'info_dict': { 'info_dict': {
'id': 'marmellata-di-ciliegie-fatta-in-casa', 'id': 'mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Marmellata di ciliegie fatta in casa', 'title': 'Cervicalgia e mal di testa, il video con i suggerimenti dell\'esperto',
'description': 'md5:89133864d6aad456dbcf6e7a29f86263', 'description': 'md5:ae21418f34cee0b8d02a487f55bcabb5',
'uploader': 'leitv.it', 'uploader': 'leitv.it',
} }
}, { }, {