mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-03 08:19:57 +01:00
[web.archive:youtube] Add ytarchive:
prefix
and misc cleanup
This commit is contained in:
parent
e612f66c7c
commit
1f13021eca
@ -19,6 +19,7 @@
|
|||||||
get_element_by_id,
|
get_element_by_id,
|
||||||
HEADRequest,
|
HEADRequest,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
|
join_nonempty,
|
||||||
KNOWN_EXTENSIONS,
|
KNOWN_EXTENSIONS,
|
||||||
merge_dicts,
|
merge_dicts,
|
||||||
mimetype2ext,
|
mimetype2ext,
|
||||||
@ -64,7 +65,7 @@ class ArchiveOrgIE(InfoExtractor):
|
|||||||
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
|
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
|
||||||
'uploader': 'yorkmba99@hotmail.com',
|
'uploader': 'yorkmba99@hotmail.com',
|
||||||
'timestamp': 1387699629,
|
'timestamp': 1387699629,
|
||||||
'upload_date': "20131222",
|
'upload_date': '20131222',
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
|
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
|
||||||
@ -150,8 +151,7 @@ def _real_extract(self, url):
|
|||||||
|
|
||||||
# Archive.org metadata API doesn't clearly demarcate playlist entries
|
# Archive.org metadata API doesn't clearly demarcate playlist entries
|
||||||
# or subtitle tracks, so we get them from the embeddable player.
|
# or subtitle tracks, so we get them from the embeddable player.
|
||||||
embed_page = self._download_webpage(
|
embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
|
||||||
'https://archive.org/embed/' + identifier, identifier)
|
|
||||||
playlist = self._playlist_data(embed_page)
|
playlist = self._playlist_data(embed_page)
|
||||||
|
|
||||||
entries = {}
|
entries = {}
|
||||||
@ -166,17 +166,17 @@ def _real_extract(self, url):
|
|||||||
'thumbnails': [],
|
'thumbnails': [],
|
||||||
'artist': p.get('artist'),
|
'artist': p.get('artist'),
|
||||||
'track': p.get('title'),
|
'track': p.get('title'),
|
||||||
'subtitles': {}}
|
'subtitles': {},
|
||||||
|
}
|
||||||
|
|
||||||
for track in p.get('tracks', []):
|
for track in p.get('tracks', []):
|
||||||
if track['kind'] != 'subtitles':
|
if track['kind'] != 'subtitles':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
entries[p['orig']][track['label']] = {
|
entries[p['orig']][track['label']] = {
|
||||||
'url': 'https://archive.org/' + track['file'].lstrip('/')}
|
'url': 'https://archive.org/' + track['file'].lstrip('/')
|
||||||
|
}
|
||||||
|
|
||||||
metadata = self._download_json(
|
metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
|
||||||
'http://archive.org/metadata/' + identifier, identifier)
|
|
||||||
m = metadata['metadata']
|
m = metadata['metadata']
|
||||||
identifier = m['identifier']
|
identifier = m['identifier']
|
||||||
|
|
||||||
@ -189,7 +189,7 @@ def _real_extract(self, url):
|
|||||||
'license': m.get('licenseurl'),
|
'license': m.get('licenseurl'),
|
||||||
'release_date': unified_strdate(m.get('date')),
|
'release_date': unified_strdate(m.get('date')),
|
||||||
'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
|
'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
|
||||||
'webpage_url': 'https://archive.org/details/' + identifier,
|
'webpage_url': f'https://archive.org/details/{identifier}',
|
||||||
'location': m.get('venue'),
|
'location': m.get('venue'),
|
||||||
'release_year': int_or_none(m.get('year'))}
|
'release_year': int_or_none(m.get('year'))}
|
||||||
|
|
||||||
@ -230,13 +230,12 @@ def _real_extract(self, url):
|
|||||||
'filesize': int_or_none(f.get('size')),
|
'filesize': int_or_none(f.get('size')),
|
||||||
'protocol': 'https'})
|
'protocol': 'https'})
|
||||||
|
|
||||||
# Sort available formats by filesize
|
|
||||||
for entry in entries.values():
|
for entry in entries.values():
|
||||||
entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))
|
self._sort_formats(entry['formats'])
|
||||||
|
|
||||||
if len(entries) == 1:
|
if len(entries) == 1:
|
||||||
# If there's only one item, use it as the main info dict
|
# If there's only one item, use it as the main info dict
|
||||||
only_video = entries[list(entries.keys())[0]]
|
only_video = next(iter(entries.values()))
|
||||||
if entry_id:
|
if entry_id:
|
||||||
info = merge_dicts(only_video, info)
|
info = merge_dicts(only_video, info)
|
||||||
else:
|
else:
|
||||||
@ -261,19 +260,19 @@ def _real_extract(self, url):
|
|||||||
|
|
||||||
class YoutubeWebArchiveIE(InfoExtractor):
|
class YoutubeWebArchiveIE(InfoExtractor):
|
||||||
IE_NAME = 'web.archive:youtube'
|
IE_NAME = 'web.archive:youtube'
|
||||||
IE_DESC = 'web.archive.org saved youtube videos'
|
IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
|
||||||
_VALID_URL = r"""(?x)^
|
_VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|
|
||||||
(?:https?://)?web\.archive\.org/
|
(?:https?://)?web\.archive\.org/
|
||||||
(?:web/)?
|
(?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
|
||||||
(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
|
(?:https?(?::|%3[Aa])//)?(?:
|
||||||
|
(?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
|
||||||
(?:https?(?::|%3[Aa])//)?
|
|(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
|
||||||
(?:
|
)
|
||||||
(?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
|
)(?P<id>[0-9A-Za-z_-]{11})
|
||||||
|(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
|
(?(prefix)
|
||||||
)
|
(?::(?P<date2>[0-9]{14}))?$|
|
||||||
(?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$)
|
(?:%26|[#&]|$)
|
||||||
"""
|
)'''
|
||||||
|
|
||||||
_TESTS = [
|
_TESTS = [
|
||||||
{
|
{
|
||||||
@ -438,7 +437,13 @@ class YoutubeWebArchiveIE(InfoExtractor):
|
|||||||
}, {
|
}, {
|
||||||
'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
|
'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
|
||||||
'only_matching': True
|
'only_matching': True
|
||||||
}
|
}, {
|
||||||
|
'url': 'ytarchive:BaW_jenozKc:20050214000000',
|
||||||
|
'only_matching': True
|
||||||
|
},{
|
||||||
|
'url': 'ytarchive:BaW_jenozKc',
|
||||||
|
'only_matching': True
|
||||||
|
},
|
||||||
]
|
]
|
||||||
_YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
|
_YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
|
||||||
_YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
|
_YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
|
||||||
@ -484,7 +489,6 @@ def _extract_webpage_title(self, webpage):
|
|||||||
page_title, 'title', default='')
|
page_title, 'title', default='')
|
||||||
|
|
||||||
def _extract_metadata(self, video_id, webpage):
|
def _extract_metadata(self, video_id, webpage):
|
||||||
|
|
||||||
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
|
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
|
||||||
player_response = self._extract_yt_initial_variable(
|
player_response = self._extract_yt_initial_variable(
|
||||||
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
|
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
|
||||||
@ -596,7 +600,7 @@ def _get_capture_dates(self, video_id, url_date):
|
|||||||
|
|
||||||
# Prefer the new polymer UI captures as we support extracting more metadata from them
|
# Prefer the new polymer UI captures as we support extracting more metadata from them
|
||||||
# WBM captures seem to all switch to this layout ~July 2020
|
# WBM captures seem to all switch to this layout ~July 2020
|
||||||
modern_captures = list(filter(lambda x: x >= 20200701000000, all_captures))
|
modern_captures = [x for x in all_captures if x >= 20200701000000]
|
||||||
if modern_captures:
|
if modern_captures:
|
||||||
capture_dates.append(modern_captures[0])
|
capture_dates.append(modern_captures[0])
|
||||||
capture_dates.append(url_date)
|
capture_dates.append(url_date)
|
||||||
@ -608,11 +612,11 @@ def _get_capture_dates(self, video_id, url_date):
|
|||||||
|
|
||||||
# Fallbacks if any of the above fail
|
# Fallbacks if any of the above fail
|
||||||
capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
|
capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
|
||||||
return orderedSet(capture_dates)
|
return orderedSet(filter(None, capture_dates))
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
|
video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
|
||||||
url_date, video_id = self._match_valid_url(url).groups()
|
url_date = url_date or url_date_2
|
||||||
|
|
||||||
urlh = None
|
urlh = None
|
||||||
try:
|
try:
|
||||||
@ -629,11 +633,9 @@ def _real_extract(self, url):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
|
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
|
||||||
self.write_debug('Captures to try: ' + ', '.join(str(i) for i in capture_dates if i is not None))
|
self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
|
||||||
info = {'id': video_id}
|
info = {'id': video_id}
|
||||||
for capture in capture_dates:
|
for capture in capture_dates:
|
||||||
if not capture:
|
|
||||||
continue
|
|
||||||
webpage = self._download_webpage(
|
webpage = self._download_webpage(
|
||||||
(self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
|
(self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
|
||||||
video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
|
video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
|
||||||
@ -648,7 +650,7 @@ def _real_extract(self, url):
|
|||||||
info['thumbnails'] = self._extract_thumbnails(video_id)
|
info['thumbnails'] = self._extract_thumbnails(video_id)
|
||||||
|
|
||||||
if urlh:
|
if urlh:
|
||||||
url = compat_urllib_parse_unquote(urlh.url)
|
url = compat_urllib_parse_unquote(urlh.geturl())
|
||||||
video_file_url_qs = parse_qs(url)
|
video_file_url_qs = parse_qs(url)
|
||||||
# Attempt to recover any ext & format info from playback url & response headers
|
# Attempt to recover any ext & format info from playback url & response headers
|
||||||
format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
|
format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
|
||||||
|
Loading…
Reference in New Issue
Block a user