[extractor/wikimedia] Add extractor (#4314)

Based on https://github.com/ytdl-org/youtube-dl/pull/30796
Authored by: EhtishamSabir, pukkandan
This commit is contained in:
Ehtisham Sabir 2022-07-18 01:22:24 +05:00 committed by GitHub
parent 306770819e
commit 2e2c60c4ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 0 deletions

View File

@ -2088,6 +2088,7 @@
WeiboMobileIE WeiboMobileIE
) )
from .weiqitv import WeiqiTVIE from .weiqitv import WeiqiTVIE
from .wikimedia import WikimediaIE
from .willow import WillowIE from .willow import WillowIE
from .wimtv import WimTVIE from .wimtv import WimTVIE
from .whowatch import WhoWatchIE from .whowatch import WhoWatchIE

View File

@ -0,0 +1,55 @@
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_class,
parse_qs,
remove_start,
unescapeHTML,
urljoin,
)
class WikimediaIE(InfoExtractor):
IE_NAME = 'wikimedia.org'
_VALID_URL = r'https?://commons\.wikimedia\.org/wiki/File:(?P<id>[^/#?]+)\.\w+'
_TESTS = [{
'url': 'https://commons.wikimedia.org/wiki/File:Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm',
'info_dict': {
'url': 're:https?://upload.wikimedia.org/wikipedia',
'ext': 'webm',
'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS',
'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons',
'description': 'md5:7cd84f76e7081f1be033d0b155b4a460',
'license': 'Creative Commons Attribution 4.0 International',
'uploader': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy',
'subtitles': 'count:4'
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
subtitles = {}
for sub in set(re.findall(r'\bsrc\s*=\s*["\'](/w/api[^"]+)["\']', webpage)):
sub = urljoin('https://commons.wikimedia.org', unescapeHTML(sub))
qs = parse_qs(sub)
lang = qs.get('lang', [None])[-1]
sub_ext = qs.get('trackformat', [None])[-1]
if lang and sub_ext:
subtitles.setdefault(lang, []).append({'ext': sub_ext, 'url': sub})
return {
'id': video_id,
'url': self._html_search_regex(r'<source\s[^>]*\bsrc="([^"]+)"', webpage, 'video URL'),
'description': clean_html(get_element_by_class('description', webpage)),
'title': remove_start(self._og_search_title(webpage), 'File:'),
'license': self._html_search_regex(
r'licensed under(?: the)? (.+?) license',
get_element_by_class('licensetpl', webpage), 'license', default=None),
'uploader': self._html_search_regex(
r'>\s*Author\s*</td>\s*<td\b[^>]*>\s*([^<]+)\s*</td>', webpage, 'video author', default=None),
'subtitles': subtitles,
}