From ced659bb4d066c782a009d370daff0fedb7b1006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Oct 2014 19:26:23 +0700 Subject: [PATCH] [generic] Ignore some non-video file extensions during generic extraction (Closes #3900) --- youtube_dl/extractor/generic.py | 34 ++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c16da70f1d..dfc2ef4e72 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -847,47 +847,51 @@ def _playlist_from_matches(matches, getter, ie=None): if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') + def check_video(vurl): + vpath = compat_urlparse.urlparse(vurl).path + vext = determine_ext(vpath) + return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml') + + def filter_video(urls): + return list(filter(check_video, urls)) + # Start with something easy: JW Player in SWFObject - found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) + found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) if not found: # Look for gorilla-vid style embedding - found = re.findall(r'''(?sx) + found = filter_video(re.findall(r'''(?sx) (?: jw_plugins| JWPlayerOptions| jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup ) - .*?file\s*:\s*["\'](.*?)["\']''', webpage) + .*?file\s*:\s*["\'](.*?)["\']''', webpage)) if not found: # Broaden the search a little bit - found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) if not found: # Broaden the findall a little bit: JWPlayer JS loader - found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + found = filter_video(re.findall( + r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) if not found: # Flow player - found = re.findall(r'''(?xs) + found = filter_video(re.findall(r'''(?xs) flowplayer\("[^"]+",\s* \{[^}]+?\}\s*, \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] - ''', webpage) + ''', webpage)) if not found: # Try to find twitter cards info - found = re.findall(r'.*?]+)? src="([^"]+)"', webpage)