Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).
This commit is contained in:
Jaime Marquínez Ferrándiz 2013-11-15 12:24:54 +01:00
parent 85d61685f1
commit ab2d524780

View File

@ -315,13 +315,17 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info # Helper functions for extracting OpenGraph info
@staticmethod @staticmethod
def _og_regex(prop): def _og_regexes(prop):
return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) esc_prop = re.escape(prop)
return [
r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop,
r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop,
]
def _og_search_property(self, prop, html, name=None, **kargs): def _og_search_property(self, prop, html, name=None, **kargs):
if name is None: if name is None:
name = 'OpenGraph %s' % prop name = 'OpenGraph %s' % prop
escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
if escaped is None: if escaped is None:
return None return None
return unescapeHTML(escaped) return unescapeHTML(escaped)
@ -336,8 +340,8 @@ class InfoExtractor(object):
return self._og_search_property('title', html, **kargs) return self._og_search_property('title', html, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs): def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
regexes = [self._og_regex('video')] regexes = self._og_regexes('video')
if secure: regexes.insert(0, self._og_regex('video:secure_url')) if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs) return self._html_search_regex(regexes, html, name, **kargs)
def _rta_search(self, html): def _rta_search(self, html):