[utils] Add get_element_by_class

For #9950
This commit is contained in:
Yen Chi Hsuan 2016-07-06 20:02:52 +08:00
parent ab49d7a9fa
commit 84c237fb8a
No known key found for this signature in database
GPG Key ID: 3FDDD575826C5C30
2 changed files with 19 additions and 2 deletions

View File

@ -33,6 +33,7 @@
ExtractorError, ExtractorError,
find_xpath_attr, find_xpath_attr,
fix_xml_ampersands, fix_xml_ampersands,
get_element_by_class,
InAdvancePagedList, InAdvancePagedList,
intlist_to_bytes, intlist_to_bytes,
is_html, is_html,
@ -991,5 +992,13 @@ def test_urshift(self):
self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646) self.assertEqual(urshift(-3, 1), 2147483646)
def test_get_element_by_class(self):
html = '''
<span class="foo bar">nice</span>
'''
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -310,9 +310,17 @@ def get_element_by_id(id, html):
return get_element_by_attribute('id', id, html) return get_element_by_attribute('id', id, html)
def get_element_by_attribute(attribute, value, html): def get_element_by_class(class_name, html):
return get_element_by_attribute(
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
html, escape_value=False)
def get_element_by_attribute(attribute, value, html, escape_value=True):
"""Return the content of the tag with the specified attribute in the passed HTML document""" """Return the content of the tag with the specified attribute in the passed HTML document"""
value = re.escape(value) if escape_value else value
m = re.search(r'''(?xs) m = re.search(r'''(?xs)
<([a-zA-Z0-9:._-]+) <([a-zA-Z0-9:._-]+)
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
@ -321,7 +329,7 @@ def get_element_by_attribute(attribute, value, html):
\s*> \s*>
(?P<content>.*?) (?P<content>.*?)
</\1> </\1>
''' % (re.escape(attribute), re.escape(value)), html) ''' % (re.escape(attribute), value), html)
if not m: if not m:
return None return None