[ustream] Simplify channel extraction

the ChannelParser has been moved to a new function in utils get_meta_content Instead of the SocialStreamParser now it uses a regex
2024-12-12 14:26:49 +01:00 · 2013-09-13 22:05:29 +02:00 · 2013-09-13 22:05:29 +02:00 · a921f40799
commit a921f40799
parent 74ac9bdd82
3 changed files with 70 additions and 59 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -11,13 +11,16 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 #from youtube_dl.utils import htmlentity_transform
-from youtube_dl.utils import timeconvert
+from youtube_dl.utils import (
-from youtube_dl.utils import sanitize_filename
+    timeconvert,
-from youtube_dl.utils import unescapeHTML
+    sanitize_filename,
-from youtube_dl.utils import orderedSet
+    unescapeHTML,
-from youtube_dl.utils import DateRange
+    orderedSet,
-from youtube_dl.utils import unified_strdate
+    DateRange,
-from youtube_dl.utils import find_xpath_attr
+    unified_strdate,
    find_xpath_attr,
    get_meta_content,
 )
 if sys.version_info < (3, 0):
    _compat_str = lambda b: b.decode('unicode-escape')
@ -127,5 +130,16 @@ def test_find_xpath_attr(self):
        self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
        self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
    def test_meta_parser(self):
        testhtml = u'''
        <head>
            <meta name="description" content="foo &amp; bar">
            <meta content='Plato' name='author'/>
        </head>
        '''
        get_meta = lambda name: get_meta_content(name, testhtml)
        self.assertEqual(get_meta('description'), u'foo & bar')
        self.assertEqual(get_meta('author'), 'Plato')
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@ -4,7 +4,7 @@
 from .common import InfoExtractor
 from ..utils import (
    compat_urlparse,
-    compat_html_parser,
+    get_meta_content,
 )
@ -49,40 +49,6 @@ def _real_extract(self, url):
               }
        return info
 # More robust than regular expressions
 class ChannelParser(compat_html_parser.HTMLParser):
    """
    <meta name="ustream:channel_id" content="1234">
    """
    channel_id = None
    def handle_starttag(self, tag, attrs):
        if tag != 'meta':
            return
        values = dict(attrs)
        if values.get('name') != 'ustream:channel_id':
            return
        value = values.get('content', '')
        if value.isdigit():
            self.channel_id = value
 class SocialstreamParser(compat_html_parser.HTMLParser):
    """
    <li class="content123 video" data-content-id="123" data-length="1452"
        data-href="/recorded/123" data-og-url="/recorded/123">
    """
    def __init__(self):
        compat_html_parser.HTMLParser.__init__(self)
        self.content_ids = []
    def handle_starttag(self, tag, attrs):
        if tag != 'li':
            return
        for (attr, value) in attrs:
            if attr == 'data-content-id' and value.isdigit():
                self.content_ids.append(value)
 class UstreamChannelIE(InfoExtractor):
    _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
    IE_NAME = u'ustream:channel'
@ -90,21 +56,16 @@ class UstreamChannelIE(InfoExtractor):
    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url)
        slug = m.group('slug')
        webpage = self._download_webpage(url, slug)
        channel_id = get_meta_content('ustream:channel_id', webpage)
        p = ChannelParser()
        p.feed(self._download_webpage(url, slug))
        p.close()
        channel_id = p.channel_id
        p = SocialstreamParser()
        BASE = 'http://www.ustream.tv'
        next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
        video_ids = []
        while next_url:
            reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id))
-            p.feed(reply['data'])
+            video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data']))
            next_url = reply['nextUrl']
        p.close()
        video_ids = p.content_ids
        urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids]
        url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls]
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -249,7 +249,17 @@ def htmlentity_transform(matchobj):
    return (u'&%s;' % entity)
 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class AttrParser(compat_html_parser.HTMLParser):
+class BaseHTMLParser(compat_html_parser.HTMLParser):
    def __init(self):
        compat_html_parser.HTMLParser.__init__(self)
        self.html = None
    def loads(self, html):
        self.html = html
        self.feed(html)
        self.close()
 class AttrParser(BaseHTMLParser):
    """Modified HTMLParser that isolates a tag with the specified attribute"""
    def __init__(self, attribute, value):
        self.attribute = attribute
@ -257,10 +267,9 @@ def __init__(self, attribute, value):
        self.result = None
        self.started = False
        self.depth = {}
        self.html = None
        self.watch_startpos = False
        self.error_count = 0
-        compat_html_parser.HTMLParser.__init__(self)
+        BaseHTMLParser.__init__(self)
    def error(self, message):
        if self.error_count > 10 or self.started:
@ -269,11 +278,6 @@ def error(self, message):
        self.error_count += 1
        self.goahead(1)
    def loads(self, html):
        self.html = html
        self.feed(html)
        self.close()
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if self.started:
@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html):
        pass
    return parser.get_result()
 class MetaParser(BaseHTMLParser):
    """
    Modified HTMLParser that isolates a meta tag with the specified name 
    attribute.
    """
    def __init__(self, name):
        BaseHTMLParser.__init__(self)
        self.name = name
        self.content = None
        self.result = None
    def handle_starttag(self, tag, attrs):
        if tag != 'meta':
            return
        attrs = dict(attrs)
        if attrs.get('name') == self.name:
            self.result = attrs.get('content')
    def get_result(self):
        return self.result
 def get_meta_content(name, html):
    """
    Return the content attribute from the meta tag with the given name attribute.
    """
    parser = MetaParser(name)
    try:
        parser.loads(html)
    except compat_html_parser.HTMLParseError:
        pass
    return parser.get_result()
 def clean_html(html):
    """Clean an HTML snippet into a readable string"""