[youtube:channel] Fix the extraction

The page don't include the 'load more' button anymore, now we directly get the 'c4_browse_ajax' pages.
This commit is contained in:
Jaime Marquínez Ferrándiz 2013-11-06 21:42:33 +01:00
parent 5d7b253ea0
commit 4f045eef8f

View File

@ -1572,7 +1572,6 @@ def _real_extract(self, url):
class YoutubeChannelIE(InfoExtractor): class YoutubeChannelIE(InfoExtractor):
IE_DESC = u'YouTube.com channels' IE_DESC = u'YouTube.com channels'
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
_MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
_MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel' IE_NAME = u'youtube:channel'
@ -1593,30 +1592,20 @@ def _real_extract(self, url):
# Download channel page # Download channel page
channel_id = mobj.group(1) channel_id = mobj.group(1)
video_ids = [] video_ids = []
pagenum = 1
url = self._TEMPLATE_URL % (channel_id, pagenum) # Download all channel pages using the json-based channel_ajax query
page = self._download_webpage(url, channel_id, for pagenum in itertools.count(1):
u'Downloading page #%s' % pagenum) url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
# Extract video identifiers page = json.loads(page)
ids_in_page = self.extract_videos_from_page(page)
video_ids.extend(ids_in_page)
# Download any subsequent channel pages using the json-based channel_ajax query ids_in_page = self.extract_videos_from_page(page['content_html'])
if self._MORE_PAGES_INDICATOR in page: video_ids.extend(ids_in_page)
for pagenum in itertools.count(1):
url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
page = json.loads(page) if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
break
ids_in_page = self.extract_videos_from_page(page['content_html'])
video_ids.extend(ids_in_page)
if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
break
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))