[huffpost] Add support

This commit is contained in:
Philipp Hagemeister 2014-01-27 05:47:30 +01:00
parent 0f2999fe2b
commit db1f388878
5 changed files with 83 additions and 3 deletions

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
from .common import FileDownloader from .common import FileDownloader
from .hls import HlsFD from .hls import HlsFD
from .http import HttpFD from .http import HttpFD
@ -12,10 +14,11 @@
def get_suitable_downloader(info_dict): def get_suitable_downloader(info_dict):
"""Get the downloader class that can handle the info dict.""" """Get the downloader class that can handle the info dict."""
url = info_dict['url'] url = info_dict['url']
protocol = info_dict.get('protocol')
if url.startswith('rtmp'): if url.startswith('rtmp'):
return RtmpFD return RtmpFD
if determine_ext(url) == u'm3u8': if (protocol == 'm3u8') or (protocol is None and determine_ext(url) == 'm3u8'):
return HlsFD return HlsFD
if url.startswith('mms') or url.startswith('rtsp'): if url.startswith('mms') or url.startswith('rtsp'):
return MplayerFD return MplayerFD

View File

@ -83,6 +83,7 @@
from .hark import HarkIE from .hark import HarkIE
from .hotnewhiphop import HotNewHipHopIE from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE from .howcast import HowcastIE
from .huffpost import HuffPostIE
from .hypem import HypemIE from .hypem import HypemIE
from .ign import IGNIE, OneUPIE from .ign import IGNIE, OneUPIE
from .imdb import ( from .imdb import (

View File

@ -71,7 +71,7 @@ class InfoExtractor(object):
* player_url SWF Player URL (used for rtmpdump). * player_url SWF Player URL (used for rtmpdump).
* protocol The protocol that will be used for the actual * protocol The protocol that will be used for the actual
download, lower-case. download, lower-case.
"http", "https", "rtsp", "rtmp" or so. "http", "https", "rtsp", "rtmp", "m3u8" or so.
* preference Order number of this format. If this field is * preference Order number of this format. If this field is
present and not None, the formats get sorted present and not None, the formats get sorted
by this field. by this field.

View File

@ -332,10 +332,16 @@ def _real_extract(self, url):
# Look for embedded Facebook player # Look for embedded Facebook player
mobj = re.search( mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
if mobj is not None: if mobj is not None:
return self.url_result(mobj.group('url'), 'Facebook') return self.url_result(mobj.group('url'), 'Facebook')
# Look for embedded Huffington Post player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'HuffPost')
# Start with something easy: JW Player in SWFObject # Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None: if mobj is None:

View File

@ -0,0 +1,70 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
parse_duration,
unified_strdate,
)
class HuffPostIE(InfoExtractor):
IE_DESC = 'Huffington Post'
_VALID_URL = r'''(?x)
https?://(embed\.)?live\.huffingtonpost\.com/
(?:
r/segment/[^/]+/|
HPLEmbedPlayer/\?segmentId=
)
(?P<id>[0-9a-f]+)'''
_TEST = {
'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
'file': '52dd3e4b02a7602131000677.mp4',
'md5': 'TODO',
'info_dict': {
'title': 'TODO',
'description': 'TODO',
'duration': 1549,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
data = self._download_json(api_url, video_id)['data']
video_title = data['title']
duration = parse_duration(data['running_time'])
upload_date = unified_strdate(data['schedule']['started_at'])
thumbnails = []
for url in data['images'].values():
m = re.match('.*-([0-9]+x[0-9]+)\.', url)
if not m:
continue
thumbnails.append({
'url': url,
'resolution': m.group(1),
})
formats = [{
'format': key,
'format_id': key.replace('/', '.'),
'ext': 'mp4',
'url': url,
'vcodec': 'none' if key.startswith('audio/') else None,
} for key, url in data['sources']['live'].items()]
self._sort_formats(formats)
return {
'id': video_id,
'title': video_title,
'formats': formats,
'duration': duration,
'upload_date': upload_date,
'thumbnails': thumbnails,
}