diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index a527f10de2..ef5644aa54 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
import json
from .common import InfoExtractor
@@ -65,18 +64,18 @@ def _real_extract(self, url):
uploader_id = mobj.group('company')
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
- playlist_snippet = self._download_webpage(playlist_url, movie)
- playlist_cleaned = re.sub(r'(?s)', u'', playlist_snippet)
- playlist_cleaned = re.sub(r'', r'', playlist_cleaned)
- # The ' in the onClick attributes are not escaped, it couldn't be parsed
- # with xml.etree.ElementTree.fromstring
- # like: http://trailers.apple.com/trailers/wb/gravity/
- def _clean_json(m):
- return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''')
- playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
- playlist_html = u'' + playlist_cleaned + u''
+ def fix_html(s):
+ s = re.sub(r'(?s)', u'', s)
+ s = re.sub(r'', r'', s)
+ # The ' in the onClick attributes are not escaped, it couldn't be parsed
+ # like: http://trailers.apple.com/trailers/wb/gravity/
+ def _clean_json(m):
+ return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''')
+ s = re.sub(self._JSON_RE, _clean_json, s)
+ s = u'' + s + u''
+ return s
+ doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
- doc = xml.etree.ElementTree.fromstring(playlist_html)
playlist = []
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index d4fc869732..c60089ad35 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -1,9 +1,9 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
+ fix_xml_all_ampersand,
)
@@ -30,12 +30,10 @@ def _real_extract(self, url):
# it includes a required token
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
- playlist_page = self._download_webpage(
+ pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
- video_id, u'Downloading video info')
- # Fix broken xml
- playlist_page = re.sub('&', '&', playlist_page)
- pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
+ video_id, u'Downloading video info',
+ transform_source=fix_xml_all_ampersand)
track_doc = pdoc.find('trackList/track')
def find_param(name):
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index 6b95b49988..e560c1d354 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -1,8 +1,10 @@
import re
-import xml.etree.ElementTree
import operator
from .common import InfoExtractor
+from ..utils import (
+ fix_xml_all_ampersand,
+)
class MetacriticIE(InfoExtractor):
@@ -23,9 +25,8 @@ def _real_extract(self, url):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&'
- info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
- video_id, u'Downloading info xml').replace('&', '&')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
+ video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = []
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 0dab9fcc5d..4593488ce5 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1057,3 +1057,8 @@ def month_by_name(name):
return ENGLISH_NAMES.index(name) + 1
except ValueError:
return None
+
+
+def fix_xml_all_ampersand(xml_str):
+ """Replace all the '&' by '&' in XML"""
+ return xml_str.replace(u'&', u'&')