[core] Support decoding multiple content encodings (#7142)

Authored by: coletdjnz
2024-11-03 08:19:57 +01:00 · 2023-05-27 22:40:05 +12:00 · 2023-05-27 22:40:05 +12:00 · daafbf49b3
commit daafbf49b3
parent 3f66b6fe50
2 changed files with 108 additions and 29 deletions
--- a/test/test_http.py
+++ b/test/test_http.py
@ -17,9 +17,11 @@
 import threading
 import urllib.error
 import urllib.request
 import zlib
 from test.helper import http_server_port
 from yt_dlp import YoutubeDL
 from yt_dlp.dependencies import brotli
 from yt_dlp.utils import sanitized_Request, urlencode_postdata
 from .helper import FakeYDL
@ -148,6 +150,31 @@ def do_GET(self):
            self.send_header('Location', new_url)
            self.send_header('Content-Length', '0')
            self.end_headers()
        elif self.path == '/content-encoding':
            encodings = self.headers.get('ytdl-encoding', '')
            payload = b'<html><video src="/vid.mp4" /></html>'
            for encoding in filter(None, (e.strip() for e in encodings.split(','))):
                if encoding == 'br' and brotli:
                    payload = brotli.compress(payload)
                elif encoding == 'gzip':
                    buf = io.BytesIO()
                    with gzip.GzipFile(fileobj=buf, mode='wb') as f:
                        f.write(payload)
                    payload = buf.getvalue()
                elif encoding == 'deflate':
                    payload = zlib.compress(payload)
                elif encoding == 'unsupported':
                    payload = b'raw'
                    break
                else:
                    self._status(415)
                    return
            self.send_response(200)
            self.send_header('Content-Encoding', encodings)
            self.send_header('Content-Length', str(len(payload)))
            self.end_headers()
            self.wfile.write(payload)
        else:
            self._status(404)
@ -302,6 +329,55 @@ def test_gzip_trailing_garbage(self):
            data = ydl.urlopen(sanitized_Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode('utf-8')
            self.assertEqual(data, '<html><video src="/vid.mp4" /></html>')
    @unittest.skipUnless(brotli, 'brotli support is not installed')
    def test_brotli(self):
        with FakeYDL() as ydl:
            res = ydl.urlopen(
                sanitized_Request(
                    f'http://127.0.0.1:{self.http_port}/content-encoding',
                    headers={'ytdl-encoding': 'br'}))
            self.assertEqual(res.headers.get('Content-Encoding'), 'br')
            self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
    def test_deflate(self):
        with FakeYDL() as ydl:
            res = ydl.urlopen(
                sanitized_Request(
                    f'http://127.0.0.1:{self.http_port}/content-encoding',
                    headers={'ytdl-encoding': 'deflate'}))
            self.assertEqual(res.headers.get('Content-Encoding'), 'deflate')
            self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
    def test_gzip(self):
        with FakeYDL() as ydl:
            res = ydl.urlopen(
                sanitized_Request(
                    f'http://127.0.0.1:{self.http_port}/content-encoding',
                    headers={'ytdl-encoding': 'gzip'}))
            self.assertEqual(res.headers.get('Content-Encoding'), 'gzip')
            self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
    def test_multiple_encodings(self):
        # https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4
        with FakeYDL() as ydl:
            for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
                res = ydl.urlopen(
                    sanitized_Request(
                        f'http://127.0.0.1:{self.http_port}/content-encoding',
                        headers={'ytdl-encoding': pair}))
                self.assertEqual(res.headers.get('Content-Encoding'), pair)
                self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
    def test_unsupported_encoding(self):
        # it should return the raw content
        with FakeYDL() as ydl:
            res = ydl.urlopen(
                sanitized_Request(
                    f'http://127.0.0.1:{self.http_port}/content-encoding',
                    headers={'ytdl-encoding': 'unsupported'}))
            self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported')
            self.assertEqual(res.read(), b'raw')
 class TestClientCert(unittest.TestCase):
    def setUp(self):
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -1361,6 +1361,23 @@ def brotli(data):
            return data
        return brotli.decompress(data)
    @staticmethod
    def gz(data):
        gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
        try:
            return gz.read()
        except OSError as original_oserror:
            # There may be junk add the end of the file
            # See http://stackoverflow.com/q/4928560/35070 for details
            for i in range(1, 1024):
                try:
                    gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
                    return gz.read()
                except OSError:
                    continue
            else:
                raise original_oserror
    def http_request(self, req):
        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
        # always respected by websites, some tend to give out URLs with non percent-encoded
@ -1394,35 +1411,21 @@ def http_request(self, req):
    def http_response(self, req, resp):
        old_resp = resp
-        # gzip
+
-        if resp.headers.get('Content-encoding', '') == 'gzip':
+        # Content-Encoding header lists the encodings in order that they were applied [1].
-            content = resp.read()
+        # To decompress, we simply do the reverse.
-            gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
+        # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
-            try:
+        decoded_response = None
-                uncompressed = io.BytesIO(gz.read())
+        for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
-            except OSError as original_ioerror:
+            if encoding == 'gzip':
-                # There may be junk add the end of the file
+                decoded_response = self.gz(decoded_response or resp.read())
-                # See http://stackoverflow.com/q/4928560/35070 for details
+            elif encoding == 'deflate':
-                for i in range(1, 1024):
+                decoded_response = self.deflate(decoded_response or resp.read())
-                    try:
+            elif encoding == 'br' and brotli:
-                        gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
+                decoded_response = self.brotli(decoded_response or resp.read())
-                        uncompressed = io.BytesIO(gz.read())
+
-                    except OSError:
+        if decoded_response is not None:
-                        continue
+            resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
                    break
                else:
                    raise original_ioerror
            resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # deflate
        if resp.headers.get('Content-encoding', '') == 'deflate':
            gz = io.BytesIO(self.deflate(resp.read()))
            resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # brotli
        if resp.headers.get('Content-encoding', '') == 'br':
            resp = urllib.request.addinfourl(
                io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
        # https://github.com/ytdl-org/youtube-dl/issues/6457).