Merge 4cf46e29c1 into 351dc0bc33

2024-05-14 05:05:08 +03:00 · 2024-05-14 05:05:08 +03:00 · c6cd36ec91
parent 351dc0bc33 4cf46e29c1
commit c6cd36ec91
3 changed files with 92 additions and 39 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -5,6 +5,7 @@ import os
 import sys
 import unittest
 import warnings
+import datetime as dt

 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

@ -27,6 +28,7 @@ from yt_dlp.utils import (
    ExtractorError,
    InAdvancePagedList,
    LazyList,
+    NO_DEFAULT,
    OnDemandPagedList,
    Popen,
    age_restricted,
@ -768,6 +770,11 @@ class TestUtil(unittest.TestCase):

    def test_parse_iso8601(self):
        self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
+        self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00'), 1395641066)
+        self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=dt.timedelta(hours=-7)), 1395641066)
+        self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=NO_DEFAULT), None)
+        # default does not override timezone in date_str
+        self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00', timezone=dt.timedelta(hours=-10)), 1395641066)
        self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
        self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
        self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -1325,6 +1325,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
                'uploader_id': '@PhilippHagemeister',
                'heatmap': 'count:100',
+                'timestamp': 1349198244,
            }
        },
        {
@ -1368,6 +1369,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
                'uploader_id': '@PhilippHagemeister',
                'heatmap': 'count:100',
+                'timestamp': 1349198244,
            },
            'params': {
                'skip_download': True,
@ -1454,6 +1456,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'comment_count': int,
                'channel_is_verified': True,
                'heatmap': 'count:100',
+                'timestamp': 1401991663,
            },
        },
        {
@ -1513,6 +1516,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': 'Projekt Melody',
                'uploader_url': 'https://www.youtube.com/@ProjektMelody',
                'uploader_id': '@ProjektMelody',
+                'timestamp': 1577508724,
            },
        },
        {
@ -1618,6 +1622,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': 'https://www.youtube.com/@Olympics',
                'uploader_id': '@Olympics',
                'channel_is_verified': True,
+                'timestamp': 1440707674,
            },
            'params': {
                'skip_download': 'requires avconv',
@ -1651,6 +1656,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': '孫ᄋᄅ',
                'uploader_url': 'https://www.youtube.com/@AllenMeow',
                'uploader_id': '@AllenMeow',
+                'timestamp': 1299776999,
            },
        },
        # url_encoded_fmt_stream_map is empty string
@ -1794,6 +1800,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                },
            }],
            'params': {'skip_download': True},
+            'skip': 'Not multifeed anymore',
        },
        {
            # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
@ -1902,6 +1909,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': 'The Berkman Klein Center for Internet & Society',
                'uploader_id': '@BKCHarvard',
                'uploader_url': 'https://www.youtube.com/@BKCHarvard',
+                'timestamp': 1422422076,
            },
            'params': {
                'skip_download': True,
@ -1937,6 +1945,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_id': '@BernieSanders',
                'channel_is_verified': True,
                'heatmap': 'count:100',
+                'timestamp': 1447987198,
            },
            'params': {
                'skip_download': True,
@ -2000,6 +2009,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_id': '@Vsauce',
                'comment_count': int,
                'channel_is_verified': True,
+                'timestamp': 1484761047,
            },
            'params': {
                'skip_download': True,
@ -2155,6 +2165,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': 'l\'Or Vert asbl',
                'uploader_url': 'https://www.youtube.com/@ElevageOrVert',
                'uploader_id': '@ElevageOrVert',
+                'timestamp': 1497343210,
            },
            'params': {
                'skip_download': True,
@ -2193,6 +2204,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_id': '@Csharp-video-tutorialsBlogspot',
                'channel_is_verified': True,
                'heatmap': 'count:100',
+                'timestamp': 1377976349,
            },
            'params': {
                'skip_download': True,
@ -2275,6 +2287,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_id': '@CBSMornings',
                'comment_count': int,
                'channel_is_verified': True,
+                'timestamp': 1405513526,
            }
        },
        {
@ -2292,7 +2305,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'view_count': int,
                'channel': 'Walk around Japan',
                'tags': ['Ueno Tokyo', 'Okachimachi Tokyo', 'Ameyoko Street', 'Tokyo attraction', 'Travel in Tokyo'],
-                'thumbnail': 'https://i.ytimg.com/vi_webp/cBvYw8_A0vQ/hqdefault.webp',
+                'thumbnail': 'https://i.ytimg.com/vi/cBvYw8_A0vQ/hqdefault.jpg',
                'age_limit': 0,
                'availability': 'public',
                'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
@ -2302,6 +2315,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': 'Walk around Japan',
                'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124',
                'uploader_id': '@walkaroundjapan7124',
+                'timestamp': 1605884416,
            },
            'params': {
                'skip_download': True,
@ -2396,6 +2410,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'comment_count': int,
                'channel_is_verified': True,
                'heatmap': 'count:100',
+                'timestamp': 1395685455,
            }, 'params': {'format': 'mhtml', 'skip_download': True}
        }, {
            # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939)
@ -2425,9 +2440,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': 'https://www.youtube.com/@LeonNguyen',
                'uploader_id': '@LeonNguyen',
                'heatmap': 'count:100',
+                'timestamp': 1641170939,
            }
        }, {
            # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date
+            # todo: remove compat opt? no longer works
            'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4',
            'info_dict': {
                'id': '2NUZ8W2llS4',
@ -2487,38 +2504,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'comment_count': int,
                'channel_is_verified': True,
                'heatmap': 'count:100',
+                'timestamp': 1641172509,
            }
        },
-        {   # continuous livestream. Microformat upload date should be preferred.
-            # Upload date was 2021-06-19 (not UTC), while stream start is 2021-11-27
-            'url': 'https://www.youtube.com/watch?v=kgx4WGK0oNU',
+        {   # continuous livestream.
+            # Upload date was 2022-07-12T05:12:29-07:00, while stream start is 2022-07-12T15:59:30+00:00
+            'url': 'https://www.youtube.com/watch?v=jfKfPfyJRdk',
            'info_dict': {
-                'id': 'kgx4WGK0oNU',
-                'title': r're:jazz\/lofi hip hop radio🌱chill beats to relax\/study to \[LIVE 24\/7\] \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
+                'id': 'jfKfPfyJRdk',
                'ext': 'mp4',
-                'channel_id': 'UC84whx2xxsiA1gXHXXqKGOA',
-                'availability': 'public',
-                'age_limit': 0,
-                'release_timestamp': 1637975704,
-                'upload_date': '20210619',
-                'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
-                'live_status': 'is_live',
-                'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg',
-                'channel': 'Abao in Tokyo',
-                'channel_follower_count': int,
-                'release_date': '20211127',
-                'tags': 'count:39',
-                'categories': ['People & Blogs'],
+                'channel_id': 'UCSJ4gkVC6NrvII8umztf0Ow',
                'like_count': int,
-                'view_count': int,
-                'playable_in_embed': True,
-                'description': 'md5:2ef1d002cad520f65825346e2084e49d',
+                'uploader': 'Lofi Girl',
+                'categories': ['Music'],
                'concurrent_view_count': int,
-                'uploader': 'Abao in Tokyo',
-                'uploader_url': 'https://www.youtube.com/@abaointokyo',
-                'uploader_id': '@abaointokyo',
+                'playable_in_embed': True,
+                'timestamp': 1657627949,
+                'release_date': '20220712',
+                'channel_url': 'https://www.youtube.com/channel/UCSJ4gkVC6NrvII8umztf0Ow',
+                'description': 'md5:13a6f76df898f5674f9127139f3df6f7',
+                'age_limit': 0,
+                'thumbnail': 'https://i.ytimg.com/vi/jfKfPfyJRdk/maxresdefault.jpg',
+                'release_timestamp': 1657641570,
+                'uploader_url': 'https://www.youtube.com/@LofiGirl',
+                'channel_follower_count': int,
+                'channel_is_verified': True,
+                'title': r're:^lofi hip hop radio 📚 - beats to relax/study to',
+                'view_count': int,
+                'live_status': 'is_live',
+                'tags': 'count:32',
+                'channel': 'Lofi Girl',
+                'availability': 'public',
+                'upload_date': '20220712',
+                'uploader_id': '@LofiGirl',
            },
-            'params': {'skip_download': True}
+            'params': {'skip_download': True},
        }, {
            'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA',
            'info_dict': {
@ -2544,6 +2564,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_id': '@lesmiscore',
                'uploader': 'Lesmiscore',
                'uploader_url': 'https://www.youtube.com/@lesmiscore',
+                'timestamp': 1648005313,
            }
        }, {
            # Prefer primary title+description language metadata by default
@ -2571,6 +2592,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': 'https://www.youtube.com/@coletdjnz',
                'uploader_id': '@coletdjnz',
                'uploader': 'cole-dlp-test-acc',
+                'timestamp': 1662677394,
            },
            'params': {'skip_download': True}
        }, {
@ -2584,7 +2606,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'duration': 5,
                'live_status': 'not_live',
                'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
-                'upload_date': '20220728',
+                'upload_date': '20220729',
                'view_count': int,
                'categories': ['People & Blogs'],
                'thumbnail': r're:^https?://.*\.jpg',
@ -2597,6 +2619,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': 'https://www.youtube.com/@coletdjnz',
                'uploader_id': '@coletdjnz',
                'uploader': 'cole-dlp-test-acc',
+                'timestamp': 1659073275,
+                'like_count': int,
            },
            'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}},
            'expected_warnings': [r'Preferring "fr" translated fields'],
@ -2662,6 +2686,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': 'Projekt Melody',
                'uploader_id': '@ProjektMelody',
                'uploader_url': 'https://www.youtube.com/@ProjektMelody',
+                'timestamp': 1577508724,
            },
            'params': {'extractor_args': {'youtube': {'player_client': ['tv_embedded']}}, 'format': '251-drc'},
        },
@ -2696,6 +2721,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_id': '@sana_natori',
                'channel_is_verified': True,
                'heatmap': 'count:100',
+                'timestamp': 1671798112,
            },
        },
        {
@ -2765,6 +2791,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries',
                'uploader_id': '@ChristopherSykesDocumentaries',
                'heatmap': 'count:100',
+                'timestamp': 1211825920,
            },
            'params': {
                'skip_download': True,
@ -4562,19 +4589,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'uploader_id': channel_handle,
            'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
        })
+
+        # We only want timestamp IF it has time precision AND a timezone
+        # Currently the uploadDate in microformats appears to be in US/Pacific timezone.
+        timestamp = (
+            parse_iso8601(get_first(microformats, 'uploadDate'), timezone=NO_DEFAULT)
+            or parse_iso8601(search_meta('uploadDate'), timezone=NO_DEFAULT)
+        )
+        upload_date = (
+            dt.datetime.fromtimestamp(timestamp, dt.timezone.utc).strftime('%Y%m%d') if timestamp else
+            (
+                unified_strdate(get_first(microformats, 'uploadDate'))
+                or unified_strdate(search_meta('uploadDate'))
+            ))
+
+        # In the case we cannot get the timestamp:
        # The upload date for scheduled, live and past live streams / premieres in microformats
        # may be different from the stream date. Although not in UTC, we will prefer it in this case.
        # See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139
-        upload_date = (
-            unified_strdate(get_first(microformats, 'uploadDate'))
-            or unified_strdate(search_meta('uploadDate')))
        if not upload_date or (
-            live_status in ('not_live', None)
+            not timestamp
+            and live_status in ('not_live', None)
            and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', [])
        ):
+            # this should be in UTC, as configured in the cookie/client context
            upload_date = strftime_or_none(
                self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date
+
        info['upload_date'] = upload_date
+        info['timestamp'] = timestamp

        if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'):
            # Newly uploaded videos' HLS formats are potentially problematic and need to be checked
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -1134,7 +1134,7 @@ def is_path_like(f):
    return isinstance(f, (str, bytes, os.PathLike))


-def extract_timezone(date_str):
+def extract_timezone(date_str, default=None):
    m = re.search(
        r'''(?x)
            ^.{8,}?                                              # >=8 char non-TZ prefix, if present
@ -1146,21 +1146,25 @@ def extract_timezone(date_str):
                (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
            $)
        ''', date_str)
+    timezone = None
+
    if not m:
        m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
        timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
        if timezone is not None:
            date_str = date_str[:-len(m.group('tz'))]
-        timezone = dt.timedelta(hours=timezone or 0)
+            timezone = dt.timedelta(hours=timezone)
    else:
        date_str = date_str[:-len(m.group('tz'))]
-        if not m.group('sign'):
-            timezone = dt.timedelta()
-        else:
+        if m.group('sign'):
            sign = 1 if m.group('sign') == '+' else -1
            timezone = dt.timedelta(
                hours=sign * int(m.group('hours')),
                minutes=sign * int(m.group('minutes')))
+
+    if timezone is None and default is not NO_DEFAULT:
+        timezone = default or dt.timedelta()
+
    return timezone, date_str


@ -1172,10 +1176,9 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):

    date_str = re.sub(r'\.[0-9]+', '', date_str)

-    if timezone is None:
-        timezone, date_str = extract_timezone(date_str)
+    timezone, date_str = extract_timezone(date_str, timezone)

-    with contextlib.suppress(ValueError):
+    with contextlib.suppress(ValueError, TypeError):
        date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
        dt_ = dt.datetime.strptime(date_str, date_format) - timezone
        return calendar.timegm(dt_.timetuple())