2015-07-22 14:03:05 +02:00
import base64
2016-02-16 23:01:44 +01:00
import binascii
2014-03-24 01:40:09 +01:00
import calendar
2014-04-04 23:00:51 +02:00
import codecs
2020-05-04 23:19:33 +02:00
import collections
2022-09-25 23:03:19 +02:00
import collections . abc
2014-02-25 01:43:17 +01:00
import contextlib
2024-02-25 01:16:34 +01:00
import datetime as dt
2017-05-01 17:09:18 +02:00
import email . header
2022-04-12 00:32:57 +02:00
import email . utils
2013-05-13 09:20:08 +02:00
import errno
2024-10-23 08:33:50 +02:00
import functools
2021-09-23 19:40:51 +02:00
import hashlib
import hmac
2022-06-24 10:10:17 +02:00
import html . entities
import html . parser
2022-06-29 03:13:24 +02:00
import inspect
2012-11-28 00:09:17 +01:00
import io
2016-05-02 05:21:39 +02:00
import itertools
2012-12-20 13:13:24 +01:00
import json
2012-03-25 03:07:37 +02:00
import locale
2013-11-25 03:12:26 +01:00
import math
2022-04-12 00:32:57 +02:00
import mimetypes
2023-06-21 05:07:42 +02:00
import netrc
2015-02-10 03:32:21 +01:00
import operator
2012-03-25 03:07:37 +02:00
import os
2013-08-28 12:57:10 +02:00
import platform
2017-02-04 12:49:58 +01:00
import random
2012-03-25 03:07:37 +02:00
import re
2022-04-12 00:32:57 +02:00
import shlex
2013-08-28 12:57:10 +02:00
import socket
2016-05-02 05:21:39 +02:00
import ssl
2022-06-24 10:10:17 +02:00
import struct
2013-12-09 18:29:07 +01:00
import subprocess
2012-03-25 03:07:37 +02:00
import sys
2014-08-21 13:01:13 +02:00
import tempfile
2020-05-04 23:19:33 +02:00
import time
2013-01-03 15:39:55 +01:00
import traceback
2022-05-25 14:23:46 +02:00
import types
2022-08-04 16:49:32 +02:00
import unicodedata
2022-06-24 12:54:43 +02:00
import urllib . error
2022-04-12 00:32:57 +02:00
import urllib . parse
2022-06-24 10:10:17 +02:00
import urllib . request
2014-03-10 17:31:32 +01:00
import xml . etree . ElementTree
2012-03-25 03:07:37 +02:00
2023-05-20 23:56:23 +02:00
from . import traversal
from . . compat import (
2015-10-25 20:04:55 +01:00
compat_etree_fromstring ,
2017-03-25 20:30:10 +01:00
compat_expanduser ,
2022-04-12 00:32:57 +02:00
compat_HTMLParseError ,
2016-09-29 18:28:32 +02:00
compat_os_name ,
2014-11-02 11:23:40 +01:00
)
2023-11-20 09:04:04 +01:00
from . . dependencies import xattr
2016-05-03 09:15:32 +02:00
2024-06-12 01:09:58 +02:00
__name__ = __name__ . rsplit ( ' . ' , 1 ) [ 0 ] # noqa: A001: Pretend to be the parent module
2023-05-24 19:59:30 +02:00
2013-06-06 14:35:08 +02:00
# This is not clearly defined otherwise
compiled_regex_type = type ( re . compile ( ' ' ) )
2019-06-28 19:32:43 +02:00
2023-05-24 20:00:43 +02:00
class NO_DEFAULT :
pass
def IDENTITY ( x ) :
return x
2015-06-28 18:56:07 +02:00
2015-02-13 08:14:23 +01:00
ENGLISH_MONTH_NAMES = [
' January ' , ' February ' , ' March ' , ' April ' , ' May ' , ' June ' ,
' July ' , ' August ' , ' September ' , ' October ' , ' November ' , ' December ' ]
2016-09-14 18:13:55 +02:00
MONTH_NAMES = {
' en ' : ENGLISH_MONTH_NAMES ,
' fr ' : [
2016-09-14 18:57:01 +02:00
' janvier ' , ' février ' , ' mars ' , ' avril ' , ' mai ' , ' juin ' ,
' juillet ' , ' août ' , ' septembre ' , ' octobre ' , ' novembre ' , ' décembre ' ] ,
2022-11-04 15:54:05 +01:00
# these follow the genitive grammatical case (dopełniacz)
# some websites might be using nominative, which will require another month list
# https://en.wikibooks.org/wiki/Polish/Noun_cases
' pl ' : [ ' stycznia ' , ' lutego ' , ' marca ' , ' kwietnia ' , ' maja ' , ' czerwca ' ,
' lipca ' , ' sierpnia ' , ' września ' , ' października ' , ' listopada ' , ' grudnia ' ] ,
2016-09-14 18:13:55 +02:00
}
2016-09-02 18:31:52 +02:00
2022-08-14 01:21:54 +02:00
# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
TIMEZONE_NAMES = {
' UT ' : 0 , ' UTC ' : 0 , ' GMT ' : 0 , ' Z ' : 0 ,
' AST ' : - 4 , ' ADT ' : - 3 , # Atlantic (used in Canada)
' EST ' : - 5 , ' EDT ' : - 4 , # Eastern
' CST ' : - 6 , ' CDT ' : - 5 , # Central
' MST ' : - 7 , ' MDT ' : - 6 , # Mountain
2024-06-12 01:09:58 +02:00
' PST ' : - 8 , ' PDT ' : - 7 , # Pacific
2022-08-14 01:21:54 +02:00
}
2016-05-03 02:40:30 +02:00
# needed for sanitizing filenames in restricted mode
2016-06-02 11:51:48 +02:00
ACCENT_CHARS = dict ( zip ( ' ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ ' ,
2019-05-10 20:42:32 +02:00
itertools . chain ( ' AAAAAA ' , [ ' AE ' ] , ' CEEEEIIIIDNOOOOOOO ' , [ ' OE ' ] , ' UUUUUY ' , [ ' TH ' , ' ss ' ] ,
' aaaaaa ' , [ ' ae ' ] , ' ceeeeiiiionooooooo ' , [ ' oe ' ] , ' uuuuuy ' , [ ' th ' ] , ' y ' ) ) )
2016-05-03 02:40:30 +02:00
2016-06-25 17:30:35 +02:00
DATE_FORMATS = (
' %d % B % Y ' ,
' %d % b % Y ' ,
' % B %d % Y ' ,
2017-01-12 16:39:45 +01:00
' % B %d st % Y ' ,
' % B %d nd % Y ' ,
2019-11-26 18:08:37 +01:00
' % B %d rd % Y ' ,
2017-01-12 16:39:45 +01:00
' % B %d th % Y ' ,
2016-06-25 17:30:35 +02:00
' % b %d % Y ' ,
2017-01-12 16:39:45 +01:00
' % b %d st % Y ' ,
' % b %d nd % Y ' ,
2019-11-26 18:08:37 +01:00
' % b %d rd % Y ' ,
2017-01-12 16:39:45 +01:00
' % b %d th % Y ' ,
2016-06-25 17:30:35 +02:00
' % b %d st % Y % I: % M ' ,
' % b %d nd % Y % I: % M ' ,
2019-11-26 18:08:37 +01:00
' % b %d rd % Y % I: % M ' ,
2016-06-25 17:30:35 +02:00
' % b %d th % Y % I: % M ' ,
' % Y % m %d ' ,
' % Y- % m- %d ' ,
2021-09-06 08:52:38 +02:00
' % Y. % m. %d . ' ,
2016-06-25 17:30:35 +02:00
' % Y/ % m/ %d ' ,
2016-08-10 05:36:49 +02:00
' % Y/ % m/ %d % H: % M ' ,
2016-06-25 17:30:35 +02:00
' % Y/ % m/ %d % H: % M: % S ' ,
2021-08-25 06:48:27 +02:00
' % Y % m %d % H % M ' ,
' % Y % m %d % H % M % S ' ,
2021-12-23 21:34:01 +01:00
' % Y % m %d ' ,
2017-01-23 16:31:43 +01:00
' % Y- % m- %d % H: % M ' ,
2016-06-25 17:30:35 +02:00
' % Y- % m- %d % H: % M: % S ' ,
' % Y- % m- %d % H: % M: % S. %f ' ,
2021-05-20 15:05:37 +02:00
' % Y- % m- %d % H: % M: % S: %f ' ,
2016-06-25 17:30:35 +02:00
' %d . % m. % Y % H: % M ' ,
' %d . % m. % Y % H. % M ' ,
' % Y- % m- %d T % H: % M: % SZ ' ,
' % Y- % m- %d T % H: % M: % S. %f Z ' ,
' % Y- % m- %d T % H: % M: % S. %f 0Z ' ,
' % Y- % m- %d T % H: % M: % S ' ,
' % Y- % m- %d T % H: % M: % S. %f ' ,
' % Y- % m- %d T % H: % M ' ,
2016-09-29 18:47:25 +02:00
' % b %d % Y at % H: % M ' ,
' % b %d % Y at % H: % M: % S ' ,
2017-12-16 15:56:16 +01:00
' % B %d % Y at % H: % M ' ,
' % B %d % Y at % H: % M: % S ' ,
2021-09-19 14:18:22 +02:00
' % H: % M %d - % b- % Y ' ,
2016-06-25 17:30:35 +02:00
)
DATE_FORMATS_DAY_FIRST = list ( DATE_FORMATS )
DATE_FORMATS_DAY_FIRST . extend ( [
' %d - % m- % Y ' ,
' %d . % m. % Y ' ,
' %d . % m. % y ' ,
' %d / % m/ % Y ' ,
' %d / % m/ % y ' ,
' %d / % m/ % Y % H: % M: % S ' ,
2022-08-01 21:25:48 +02:00
' %d - % m- % Y % H: % M ' ,
2023-05-29 16:44:26 +02:00
' % H: % M %d / % m/ % Y ' ,
2016-06-25 17:30:35 +02:00
] )
DATE_FORMATS_MONTH_FIRST = list ( DATE_FORMATS )
DATE_FORMATS_MONTH_FIRST . extend ( [
' % m- %d - % Y ' ,
' % m. %d . % Y ' ,
' % m/ %d / % Y ' ,
' % m/ %d / % y ' ,
' % m/ %d / % Y % H: % M: % S ' ,
] )
2016-10-19 18:28:49 +02:00
PACKED_CODES_RE = r " } \ ( ' (.+) ' ,( \ d+),( \ d+), ' ([^ ' ]+) ' \ .split \ ( ' \ | ' \ ) "
2022-09-26 23:00:50 +02:00
JSON_LD_RE = r ' (?is)<script[^>]+type=([ " \' ]?)application/ld \ +json \ 1[^>]*> \ s*(?P<json_ld> { .+?}| \ [.+? \ ]) \ s*</script> '
2016-10-19 18:28:49 +02:00
2022-04-29 03:48:36 +02:00
NUMBER_RE = r ' \ d+(?: \ . \ d+)? '
2015-02-13 08:14:23 +01:00
2022-05-19 16:06:31 +02:00
@functools.cache
2012-03-25 03:07:37 +02:00
def preferredencoding ( ) :
2012-11-28 02:04:46 +01:00
""" Get preferred encoding.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
Returns the best encoding scheme for the system , based on
locale . getpreferredencoding ( ) and some further tweaks .
"""
try :
pref = locale . getpreferredencoding ( )
2014-11-17 07:16:12 +01:00
' TEST ' . encode ( pref )
2015-03-27 13:02:20 +01:00
except Exception :
2012-11-28 02:04:46 +01:00
pref = ' UTF-8 '
2012-07-01 18:21:27 +02:00
2012-11-28 02:04:46 +01:00
return pref
2012-03-25 03:07:37 +02:00
2012-12-20 13:13:24 +01:00
2014-08-21 13:01:13 +02:00
def write_json_file ( obj , fn ) :
2014-11-20 07:05:39 +01:00
""" Encode obj as JSON and write it to fn, atomically if possible """
2014-08-21 13:01:13 +02:00
2021-12-30 13:23:36 +01:00
tf = tempfile . NamedTemporaryFile (
prefix = f ' { os . path . basename ( fn ) } . ' , dir = os . path . dirname ( fn ) ,
suffix = ' .tmp ' , delete = False , mode = ' w ' , encoding = ' utf-8 ' )
2014-08-21 13:01:13 +02:00
try :
with tf :
2021-12-27 23:51:13 +01:00
json . dump ( obj , tf , ensure_ascii = False )
2014-11-20 07:05:39 +01:00
if sys . platform == ' win32 ' :
# Need to remove existing file on Windows, else os.rename raises
# WindowsError or FileExistsError.
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( OSError ) :
2014-11-20 07:05:39 +01:00
os . unlink ( fn )
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( OSError ) :
2020-05-19 22:21:52 +02:00
mask = os . umask ( 0 )
os . umask ( mask )
os . chmod ( tf . name , 0o666 & ~ mask )
2014-08-21 13:01:13 +02:00
os . rename ( tf . name , fn )
2015-03-27 13:02:20 +01:00
except Exception :
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( OSError ) :
2014-08-21 13:01:13 +02:00
os . remove ( tf . name )
raise
2021-12-30 13:23:36 +01:00
def find_xpath_attr ( node , xpath , key , val = None ) :
""" Find the xpath xpath[@key=val] """
assert re . match ( r ' ^[a-zA-Z_-]+$ ' , key )
2024-06-12 01:09:58 +02:00
expr = xpath + ( f ' [@ { key } ] ' if val is None else f " [@ { key } = ' { val } ' ] " )
2021-12-30 13:23:36 +01:00
return node . find ( expr )
2013-07-11 16:12:08 +02:00
2013-10-12 21:34:04 +02:00
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
2014-11-23 20:41:03 +01:00
2013-10-12 21:34:04 +02:00
def xpath_with_ns ( path , ns_map ) :
components = [ c . split ( ' : ' ) for c in path . split ( ' / ' ) ]
replaced = [ ]
for c in components :
if len ( c ) == 1 :
replaced . append ( c [ 0 ] )
else :
ns , tag = c
2024-06-12 01:09:58 +02:00
replaced . append ( f ' {{ { ns_map [ ns ] } }} { tag } ' )
2013-10-12 21:34:04 +02:00
return ' / ' . join ( replaced )
2012-03-25 03:07:37 +02:00
2015-09-04 19:56:45 +02:00
def xpath_element ( node , xpath , name = None , fatal = False , default = NO_DEFAULT ) :
2015-10-31 17:39:44 +01:00
def _find_xpath ( xpath ) :
2022-04-11 22:09:26 +02:00
return node . find ( xpath )
2015-10-31 17:39:44 +01:00
2022-06-24 12:54:43 +02:00
if isinstance ( xpath , str ) :
2015-10-31 17:39:44 +01:00
n = _find_xpath ( xpath )
else :
for xp in xpath :
n = _find_xpath ( xp )
if n is not None :
break
2014-09-13 09:11:14 +02:00
2015-09-04 20:34:49 +02:00
if n is None :
2015-06-28 18:56:07 +02:00
if default is not NO_DEFAULT :
return default
elif fatal :
2014-09-13 09:09:55 +02:00
name = xpath if name is None else name
2024-06-12 01:09:58 +02:00
raise ExtractorError ( f ' Could not find XML element { name } ' )
2014-09-13 09:09:55 +02:00
else :
return None
2015-09-04 19:56:45 +02:00
return n
def xpath_text ( node , xpath , name = None , fatal = False , default = NO_DEFAULT ) :
2015-09-04 20:34:49 +02:00
n = xpath_element ( node , xpath , name , fatal = fatal , default = default )
if n is None or n == default :
return n
if n . text is None :
if default is not NO_DEFAULT :
return default
elif fatal :
name = xpath if name is None else name
2024-06-12 01:09:58 +02:00
raise ExtractorError ( f ' Could not find XML element \' s text { name } ' )
2015-09-04 20:34:49 +02:00
else :
return None
return n . text
2015-09-04 19:56:45 +02:00
def xpath_attr ( node , xpath , key , name = None , fatal = False , default = NO_DEFAULT ) :
n = find_xpath_attr ( node , xpath , key )
if n is None :
if default is not NO_DEFAULT :
return default
elif fatal :
2022-04-11 17:10:28 +02:00
name = f ' { xpath } [@ { key } ] ' if name is None else name
2024-06-12 01:09:58 +02:00
raise ExtractorError ( f ' Could not find XML attribute { name } ' )
2015-09-04 19:56:45 +02:00
else :
return None
return n . attrib [ key ]
2014-09-13 09:09:55 +02:00
2022-04-17 19:18:50 +02:00
def get_element_by_id ( id , html , * * kwargs ) :
2012-12-19 15:21:14 +01:00
""" Return the content of the tag with the specified ID in the passed HTML document """
2022-04-17 19:18:50 +02:00
return get_element_by_attribute ( ' id ' , id , html , * * kwargs )
2012-12-19 15:21:14 +01:00
2014-11-04 23:20:39 +01:00
2022-04-17 19:18:50 +02:00
def get_element_html_by_id ( id , html , * * kwargs ) :
2022-01-05 19:37:49 +01:00
""" Return the html of the tag with the specified ID in the passed HTML document """
2022-04-17 19:18:50 +02:00
return get_element_html_by_attribute ( ' id ' , id , html , * * kwargs )
2022-01-05 19:37:49 +01:00
2016-07-06 14:02:52 +02:00
def get_element_by_class ( class_name , html ) :
2017-02-11 10:16:54 +01:00
""" Return the content of the first tag with the specified class in the passed HTML document """
retval = get_elements_by_class ( class_name , html )
return retval [ 0 ] if retval else None
2022-01-05 19:37:49 +01:00
def get_element_html_by_class ( class_name , html ) :
""" Return the html of the first tag with the specified class in the passed HTML document """
retval = get_elements_html_by_class ( class_name , html )
return retval [ 0 ] if retval else None
2022-04-17 19:18:50 +02:00
def get_element_by_attribute ( attribute , value , html , * * kwargs ) :
retval = get_elements_by_attribute ( attribute , value , html , * * kwargs )
2017-02-11 10:16:54 +01:00
return retval [ 0 ] if retval else None
2022-04-17 19:18:50 +02:00
def get_element_html_by_attribute ( attribute , value , html , * * kargs ) :
retval = get_elements_html_by_attribute ( attribute , value , html , * * kargs )
2022-01-05 19:37:49 +01:00
return retval [ 0 ] if retval else None
2022-04-17 19:18:50 +02:00
def get_elements_by_class ( class_name , html , * * kargs ) :
2017-02-11 10:16:54 +01:00
""" Return the content of all tags with the specified class in the passed HTML document as a list """
return get_elements_by_attribute (
2024-06-12 01:09:58 +02:00
' class ' , rf ' [^ \' " ]*(?<=[ \' " \ s]) { re . escape ( class_name ) } (?=[ \' " \ s])[^ \' " ]* ' ,
2016-07-06 14:02:52 +02:00
html , escape_value = False )
2022-01-05 19:37:49 +01:00
def get_elements_html_by_class ( class_name , html ) :
""" Return the html of all tags with the specified class in the passed HTML document as a list """
return get_elements_html_by_attribute (
2024-06-12 01:09:58 +02:00
' class ' , rf ' [^ \' " ]*(?<=[ \' " \ s]) { re . escape ( class_name ) } (?=[ \' " \ s])[^ \' " ]* ' ,
2022-01-05 19:37:49 +01:00
html , escape_value = False )
def get_elements_by_attribute ( * args , * * kwargs ) :
2012-12-19 15:21:14 +01:00
""" Return the content of the tag with the specified attribute in the passed HTML document """
2022-01-05 19:37:49 +01:00
return [ content for content , _ in get_elements_text_and_html_by_attribute ( * args , * * kwargs ) ]
def get_elements_html_by_attribute ( * args , * * kwargs ) :
""" Return the html of the tag with the specified attribute in the passed HTML document """
return [ whole for _ , whole in get_elements_text_and_html_by_attribute ( * args , * * kwargs ) ]
2022-10-09 07:55:26 +02:00
def get_elements_text_and_html_by_attribute ( attribute , value , html , * , tag = r ' [ \ w:.-]+ ' , escape_value = True ) :
2022-01-05 19:37:49 +01:00
"""
Return the text ( content ) and the html ( whole ) of the tag with the specified
attribute in the passed HTML document
"""
2022-11-09 04:30:15 +01:00
if not value :
return
2012-04-11 00:22:51 +02:00
2022-04-11 17:10:28 +02:00
quote = ' ' if re . match ( r ''' [ \ s " ' `=<>] ''' , value ) else ' ? '
2022-01-09 19:14:56 +01:00
2016-07-06 14:02:52 +02:00
value = re . escape ( value ) if escape_value else value
2022-04-11 17:10:28 +02:00
partial_element_re = rf ''' (?x)
2022-10-09 07:55:26 +02:00
< ( ? P < tag > { tag } )
2022-01-09 19:14:56 +01:00
( ? : \s ( ? : [ ^ > " ' ]| " [ ^ " ]* " | ' [^ ' ] * ' )*)?
2022-04-11 17:10:28 +02:00
\s { re . escape ( attribute ) } \s * = \s * ( ? P < _q > [ ' " ] {quote} )(?-x: {value} )(?P=_q)
'''
2014-11-04 23:33:43 +01:00
2022-01-09 19:14:56 +01:00
for m in re . finditer ( partial_element_re , html ) :
content , whole = get_element_text_and_html_by_tag ( m . group ( ' tag ' ) , html [ m . start ( ) : ] )
2013-09-13 22:05:29 +02:00
2022-01-09 19:14:56 +01:00
yield (
unescapeHTML ( re . sub ( r ' ^(?P<q>[ " \' ])(?P<content>.*)(?P=q)$ ' , r ' \ g<content> ' , content , flags = re . DOTALL ) ) ,
2024-06-12 01:09:58 +02:00
whole ,
2022-01-09 19:14:56 +01:00
)
2013-09-13 22:05:29 +02:00
2016-03-16 16:50:04 +01:00
2022-06-24 10:10:17 +02:00
class HTMLBreakOnClosingTagParser ( html . parser . HTMLParser ) :
2022-01-05 19:37:49 +01:00
"""
HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
closing tag for the first opening tag it has encountered , and can be used
as a context manager
"""
class HTMLBreakOnClosingTagException ( Exception ) :
pass
def __init__ ( self ) :
self . tagstack = collections . deque ( )
2022-06-24 10:10:17 +02:00
html . parser . HTMLParser . __init__ ( self )
2022-01-05 19:37:49 +01:00
def __enter__ ( self ) :
return self
def __exit__ ( self , * _ ) :
self . close ( )
def close ( self ) :
# handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
# so data remains buffered; we no longer have any interest in it, thus
# override this method to discard it
pass
def handle_starttag ( self , tag , _ ) :
self . tagstack . append ( tag )
def handle_endtag ( self , tag ) :
if not self . tagstack :
raise compat_HTMLParseError ( ' no tags in the stack ' )
while self . tagstack :
inner_tag = self . tagstack . pop ( )
if inner_tag == tag :
break
else :
raise compat_HTMLParseError ( f ' matching opening tag for closing { tag } tag not found ' )
if not self . tagstack :
2024-06-12 01:09:58 +02:00
raise self . HTMLBreakOnClosingTagException
2022-01-05 19:37:49 +01:00
2022-11-06 21:59:58 +01:00
# XXX: This should be far less strict
2022-01-05 19:37:49 +01:00
def get_element_text_and_html_by_tag ( tag , html ) :
"""
For the first element with the specified tag in the passed HTML document
return its ' content (text) and the whole element (html)
"""
def find_or_raise ( haystack , needle , exc ) :
try :
return haystack . index ( needle )
except ValueError :
raise exc
closing_tag = f ' </ { tag } > '
whole_start = find_or_raise (
html , f ' < { tag } ' , compat_HTMLParseError ( f ' opening { tag } tag not found ' ) )
content_start = find_or_raise (
html [ whole_start : ] , ' > ' , compat_HTMLParseError ( f ' malformed opening { tag } tag ' ) )
content_start + = whole_start + 1
with HTMLBreakOnClosingTagParser ( ) as parser :
parser . feed ( html [ whole_start : content_start ] )
if not parser . tagstack or parser . tagstack [ 0 ] != tag :
raise compat_HTMLParseError ( f ' parser did not match opening { tag } tag ' )
offset = content_start
while offset < len ( html ) :
next_closing_tag_start = find_or_raise (
html [ offset : ] , closing_tag ,
compat_HTMLParseError ( f ' closing { tag } tag not found ' ) )
next_closing_tag_end = next_closing_tag_start + len ( closing_tag )
try :
parser . feed ( html [ offset : offset + next_closing_tag_end ] )
offset + = next_closing_tag_end
except HTMLBreakOnClosingTagParser . HTMLBreakOnClosingTagException :
return html [ content_start : offset + next_closing_tag_start ] , \
html [ whole_start : offset + next_closing_tag_end ]
raise compat_HTMLParseError ( ' unexpected end of html ' )
2022-06-24 10:10:17 +02:00
class HTMLAttributeParser ( html . parser . HTMLParser ) :
2016-01-02 20:49:59 +01:00
""" Trivial HTML parser to gather the attributes for a single element """
2020-10-09 07:06:49 +02:00
2016-01-02 20:49:59 +01:00
def __init__ ( self ) :
2016-03-16 16:50:04 +01:00
self . attrs = { }
2022-06-24 10:10:17 +02:00
html . parser . HTMLParser . __init__ ( self )
2016-01-02 20:49:59 +01:00
def handle_starttag ( self , tag , attrs ) :
self . attrs = dict ( attrs )
2022-11-06 18:23:16 +01:00
raise compat_HTMLParseError ( ' done ' )
2016-01-02 20:49:59 +01:00
2016-03-16 16:50:04 +01:00
2022-06-24 10:10:17 +02:00
class HTMLListAttrsParser ( html . parser . HTMLParser ) :
2021-11-05 17:54:56 +01:00
""" HTML parser to gather the attributes for the elements of a list """
def __init__ ( self ) :
2022-06-24 10:10:17 +02:00
html . parser . HTMLParser . __init__ ( self )
2021-11-05 17:54:56 +01:00
self . items = [ ]
self . _level = 0
def handle_starttag ( self , tag , attrs ) :
if tag == ' li ' and self . _level == 0 :
self . items . append ( dict ( attrs ) )
self . _level + = 1
def handle_endtag ( self , tag ) :
self . _level - = 1
2016-01-02 20:49:59 +01:00
def extract_attributes ( html_element ) :
""" Given a string for an HTML element such as
< el
a = " foo " B = " bar " c = " &98;az " d = boz
empty = noval entity = " & "
sq = ' " ' dq = " ' "
>
Decode and return a dictionary of attributes .
{
' a ' : ' foo ' , ' b ' : ' bar ' , c : ' baz ' , d : ' boz ' ,
' empty ' : ' ' , ' noval ' : None , ' entity ' : ' & ' ,
' sq ' : ' " ' , ' dq ' : ' \' '
} .
"""
parser = HTMLAttributeParser ( )
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( compat_HTMLParseError ) :
2017-06-11 20:52:24 +02:00
parser . feed ( html_element )
parser . close ( )
2016-01-02 20:49:59 +01:00
return parser . attrs
2012-04-11 00:22:51 +02:00
2016-03-16 16:50:04 +01:00
2021-11-05 17:54:56 +01:00
def parse_list ( webpage ) :
""" Given a string for an series of HTML <li> elements,
return a dictionary of their attributes """
parser = HTMLListAttrsParser ( )
parser . feed ( webpage )
parser . close ( )
return parser . items
2012-04-11 00:22:51 +02:00
def clean_html ( html ) :
2012-11-28 02:04:46 +01:00
""" Clean an HTML snippet into a readable string """
2015-01-09 23:59:18 +01:00
if html is None : # Convenience for sanitizing descriptions etc.
return html
2022-02-03 15:15:57 +01:00
html = re . sub ( r ' \ s+ ' , ' ' , html )
html = re . sub ( r ' (?u) \ s?< \ s?br \ s?/? \ s?> \ s? ' , ' \n ' , html )
html = re . sub ( r ' (?u)< \ s?/ \ s?p \ s?> \ s?< \ s?p[^>]*> ' , ' \n ' , html )
2012-11-28 02:04:46 +01:00
# Strip html tags
html = re . sub ( ' <.*?> ' , ' ' , html )
# Replace html entities
html = unescapeHTML ( html )
2013-03-29 15:59:13 +01:00
return html . strip ( )
2012-04-11 00:22:51 +02:00
2022-06-03 17:32:31 +02:00
class LenientJSONDecoder ( json . JSONDecoder ) :
2023-02-24 06:09:43 +01:00
# TODO: Write tests
def __init__ ( self , * args , transform_source = None , ignore_extra = False , close_objects = 0 , * * kwargs ) :
2022-06-03 17:32:31 +02:00
self . transform_source , self . ignore_extra = transform_source , ignore_extra
2023-02-24 06:09:43 +01:00
self . _close_attempts = 2 * close_objects
2022-06-03 17:32:31 +02:00
super ( ) . __init__ ( * args , * * kwargs )
2023-02-24 06:09:43 +01:00
@staticmethod
def _close_object ( err ) :
doc = err . doc [ : err . pos ]
# We need to add comma first to get the correct error message
if err . msg . startswith ( ' Expecting \' , \' ' ) :
return doc + ' , '
elif not doc . endswith ( ' , ' ) :
return
if err . msg . startswith ( ' Expecting property name ' ) :
return doc [ : - 1 ] + ' } '
elif err . msg . startswith ( ' Expecting value ' ) :
return doc [ : - 1 ] + ' ] '
2022-06-03 17:32:31 +02:00
def decode ( self , s ) :
if self . transform_source :
s = self . transform_source ( s )
2023-02-24 06:09:43 +01:00
for attempt in range ( self . _close_attempts + 1 ) :
try :
if self . ignore_extra :
return self . raw_decode ( s . lstrip ( ) ) [ 0 ]
return super ( ) . decode ( s )
except json . JSONDecodeError as e :
if e . pos is None :
raise
elif attempt < self . _close_attempts :
s = self . _close_object ( e )
if s is not None :
continue
2023-12-30 22:27:36 +01:00
raise type ( e ) ( f ' { e . msg } in { s [ e . pos - 10 : e . pos + 10 ] !r} ' , s , e . pos )
2023-02-24 06:09:43 +01:00
assert False , ' Too many attempts to decode JSON '
2022-06-03 17:32:31 +02:00
2012-03-25 03:07:37 +02:00
def sanitize_open ( filename , open_mode ) :
2012-11-28 02:04:46 +01:00
""" Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename . If this fails , it tries to change
the filename slightly , step by step , until it ' s either able to open it
or it fails and raises a final exception , like the standard open ( )
function .
It returns the tuple ( stream , definitive_file_name ) .
"""
2022-04-05 19:38:18 +02:00
if filename == ' - ' :
if sys . platform == ' win32 ' :
import msvcrt
2022-08-01 22:13:18 +02:00
2022-08-14 14:04:13 +02:00
# stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
2022-07-31 00:01:20 +02:00
with contextlib . suppress ( io . UnsupportedOperation ) :
msvcrt . setmode ( sys . stdout . fileno ( ) , os . O_BINARY )
2022-04-05 19:38:18 +02:00
return ( sys . stdout . buffer if hasattr ( sys . stdout , ' buffer ' ) else sys . stdout , filename )
2012-11-28 02:04:46 +01:00
2022-04-05 19:38:18 +02:00
for attempt in range ( 2 ) :
try :
try :
2022-04-05 19:45:17 +02:00
if sys . platform == ' win32 ' :
2022-04-07 08:00:46 +02:00
# FIXME: An exclusive lock also locks the file from being read.
# Since windows locks are mandatory, don't lock the file on windows (for now).
# Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
2024-06-12 01:09:58 +02:00
raise LockingUnsupportedError
2022-04-05 19:38:18 +02:00
stream = locked_file ( filename , open_mode , block = False ) . __enter__ ( )
2022-05-27 01:06:23 +02:00
except OSError :
2022-04-05 19:38:18 +02:00
stream = open ( filename , open_mode )
2022-05-27 01:06:23 +02:00
return stream , filename
2022-04-11 17:10:28 +02:00
except OSError as err :
2022-04-05 19:38:18 +02:00
if attempt or err . errno in ( errno . EACCES , ) :
raise
old_filename , filename = filename , sanitize_path ( filename )
if old_filename == filename :
raise
2012-03-25 03:07:37 +02:00
def timeconvert ( timestr ) :
2012-11-28 02:04:46 +01:00
""" Convert RFC 2822 defined time string into system timestamp """
timestamp = None
timetuple = email . utils . parsedate_tz ( timestr )
if timetuple is not None :
timestamp = email . utils . mktime_tz ( timetuple )
return timestamp
2012-11-26 23:58:46 +01:00
2014-11-23 20:41:03 +01:00
2022-03-27 06:34:04 +02:00
def sanitize_filename ( s , restricted = False , is_id = NO_DEFAULT ) :
2012-11-28 02:04:46 +01:00
""" Sanitizes a string so it could be used as part of a filename.
2022-03-27 06:34:04 +02:00
@param restricted Use a stricter subset of allowed characters
@param is_id Whether this is an ID that should be kept unchanged if possible .
If unset , yt - dlp ' s new sanitization rules are in effect
2012-11-28 02:04:46 +01:00
"""
2022-03-27 06:34:04 +02:00
if s == ' ' :
return ' '
2012-11-28 02:04:46 +01:00
def replace_insane ( char ) :
2016-05-03 02:40:30 +02:00
if restricted and char in ACCENT_CHARS :
return ACCENT_CHARS [ char ]
2021-09-27 07:59:16 +02:00
elif not restricted and char == ' \n ' :
2022-03-27 06:34:04 +02:00
return ' \0 '
2022-08-04 16:49:32 +02:00
elif is_id is NO_DEFAULT and not restricted and char in ' " *:<>?|/ \\ ' :
# Replace with their full-width unicode counterparts
return { ' / ' : ' \u29F8 ' , ' \\ ' : ' \u29f9 ' } . get ( char , chr ( ord ( char ) + 0xfee0 ) )
2021-09-27 07:59:16 +02:00
elif char == ' ? ' or ord ( char ) < 32 or ord ( char ) == 127 :
2012-11-28 02:04:46 +01:00
return ' '
elif char == ' " ' :
return ' ' if restricted else ' \' '
elif char == ' : ' :
2022-03-27 06:34:04 +02:00
return ' \0 _ \0 - ' if restricted else ' \0 \0 - '
2012-11-28 02:04:46 +01:00
elif char in ' \\ /|*<> ' :
2022-03-27 06:34:04 +02:00
return ' \0 _ '
if restricted and ( char in ' !& \' ()[] {} $;`^,# ' or char . isspace ( ) or ord ( char ) > 127 ) :
2023-12-26 01:40:24 +01:00
return ' ' if unicodedata . category ( char ) [ 0 ] in ' CM ' else ' \0 _ '
2012-11-28 02:04:46 +01:00
return char
2022-11-06 20:46:33 +01:00
# Replace look-alike Unicode glyphs
if restricted and ( is_id is NO_DEFAULT or not is_id ) :
2022-08-04 16:49:32 +02:00
s = unicodedata . normalize ( ' NFKC ' , s )
2022-03-27 06:34:04 +02:00
s = re . sub ( r ' [0-9]+(?::[0-9]+)+ ' , lambda m : m . group ( 0 ) . replace ( ' : ' , ' _ ' ) , s ) # Handle timestamps
2014-11-17 07:16:12 +01:00
result = ' ' . join ( map ( replace_insane , s ) )
2022-03-27 06:34:04 +02:00
if is_id is NO_DEFAULT :
2022-06-28 07:10:54 +02:00
result = re . sub ( r ' ( \ 0.)(?:(?= \ 1)..)+ ' , r ' \ 1 ' , result ) # Remove repeated substitute chars
STRIP_RE = r ' (?: \ 0.|[ _-])* '
2022-03-27 06:34:04 +02:00
result = re . sub ( f ' ^ \0 . { STRIP_RE } | { STRIP_RE } \0 .$ ' , ' ' , result ) # Remove substitute chars from start/end
result = result . replace ( ' \0 ' , ' ' ) or ' _ '
2012-12-03 15:36:24 +01:00
if not is_id :
while ' __ ' in result :
result = result . replace ( ' __ ' , ' _ ' )
result = result . strip ( ' _ ' )
# Common case of "Foreign band name - English song title"
if restricted and result . startswith ( ' -_ ' ) :
result = result [ 2 : ]
2015-02-24 11:38:01 +01:00
if result . startswith ( ' - ' ) :
result = ' _ ' + result [ len ( ' - ' ) : ]
2015-03-02 19:07:17 +01:00
result = result . lstrip ( ' . ' )
2012-12-03 15:36:24 +01:00
if not result :
result = ' _ '
2012-11-28 02:04:46 +01:00
return result
2012-03-25 03:07:37 +02:00
2014-11-23 20:41:03 +01:00
2024-10-13 04:10:12 +02:00
def _sanitize_path_parts ( parts ) :
sanitized_parts = [ ]
for part in parts :
if not part or part == ' . ' :
continue
elif part == ' .. ' :
if sanitized_parts and sanitized_parts [ - 1 ] != ' .. ' :
sanitized_parts . pop ( )
sanitized_parts . append ( ' .. ' )
continue
# Replace invalid segments with `#`
# - trailing dots and spaces (`asdf...` => `asdf..#`)
# - invalid chars (`<>` => `##`)
sanitized_part = re . sub ( r ' [/<>: " \ | \\ ? \ *]|[ \ s.]$ ' , ' # ' , part )
sanitized_parts . append ( sanitized_part )
return sanitized_parts
2021-02-17 20:09:38 +01:00
def sanitize_path ( s , force = False ) :
2015-03-08 15:55:22 +01:00
""" Sanitizes and normalizes path on Windows """
2024-10-13 04:10:12 +02:00
if sys . platform != ' win32 ' :
if not force :
return s
root = ' / ' if s . startswith ( ' / ' ) else ' '
return root + ' / ' . join ( _sanitize_path_parts ( s . split ( ' / ' ) ) )
normed = s . replace ( ' / ' , ' \\ ' )
if normed . startswith ( ' \\ \\ ' ) :
# UNC path (`\\SERVER\SHARE`) or device path (`\\.`, `\\?`)
parts = normed . split ( ' \\ ' )
root = ' \\ ' . join ( parts [ : 4 ] ) + ' \\ '
parts = parts [ 4 : ]
elif normed [ 1 : 2 ] == ' : ' :
# absolute path or drive relative path
offset = 3 if normed [ 2 : 3 ] == ' \\ ' else 2
root = normed [ : offset ]
parts = normed [ offset : ] . split ( ' \\ ' )
2021-02-17 20:09:38 +01:00
else :
2024-10-13 04:10:12 +02:00
# relative/drive root relative path
root = ' \\ ' if normed [ : 1 ] == ' \\ ' else ' '
parts = normed . split ( ' \\ ' )
return root + ' \\ ' . join ( _sanitize_path_parts ( parts ) )
2015-03-08 15:55:22 +01:00
2022-08-01 03:22:03 +02:00
def sanitize_url ( url , * , scheme = ' http ' ) :
2018-02-19 16:50:23 +01:00
# Prepend protocol-less URLs with `http:` scheme in order to mitigate
# the number of unwanted failures due to missing protocol
2022-05-18 05:34:30 +02:00
if url is None :
return
elif url . startswith ( ' // ' ) :
2022-08-01 03:22:03 +02:00
return f ' { scheme } : { url } '
2018-02-19 16:50:23 +01:00
# Fix some common typos seen so far
COMMON_TYPOS = (
2019-03-09 13:14:41 +01:00
# https://github.com/ytdl-org/youtube-dl/issues/15649
2018-02-19 16:50:23 +01:00
( r ' ^httpss:// ' , r ' https:// ' ) ,
# https://bx1.be/lives/direct-tv/
( r ' ^rmtp([es]?):// ' , r ' rtmp \ 1:// ' ) ,
)
for mistake , fixup in COMMON_TYPOS :
if re . match ( mistake , url ) :
return re . sub ( mistake , fixup , url )
2021-06-01 14:35:41 +02:00
return url
2016-03-26 14:33:57 +01:00
2021-04-19 14:07:45 +02:00
def extract_basic_auth ( url ) :
2022-06-24 12:54:43 +02:00
parts = urllib . parse . urlsplit ( url )
2021-04-19 14:07:45 +02:00
if parts . username is None :
return url , None
2022-06-24 12:54:43 +02:00
url = urllib . parse . urlunsplit ( parts . _replace ( netloc = (
2021-04-19 14:07:45 +02:00
parts . hostname if parts . port is None
2024-06-12 01:09:58 +02:00
else f ' { parts . hostname } : { parts . port } ' ) ) )
2021-04-19 14:07:45 +02:00
auth_payload = base64 . b64encode (
2024-06-12 01:09:58 +02:00
( ' {} : {} ' . format ( parts . username , parts . password or ' ' ) ) . encode ( ) )
2022-05-09 13:54:28 +02:00
return url , f ' Basic { auth_payload . decode ( ) } '
2021-04-19 14:07:45 +02:00
2017-03-25 20:30:10 +01:00
def expand_path ( s ) :
2022-09-21 22:07:44 +02:00
""" Expand shell variables and ~ """
2017-03-25 20:30:10 +01:00
return os . path . expandvars ( compat_expanduser ( s ) )
2022-06-17 10:05:04 +02:00
def orderedSet ( iterable , * , lazy = False ) :
""" Remove all duplicates from the input iterable """
def _iter ( ) :
seen = [ ] # Do not use set since the items can be unhashable
for x in iterable :
if x not in seen :
seen . append ( x )
yield x
return _iter ( ) if lazy else list ( _iter ( ) )
2012-03-25 03:07:37 +02:00
2014-03-24 01:40:09 +01:00
2016-06-10 09:11:55 +02:00
def _htmlentity_transform ( entity_with_semicolon ) :
2014-08-27 19:11:45 +02:00
""" Transforms an HTML entity to a character. """
2016-06-10 09:11:55 +02:00
entity = entity_with_semicolon [ : - 1 ]
2014-08-27 19:11:45 +02:00
# Known non-numeric HTML entity
2022-06-24 10:10:17 +02:00
if entity in html . entities . name2codepoint :
return chr ( html . entities . name2codepoint [ entity ] )
2014-08-27 19:11:45 +02:00
2022-08-14 14:04:13 +02:00
# TODO: HTML5 allows entities without a semicolon.
# E.g. 'Éric' should be decoded as 'Éric'.
2022-06-24 10:10:17 +02:00
if entity_with_semicolon in html . entities . html5 :
return html . entities . html5 [ entity_with_semicolon ]
2016-06-10 09:11:55 +02:00
2015-03-26 16:15:27 +01:00
mobj = re . match ( r ' #(x[0-9a-fA-F]+|[0-9]+) ' , entity )
2014-08-27 19:11:45 +02:00
if mobj is not None :
numstr = mobj . group ( 1 )
2014-11-17 07:16:12 +01:00
if numstr . startswith ( ' x ' ) :
2014-08-27 19:11:45 +02:00
base = 16
2024-06-12 01:09:58 +02:00
numstr = f ' 0 { numstr } '
2014-08-27 19:11:45 +02:00
else :
base = 10
2019-03-09 13:14:41 +01:00
# See https://github.com/ytdl-org/youtube-dl/issues/7518
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( ValueError ) :
2022-06-24 10:10:17 +02:00
return chr ( int ( numstr , base ) )
2014-08-27 19:11:45 +02:00
# Unknown entity in name, return its literal representation
2024-06-12 01:09:58 +02:00
return f ' & { entity } ; '
2014-08-27 19:11:45 +02:00
2012-03-25 03:07:37 +02:00
def unescapeHTML ( s ) :
2014-03-24 01:40:09 +01:00
if s is None :
return None
2022-04-17 22:58:28 +02:00
assert isinstance ( s , str )
2012-03-25 03:07:37 +02:00
2014-08-27 19:11:45 +02:00
return re . sub (
2017-08-19 15:40:53 +02:00
r ' &([^&;]+;) ' , lambda m : _htmlentity_transform ( m . group ( 1 ) ) , s )
2012-03-25 03:07:37 +02:00
2014-01-05 03:07:55 +01:00
2021-05-23 18:34:49 +02:00
def escapeHTML ( text ) :
return (
text
. replace ( ' & ' , ' & ' )
. replace ( ' < ' , ' < ' )
. replace ( ' > ' , ' > ' )
. replace ( ' " ' , ' " ' )
. replace ( " ' " , ' ' ' )
)
2023-06-21 05:07:42 +02:00
class netrc_from_content ( netrc . netrc ) :
def __init__ ( self , content ) :
self . hosts , self . macros = { } , { }
with io . StringIO ( content ) as stream :
self . _parse ( ' - ' , stream , False )
2021-10-20 18:19:40 +02:00
class Popen ( subprocess . Popen ) :
if sys . platform == ' win32 ' :
_startupinfo = subprocess . STARTUPINFO ( )
_startupinfo . dwFlags | = subprocess . STARTF_USESHOWWINDOW
else :
_startupinfo = None
2022-08-30 18:24:14 +02:00
@staticmethod
2024-10-16 05:53:53 +02:00
def _fix_pyinstaller_issues ( env ) :
2022-08-30 18:24:14 +02:00
if not hasattr ( sys , ' _MEIPASS ' ) :
return
2024-10-16 05:53:53 +02:00
# Force spawning independent subprocesses for exes bundled with PyInstaller>=6.10
# Ref: https://pyinstaller.org/en/v6.10.0/CHANGES.html#incompatible-changes
# https://github.com/yt-dlp/yt-dlp/issues/11259
env [ ' PYINSTALLER_RESET_ENVIRONMENT ' ] = ' 1 '
# Restore LD_LIBRARY_PATH when using PyInstaller
# Ref: https://pyinstaller.org/en/v6.10.0/runtime-information.html#ld-library-path-libpath-considerations
# https://github.com/yt-dlp/yt-dlp/issues/4573
2022-08-30 18:24:14 +02:00
def _fix ( key ) :
orig = env . get ( f ' { key } _ORIG ' )
if orig is None :
env . pop ( key , None )
else :
env [ key ] = orig
_fix ( ' LD_LIBRARY_PATH ' ) # Linux
_fix ( ' DYLD_LIBRARY_PATH ' ) # macOS
2023-09-24 02:29:01 +02:00
def __init__ ( self , args , * remaining , env = None , text = False , shell = False , * * kwargs ) :
2022-08-30 18:24:14 +02:00
if env is None :
env = os . environ . copy ( )
2024-10-16 05:53:53 +02:00
self . _fix_pyinstaller_issues ( env )
2022-08-30 18:24:14 +02:00
2023-02-23 04:18:45 +01:00
self . __text_mode = kwargs . get ( ' encoding ' ) or kwargs . get ( ' errors ' ) or text or kwargs . get ( ' universal_newlines ' )
2022-06-15 22:55:43 +02:00
if text is True :
kwargs [ ' universal_newlines ' ] = True # For 3.6 compatibility
kwargs . setdefault ( ' encoding ' , ' utf-8 ' )
kwargs . setdefault ( ' errors ' , ' replace ' )
2023-09-24 02:29:01 +02:00
if shell and compat_os_name == ' nt ' and kwargs . get ( ' executable ' ) is None :
if not isinstance ( args , str ) :
2024-04-08 23:18:04 +02:00
args = shell_quote ( args , shell = True )
2023-09-24 02:29:01 +02:00
shell = False
2024-04-08 23:18:04 +02:00
# Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
env [ ' = ' ] = ' " ^ \n \n " '
args = f ' { self . __comspec ( ) } /Q /S /D /V:OFF /E:ON /C " { args } " '
2023-09-24 02:29:01 +02:00
super ( ) . __init__ ( args , * remaining , env = env , shell = shell , * * kwargs , startupinfo = self . _startupinfo )
def __comspec ( self ) :
comspec = os . environ . get ( ' ComSpec ' ) or os . path . join (
os . environ . get ( ' SystemRoot ' , ' ' ) , ' System32 ' , ' cmd.exe ' )
if os . path . isabs ( comspec ) :
return comspec
raise FileNotFoundError ( ' shell not found: neither % ComSpec % nor % SystemRoot % i s set ' )
2021-10-20 18:19:40 +02:00
def communicate_or_kill ( self , * args , * * kwargs ) :
2022-05-27 01:06:23 +02:00
try :
return self . communicate ( * args , * * kwargs )
except BaseException : # Including KeyboardInterrupt
2022-06-15 22:55:43 +02:00
self . kill ( timeout = None )
2022-05-27 01:06:23 +02:00
raise
2021-10-20 18:19:40 +02:00
2022-06-15 22:55:43 +02:00
def kill ( self , * , timeout = 0 ) :
super ( ) . kill ( )
if timeout != 0 :
self . wait ( timeout = timeout )
@classmethod
2022-08-22 02:49:06 +02:00
def run ( cls , * args , timeout = None , * * kwargs ) :
2022-06-15 22:55:43 +02:00
with cls ( * args , * * kwargs ) as proc :
2023-02-23 04:18:45 +01:00
default = ' ' if proc . __text_mode else b ' '
2022-08-22 02:49:06 +02:00
stdout , stderr = proc . communicate_or_kill ( timeout = timeout )
2022-09-25 23:22:21 +02:00
return stdout or default , stderr or default , proc . returncode
2022-06-15 22:55:43 +02:00
2021-10-20 18:19:40 +02:00
2014-05-16 15:47:54 +02:00
def encodeArgument ( s ) :
2021-12-30 13:23:36 +01:00
# Legacy code that uses byte strings
# Uncomment the following line after fixing all post processors
2022-06-24 12:54:43 +02:00
# assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
2021-12-30 13:23:36 +01:00
return s if isinstance ( s , str ) else s . decode ( ' ascii ' )
2014-05-16 15:47:54 +02:00
2021-10-19 19:28:14 +02:00
_timetuple = collections . namedtuple ( ' Time ' , ( ' hours ' , ' minutes ' , ' seconds ' , ' milliseconds ' ) )
def timetuple_from_msec ( msec ) :
secs , msec = divmod ( msec , 1000 )
mins , secs = divmod ( secs , 60 )
hrs , mins = divmod ( mins , 60 )
return _timetuple ( hrs , mins , secs , msec )
2021-05-23 18:34:49 +02:00
def formatSeconds ( secs , delim = ' : ' , msec = False ) :
2021-10-19 19:28:14 +02:00
time = timetuple_from_msec ( secs * 1000 )
if time . hours :
ret = ' %d %s %02d %s %02d ' % ( time . hours , delim , time . minutes , delim , time . seconds )
elif time . minutes :
ret = ' %d %s %02d ' % ( time . minutes , delim , time . seconds )
2013-05-04 12:02:18 +02:00
else :
2021-10-19 19:28:14 +02:00
ret = ' %d ' % time . seconds
return ' %s . %03d ' % ( ret , time . milliseconds ) if msec else ret
2013-05-04 12:02:18 +02:00
2013-12-29 15:28:32 +01:00
2021-04-22 21:16:29 +02:00
def bug_reports_message ( before = ' ; ' ) :
2023-05-20 23:56:23 +02:00
from . . update import REPOSITORY
2022-06-21 13:32:56 +02:00
msg = ( f ' please report this issue on https://github.com/ { REPOSITORY } /issues?q= , '
' filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U ' )
2021-04-22 21:16:29 +02:00
before = before . rstrip ( )
if not before or before . endswith ( ( ' . ' , ' ! ' , ' ? ' ) ) :
msg = msg [ 0 ] . title ( ) + msg [ 1 : ]
return ( before + ' ' if before else ' ' ) + msg
2015-04-17 14:55:24 +02:00
2016-10-17 13:38:37 +02:00
class YoutubeDLError ( Exception ) :
""" Base exception for YoutubeDL errors. """
2021-11-09 23:49:33 +01:00
msg = None
def __init__ ( self , msg = None ) :
if msg is not None :
self . msg = msg
elif self . msg is None :
self . msg = type ( self ) . __name__
super ( ) . __init__ ( self . msg )
2016-10-17 13:38:37 +02:00
class ExtractorError ( YoutubeDLError ) :
2013-01-01 20:27:53 +01:00
""" Error during info extraction. """
2014-11-23 20:41:03 +01:00
2021-08-19 03:49:23 +02:00
def __init__ ( self , msg , tb = None , expected = False , cause = None , video_id = None , ie = None ) :
2013-07-02 08:40:21 +02:00
""" tb, if given, is the original traceback (so that it can be printed out).
2021-02-24 19:45:56 +01:00
If expected is set , this is a normal error message and most likely not a bug in yt - dlp .
2013-07-02 08:40:21 +02:00
"""
2023-07-15 11:00:08 +02:00
from . . networking . exceptions import network_exceptions
2021-05-04 19:06:18 +02:00
if sys . exc_info ( ) [ 0 ] in network_exceptions :
2013-07-02 08:40:21 +02:00
expected = True
2013-06-09 11:55:08 +02:00
2022-03-04 15:07:43 +01:00
self . orig_msg = str ( msg )
2013-01-01 20:27:53 +01:00
self . traceback = tb
2021-08-19 03:49:23 +02:00
self . expected = expected
2013-08-28 04:25:38 +02:00
self . cause = cause
2014-04-21 20:34:03 +02:00
self . video_id = video_id
2021-08-19 03:49:23 +02:00
self . ie = ie
self . exc_info = sys . exc_info ( ) # preserve original exception
2022-06-20 09:00:02 +02:00
if isinstance ( self . exc_info [ 1 ] , ExtractorError ) :
self . exc_info = self . exc_info [ 1 ] . exc_info
2022-11-30 01:40:26 +01:00
super ( ) . __init__ ( self . __msg )
2021-08-19 03:49:23 +02:00
2022-11-30 01:40:26 +01:00
@property
def __msg ( self ) :
return ' ' . join ( (
format_field ( self . ie , None , ' [ %s ] ' ) ,
format_field ( self . video_id , None , ' %s : ' ) ,
self . orig_msg ,
format_field ( self . cause , None , ' (caused by %r ) ' ) ,
' ' if self . expected else bug_reports_message ( ) ) )
2013-01-01 20:27:53 +01:00
2013-01-03 15:39:55 +01:00
def format_traceback ( self ) :
2022-03-08 07:34:49 +01:00
return join_nonempty (
self . traceback and ' ' . join ( traceback . format_tb ( self . traceback ) ) ,
2022-03-09 02:12:14 +01:00
self . cause and ' ' . join ( traceback . format_exception ( None , self . cause , self . cause . __traceback__ ) [ 1 : ] ) ,
2022-03-08 07:34:49 +01:00
delim = ' \n ' ) or None
2013-01-03 15:39:55 +01:00
2022-11-30 01:40:26 +01:00
def __setattr__ ( self , name , value ) :
super ( ) . __setattr__ ( name , value )
if getattr ( self , ' msg ' , None ) and name not in ( ' msg ' , ' args ' ) :
self . msg = self . __msg or type ( self ) . __name__
self . args = ( self . msg , ) # Cannot be property
2013-01-01 20:27:53 +01:00
2014-12-30 19:35:35 +01:00
class UnsupportedError ( ExtractorError ) :
def __init__ ( self , url ) :
2022-04-11 17:10:28 +02:00
super ( ) . __init__ (
2024-06-12 01:09:58 +02:00
f ' Unsupported URL: { url } ' , expected = True )
2014-12-30 19:35:35 +01:00
self . url = url
2013-10-23 14:38:03 +02:00
class RegexNotFoundError ( ExtractorError ) :
""" Error when a regex didn ' t match """
pass
2017-02-04 12:49:58 +01:00
class GeoRestrictedError ( ExtractorError ) :
""" Geographic restriction Error exception.
This exception may be thrown when a video is not available from your
geographic location due to geographic restrictions imposed by a website .
"""
2020-10-09 07:06:49 +02:00
2021-10-26 16:47:29 +02:00
def __init__ ( self , msg , countries = None , * * kwargs ) :
kwargs [ ' expected ' ] = True
2022-04-11 17:10:28 +02:00
super ( ) . __init__ ( msg , * * kwargs )
2017-02-04 12:49:58 +01:00
self . countries = countries
2022-07-26 05:53:10 +02:00
class UserNotLive ( ExtractorError ) :
""" Error when a channel/user is not live """
def __init__ ( self , msg = None , * * kwargs ) :
kwargs [ ' expected ' ] = True
super ( ) . __init__ ( msg or ' The channel is not currently live ' , * * kwargs )
2016-10-17 13:38:37 +02:00
class DownloadError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Download Error exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be thrown by FileDownloader objects if they are not
configured to continue on errors . They will contain the appropriate
error message .
"""
2014-11-23 20:41:03 +01:00
2013-03-09 10:05:43 +01:00
def __init__ ( self , msg , exc_info = None ) :
""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2022-04-11 17:10:28 +02:00
super ( ) . __init__ ( msg )
2013-03-09 10:05:43 +01:00
self . exc_info = exc_info
2012-03-25 03:07:37 +02:00
2021-03-23 20:45:53 +01:00
class EntryNotInPlaylist ( YoutubeDLError ) :
""" Entry not in playlist exception.
This exception will be thrown by YoutubeDL when a requested entry
is not found in the playlist info_dict
"""
2021-11-09 23:49:33 +01:00
msg = ' Entry not found in info '
2021-03-23 20:45:53 +01:00
2016-10-17 13:38:37 +02:00
class SameFileError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Same File exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception will be thrown by FileDownloader objects if they detect
multiple files would have to be downloaded to the same file on disk .
"""
2021-11-09 23:49:33 +01:00
msg = ' Fixed output name but more than one file to download '
def __init__ ( self , filename = None ) :
if filename is not None :
self . msg + = f ' : { filename } '
super ( ) . __init__ ( self . msg )
2012-03-25 03:07:37 +02:00
2016-10-17 13:38:37 +02:00
class PostProcessingError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Post Processing exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be raised by PostProcessor ' s .run() method to
indicate an error in the postprocessing task .
"""
2014-11-23 20:41:03 +01:00
2021-10-26 16:45:12 +02:00
class DownloadCancelled ( YoutubeDLError ) :
""" Exception raised when the download queue should be interrupted """
msg = ' The download was cancelled '
2021-01-13 02:01:01 +01:00
2021-10-26 16:45:12 +02:00
class ExistingVideoReached ( DownloadCancelled ) :
""" --break-on-existing triggered """
msg = ' Encountered a video that is already in the archive, stopping due to --break-on-existing '
2021-01-13 02:01:01 +01:00
2021-10-26 16:45:12 +02:00
class RejectedVideoReached ( DownloadCancelled ) :
2023-03-03 20:43:05 +01:00
""" --break-match-filter triggered """
msg = ' Encountered a video that did not match filter, stopping due to --break-match-filter '
2021-06-23 01:11:09 +02:00
2021-10-26 16:45:12 +02:00
class MaxDownloadsReached ( DownloadCancelled ) :
2012-11-28 02:04:46 +01:00
""" --max-downloads limit has been reached. """
2021-10-26 16:45:12 +02:00
msg = ' Maximum number of downloads reached, stopping due to --max-downloads '
2021-11-28 19:57:44 +01:00
class ReExtractInfo ( YoutubeDLError ) :
""" Video info needs to be re-extracted. """
def __init__ ( self , msg , expected = False ) :
super ( ) . __init__ ( msg )
self . expected = expected
class ThrottledDownload ( ReExtractInfo ) :
2021-10-26 16:45:12 +02:00
""" Download speed below --throttled-rate. """
2021-11-09 23:49:33 +01:00
msg = ' The download speed is below throttle limit '
2012-03-25 03:07:37 +02:00
2021-12-02 22:22:03 +01:00
def __init__ ( self ) :
super ( ) . __init__ ( self . msg , expected = False )
2021-11-28 19:57:44 +01:00
2012-03-25 03:07:37 +02:00
2016-10-17 13:38:37 +02:00
class UnavailableVideoError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Unavailable Format exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception will be thrown when a video is requested
in a format that is not available for that video .
"""
2021-11-09 23:49:33 +01:00
msg = ' Unable to download video '
def __init__ ( self , err = None ) :
if err is not None :
self . msg + = f ' : { err } '
super ( ) . __init__ ( self . msg )
2012-03-25 03:07:37 +02:00
2016-10-17 13:38:37 +02:00
class ContentTooShortError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Content Too Short exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be raised by FileDownloader objects when a file they
download is too small for what the server announced first , indicating
the connection was probably interrupted .
"""
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def __init__ ( self , downloaded , expected ) :
2022-04-11 17:10:28 +02:00
super ( ) . __init__ ( f ' Downloaded { downloaded } bytes, expected { expected } bytes ' )
2015-07-26 16:37:51 +02:00
# Both in bytes
2012-11-28 02:04:46 +01:00
self . downloaded = downloaded
self . expected = expected
2012-03-25 03:07:37 +02:00
2014-11-23 20:41:03 +01:00
2016-10-17 13:38:37 +02:00
class XAttrMetadataError ( YoutubeDLError ) :
2016-09-29 18:28:32 +02:00
def __init__ ( self , code = None , msg = ' Unknown error ' ) :
2022-04-11 17:10:28 +02:00
super ( ) . __init__ ( msg )
2016-09-29 18:28:32 +02:00
self . code = code
2016-10-01 21:03:41 +02:00
self . msg = msg
2016-09-29 18:28:32 +02:00
# Parsing code and msg
2019-05-10 22:56:22 +02:00
if ( self . code in ( errno . ENOSPC , errno . EDQUOT )
2020-11-21 15:50:42 +01:00
or ' No space left ' in self . msg or ' Disk quota exceeded ' in self . msg ) :
2016-09-29 18:28:32 +02:00
self . reason = ' NO_SPACE '
elif self . code == errno . E2BIG or ' Argument list too long ' in self . msg :
self . reason = ' VALUE_TOO_LONG '
else :
self . reason = ' NOT_SUPPORTED '
2016-10-17 13:38:37 +02:00
class XAttrUnavailableError ( YoutubeDLError ) :
2016-09-29 18:28:32 +02:00
pass
2022-09-09 19:44:20 +02:00
def is_path_like ( f ) :
return isinstance ( f , ( str , bytes , os . PathLike ) )
2024-05-26 23:13:12 +02:00
def extract_timezone ( date_str , default = None ) :
2016-06-25 17:30:35 +02:00
m = re . search (
2021-09-19 14:15:41 +02:00
r ''' (?x)
^ . { 8 , } ? # >=8 char non-TZ prefix, if present
( ? P < tz > Z | # just the UTC Z, or
( ? : ( ? < = . \b \d { 4 } | \b \d { 2 } : \d \d ) | # preceded by 4 digits or hh:mm or
( ? < ! . \b [ a - zA - Z ] { 3 } | [ a - zA - Z ] { 4 } | . . \b \d \d ) ) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
[ ] ? # optional space
( ? P < sign > \+ | - ) # +/-
( ? P < hours > [ 0 - 9 ] { 2 } ) : ? ( ? P < minutes > [ 0 - 9 ] { 2 } ) # hh[:]mm
$ )
''' , date_str)
2024-05-26 23:13:12 +02:00
timezone = None
2016-06-25 17:30:35 +02:00
if not m :
2022-08-14 01:21:54 +02:00
m = re . search ( r ' \ d { 1,2}: \ d { 1,2}(?: \ . \ d+)?(?P<tz> \ s*[A-Z]+)$ ' , date_str )
timezone = TIMEZONE_NAMES . get ( m and m . group ( ' tz ' ) . strip ( ) )
if timezone is not None :
date_str = date_str [ : - len ( m . group ( ' tz ' ) ) ]
2024-05-26 23:13:12 +02:00
timezone = dt . timedelta ( hours = timezone )
2016-06-25 17:30:35 +02:00
else :
date_str = date_str [ : - len ( m . group ( ' tz ' ) ) ]
2024-05-26 23:13:12 +02:00
if m . group ( ' sign ' ) :
2016-06-25 17:30:35 +02:00
sign = 1 if m . group ( ' sign ' ) == ' + ' else - 1
2024-02-25 01:16:34 +01:00
timezone = dt . timedelta (
2016-06-25 17:30:35 +02:00
hours = sign * int ( m . group ( ' hours ' ) ) ,
minutes = sign * int ( m . group ( ' minutes ' ) ) )
2024-05-26 23:13:12 +02:00
if timezone is None and default is not NO_DEFAULT :
timezone = default or dt . timedelta ( )
2016-06-25 17:30:35 +02:00
return timezone , date_str
2015-02-12 08:55:06 +01:00
def parse_iso8601 ( date_str , delimiter = ' T ' , timezone = None ) :
2014-03-24 01:40:09 +01:00
""" Return a UNIX timestamp from the given date """
if date_str is None :
return None
2015-10-28 16:40:22 +01:00
date_str = re . sub ( r ' \ .[0-9]+ ' , ' ' , date_str )
2024-05-26 23:13:12 +02:00
timezone , date_str = extract_timezone ( date_str , timezone )
2016-06-25 17:30:35 +02:00
2024-05-26 23:13:12 +02:00
with contextlib . suppress ( ValueError , TypeError ) :
2022-04-11 17:10:28 +02:00
date_format = f ' %Y-%m-%d { delimiter } %H:%M:%S '
2024-02-25 01:16:34 +01:00
dt_ = dt . datetime . strptime ( date_str , date_format ) - timezone
return calendar . timegm ( dt_ . timetuple ( ) )
2014-03-24 01:40:09 +01:00
2016-06-25 17:30:35 +02:00
def date_formats ( day_first = True ) :
return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
2014-12-12 02:57:36 +01:00
def unified_strdate ( date_str , day_first = True ) :
2013-04-27 15:14:20 +02:00
""" Return a string with the date in the format YYYYMMDD """
2014-03-21 14:38:37 +01:00
if date_str is None :
return None
2013-04-27 15:14:20 +02:00
upload_date = None
2014-11-23 20:41:03 +01:00
# Replace commas
2014-02-09 18:09:57 +01:00
date_str = date_str . replace ( ' , ' , ' ' )
2014-12-12 02:57:36 +01:00
# Remove AM/PM + timezone
2015-02-03 10:58:28 +01:00
date_str = re . sub ( r ' (?i) \ s*(?:AM|PM)(?: \ s+[A-Z]+)? ' , ' ' , date_str )
2016-06-25 17:30:35 +02:00
_ , date_str = extract_timezone ( date_str )
2014-12-12 02:57:36 +01:00
2016-06-25 17:30:35 +02:00
for expression in date_formats ( day_first ) :
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( ValueError ) :
2024-02-25 01:16:34 +01:00
upload_date = dt . datetime . strptime ( date_str , expression ) . strftime ( ' % Y % m %d ' )
2013-12-17 12:33:55 +01:00
if upload_date is None :
timetuple = email . utils . parsedate_tz ( date_str )
if timetuple :
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( ValueError ) :
2024-02-25 01:16:34 +01:00
upload_date = dt . datetime ( * timetuple [ : 6 ] ) . strftime ( ' % Y % m %d ' )
2015-11-02 14:08:38 +01:00
if upload_date is not None :
2022-06-24 12:54:43 +02:00
return str ( upload_date )
2013-04-27 15:14:20 +02:00
2014-11-23 20:41:03 +01:00
2016-06-25 17:30:35 +02:00
def unified_timestamp ( date_str , day_first = True ) :
2023-06-21 05:51:20 +02:00
if not isinstance ( date_str , str ) :
2016-06-25 17:30:35 +02:00
return None
2022-08-14 01:21:54 +02:00
date_str = re . sub ( r ' \ s+ ' , ' ' , re . sub (
2024-07-29 07:35:46 +02:00
r ' (?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?|sun)(day)? ' , ' ' , date_str ) )
2016-06-25 17:30:35 +02:00
2016-08-05 05:41:55 +02:00
pm_delta = 12 if re . search ( r ' (?i)PM ' , date_str ) else 0
2016-06-25 17:30:35 +02:00
timezone , date_str = extract_timezone ( date_str )
# Remove AM/PM + timezone
date_str = re . sub ( r ' (?i) \ s*(?:AM|PM)(?: \ s+[A-Z]+)? ' , ' ' , date_str )
2017-04-30 16:07:30 +02:00
# Remove unrecognized timezones from ISO 8601 alike timestamps
m = re . search ( r ' \ d { 1,2}: \ d { 1,2}(?: \ . \ d+)?(?P<tz> \ s*[A-Z]+)$ ' , date_str )
if m :
date_str = date_str [ : - len ( m . group ( ' tz ' ) ) ]
2018-03-14 01:28:40 +01:00
# Python only supports microseconds, so remove nanoseconds
m = re . search ( r ' ^([0-9] { 4,}-[0-9] { 1,2}-[0-9] { 1,2}T[0-9] { 1,2}:[0-9] { 1,2}:[0-9] { 1,2} \ .[0-9] {6} )[0-9]+$ ' , date_str )
if m :
date_str = m . group ( 1 )
2016-06-25 17:30:35 +02:00
for expression in date_formats ( day_first ) :
2022-04-17 22:58:28 +02:00
with contextlib . suppress ( ValueError ) :
2024-02-25 01:16:34 +01:00
dt_ = dt . datetime . strptime ( date_str , expression ) - timezone + dt . timedelta ( hours = pm_delta )
return calendar . timegm ( dt_ . timetuple ( ) )
2022-08-14 01:21:54 +02:00
2016-06-25 17:30:35 +02:00
timetuple = email . utils . parsedate_tz ( date_str )
if timetuple :
2022-08-14 01:21:54 +02:00
return calendar . timegm ( timetuple ) + pm_delta * 3600 - timezone . total_seconds ( )
2016-06-25 17:30:35 +02:00
2014-11-17 07:16:12 +01:00
def determine_ext ( url , default_ext = ' unknown_video ' ) :
2018-06-01 19:16:22 +02:00
if url is None or ' . ' not in url :
2014-08-01 14:08:09 +02:00
return default_ext
2015-11-22 12:27:13 +01:00
guess = url . partition ( ' ? ' ) [ 0 ] . rpartition ( ' . ' ) [ 2 ]
2013-07-08 01:13:55 +02:00
if re . match ( r ' ^[A-Za-z0-9]+$ ' , guess ) :
return guess
2016-01-03 20:08:34 +01:00
# Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
elif guess . rstrip ( ' / ' ) in KNOWN_EXTENSIONS :
2015-11-22 12:27:13 +01:00
return guess . rstrip ( ' / ' )
2013-07-08 01:13:55 +02:00
else :
2013-07-12 21:52:59 +02:00
return default_ext
2013-07-08 01:13:55 +02:00
2014-11-23 20:41:03 +01:00
2019-10-17 23:03:53 +02:00
def subtitles_filename ( filename , sub_lang , sub_format , expected_real_ext = None ) :
return replace_extension ( filename , sub_lang + ' . ' + sub_format , expected_real_ext )
2013-07-20 12:48:57 +02:00
2014-11-23 20:41:03 +01:00
2021-04-06 08:45:15 +02:00
def datetime_from_str ( date_str , precision = ' auto ' , format = ' % Y % m %d ' ) :
2022-05-11 02:22:31 +02:00
R """
Return a datetime object from a string .
Supported format :
( now | today | yesterday | DATE ) ( [ + - ] \d + ( microsecond | second | minute | hour | day | week | month | year ) s ? ) ?
@param format strftime format of DATE
@param precision Round the datetime object : auto | microsecond | second | minute | hour | day
auto : round to the unit provided in date_str ( if applicable ) .
2021-04-06 08:45:15 +02:00
"""
auto_precision = False
if precision == ' auto ' :
auto_precision = True
precision = ' microsecond '
2024-02-25 01:16:34 +01:00
today = datetime_round ( dt . datetime . now ( dt . timezone . utc ) , precision )
2014-12-11 10:29:30 +01:00
if date_str in ( ' now ' , ' today ' ) :
2013-04-28 11:39:37 +02:00
return today
2014-12-11 10:29:30 +01:00
if date_str == ' yesterday ' :
2024-02-25 01:16:34 +01:00
return today - dt . timedelta ( days = 1 )
2021-04-06 08:45:15 +02:00
match = re . match (
2022-05-11 02:22:31 +02:00
r ' (?P<start>.+)(?P<sign>[+-])(?P<time> \ d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s? ' ,
2021-04-06 08:45:15 +02:00
date_str )
2013-04-28 11:39:37 +02:00
if match is not None :
2021-04-06 08:45:15 +02:00
start_time = datetime_from_str ( match . group ( ' start ' ) , precision , format )
time = int ( match . group ( ' time ' ) ) * ( - 1 if match . group ( ' sign ' ) == ' - ' else 1 )
2013-04-28 11:39:37 +02:00
unit = match . group ( ' unit ' )
2021-04-06 08:45:15 +02:00
if unit == ' month ' or unit == ' year ' :
new_date = datetime_add_months ( start_time , time * 12 if unit == ' year ' else time )
2013-04-28 11:39:37 +02:00
unit = ' day '
2021-04-06 08:45:15 +02:00
else :
if unit == ' week ' :
unit = ' day '
time * = 7
2024-02-25 01:16:34 +01:00
delta = dt . timedelta ( * * { unit + ' s ' : time } )
2021-04-06 08:45:15 +02:00
new_date = start_time + delta
if auto_precision :
return datetime_round ( new_date , unit )
return new_date
2024-02-25 01:16:34 +01:00
return datetime_round ( dt . datetime . strptime ( date_str , format ) , precision )
2021-04-06 08:45:15 +02:00
2022-02-11 22:10:49 +01:00
def date_from_str ( date_str , format = ' % Y % m %d ' , strict = False ) :
2022-05-11 02:22:31 +02:00
R """
Return a date object from a string using datetime_from_str
2021-04-06 08:45:15 +02:00
2022-05-11 02:22:31 +02:00
@param strict Restrict allowed patterns to " YYYYMMDD " and
( now | today | yesterday ) ( - \d + ( day | week | month | year ) s ? ) ?
2021-04-06 08:45:15 +02:00
"""
2022-05-11 02:22:31 +02:00
if strict and not re . fullmatch ( r ' \ d {8} |(now|today|yesterday)(- \ d+(day|week|month|year)s?)? ' , date_str ) :
raise ValueError ( f ' Invalid date format " { date_str } " ' )
2021-04-06 08:45:15 +02:00
return datetime_from_str ( date_str , precision = ' microsecond ' , format = format ) . date ( )
2024-02-25 01:16:34 +01:00
def datetime_add_months ( dt_ , months ) :
2021-04-06 08:45:15 +02:00
""" Increment/Decrement a datetime object by months. """
2024-02-25 01:16:34 +01:00
month = dt_ . month + months - 1
year = dt_ . year + month / / 12
2021-04-06 08:45:15 +02:00
month = month % 12 + 1
2024-02-25 01:16:34 +01:00
day = min ( dt_ . day , calendar . monthrange ( year , month ) [ 1 ] )
return dt_ . replace ( year , month , day )
2021-04-06 08:45:15 +02:00
2024-02-25 01:16:34 +01:00
def datetime_round ( dt_ , precision = ' day ' ) :
2021-04-06 08:45:15 +02:00
"""
Round a datetime object ' s time to a specific precision
"""
if precision == ' microsecond ' :
2024-02-25 01:16:34 +01:00
return dt_
2021-04-06 08:45:15 +02:00
unit_seconds = {
' day ' : 86400 ,
' hour ' : 3600 ,
' minute ' : 60 ,
' second ' : 1 ,
}
roundto = lambda x , n : ( ( x + n / 2 ) / / n ) * n
2024-02-25 01:16:34 +01:00
timestamp = roundto ( calendar . timegm ( dt_ . timetuple ( ) ) , unit_seconds [ precision ] )
return dt . datetime . fromtimestamp ( timestamp , dt . timezone . utc )
2014-11-23 20:41:03 +01:00
2014-01-02 13:47:28 +01:00
def hyphenate_date ( date_str ) :
"""
Convert a date in ' YYYYMMDD ' format to ' YYYY-MM-DD ' format """
match = re . match ( r ' ^( \ d \ d \ d \ d)( \ d \ d)( \ d \ d)$ ' , date_str )
if match is not None :
return ' - ' . join ( match . groups ( ) )
else :
return date_str
2014-11-23 20:41:03 +01:00
2022-04-11 17:10:28 +02:00
class DateRange :
2013-04-27 14:01:55 +02:00
""" Represents a time interval between two dates """
2014-11-23 20:41:03 +01:00
2013-04-27 14:01:55 +02:00
def __init__ ( self , start = None , end = None ) :
""" start and end must be strings in the format accepted by date """
if start is not None :
2022-02-11 22:10:49 +01:00
self . start = date_from_str ( start , strict = True )
2013-04-27 14:01:55 +02:00
else :
2024-02-25 01:16:34 +01:00
self . start = dt . datetime . min . date ( )
2013-04-27 14:01:55 +02:00
if end is not None :
2022-02-11 22:10:49 +01:00
self . end = date_from_str ( end , strict = True )
2013-04-27 14:01:55 +02:00
else :
2024-02-25 01:16:34 +01:00
self . end = dt . datetime . max . date ( )
2013-04-28 11:39:37 +02:00
if self . start > self . end :
2024-06-12 01:09:58 +02:00
raise ValueError ( f ' Date range: " { self } " , the start date must be before the end date ' )
2014-11-23 20:41:03 +01:00
2013-04-27 14:01:55 +02:00
@classmethod
def day ( cls , day ) :
""" Returns a range that only contains the given day """
2014-11-23 20:41:03 +01:00
return cls ( day , day )
2013-04-27 14:01:55 +02:00
def __contains__ ( self , date ) :
""" Check if the date is in the range """
2024-02-25 01:16:34 +01:00
if not isinstance ( date , dt . date ) :
2013-04-28 11:39:37 +02:00
date = date_from_str ( date )
return self . start < = date < = self . end
2014-11-23 20:41:03 +01:00
2023-05-24 19:59:30 +02:00
def __repr__ ( self ) :
return f ' { __name__ } . { type ( self ) . __name__ } ( { self . start . isoformat ( ) !r} , { self . end . isoformat ( ) !r} ) '
2013-08-28 12:57:10 +02:00
2024-01-31 11:27:37 +01:00
def __str__ ( self ) :
return f ' { self . start } to { self . end } '
2022-07-08 21:37:47 +02:00
def __eq__ ( self , other ) :
return ( isinstance ( other , DateRange )
and self . start == other . start and self . end == other . end )
2013-08-28 12:57:10 +02:00
2022-06-29 03:13:24 +02:00
@functools.cache
def system_identifier ( ) :
python_implementation = platform . python_implementation ( )
if python_implementation == ' PyPy ' and hasattr ( sys , ' pypy_version_info ' ) :
python_implementation + = ' version %d . %d . %d ' % sys . pypy_version_info [ : 3 ]
2022-09-17 08:27:47 +02:00
libc_ver = [ ]
with contextlib . suppress ( OSError ) : # We may not have access to the executable
libc_ver = platform . libc_ver ( )
2022-06-29 03:13:24 +02:00
2024-06-12 01:09:58 +02:00
return ' Python {} ( {} {} {} ) - {} ( {} {} ) ' . format (
2022-06-29 03:13:24 +02:00
platform . python_version ( ) ,
python_implementation ,
2022-11-11 02:49:24 +01:00
platform . machine ( ) ,
2022-06-29 03:13:24 +02:00
platform . architecture ( ) [ 0 ] ,
platform . platform ( ) ,
2022-11-06 17:37:23 +01:00
ssl . OPENSSL_VERSION ,
format_field ( join_nonempty ( * libc_ver , delim = ' ' ) , None , ' , %s ' ) ,
2022-06-29 03:13:24 +02:00
)
2013-08-28 18:22:28 +02:00
2022-05-19 16:06:31 +02:00
@functools.cache
2021-09-23 19:40:51 +02:00
def get_windows_version ( ) :
2024-06-12 01:09:58 +02:00
""" Get Windows version. returns () if it ' s not running on Windows """
2021-09-23 19:40:51 +02:00
if compat_os_name == ' nt ' :
return version_tuple ( platform . win32_ver ( ) [ 1 ] )
else :
2022-05-27 01:06:23 +02:00
return ( )
2021-09-23 19:40:51 +02:00
2014-04-07 19:57:42 +02:00
def write_string ( s , out = None , encoding = None ) :
2022-04-17 22:58:28 +02:00
assert isinstance ( s , str )
out = out or sys . stderr
2023-03-07 22:34:07 +01:00
# `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
if not out :
return
2013-09-16 06:55:33 +02:00
2022-05-10 07:38:19 +02:00
if compat_os_name == ' nt ' and supports_terminal_sequences ( out ) :
2022-04-29 18:02:31 +02:00
s = re . sub ( r ' ([ \ r \ n]+) ' , r ' \ 1 ' , s )
2022-04-28 15:41:04 +02:00
2022-05-27 01:06:23 +02:00
enc , buffer = None , out
2024-03-10 15:22:49 +01:00
# `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
if ' b ' in ( getattr ( out , ' mode ' , None ) or ' ' ) :
2022-04-17 19:18:50 +02:00
enc = encoding or preferredencoding ( )
2014-04-07 21:40:34 +02:00
elif hasattr ( out , ' buffer ' ) :
2022-05-27 01:06:23 +02:00
buffer = out . buffer
2014-04-07 21:40:34 +02:00
enc = encoding or getattr ( out , ' encoding ' , None ) or preferredencoding ( )
2022-04-17 19:18:50 +02:00
2022-05-27 01:06:23 +02:00
buffer . write ( s . encode ( enc , ' ignore ' ) if enc else s )
2013-09-16 06:55:33 +02:00
out . flush ( )
2023-07-09 09:53:02 +02:00
# TODO: Use global logger
2022-08-30 17:28:28 +02:00
def deprecation_warning ( msg , * , printer = None , stacklevel = 0 , * * kwargs ) :
2023-05-20 23:56:23 +02:00
from . . import _IN_CLI
2022-08-30 17:28:28 +02:00
if _IN_CLI :
if msg in deprecation_warning . _cache :
return
deprecation_warning . _cache . add ( msg )
if printer :
return printer ( f ' { msg } { bug_reports_message ( ) } ' , * * kwargs )
return write_string ( f ' ERROR: { msg } { bug_reports_message ( ) } \n ' , * * kwargs )
else :
import warnings
warnings . warn ( DeprecationWarning ( msg ) , stacklevel = stacklevel + 3 )
deprecation_warning . _cache = set ( )
2013-08-28 14:28:55 +02:00
def bytes_to_intlist ( bs ) :
if not bs :
return [ ]
if isinstance ( bs [ 0 ] , int ) : # Python 3
return list ( bs )
else :
return [ ord ( c ) for c in bs ]
2013-08-28 18:22:28 +02:00
2013-08-28 15:59:07 +02:00
def intlist_to_bytes ( xs ) :
if not xs :
return b ' '
2022-06-24 10:10:17 +02:00
return struct . pack ( ' %d B ' % len ( xs ) , * xs )
2013-10-02 08:41:03 +02:00
2022-05-27 01:06:23 +02:00
class LockingUnsupportedError ( OSError ) :
2022-06-03 17:59:03 +02:00
msg = ' File locking is not supported '
2022-04-05 19:38:18 +02:00
def __init__ ( self ) :
super ( ) . __init__ ( self . msg )
2013-10-06 04:27:09 +02:00
# Cross-platform file locking
if sys . platform == ' win32 ' :
2022-08-03 14:17:38 +02:00
import ctypes
2013-10-06 04:27:09 +02:00
import ctypes . wintypes
import msvcrt
class OVERLAPPED ( ctypes . Structure ) :
_fields_ = [
( ' Internal ' , ctypes . wintypes . LPVOID ) ,
( ' InternalHigh ' , ctypes . wintypes . LPVOID ) ,
( ' Offset ' , ctypes . wintypes . DWORD ) ,
( ' OffsetHigh ' , ctypes . wintypes . DWORD ) ,
( ' hEvent ' , ctypes . wintypes . HANDLE ) ,
]
2023-01-25 22:32:07 +01:00
kernel32 = ctypes . WinDLL ( ' kernel32 ' )
2013-10-06 04:27:09 +02:00
LockFileEx = kernel32 . LockFileEx
LockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwFlags
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
2024-06-12 01:09:58 +02:00
ctypes . POINTER ( OVERLAPPED ) , # Overlapped
2013-10-06 04:27:09 +02:00
]
LockFileEx . restype = ctypes . wintypes . BOOL
UnlockFileEx = kernel32 . UnlockFileEx
UnlockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
2024-06-12 01:09:58 +02:00
ctypes . POINTER ( OVERLAPPED ) , # Overlapped
2013-10-06 04:27:09 +02:00
]
UnlockFileEx . restype = ctypes . wintypes . BOOL
whole_low = 0xffffffff
whole_high = 0x7fffffff
2022-03-03 14:27:38 +01:00
def _lock_file ( f , exclusive , block ) :
2013-10-06 04:27:09 +02:00
overlapped = OVERLAPPED ( )
overlapped . Offset = 0
overlapped . OffsetHigh = 0
overlapped . hEvent = 0
f . _lock_file_overlapped_p = ctypes . pointer ( overlapped )
2022-03-03 14:27:38 +01:00
if not LockFileEx ( msvcrt . get_osfhandle ( f . fileno ( ) ) ,
( 0x2 if exclusive else 0x0 ) | ( 0x0 if block else 0x1 ) ,
0 , whole_low , whole_high , f . _lock_file_overlapped_p ) :
2022-06-13 13:57:31 +02:00
# NB: No argument form of "ctypes.FormatError" does not work on PyPy
raise BlockingIOError ( f ' Locking file failed: { ctypes . FormatError ( ctypes . GetLastError ( ) ) !r} ' )
2013-10-06 04:27:09 +02:00
def _unlock_file ( f ) :
assert f . _lock_file_overlapped_p
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
2022-03-03 14:27:38 +01:00
if not UnlockFileEx ( handle , 0 , whole_low , whole_high , f . _lock_file_overlapped_p ) :
2024-06-12 01:09:58 +02:00
raise OSError ( f ' Unlocking file failed: { ctypes . FormatError ( ) !r} ' )
2013-10-06 04:27:09 +02:00
else :
2016-02-20 20:28:25 +01:00
try :
import fcntl
2013-10-06 04:27:09 +02:00
2022-02-05 11:45:51 +01:00
def _lock_file ( f , exclusive , block ) :
2022-04-07 08:30:58 +02:00
flags = fcntl . LOCK_EX if exclusive else fcntl . LOCK_SH
if not block :
flags | = fcntl . LOCK_NB
2022-03-03 14:09:00 +01:00
try :
2022-04-07 08:30:58 +02:00
fcntl . flock ( f , flags )
2022-03-03 14:09:00 +01:00
except BlockingIOError :
raise
except OSError : # AOSP does not have flock()
2022-04-07 08:30:58 +02:00
fcntl . lockf ( f , flags )
2013-10-06 04:27:09 +02:00
2016-02-20 20:28:25 +01:00
def _unlock_file ( f ) :
2023-05-05 07:31:41 +02:00
with contextlib . suppress ( OSError ) :
return fcntl . flock ( f , fcntl . LOCK_UN )
with contextlib . suppress ( OSError ) :
return fcntl . lockf ( f , fcntl . LOCK_UN ) # AOSP does not have flock()
return fcntl . flock ( f , fcntl . LOCK_UN | fcntl . LOCK_NB ) # virtiofs needs LOCK_NB on unlocking
2022-02-05 11:45:51 +01:00
2016-02-20 20:28:25 +01:00
except ImportError :
2022-02-05 11:45:51 +01:00
def _lock_file ( f , exclusive , block ) :
2024-06-12 01:09:58 +02:00
raise LockingUnsupportedError
2016-02-20 20:28:25 +01:00
def _unlock_file ( f ) :
2024-06-12 01:09:58 +02:00
raise LockingUnsupportedError
2013-10-06 04:27:09 +02:00
2022-04-11 17:10:28 +02:00
class locked_file :
2022-04-05 19:38:18 +02:00
locked = False
2022-03-03 14:27:38 +01:00
2022-02-05 11:45:51 +01:00
def __init__ ( self , filename , mode , block = True , encoding = None ) :
2022-04-07 07:58:56 +02:00
if mode not in { ' r ' , ' rb ' , ' a ' , ' ab ' , ' w ' , ' wb ' } :
raise NotImplementedError ( mode )
self . mode , self . block = mode , block
writable = any ( f in mode for f in ' wax+ ' )
readable = any ( f in mode for f in ' r+ ' )
flags = functools . reduce ( operator . ior , (
getattr ( os , ' O_CLOEXEC ' , 0 ) , # UNIX only
getattr ( os , ' O_BINARY ' , 0 ) , # Windows only
getattr ( os , ' O_NOINHERIT ' , 0 ) , # Windows only
os . O_CREAT if writable else 0 , # O_TRUNC only after locking
os . O_APPEND if ' a ' in mode else 0 ,
os . O_EXCL if ' x ' in mode else 0 ,
os . O_RDONLY if not writable else os . O_RDWR if readable else os . O_WRONLY ,
) )
2022-04-09 18:23:27 +02:00
self . f = os . fdopen ( os . open ( filename , flags , 0o666 ) , mode , encoding = encoding )
2013-10-06 04:27:09 +02:00
def __enter__ ( self ) :
2022-02-05 11:45:51 +01:00
exclusive = ' r ' not in self . mode
2013-10-06 04:27:09 +02:00
try :
2022-02-05 11:45:51 +01:00
_lock_file ( self . f , exclusive , self . block )
2022-04-05 19:38:18 +02:00
self . locked = True
2022-04-11 17:10:28 +02:00
except OSError :
2013-10-06 04:27:09 +02:00
self . f . close ( )
raise
2022-04-07 07:58:56 +02:00
if ' w ' in self . mode :
2022-05-01 22:31:06 +02:00
try :
self . f . truncate ( )
except OSError as e :
2022-06-03 17:59:03 +02:00
if e . errno not in (
errno . ESPIPE , # Illegal seek - expected for FIFO
errno . EINVAL , # Invalid argument - expected for /dev/null
) :
raise
2013-10-06 04:27:09 +02:00
return self
2022-04-05 19:38:18 +02:00
def unlock ( self ) :
if not self . locked :
return
2013-10-06 04:27:09 +02:00
try :
2022-04-05 19:38:18 +02:00
_unlock_file ( self . f )
2013-10-06 04:27:09 +02:00
finally :
2022-04-05 19:38:18 +02:00
self . locked = False
2013-10-06 04:27:09 +02:00
2022-04-05 19:38:18 +02:00
def __exit__ ( self , * _ ) :
try :
self . unlock ( )
finally :
self . f . close ( )
2013-10-12 13:49:27 +02:00
2022-04-05 19:38:18 +02:00
open = __enter__
close = __exit__
2022-02-05 11:45:51 +01:00
2022-04-05 19:38:18 +02:00
def __getattr__ ( self , attr ) :
return getattr ( self . f , attr )
2022-02-05 11:45:51 +01:00
2022-04-05 19:38:18 +02:00
def __iter__ ( self ) :
return iter ( self . f )
2022-02-05 11:45:51 +01:00
2013-10-12 13:49:27 +02:00
2022-05-19 16:06:31 +02:00
@functools.cache
2014-09-30 17:27:53 +02:00
def get_filesystem_encoding ( ) :
encoding = sys . getfilesystemencoding ( )
return encoding if encoding is not None else ' utf-8 '
2024-04-27 10:37:26 +02:00
_WINDOWS_QUOTE_TRANS = str . maketrans ( { ' " ' : R ' \ " ' } )
2024-04-08 23:18:04 +02:00
_CMD_QUOTE_TRANS = str . maketrans ( {
# Keep quotes balanced by replacing them with `""` instead of `\\"`
' " ' : ' " " ' ,
2024-04-27 10:37:26 +02:00
# These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
2024-04-08 23:18:04 +02:00
# `=` should be unique since variables containing `=` cannot be set using cmd
' \n ' : ' % = % ' ,
2024-04-27 10:37:26 +02:00
' \r ' : ' % = % ' ,
2024-04-08 23:18:04 +02:00
# Use zero length variable replacement so `%` doesn't get expanded
# `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
' % ' : ' %% cd:~, % ' ,
} )
def shell_quote ( args , * , shell = False ) :
args = list ( variadic ( args ) )
if compat_os_name != ' nt ' :
return shlex . join ( args )
trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
return ' ' . join (
2024-04-27 10:37:26 +02:00
s if re . fullmatch ( r ' [ \ w#$* \ -+./:?@ \\ ]+ ' , s , re . ASCII )
else re . sub ( r ' ( \\ +)( " |$) ' , r ' \ 1 \ 1 \ 2 ' , s ) . translate ( trans ) . join ( ' " " ' )
2024-04-08 23:18:04 +02:00
for s in args )
2013-10-15 12:05:13 +02:00
def smuggle_url ( url , data ) :
""" Pass additional data in a URL for internal use. """
2016-07-04 18:57:44 +02:00
url , idata = unsmuggle_url ( url , { } )
data . update ( idata )
2022-06-24 12:54:43 +02:00
sdata = urllib . parse . urlencode (
2014-11-17 07:16:12 +01:00
{ ' __youtubedl_smuggle ' : json . dumps ( data ) } )
return url + ' # ' + sdata
2013-10-15 12:05:13 +02:00
2014-01-07 05:34:14 +01:00
def unsmuggle_url ( smug_url , default = None ) :
2014-12-09 23:11:26 +01:00
if ' #__youtubedl_smuggle ' not in smug_url :
2014-01-07 05:34:14 +01:00
return smug_url , default
2014-11-17 07:16:12 +01:00
url , _ , sdata = smug_url . rpartition ( ' # ' )
2022-06-24 12:54:43 +02:00
jsond = urllib . parse . parse_qs ( sdata ) [ ' __youtubedl_smuggle ' ] [ 0 ]
2013-10-15 12:05:13 +02:00
data = json . loads ( jsond )
return url , data
2013-11-25 03:12:26 +01:00
2021-12-23 02:14:42 +01:00
def format_decimal_suffix ( num , fmt = ' %d %s ' , * , factor = 1000 ) :
""" Formats numbers with decimal sufixes like K, M, etc """
num , factor = float_or_none ( num ) , float ( factor )
2022-03-04 15:10:10 +01:00
if num is None or num < 0 :
2021-12-23 02:14:42 +01:00
return None
2022-03-18 22:03:09 +01:00
POSSIBLE_SUFFIXES = ' kMGTPEZY '
exponent = 0 if num == 0 else min ( int ( math . log ( num , factor ) ) , len ( POSSIBLE_SUFFIXES ) )
suffix = [ ' ' , * POSSIBLE_SUFFIXES ] [ exponent ]
2021-12-30 04:13:40 +01:00
if factor == 1024 :
suffix = { ' k ' : ' Ki ' , ' ' : ' ' } . get ( suffix , f ' { suffix } i ' )
2021-12-23 02:14:42 +01:00
converted = num / ( factor * * exponent )
2021-12-30 04:13:40 +01:00
return fmt % ( converted , suffix )
2021-12-23 02:14:42 +01:00
2013-11-25 03:12:26 +01:00
def format_bytes ( bytes ) :
2021-12-27 23:08:31 +01:00
return format_decimal_suffix ( bytes , ' %.2f %s B ' , factor = 1024 ) or ' N/A '
2013-12-06 13:36:36 +01:00
2013-12-09 18:29:07 +01:00
2022-11-17 04:10:34 +01:00
def lookup_unit_table ( unit_table , s , strict = False ) :
num_re = NUMBER_RE if strict else NUMBER_RE . replace ( R ' \ . ' , ' [,.] ' )
2016-03-13 11:27:20 +01:00
units_re = ' | ' . join ( re . escape ( u ) for u in unit_table )
2022-11-17 04:10:34 +01:00
m = ( re . fullmatch if strict else re . match ) (
rf ' (?P<num> { num_re } ) \ s*(?P<unit> { units_re } ) \ b ' , s )
2016-03-13 11:27:20 +01:00
if not m :
return None
2022-11-17 04:10:34 +01:00
num = float ( m . group ( ' num ' ) . replace ( ' , ' , ' . ' ) )
2016-03-13 11:27:20 +01:00
mult = unit_table [ m . group ( ' unit ' ) ]
2022-11-17 04:10:34 +01:00
return round ( num * mult )
def parse_bytes ( s ) :
""" Parse a string indicating a byte quantity into an integer """
return lookup_unit_table (
{ u : 1024 * * i for i , u in enumerate ( [ ' ' , * ' KMGTPEZY ' ] ) } ,
s . upper ( ) , strict = True )
2016-03-13 11:27:20 +01:00
2014-11-25 09:54:54 +01:00
def parse_filesize ( s ) :
if s is None :
return None
2016-01-10 16:17:47 +01:00
# The lower-case forms are of course incorrect and unofficial,
2014-11-25 09:54:54 +01:00
# but we support those too
_UNIT_TABLE = {
' B ' : 1 ,
' b ' : 1 ,
2016-08-19 18:12:32 +02:00
' bytes ' : 1 ,
2014-11-25 09:54:54 +01:00
' KiB ' : 1024 ,
' KB ' : 1000 ,
' kB ' : 1024 ,
' Kb ' : 1000 ,
2016-08-18 18:32:00 +02:00
' kb ' : 1000 ,
2016-08-19 18:12:32 +02:00
' kilobytes ' : 1000 ,
' kibibytes ' : 1024 ,
2014-11-25 09:54:54 +01:00
' MiB ' : 1024 * * 2 ,
' MB ' : 1000 * * 2 ,
' mB ' : 1024 * * 2 ,
' Mb ' : 1000 * * 2 ,
2016-08-18 18:32:00 +02:00
' mb ' : 1000 * * 2 ,
2016-08-19 18:12:32 +02:00
' megabytes ' : 1000 * * 2 ,
' mebibytes ' : 1024 * * 2 ,
2014-11-25 09:54:54 +01:00
' GiB ' : 1024 * * 3 ,
' GB ' : 1000 * * 3 ,
' gB ' : 1024 * * 3 ,
' Gb ' : 1000 * * 3 ,
2016-08-18 18:32:00 +02:00
' gb ' : 1000 * * 3 ,
2016-08-19 18:12:32 +02:00
' gigabytes ' : 1000 * * 3 ,
' gibibytes ' : 1024 * * 3 ,
2014-11-25 09:54:54 +01:00
' TiB ' : 1024 * * 4 ,
' TB ' : 1000 * * 4 ,
' tB ' : 1024 * * 4 ,
' Tb ' : 1000 * * 4 ,
2016-08-18 18:32:00 +02:00
' tb ' : 1000 * * 4 ,
2016-08-19 18:12:32 +02:00
' terabytes ' : 1000 * * 4 ,
' tebibytes ' : 1024 * * 4 ,
2014-11-25 09:54:54 +01:00
' PiB ' : 1024 * * 5 ,
' PB ' : 1000 * * 5 ,
' pB ' : 1024 * * 5 ,
' Pb ' : 1000 * * 5 ,
2016-08-18 18:32:00 +02:00
' pb ' : 1000 * * 5 ,
2016-08-19 18:12:32 +02:00
' petabytes ' : 1000 * * 5 ,
' pebibytes ' : 1024 * * 5 ,
2014-11-25 09:54:54 +01:00
' EiB ' : 1024 * * 6 ,
' EB ' : 1000 * * 6 ,
' eB ' : 1024 * * 6 ,
' Eb ' : 1000 * * 6 ,
2016-08-18 18:32:00 +02:00
' eb ' : 1000 * * 6 ,
2016-08-19 18:12:32 +02:00
' exabytes ' : 1000 * * 6 ,
' exbibytes ' : 1024 * * 6 ,
2014-11-25 09:54:54 +01:00
' ZiB ' : 1024 * * 7 ,
' ZB ' : 1000 * * 7 ,
' zB ' : 1024 * * 7 ,
' Zb ' : 1000 * * 7 ,
2016-08-18 18:32:00 +02:00
' zb ' : 1000 * * 7 ,
2016-08-19 18:12:32 +02:00
' zettabytes ' : 1000 * * 7 ,
' zebibytes ' : 1024 * * 7 ,
2014-11-25 09:54:54 +01:00
' YiB ' : 1024 * * 8 ,
' YB ' : 1000 * * 8 ,
' yB ' : 1024 * * 8 ,
' Yb ' : 1000 * * 8 ,
2016-08-18 18:32:00 +02:00
' yb ' : 1000 * * 8 ,
2016-08-19 18:12:32 +02:00
' yottabytes ' : 1000 * * 8 ,
' yobibytes ' : 1024 * * 8 ,
2014-11-25 09:54:54 +01:00
}
2016-03-13 11:27:20 +01:00
return lookup_unit_table ( _UNIT_TABLE , s )
def parse_count ( s ) :
if s is None :
2014-11-25 09:54:54 +01:00
return None
2021-12-23 22:32:50 +01:00
s = re . sub ( r ' ^[^ \ d]+ \ s ' , ' ' , s ) . strip ( )
2016-03-13 11:27:20 +01:00
if re . match ( r ' ^[ \ d,.]+$ ' , s ) :
return str_to_int ( s )
_UNIT_TABLE = {
' k ' : 1000 ,
' K ' : 1000 ,
' m ' : 1000 * * 2 ,
' M ' : 1000 * * 2 ,
' kk ' : 1000 * * 2 ,
' KK ' : 1000 * * 2 ,
2021-12-23 22:32:50 +01:00
' b ' : 1000 * * 3 ,
' B ' : 1000 * * 3 ,
2016-03-13 11:27:20 +01:00
}
2014-11-25 09:54:54 +01:00
2021-12-23 22:32:50 +01:00
ret = lookup_unit_table ( _UNIT_TABLE , s )
if ret is not None :
return ret
mobj = re . match ( r ' ([ \ d,.]+)(?:$| \ s) ' , s )
if mobj :
return str_to_int ( mobj . group ( 1 ) )
2014-11-25 09:54:54 +01:00
2016-03-13 12:23:08 +01:00
2022-04-01 12:31:58 +02:00
def parse_resolution ( s , * , lenient = False ) :
2018-03-02 17:39:04 +01:00
if s is None :
return { }
2022-04-01 12:31:58 +02:00
if lenient :
mobj = re . search ( r ' (?P<w> \ d+) \ s*[xX× ,] \ s*(?P<h> \ d+) ' , s )
else :
mobj = re . search ( r ' (?<![a-zA-Z0-9])(?P<w> \ d+) \ s*[xX× ,] \ s*(?P<h> \ d+)(?![a-zA-Z0-9]) ' , s )
2018-03-02 17:39:04 +01:00
if mobj :
return {
' width ' : int ( mobj . group ( ' w ' ) ) ,
' height ' : int ( mobj . group ( ' h ' ) ) ,
}
2021-10-22 02:04:00 +02:00
mobj = re . search ( r ' (?<![a-zA-Z0-9])( \ d+)[pPiI](?![a-zA-Z0-9]) ' , s )
2018-03-02 17:39:04 +01:00
if mobj :
return { ' height ' : int ( mobj . group ( 1 ) ) }
mobj = re . search ( r ' \ b([48])[kK] \ b ' , s )
if mobj :
return { ' height ' : int ( mobj . group ( 1 ) ) * 540 }
return { }
2019-03-17 03:07:47 +01:00
def parse_bitrate ( s ) :
2022-06-24 12:54:43 +02:00
if not isinstance ( s , str ) :
2019-03-17 03:07:47 +01:00
return
mobj = re . search ( r ' \ b( \ d+) \ s*kbps ' , s )
if mobj :
return int ( mobj . group ( 1 ) )
2016-09-02 18:31:52 +02:00
def month_by_name ( name , lang = ' en ' ) :
2013-12-09 19:39:41 +01:00
""" Return the number of a month by (locale-independently) English name """
2016-09-14 18:13:55 +02:00
month_names = MONTH_NAMES . get ( lang , MONTH_NAMES [ ' en ' ] )
2016-09-02 18:31:52 +02:00
2013-12-09 19:39:41 +01:00
try :
2016-09-14 18:13:55 +02:00
return month_names . index ( name ) + 1
2015-02-13 08:14:23 +01:00
except ValueError :
return None
def month_by_abbreviation ( abbrev ) :
""" Return the number of a month by (locale-independently) English
abbreviations """
try :
return [ s [ : 3 ] for s in ENGLISH_MONTH_NAMES ] . index ( abbrev ) + 1
2013-12-09 19:39:41 +01:00
except ValueError :
return None
2013-12-10 21:03:53 +01:00
2014-01-20 22:11:34 +01:00
def fix_xml_ampersands ( xml_str ) :
2013-12-10 21:03:53 +01:00
""" Replace all the ' & ' by ' & ' in XML """
2014-01-20 22:11:34 +01:00
return re . sub (
r ' &(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F] { ,4};|#[0-9] { ,4};) ' ,
2014-11-17 07:16:12 +01:00
' & ' ,
2014-01-20 22:11:34 +01:00
xml_str )
2013-12-16 05:04:12 +01:00
def setproctitle ( title ) :
2022-06-24 12:54:43 +02:00
assert isinstance ( title , str )
2016-02-20 20:29:02 +01:00
2022-08-03 14:17:38 +02:00
# Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
try :
import ctypes
except ImportError :
2016-02-20 20:29:02 +01:00
return
2013-12-16 05:04:12 +01:00
try :
2016-02-14 10:37:17 +01:00
libc = ctypes . cdll . LoadLibrary ( ' libc.so.6 ' )
2013-12-16 05:04:12 +01:00
except OSError :
return
2017-02-10 22:05:09 +01:00
except TypeError :
# LoadLibrary in Windows Python 2.7.13 only expects
# a bytestring, but since unicode_literals turns
# every string into a unicode string, it fails.
return
2022-05-09 13:54:28 +02:00
title_bytes = title . encode ( )
2014-03-23 14:28:22 +01:00
buf = ctypes . create_string_buffer ( len ( title_bytes ) )
buf . value = title_bytes
2013-12-16 05:04:12 +01:00
try :
2023-12-30 22:27:36 +01:00
# PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
2014-03-23 14:28:22 +01:00
libc . prctl ( 15 , buf , 0 , 0 , 0 )
2013-12-16 05:04:12 +01:00
except AttributeError :
return # Strange libc, just skip this
2013-12-16 13:56:13 +01:00
def remove_start ( s , start ) :
2016-05-19 00:31:30 +02:00
return s [ len ( start ) : ] if s is not None and s . startswith ( start ) else s
2013-12-17 04:13:36 +01:00
2014-08-22 18:40:26 +02:00
def remove_end ( s , end ) :
2016-05-19 00:31:30 +02:00
return s [ : - len ( end ) ] if s is not None and s . endswith ( end ) else s
2014-08-22 18:40:26 +02:00
2015-12-14 16:30:58 +01:00
def remove_quotes ( s ) :
if s is None or len ( s ) < 2 :
return s
2024-06-12 01:09:58 +02:00
for quote in ( ' " ' , " ' " ) :
2015-12-14 16:30:58 +01:00
if s [ 0 ] == quote and s [ - 1 ] == quote :
return s [ 1 : - 1 ]
return s
2020-10-09 07:06:49 +02:00
def get_domain ( url ) :
2022-07-13 16:12:52 +02:00
"""
This implementation is inconsistent , but is kept for compatibility .
Use this only for " webpage_url_domain "
"""
return remove_start ( urllib . parse . urlparse ( url ) . netloc , ' www. ' ) or None
2020-10-09 07:06:49 +02:00
2013-12-17 04:13:36 +01:00
def url_basename ( url ) :
2022-06-24 12:54:43 +02:00
path = urllib . parse . urlparse ( url ) . path
2014-11-17 07:16:12 +01:00
return path . strip ( ' / ' ) . split ( ' / ' ) [ - 1 ]
2013-12-20 17:05:28 +01:00
2016-11-01 20:14:01 +01:00
def base_url ( url ) :
2022-09-04 05:09:45 +02:00
return re . match ( r ' https?://[^?#]+/ ' , url ) . group ( )
2016-11-01 20:14:01 +01:00
2016-12-12 20:23:49 +01:00
def urljoin ( base , path ) :
2017-03-05 21:57:46 +01:00
if isinstance ( path , bytes ) :
2022-05-09 13:54:28 +02:00
path = path . decode ( )
2022-06-24 12:54:43 +02:00
if not isinstance ( path , str ) or not path :
2016-12-12 20:23:49 +01:00
return None
2024-09-28 00:46:22 +02:00
if re . match ( r ' (?:[a-zA-Z][a-zA-Z0-9+-.]*:)?// ' , path ) :
2016-12-12 20:23:49 +01:00
return path
2017-03-05 21:57:46 +01:00
if isinstance ( base , bytes ) :
2022-05-09 13:54:28 +02:00
base = base . decode ( )
2022-06-24 12:54:43 +02:00
if not isinstance ( base , str ) or not re . match (
2017-03-05 21:57:46 +01:00
r ' ^(?:https?:)?// ' , base ) :
2016-12-12 20:23:49 +01:00
return None
2022-06-24 12:54:43 +02:00
return urllib . parse . urljoin ( base , path )
2016-12-12 20:23:49 +01:00
2024-10-13 05:14:32 +02:00
def partial_application ( func ) :
sig = inspect . signature ( func )
@functools.wraps ( func )
def wrapped ( * args , * * kwargs ) :
try :
sig . bind ( * args , * * kwargs )
except TypeError :
return functools . partial ( func , * args , * * kwargs )
else :
return func ( * args , * * kwargs )
return wrapped
@partial_application
def int_or_none ( v , scale = 1 , default = None , get_attr = None , invscale = 1 , base = None ) :
2022-01-03 20:37:24 +01:00
if get_attr and v is not None :
v = getattr ( v , get_attr , None )
2024-10-13 05:14:32 +02:00
if invscale == 1 and scale < 1 :
invscale = int ( 1 / scale )
scale = 1
2015-10-14 18:35:01 +02:00
try :
2024-10-13 05:14:32 +02:00
return ( int ( v ) if base is None else int ( v , base = base ) ) * invscale / / scale
2021-11-03 19:35:53 +01:00
except ( ValueError , TypeError , OverflowError ) :
2015-10-14 18:37:03 +02:00
return default
2014-07-21 12:02:44 +02:00
2014-08-10 13:04:45 +02:00
2014-08-10 11:00:14 +02:00
def str_or_none ( v , default = None ) :
2022-06-24 12:54:43 +02:00
return default if v is None else str ( v )
2014-08-10 11:00:14 +02:00
2014-07-21 12:02:44 +02:00
def str_to_int ( int_str ) :
2014-08-31 23:51:36 +02:00
""" A more relaxed version of int_or_none """
2022-04-11 22:09:26 +02:00
if isinstance ( int_str , int ) :
2019-11-29 17:05:06 +01:00
return int_str
2022-06-24 12:54:43 +02:00
elif isinstance ( int_str , str ) :
2019-12-15 17:15:24 +01:00
int_str = re . sub ( r ' [, \ . \ +] ' , ' ' , int_str )
return int_or_none ( int_str )
2013-12-26 13:49:44 +01:00
2024-10-13 05:14:32 +02:00
@partial_application
2014-07-21 12:02:44 +02:00
def float_or_none ( v , scale = 1 , invscale = 1 , default = None ) :
2015-10-14 18:36:37 +02:00
if v is None :
return default
2024-10-13 05:14:32 +02:00
if invscale == 1 and scale < 1 :
invscale = int ( 1 / scale )
scale = 1
2015-10-14 18:36:37 +02:00
try :
return float ( v ) * invscale / scale
2019-03-22 19:08:54 +01:00
except ( ValueError , TypeError ) :
2015-10-14 18:36:37 +02:00
return default
2014-03-28 23:06:34 +01:00
2017-09-10 14:08:39 +02:00
def bool_or_none ( v , default = None ) :
return v if isinstance ( v , bool ) else default
2019-05-23 18:58:35 +02:00
def strip_or_none ( v , default = None ) :
2022-06-24 12:54:43 +02:00
return v . strip ( ) if isinstance ( v , str ) else default
2016-06-25 17:32:02 +02:00
2018-07-21 13:01:06 +02:00
def url_or_none ( url ) :
2022-06-24 12:54:43 +02:00
if not url or not isinstance ( url , str ) :
2018-07-21 13:01:06 +02:00
return None
url = url . strip ( )
2024-09-28 00:46:22 +02:00
return url if re . match ( r ' (?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?// ' , url ) else None
2018-07-21 13:01:06 +02:00
2023-06-21 05:51:20 +02:00
def strftime_or_none ( timestamp , date_format = ' % Y % m %d ' , default = None ) :
2021-02-02 22:15:00 +01:00
datetime_object = None
try :
2022-04-11 22:09:26 +02:00
if isinstance ( timestamp , ( int , float ) ) : # unix timestamp
2022-10-09 00:48:28 +02:00
# Using naive datetime here can break timestamp() in Windows
# Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2024-02-25 01:16:34 +01:00
# Also, dt.datetime.fromtimestamp breaks for negative timestamps
2023-06-21 01:48:03 +02:00
# Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2024-02-25 01:16:34 +01:00
datetime_object = ( dt . datetime . fromtimestamp ( 0 , dt . timezone . utc )
+ dt . timedelta ( seconds = timestamp ) )
2022-06-24 12:54:43 +02:00
elif isinstance ( timestamp , str ) : # assume YYYYMMDD
2024-02-25 01:16:34 +01:00
datetime_object = dt . datetime . strptime ( timestamp , ' % Y % m %d ' )
2022-09-17 08:04:04 +02:00
date_format = re . sub ( # Support %s on windows
r ' (?<! % )( %% )* %s ' , rf ' \ g<1> { int ( datetime_object . timestamp ( ) ) } ' , date_format )
2021-02-02 22:15:00 +01:00
return datetime_object . strftime ( date_format )
except ( ValueError , TypeError , AttributeError ) :
return default
2013-12-26 13:49:44 +01:00
def parse_duration ( s ) :
2022-04-11 22:09:26 +02:00
if not isinstance ( s , str ) :
2013-12-26 13:49:44 +01:00
return None
2014-08-31 01:41:30 +02:00
s = s . strip ( )
2021-12-06 19:00:33 +01:00
if not s :
return None
2014-08-31 01:41:30 +02:00
2016-04-07 20:30:47 +02:00
days , hours , mins , secs , ms = [ None ] * 5
2022-01-19 13:41:27 +01:00
m = re . match ( r ''' (?x)
( ? P < before_secs >
( ? : ( ? : ( ? P < days > [ 0 - 9 ] + ) : ) ? ( ? P < hours > [ 0 - 9 ] + ) : ) ? ( ? P < mins > [ 0 - 9 ] + ) : ) ?
( ? P < secs > ( ? ( before_secs ) [ 0 - 9 ] { 1 , 2 } | [ 0 - 9 ] + ) )
( ? P < ms > [ . : ] [ 0 - 9 ] + ) ? Z ? $
''' , s)
2016-04-07 20:30:47 +02:00
if m :
2022-01-19 13:41:27 +01:00
days , hours , mins , secs , ms = m . group ( ' days ' , ' hours ' , ' mins ' , ' secs ' , ' ms ' )
2016-04-07 20:30:47 +02:00
else :
m = re . match (
2017-10-29 02:04:48 +02:00
r ''' (?ix)(?:P?
( ? :
2022-03-28 02:49:42 +02:00
[ 0 - 9 ] + \s * y ( ? : ears ? ) ? , ? \s *
2017-10-29 02:04:48 +02:00
) ?
( ? :
2022-03-28 02:49:42 +02:00
[ 0 - 9 ] + \s * m ( ? : onths ? ) ? , ? \s *
2017-10-29 02:04:48 +02:00
) ?
( ? :
2022-03-28 02:49:42 +02:00
[ 0 - 9 ] + \s * w ( ? : eeks ? ) ? , ? \s *
2017-10-29 02:04:48 +02:00
) ?
2015-02-02 21:48:54 +01:00
( ? :
2022-03-28 02:49:42 +02:00
( ? P < days > [ 0 - 9 ] + ) \s * d ( ? : ays ? ) ? , ? \s *
2015-02-02 21:48:54 +01:00
) ?
2017-10-29 02:04:48 +02:00
T ) ?
2016-04-07 20:30:47 +02:00
( ? :
2023-07-20 15:40:31 +02:00
( ? P < hours > [ 0 - 9 ] + ) \s * h ( ? : ( ? : ou ) ? rs ? ) ? , ? \s *
2016-04-07 20:30:47 +02:00
) ?
( ? :
2022-03-28 02:49:42 +02:00
( ? P < mins > [ 0 - 9 ] + ) \s * m ( ? : in ( ? : ute ) ? s ? ) ? , ? \s *
2016-04-07 20:30:47 +02:00
) ?
( ? :
( ? P < secs > [ 0 - 9 ] + ) ( ? P < ms > \. [ 0 - 9 ] + ) ? \s * s ( ? : ec ( ? : ond ) ? s ? ) ? \s *
2017-01-26 17:23:08 +01:00
) ? Z ? $ ''' , s)
2016-04-07 20:30:47 +02:00
if m :
days , hours , mins , secs , ms = m . groups ( )
else :
2017-01-26 17:23:08 +01:00
m = re . match ( r ' (?i)(?:(?P<hours>[0-9.]+) \ s*(?:hours?)|(?P<mins>[0-9.]+) \ s*(?:mins? \ .?|minutes?) \ s*)Z?$ ' , s )
2016-04-07 20:30:47 +02:00
if m :
hours , mins = m . groups ( )
else :
return None
if ms :
2022-04-17 22:58:28 +02:00
ms = ms . replace ( ' : ' , ' . ' )
return sum ( float ( part or 0 ) * mult for part , mult in (
( days , 86400 ) , ( hours , 3600 ) , ( mins , 60 ) , ( secs , 1 ) , ( ms , 1 ) ) )
2014-01-03 12:52:27 +01:00
2024-07-02 00:52:50 +02:00
def _change_extension ( prepend , filename , ext , expected_real_ext = None ) :
2014-11-23 20:41:03 +01:00
name , real_ext = os . path . splitext ( filename )
2014-01-07 06:23:41 +01:00
2024-07-02 00:52:50 +02:00
if not expected_real_ext or real_ext [ 1 : ] == expected_real_ext :
filename = name
if prepend and real_ext :
_UnsafeExtensionError . sanitize_extension ( ext , prepend = True )
return f ' { filename } . { ext } { real_ext } '
return f ' { filename } . { _UnsafeExtensionError . sanitize_extension ( ext ) } '
2014-01-07 06:23:41 +01:00
2024-07-02 00:52:50 +02:00
prepend_extension = functools . partial ( _change_extension , True )
replace_extension = functools . partial ( _change_extension , False )
2015-05-02 19:23:06 +02:00
2014-01-07 06:23:41 +01:00
def check_executable ( exe , args = [ ] ) :
""" Checks if the given binary is installed somewhere in PATH, and returns its name.
args can be a list of arguments for a short output ( like - version ) """
try :
2024-06-12 01:09:58 +02:00
Popen . run ( [ exe , * args ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
2014-01-07 06:23:41 +01:00
except OSError :
return False
return exe
2014-01-20 11:36:47 +01:00
2022-11-11 04:13:08 +01:00
def _get_exe_version_output ( exe , args ) :
2014-11-02 10:50:30 +01:00
try :
2016-10-22 07:04:05 +02:00
# STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2021-02-24 19:45:56 +01:00
# SIGTTOU if yt-dlp is run in the background.
2019-03-09 13:14:41 +01:00
# See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2024-06-12 01:09:58 +02:00
stdout , _ , ret = Popen . run ( [ encodeArgument ( exe ) , * args ] , text = True ,
2023-01-01 09:41:14 +01:00
stdin = subprocess . PIPE , stdout = subprocess . PIPE , stderr = subprocess . STDOUT )
if ret :
return None
2014-11-02 10:50:30 +01:00
except OSError :
return False
2022-06-15 22:55:43 +02:00
return stdout
2014-12-14 21:59:59 +01:00
def detect_exe_version ( output , version_re = None , unrecognized = ' present ' ) :
2022-06-24 12:54:43 +02:00
assert isinstance ( output , str )
2014-12-14 21:59:59 +01:00
if version_re is None :
version_re = r ' version \ s+([-0-9._a-zA-Z]+) '
m = re . search ( version_re , output )
2014-11-02 10:50:30 +01:00
if m :
return m . group ( 1 )
else :
return unrecognized
2021-11-03 19:53:48 +01:00
def get_exe_version ( exe , args = [ ' --version ' ] ,
2023-01-01 09:41:14 +01:00
version_re = None , unrecognized = ( ' present ' , ' broken ' ) ) :
2021-11-03 19:53:48 +01:00
""" Returns the version of the specified executable,
or False if the executable is not present """
2023-01-01 09:41:14 +01:00
unrecognized = variadic ( unrecognized )
assert len ( unrecognized ) in ( 1 , 2 )
2021-11-03 19:53:48 +01:00
out = _get_exe_version_output ( exe , args )
2023-01-01 09:41:14 +01:00
if out is None :
return unrecognized [ - 1 ]
return out and detect_exe_version ( out , version_re , unrecognized [ 0 ] )
2021-11-03 19:53:48 +01:00
2022-06-17 06:48:21 +02:00
def frange ( start = 0 , stop = None , step = 1 ) :
""" Float range """
if stop is None :
start , stop = 0 , start
sign = [ - 1 , 1 ] [ step > 0 ] if step else 0
while sign * start < sign * stop :
yield start
start + = step
2021-07-23 17:02:48 +02:00
class LazyList ( collections . abc . Sequence ) :
2022-05-09 13:54:28 +02:00
""" Lazy immutable list from an iterable
Note that slices of a LazyList are lists and not LazyList """
2021-05-28 18:49:13 +02:00
2024-06-12 01:09:58 +02:00
class IndexError ( IndexError ) : # noqa: A001
2021-09-03 23:37:27 +02:00
pass
2021-11-20 03:35:57 +01:00
def __init__ ( self , iterable , * , reverse = False , _cache = None ) :
2022-05-09 13:54:28 +02:00
self . _iterable = iter ( iterable )
self . _cache = [ ] if _cache is None else _cache
self . _reversed = reverse
2021-05-28 18:49:13 +02:00
def __iter__ ( self ) :
2022-05-09 13:54:28 +02:00
if self . _reversed :
2021-06-12 17:14:30 +02:00
# We need to consume the entire iterable to iterate in reverse
2021-06-27 04:05:58 +02:00
yield from self . exhaust ( )
2021-06-12 17:14:30 +02:00
return
2022-05-09 13:54:28 +02:00
yield from self . _cache
for item in self . _iterable :
self . _cache . append ( item )
2021-05-28 18:49:13 +02:00
yield item
2022-05-09 13:54:28 +02:00
def _exhaust ( self ) :
self . _cache . extend ( self . _iterable )
self . _iterable = [ ] # Discard the emptied iterable to make it pickle-able
return self . _cache
2021-06-12 17:14:30 +02:00
2021-06-27 04:05:58 +02:00
def exhaust ( self ) :
2022-05-09 13:54:28 +02:00
""" Evaluate the entire iterable """
return self . _exhaust ( ) [ : : - 1 if self . _reversed else 1 ]
2021-06-27 04:05:58 +02:00
2021-06-12 17:14:30 +02:00
@staticmethod
2022-05-09 13:54:28 +02:00
def _reverse_index ( x ) :
2022-07-08 21:37:47 +02:00
return None if x is None else ~ x
2021-05-28 18:49:13 +02:00
def __getitem__ ( self , idx ) :
if isinstance ( idx , slice ) :
2022-05-09 13:54:28 +02:00
if self . _reversed :
idx = slice ( self . _reverse_index ( idx . start ) , self . _reverse_index ( idx . stop ) , - ( idx . step or 1 ) )
2021-08-01 08:17:30 +02:00
start , stop , step = idx . start , idx . stop , idx . step or 1
2021-05-28 18:49:13 +02:00
elif isinstance ( idx , int ) :
2022-05-09 13:54:28 +02:00
if self . _reversed :
idx = self . _reverse_index ( idx )
2021-08-01 08:17:30 +02:00
start , stop , step = idx , idx , 0
2021-05-28 18:49:13 +02:00
else :
raise TypeError ( ' indices must be integers or slices ' )
2021-08-01 08:17:30 +02:00
if ( ( start or 0 ) < 0 or ( stop or 0 ) < 0
or ( start is None and step < 0 )
or ( stop is None and step > 0 ) ) :
2021-05-28 18:49:13 +02:00
# We need to consume the entire iterable to be able to slice from the end
# Obviously, never use this with infinite iterables
2022-05-09 13:54:28 +02:00
self . _exhaust ( )
2021-09-03 23:37:27 +02:00
try :
2022-05-09 13:54:28 +02:00
return self . _cache [ idx ]
2021-09-03 23:37:27 +02:00
except IndexError as e :
raise self . IndexError ( e ) from e
2022-05-09 13:54:28 +02:00
n = max ( start or 0 , stop or 0 ) - len ( self . _cache ) + 1
2021-06-12 17:14:30 +02:00
if n > 0 :
2022-05-09 13:54:28 +02:00
self . _cache . extend ( itertools . islice ( self . _iterable , n ) )
2021-09-03 23:37:27 +02:00
try :
2022-05-09 13:54:28 +02:00
return self . _cache [ idx ]
2021-09-03 23:37:27 +02:00
except IndexError as e :
raise self . IndexError ( e ) from e
2021-05-28 18:49:13 +02:00
def __bool__ ( self ) :
try :
2022-05-09 13:54:28 +02:00
self [ - 1 ] if self . _reversed else self [ 0 ]
2021-09-03 23:37:27 +02:00
except self . IndexError :
2021-05-28 18:49:13 +02:00
return False
return True
def __len__ ( self ) :
2022-05-09 13:54:28 +02:00
self . _exhaust ( )
return len ( self . _cache )
2021-05-28 18:49:13 +02:00
2021-11-20 03:35:57 +01:00
def __reversed__ ( self ) :
2022-05-09 13:54:28 +02:00
return type ( self ) ( self . _iterable , reverse = not self . _reversed , _cache = self . _cache )
2021-11-20 03:35:57 +01:00
def __copy__ ( self ) :
2022-05-09 13:54:28 +02:00
return type ( self ) ( self . _iterable , reverse = self . _reversed , _cache = self . _cache )
2021-11-20 03:35:57 +01:00
2021-06-12 17:14:30 +02:00
def __repr__ ( self ) :
# repr and str should mimic a list. So we exhaust the iterable
return repr ( self . exhaust ( ) )
def __str__ ( self ) :
return repr ( self . exhaust ( ) )
2021-05-28 18:49:13 +02:00
2021-08-10 00:10:40 +02:00
class PagedList :
2021-11-19 16:15:52 +01:00
2024-06-12 01:09:58 +02:00
class IndexError ( IndexError ) : # noqa: A001
2021-11-19 16:15:52 +01:00
pass
2014-01-22 21:43:33 +01:00
def __len__ ( self ) :
# This is only useful for tests
return len ( self . getslice ( ) )
2021-08-10 00:10:40 +02:00
def __init__ ( self , pagefunc , pagesize , use_cache = True ) :
self . _pagefunc = pagefunc
self . _pagesize = pagesize
2022-03-02 22:12:52 +01:00
self . _pagecount = float ( ' inf ' )
2021-08-10 00:10:40 +02:00
self . _use_cache = use_cache
self . _cache = { }
def getpage ( self , pagenum ) :
2021-11-16 16:44:02 +01:00
page_results = self . _cache . get ( pagenum )
if page_results is None :
2022-03-02 22:12:52 +01:00
page_results = [ ] if pagenum > self . _pagecount else list ( self . _pagefunc ( pagenum ) )
2021-08-10 00:10:40 +02:00
if self . _use_cache :
self . _cache [ pagenum ] = page_results
return page_results
def getslice ( self , start = 0 , end = None ) :
return list ( self . _getslice ( start , end ) )
def _getslice ( self , start , end ) :
2021-05-17 15:44:20 +02:00
raise NotImplementedError ( ' This method must be implemented by subclasses ' )
def __getitem__ ( self , idx ) :
2022-03-02 22:12:52 +01:00
assert self . _use_cache , ' Indexing PagedList requires cache '
2021-05-17 15:44:20 +02:00
if not isinstance ( idx , int ) or idx < 0 :
raise TypeError ( ' indices must be non-negative integers ' )
entries = self . getslice ( idx , idx + 1 )
2021-11-16 16:44:02 +01:00
if not entries :
2024-06-12 01:09:58 +02:00
raise self . IndexError
2021-11-16 16:44:02 +01:00
return entries [ 0 ]
2021-05-17 15:44:20 +02:00
2023-12-30 22:27:36 +01:00
def __bool__ ( self ) :
return bool ( self . getslice ( 0 , 1 ) )
2014-09-29 00:36:06 +02:00
class OnDemandPagedList ( PagedList ) :
2022-03-27 04:20:43 +02:00
""" Download pages until a page with less than maximum results """
2022-04-11 17:10:28 +02:00
2021-08-10 00:10:40 +02:00
def _getslice ( self , start , end ) :
2014-01-20 11:36:47 +01:00
for pagenum in itertools . count ( start / / self . _pagesize ) :
firstid = pagenum * self . _pagesize
nextfirstid = pagenum * self . _pagesize + self . _pagesize
if start > = nextfirstid :
continue
startv = (
start % self . _pagesize
if firstid < = start < nextfirstid
else 0 )
endv = (
( ( end - 1 ) % self . _pagesize ) + 1
if ( end is not None and firstid < = end < = nextfirstid )
else None )
2022-03-02 22:12:52 +01:00
try :
page_results = self . getpage ( pagenum )
except Exception :
self . _pagecount = pagenum - 1
raise
2014-01-20 11:36:47 +01:00
if startv != 0 or endv is not None :
page_results = page_results [ startv : endv ]
2021-08-10 00:10:40 +02:00
yield from page_results
2014-01-20 11:36:47 +01:00
# A little optimization - if current page is not "full", ie. does
# not contain page_size videos then we can assume that this page
# is the last one - there are no more ids on further pages -
# i.e. no need to query again.
if len ( page_results ) + startv < self . _pagesize :
break
# If we got the whole page, but the next page is not interesting,
# break out early as well
if end == nextfirstid :
break
2014-02-09 17:56:10 +01:00
2014-09-29 00:36:06 +02:00
class InAdvancePagedList ( PagedList ) :
2022-03-27 04:20:43 +02:00
""" PagedList with total number of pages known in advance """
2022-04-11 17:10:28 +02:00
2014-09-29 00:36:06 +02:00
def __init__ ( self , pagefunc , pagecount , pagesize ) :
2021-08-10 00:10:40 +02:00
PagedList . __init__ ( self , pagefunc , pagesize , True )
2022-03-02 22:12:52 +01:00
self . _pagecount = pagecount
2014-09-29 00:36:06 +02:00
2021-08-10 00:10:40 +02:00
def _getslice ( self , start , end ) :
2014-09-29 00:36:06 +02:00
start_page = start / / self . _pagesize
2022-01-23 18:55:17 +01:00
end_page = self . _pagecount if end is None else min ( self . _pagecount , end / / self . _pagesize + 1 )
2014-09-29 00:36:06 +02:00
skip_elems = start - start_page * self . _pagesize
only_more = None if end is None else end - start
for pagenum in range ( start_page , end_page ) :
2021-08-10 00:10:40 +02:00
page_results = self . getpage ( pagenum )
2014-09-29 00:36:06 +02:00
if skip_elems :
2021-08-10 00:10:40 +02:00
page_results = page_results [ skip_elems : ]
2014-09-29 00:36:06 +02:00
skip_elems = None
if only_more is not None :
2021-08-10 00:10:40 +02:00
if len ( page_results ) < only_more :
only_more - = len ( page_results )
2014-09-29 00:36:06 +02:00
else :
2021-08-10 00:10:40 +02:00
yield from page_results [ : only_more ]
2014-09-29 00:36:06 +02:00
break
2021-08-10 00:10:40 +02:00
yield from page_results
2014-09-29 00:36:06 +02:00
2022-06-17 06:48:21 +02:00
class PlaylistEntries :
MissingEntry = object ( )
is_exhausted = False
def __init__ ( self , ydl , info_dict ) :
2022-06-17 10:05:04 +02:00
self . ydl = ydl
# _entries must be assigned now since infodict can change during iteration
entries = info_dict . get ( ' entries ' )
if entries is None :
raise EntryNotInPlaylist ( ' There are no entries ' )
elif isinstance ( entries , list ) :
self . is_exhausted = True
requested_entries = info_dict . get ( ' requested_entries ' )
2022-11-11 18:33:26 +01:00
self . is_incomplete = requested_entries is not None
2022-06-17 10:05:04 +02:00
if self . is_incomplete :
assert self . is_exhausted
2022-11-11 18:33:26 +01:00
self . _entries = [ self . MissingEntry ] * max ( requested_entries or [ 0 ] )
2022-06-17 10:05:04 +02:00
for i , entry in zip ( requested_entries , entries ) :
self . _entries [ i - 1 ] = entry
elif isinstance ( entries , ( list , PagedList , LazyList ) ) :
self . _entries = entries
else :
self . _entries = LazyList ( entries )
2022-06-17 06:48:21 +02:00
PLAYLIST_ITEMS_RE = re . compile ( r ''' (?x)
( ? P < start > [ + - ] ? \d + ) ?
( ? P < range > [ : - ]
( ? P < end > [ + - ] ? \d + | inf ( ? : inite ) ? ) ?
( ? : : ( ? P < step > [ + - ] ? \d + ) ) ?
) ? ''' )
@classmethod
def parse_playlist_items ( cls , string ) :
for segment in string . split ( ' , ' ) :
if not segment :
raise ValueError ( ' There is two or more consecutive commas ' )
mobj = cls . PLAYLIST_ITEMS_RE . fullmatch ( segment )
if not mobj :
raise ValueError ( f ' { segment !r} is not a valid specification ' )
start , end , step , has_range = mobj . group ( ' start ' , ' end ' , ' step ' , ' range ' )
if int_or_none ( step ) == 0 :
raise ValueError ( f ' Step in { segment !r} cannot be zero ' )
yield slice ( int_or_none ( start ) , float_or_none ( end ) , int_or_none ( step ) ) if has_range else int ( start )
def get_requested_items ( self ) :
playlist_items = self . ydl . params . get ( ' playlist_items ' )
playlist_start = self . ydl . params . get ( ' playliststart ' , 1 )
playlist_end = self . ydl . params . get ( ' playlistend ' )
# For backwards compatibility, interpret -1 as whole list
if playlist_end in ( - 1 , None ) :
playlist_end = ' '
if not playlist_items :
playlist_items = f ' { playlist_start } : { playlist_end } '
elif playlist_start != 1 or playlist_end :
self . ydl . report_warning ( ' Ignoring playliststart and playlistend because playlistitems was given ' , only_once = True )
for index in self . parse_playlist_items ( playlist_items ) :
for i , entry in self [ index ] :
yield i , entry
2022-06-22 05:09:14 +02:00
if not entry :
continue
2022-06-17 06:48:21 +02:00
try :
2023-03-03 19:29:00 +01:00
# The item may have just been added to archive. Don't break due to it
if not self . ydl . params . get ( ' lazy_playlist ' ) :
# TODO: Add auto-generated fields
self . ydl . _match_entry ( entry , incomplete = True , silent = True )
2022-06-17 06:48:21 +02:00
except ( ExistingVideoReached , RejectedVideoReached ) :
return
2022-06-17 10:05:04 +02:00
def get_full_count ( self ) :
if self . is_exhausted and not self . is_incomplete :
2022-06-17 06:48:21 +02:00
return len ( self )
elif isinstance ( self . _entries , InAdvancePagedList ) :
if self . _entries . _pagesize == 1 :
return self . _entries . _pagecount
@functools.cached_property
def _getter ( self ) :
if isinstance ( self . _entries , list ) :
def get_entry ( i ) :
try :
entry = self . _entries [ i ]
except IndexError :
entry = self . MissingEntry
if not self . is_incomplete :
2024-06-12 01:09:58 +02:00
raise self . IndexError
2022-06-17 06:48:21 +02:00
if entry is self . MissingEntry :
2022-11-11 18:33:26 +01:00
raise EntryNotInPlaylist ( f ' Entry { i + 1 } cannot be found ' )
2022-06-17 06:48:21 +02:00
return entry
else :
def get_entry ( i ) :
try :
return type ( self . ydl ) . _handle_extraction_exceptions ( lambda _ , i : self . _entries [ i ] ) ( self . ydl , i )
except ( LazyList . IndexError , PagedList . IndexError ) :
2024-06-12 01:09:58 +02:00
raise self . IndexError
2022-06-17 06:48:21 +02:00
return get_entry
def __getitem__ ( self , idx ) :
if isinstance ( idx , int ) :
idx = slice ( idx , idx )
# NB: PlaylistEntries[1:10] => (0, 1, ... 9)
step = 1 if idx . step is None else idx . step
if idx . start is None :
start = 0 if step > 0 else len ( self ) - 1
else :
start = idx . start - 1 if idx . start > = 0 else len ( self ) + idx . start
# NB: Do not call len(self) when idx == [:]
if idx . stop is None :
stop = 0 if step < 0 else float ( ' inf ' )
else :
stop = idx . stop - 1 if idx . stop > = 0 else len ( self ) + idx . stop
stop + = [ - 1 , 1 ] [ step > 0 ]
for i in frange ( start , stop , step ) :
if i < 0 :
continue
try :
2022-06-17 10:05:04 +02:00
entry = self . _getter ( i )
except self . IndexError :
self . is_exhausted = True
if step > 0 :
2022-06-17 06:48:21 +02:00
break
2022-06-17 10:05:04 +02:00
continue
2022-06-17 06:48:21 +02:00
yield i + 1 , entry
def __len__ ( self ) :
return len ( tuple ( self [ : ] ) )
2024-06-12 01:09:58 +02:00
class IndexError ( IndexError ) : # noqa: A001
2022-06-17 06:48:21 +02:00
pass
2014-02-09 17:56:10 +01:00
def uppercase_escape ( s ) :
2014-04-04 23:00:51 +02:00
unicode_escape = codecs . getdecoder ( ' unicode_escape ' )
2014-02-09 17:56:10 +01:00
return re . sub (
2014-04-01 13:17:07 +02:00
r ' \\ U[0-9a-fA-F] {8} ' ,
2014-04-04 23:00:51 +02:00
lambda m : unicode_escape ( m . group ( 0 ) ) [ 0 ] ,
s )
2015-05-04 15:53:05 +02:00
def lowercase_escape ( s ) :
unicode_escape = codecs . getdecoder ( ' unicode_escape ' )
return re . sub (
r ' \\ u[0-9a-fA-F] {4} ' ,
lambda m : unicode_escape ( m . group ( 0 ) ) [ 0 ] ,
s )
2014-02-15 16:24:43 +01:00
2014-09-13 15:59:16 +02:00
2022-11-06 20:05:09 +01:00
def parse_qs ( url , * * kwargs ) :
return urllib . parse . parse_qs ( urllib . parse . urlparse ( url ) . query , * * kwargs )
2021-08-22 21:02:00 +02:00
2014-02-25 01:43:17 +01:00
def read_batch_urls ( batch_fd ) :
def fixup ( url ) :
2022-06-24 12:54:43 +02:00
if not isinstance ( url , str ) :
2014-02-25 01:43:17 +01:00
url = url . decode ( ' utf-8 ' , ' replace ' )
batch-file enumeration improvements (https://github.com/ytdl-org/youtube-dl/pull/26813)
Co-authored by: glenn-slayden
Modified from https://github.com/ytdl-org/youtube-dl/pull/26813/commits/c9a9ccf8a35e157e22afeaafc2851176ddd87e68
These improvements apply to reading the list of URLs from the file supplied via the `--batch-file` (`-a`) command line option.
1. Skip blank and empty lines in the file. Currently, lines with leading whitespace are only skipped when that whitespace is followed by a comment character (`#`, `;`, or `]`). This means that empty lines and lines consisting only of whitespace are returned as (trimmed) empty strings in the list of URLs to process.
2. [bug fix] Detect and remove the Unicode BOM when the file descriptor is already decoding Unicode.
With Python 3, the `batch_fd` enumerator returns the lines of the file as Unicode. For UTF-8, this means that the raw BOM bytes from the file `\xef \xbb \xbf` show up converted into a single `\ufeff` character prefixed to the first enumerated text line.
This fix solves several buggy interactions between the presence of BOM, the skipping of comments and/or blank lines, and ensuring the list of URLs is consistently trimmed. For example, if the first line of the file is blank, the BOM is incorrectly returned as a URL standing alone. If the first line contains a URL, it will be prefixed with this unwanted single character--but note that its being there will have inhibited the proper trimming of any leading whitespace. Currently, the `UnicodeBOMIE` helper attempts to recover from some of these error cases, but this fix prevents the error from happening in the first place (at least on Python3). In any case, the `UnicodeBOMIE` approach is flawed, because it is clearly illogical for a BOM to appear in the (non-batch) URL(s) specified directly on the command line (and for that matter, on URLs *after the first line* of a batch list, also)
3. Adds proper trimming of the " #" into the read_batch_urls processing so that the URLs it enumerates are cleaned and trimmed more consistently.
2021-01-09 13:38:03 +01:00
BOM_UTF8 = ( ' \xef \xbb \xbf ' , ' \ufeff ' )
for bom in BOM_UTF8 :
if url . startswith ( bom ) :
url = url [ len ( bom ) : ]
url = url . lstrip ( )
if not url or url . startswith ( ( ' # ' , ' ; ' , ' ] ' ) ) :
2014-02-25 01:43:17 +01:00
return False
batch-file enumeration improvements (https://github.com/ytdl-org/youtube-dl/pull/26813)
Co-authored by: glenn-slayden
Modified from https://github.com/ytdl-org/youtube-dl/pull/26813/commits/c9a9ccf8a35e157e22afeaafc2851176ddd87e68
These improvements apply to reading the list of URLs from the file supplied via the `--batch-file` (`-a`) command line option.
1. Skip blank and empty lines in the file. Currently, lines with leading whitespace are only skipped when that whitespace is followed by a comment character (`#`, `;`, or `]`). This means that empty lines and lines consisting only of whitespace are returned as (trimmed) empty strings in the list of URLs to process.
2. [bug fix] Detect and remove the Unicode BOM when the file descriptor is already decoding Unicode.
With Python 3, the `batch_fd` enumerator returns the lines of the file as Unicode. For UTF-8, this means that the raw BOM bytes from the file `\xef \xbb \xbf` show up converted into a single `\ufeff` character prefixed to the first enumerated text line.
This fix solves several buggy interactions between the presence of BOM, the skipping of comments and/or blank lines, and ensuring the list of URLs is consistently trimmed. For example, if the first line of the file is blank, the BOM is incorrectly returned as a URL standing alone. If the first line contains a URL, it will be prefixed with this unwanted single character--but note that its being there will have inhibited the proper trimming of any leading whitespace. Currently, the `UnicodeBOMIE` helper attempts to recover from some of these error cases, but this fix prevents the error from happening in the first place (at least on Python3). In any case, the `UnicodeBOMIE` approach is flawed, because it is clearly illogical for a BOM to appear in the (non-batch) URL(s) specified directly on the command line (and for that matter, on URLs *after the first line* of a batch list, also)
3. Adds proper trimming of the " #" into the read_batch_urls processing so that the URLs it enumerates are cleaned and trimmed more consistently.
2021-01-09 13:38:03 +01:00
# "#" cannot be stripped out since it is part of the URI
2022-06-27 02:50:06 +02:00
# However, it can be safely stripped out if following a whitespace
2024-05-26 23:37:49 +02:00
return re . split ( r ' \ s# ' , url , maxsplit = 1 ) [ 0 ] . rstrip ( )
2014-02-25 01:43:17 +01:00
with contextlib . closing ( batch_fd ) as fd :
return [ url for url in map ( fixup , fd ) if url ]
2014-03-07 15:25:33 +01:00
def urlencode_postdata ( * args , * * kargs ) :
2022-06-24 12:54:43 +02:00
return urllib . parse . urlencode ( * args , * * kargs ) . encode ( ' ascii ' )
2014-03-10 17:31:32 +01:00
2023-02-17 12:21:34 +01:00
def update_url ( url , * , query_update = None , * * kwargs ) :
""" Replace URL components specified by kwargs
@param url str or parse url tuple
@param query_update update query
@returns str
"""
if isinstance ( url , str ) :
if not kwargs and not query_update :
return url
else :
url = urllib . parse . urlparse ( url )
if query_update :
assert ' query ' not in kwargs , ' query_update and query cannot be specified at the same time '
kwargs [ ' query ' ] = urllib . parse . urlencode ( {
* * urllib . parse . parse_qs ( url . query ) ,
2024-06-12 01:09:58 +02:00
* * query_update ,
2023-02-17 12:21:34 +01:00
} , True )
return urllib . parse . urlunparse ( url . _replace ( * * kwargs ) )
2016-03-03 18:34:52 +01:00
def update_url_query ( url , query ) :
2023-02-17 12:21:34 +01:00
return update_url ( url , query_update = query )
2015-09-06 03:22:20 +02:00
2015-12-20 01:26:26 +01:00
2017-05-06 13:06:18 +02:00
def _multipart_encode_impl ( data , boundary ) :
2024-06-12 01:09:58 +02:00
content_type = f ' multipart/form-data; boundary= { boundary } '
2017-05-01 17:09:18 +02:00
out = b ' '
for k , v in data . items ( ) :
out + = b ' -- ' + boundary . encode ( ' ascii ' ) + b ' \r \n '
2022-06-24 12:54:43 +02:00
if isinstance ( k , str ) :
2022-05-09 13:54:28 +02:00
k = k . encode ( )
2022-06-24 12:54:43 +02:00
if isinstance ( v , str ) :
2022-05-09 13:54:28 +02:00
v = v . encode ( )
2017-05-01 17:09:18 +02:00
# RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
# suggests sending UTF-8 directly. Firefox sends UTF-8, too
2017-05-05 14:51:59 +02:00
content = b ' Content-Disposition: form-data; name= " ' + k + b ' " \r \n \r \n ' + v + b ' \r \n '
2017-05-01 17:09:18 +02:00
if boundary . encode ( ' ascii ' ) in content :
raise ValueError ( ' Boundary overlaps with data ' )
out + = content
out + = b ' -- ' + boundary . encode ( ' ascii ' ) + b ' -- \r \n '
return out , content_type
def multipart_encode ( data , boundary = None ) :
2024-06-12 01:09:58 +02:00
"""
2017-05-01 17:09:18 +02:00
Encode a dict to RFC 7578 - compliant form - data
data :
A dict where keys and values can be either Unicode or bytes - like
objects .
boundary :
If specified a Unicode object , it ' s used as the boundary. Otherwise
a random boundary is generated .
Reference : https : / / tools . ietf . org / html / rfc7578
2024-06-12 01:09:58 +02:00
"""
2017-05-01 17:09:18 +02:00
has_specified_boundary = boundary is not None
while True :
if boundary is None :
boundary = ' --------------- ' + str ( random . randrange ( 0x0fffffff , 0xffffffff ) )
try :
2017-05-06 13:06:18 +02:00
out , content_type = _multipart_encode_impl ( data , boundary )
2017-05-01 17:09:18 +02:00
break
except ValueError :
if has_specified_boundary :
raise
boundary = None
return out , content_type
2023-04-30 19:50:22 +02:00
def is_iterable_like ( x , allowed_types = collections . abc . Iterable , blocked_types = NO_DEFAULT ) :
if blocked_types is NO_DEFAULT :
blocked_types = ( str , bytes , collections . abc . Mapping )
return isinstance ( x , allowed_types ) and not isinstance ( x , blocked_types )
def variadic ( x , allowed_types = NO_DEFAULT ) :
2023-05-24 20:00:43 +02:00
if not isinstance ( allowed_types , ( tuple , type ) ) :
deprecation_warning ( ' allowed_types should be a tuple or a type ' )
allowed_types = tuple ( allowed_types )
2023-05-19 23:36:23 +02:00
return x if is_iterable_like ( x , blocked_types = allowed_types ) else ( x , )
2022-10-04 06:23:11 +02:00
2022-03-31 09:49:16 +02:00
def try_call ( * funcs , expected_type = None , args = [ ] , kwargs = { } ) :
for f in funcs :
2017-04-18 17:39:58 +02:00
try :
2022-03-31 09:49:16 +02:00
val = f ( * args , * * kwargs )
2022-09-25 23:03:19 +02:00
except ( AttributeError , KeyError , TypeError , IndexError , ValueError , ZeroDivisionError ) :
2017-04-18 17:39:58 +02:00
pass
else :
2022-03-31 09:49:16 +02:00
if expected_type is None or isinstance ( val , expected_type ) :
return val
def try_get ( src , getter , expected_type = None ) :
return try_call ( * variadic ( getter ) , args = ( src , ) , expected_type = expected_type )
2016-06-12 01:05:34 +02:00
2022-03-28 04:51:45 +02:00
def filter_dict ( dct , cndn = lambda _ , v : v is not None ) :
return { k : v for k , v in dct . items ( ) if cndn ( k , v ) }
2018-04-27 21:47:17 +02:00
def merge_dicts ( * dicts ) :
merged = { }
for a_dict in dicts :
for k , v in a_dict . items ( ) :
2022-03-28 04:51:45 +02:00
if ( v is not None and k not in merged
or isinstance ( v , str ) and merged [ k ] == ' ' ) :
2018-04-27 21:47:17 +02:00
merged [ k ] = v
return merged
2015-12-20 01:26:26 +01:00
def encode_compat_str ( string , encoding = preferredencoding ( ) , errors = ' strict ' ) :
2022-06-24 12:54:43 +02:00
return string if isinstance ( string , str ) else str ( string , encoding , errors )
2015-12-20 01:26:26 +01:00
2015-09-06 03:22:20 +02:00
2014-03-21 00:59:51 +01:00
US_RATINGS = {
' G ' : 0 ,
' PG ' : 10 ,
' PG-13 ' : 13 ,
' R ' : 16 ,
' NC ' : 18 ,
}
2014-03-24 23:21:20 +01:00
2016-08-07 15:45:18 +02:00
TV_PARENTAL_GUIDELINES = {
2018-05-26 00:12:18 +02:00
' TV-Y ' : 0 ,
' TV-Y7 ' : 7 ,
' TV-G ' : 0 ,
' TV-PG ' : 0 ,
' TV-14 ' : 14 ,
' TV-MA ' : 17 ,
2016-08-07 15:45:18 +02:00
}
2014-10-03 14:37:25 +02:00
def parse_age_limit ( s ) :
2022-04-17 22:58:28 +02:00
# isinstance(False, int) is True. So type() must be used instead
2022-04-17 19:18:50 +02:00
if type ( s ) is int : # noqa: E721
2016-08-07 15:45:18 +02:00
return s if 0 < = s < = 21 else None
2022-04-17 22:58:28 +02:00
elif not isinstance ( s , str ) :
2014-10-03 20:17:10 +02:00
return None
2014-10-03 14:37:25 +02:00
m = re . match ( r ' ^(?P<age> \ d { 1,2}) \ +?$ ' , s )
2016-08-07 15:45:18 +02:00
if m :
return int ( m . group ( ' age ' ) )
2021-03-20 11:41:11 +01:00
s = s . upper ( )
2016-08-07 15:45:18 +02:00
if s in US_RATINGS :
return US_RATINGS [ s ]
2024-06-12 01:09:58 +02:00
m = re . match ( r ' ^TV[_-]?( {} )$ ' . format ( ' | ' . join ( k [ 3 : ] for k in TV_PARENTAL_GUIDELINES ) ) , s )
2018-05-23 13:12:20 +02:00
if m :
2018-05-26 00:12:18 +02:00
return TV_PARENTAL_GUIDELINES [ ' TV- ' + m . group ( 1 ) ]
2018-05-23 13:12:20 +02:00
return None
2014-10-03 14:37:25 +02:00
2014-03-24 23:21:20 +01:00
def strip_jsonp ( code ) :
2014-11-13 16:28:05 +01:00
return re . sub (
2017-05-26 15:58:18 +02:00
r ''' (?sx)^
2018-07-21 07:30:18 +02:00
( ? : window \. ) ? ( ? P < func_name > [ a - zA - Z0 - 9 _ . $ ] * )
2017-05-26 15:58:18 +02:00
( ? : \s * & & \s * ( ? P = func_name ) ) ?
\s * \( \s * ( ? P < callback_data > . * ) \) ; ?
\s * ? ( ? : / / [ ^ \n ] * ) * $ ''' ,
r ' \ g<callback_data> ' , code )
2014-04-21 07:12:02 +02:00
2022-08-14 01:21:54 +02:00
def js_to_json ( code , vars = { } , * , strict = False ) :
2021-01-19 20:05:50 +01:00
# vars is a dict of var, val pairs to substitute
2023-03-25 19:41:28 +01:00
STRING_QUOTES = ' \' " ` '
2022-10-12 22:22:17 +02:00
STRING_RE = ' | ' . join ( rf ' { q } (?: \\ .|[^ \\ { q } ])* { q } ' for q in STRING_QUOTES )
2021-07-13 09:18:20 +02:00
COMMENT_RE = r ' / \ *(?:(?! \ */).)*? \ */|//[^ \ n]* \ n '
2022-04-11 17:10:28 +02:00
SKIP_RE = fr ' \ s*(?: { COMMENT_RE } )? \ s* '
2017-02-02 20:55:06 +01:00
INTEGER_TABLE = (
2022-04-11 17:10:28 +02:00
( fr ' (?s)^(0[xX][0-9a-fA-F]+) { SKIP_RE } :?$ ' , 16 ) ,
( fr ' (?s)^(0+[0-7]+) { SKIP_RE } :?$ ' , 8 ) ,
2017-02-02 20:55:06 +01:00
)
2022-10-12 22:22:17 +02:00
def process_escape ( match ) :
JSON_PASSTHROUGH_ESCAPES = R ' " \ bfnrtu '
escape = match . group ( 1 ) or match . group ( 2 )
return ( Rf ' \ { escape } ' if escape in JSON_PASSTHROUGH_ESCAPES
else R ' \ u00 ' if escape == ' x '
else ' ' if escape == ' \n '
else escape )
2023-03-25 19:41:28 +01:00
def template_substitute ( match ) :
evaluated = js_to_json ( match . group ( 1 ) , vars , strict = strict )
if evaluated [ 0 ] == ' " ' :
return json . loads ( evaluated )
return evaluated
2014-08-22 02:33:29 +02:00
def fix_kv ( m ) :
2014-09-30 11:12:59 +02:00
v = m . group ( 0 )
if v in ( ' true ' , ' false ' , ' null ' ) :
return v
2021-09-04 14:29:35 +02:00
elif v in ( ' undefined ' , ' void 0 ' ) :
return ' null '
2024-06-12 01:09:58 +02:00
elif v . startswith ( ( ' /* ' , ' // ' , ' ! ' ) ) or v == ' , ' :
2022-10-12 22:22:17 +02:00
return ' '
if v [ 0 ] in STRING_QUOTES :
2023-03-25 19:41:28 +01:00
v = re . sub ( r ' (?s) \ $ { ([^}]+)} ' , template_substitute , v [ 1 : - 1 ] ) if v [ 0 ] == ' ` ' else v [ 1 : - 1 ]
escaped = re . sub ( r ' (?s)( " )| \\ (.) ' , process_escape , v )
2022-10-12 22:22:17 +02:00
return f ' " { escaped } " '
for regex , base in INTEGER_TABLE :
im = re . match ( regex , v )
if im :
i = int ( im . group ( 1 ) , base )
return f ' " { i } " : ' if v . endswith ( ' : ' ) else str ( i )
if v in vars :
2022-12-30 07:38:38 +01:00
try :
if not strict :
json . loads ( vars [ v ] )
2023-01-02 15:09:03 +01:00
except json . JSONDecodeError :
2022-12-30 07:38:38 +01:00
return json . dumps ( vars [ v ] )
else :
return vars [ v ]
2016-05-14 16:39:58 +02:00
2022-10-12 22:22:17 +02:00
if not strict :
return f ' " { v } " '
2021-01-19 20:05:50 +01:00
2022-10-12 22:22:17 +02:00
raise ValueError ( f ' Unknown value: { v } ' )
2014-08-22 02:33:29 +02:00
2022-06-19 23:33:19 +02:00
def create_map ( mobj ) :
return json . dumps ( dict ( json . loads ( js_to_json ( mobj . group ( 1 ) or ' [] ' , vars = vars ) ) ) )
2023-09-21 23:51:57 +02:00
code = re . sub ( r ' (?:new \ s+)?Array \ ((.*?) \ ) ' , r ' [ \ g<1>] ' , code )
2022-06-19 23:33:19 +02:00
code = re . sub ( r ' new Map \ (( \ [.*? \ ])? \ ) ' , create_map , code )
2022-08-14 01:21:54 +02:00
if not strict :
2023-10-08 01:57:23 +02:00
code = re . sub ( rf ' new Date \ (( { STRING_RE } ) \ ) ' , r ' \ g<1> ' , code )
2022-09-23 15:51:07 +02:00
code = re . sub ( r ' new \ w+ \ ((.*?) \ ) ' , lambda m : json . dumps ( m . group ( 0 ) ) , code )
2023-02-03 19:47:00 +01:00
code = re . sub ( r ' parseInt \ ([^ \ d]+( \ d+)[^ \ d]+ \ ) ' , r ' \ 1 ' , code )
code = re . sub ( r ' \ (function \ ([^)]* \ ) \ s* \ { [^}]* \ } \ s* \ ) \ s* \ ( \ s*([ " \' ][^)]*[ " \' ]) \ s* \ ) ' , r ' \ 1 ' , code )
2022-02-19 13:00:51 +01:00
2022-10-12 22:22:17 +02:00
return re . sub ( rf ''' (?sx)
{ STRING_RE } |
{ COMMENT_RE } | , ( ? = { SKIP_RE } [ \] } } ] ) |
2021-09-04 14:29:35 +02:00
void \s0 | ( ? : ( ? < ! [ 0 - 9 ] ) [ eE ] | [ a - df - zA - DF - Z_ $ ] ) [ . a - zA - Z_ $ 0 - 9 ] * |
2022-10-12 22:22:17 +02:00
\b ( ? : 0 [ xX ] [ 0 - 9 a - fA - F ] + | 0 + [ 0 - 7 ] + ) ( ? : { SKIP_RE } : ) ? |
[ 0 - 9 ] + ( ? = { SKIP_RE } : ) |
2020-11-19 20:22:59 +01:00
! +
2022-10-12 22:22:17 +02:00
''' , fix_kv, code)
2014-08-22 02:33:29 +02:00
2014-04-21 07:12:02 +02:00
def qualities ( quality_ids ) :
""" Get a numeric quality value out of a list of possible values """
def q ( qid ) :
try :
return quality_ids . index ( qid )
except ValueError :
return - 1
return q
2014-04-30 10:02:03 +02:00
2022-12-30 06:45:41 +01:00
POSTPROCESS_WHEN = ( ' pre_process ' , ' after_filter ' , ' video ' , ' before_dl ' , ' post_process ' , ' after_move ' , ' after_video ' , ' playlist ' )
2022-01-03 12:13:54 +01:00
2021-02-03 14:36:09 +01:00
DEFAULT_OUTTMPL = {
' default ' : ' %(title)s [ %(id)s ]. %(ext)s ' ,
2021-03-15 00:02:13 +01:00
' chapter ' : ' %(title)s - %(section_number)03d %(section_title)s [ %(id)s ]. %(ext)s ' ,
2021-02-03 14:36:09 +01:00
}
OUTTMPL_TYPES = {
2021-03-15 00:02:13 +01:00
' chapter ' : None ,
2021-02-03 14:36:09 +01:00
' subtitle ' : None ,
' thumbnail ' : None ,
' description ' : ' description ' ,
' annotation ' : ' annotations.xml ' ,
' infojson ' : ' info.json ' ,
2021-10-26 16:41:59 +02:00
' link ' : None ,
2022-01-13 12:01:08 +01:00
' pl_video ' : None ,
2021-05-17 22:10:21 +02:00
' pl_thumbnail ' : None ,
2021-02-03 14:36:09 +01:00
' pl_description ' : ' description ' ,
' pl_infojson ' : ' info.json ' ,
}
2014-05-16 12:03:59 +02:00
2021-03-24 23:02:15 +01:00
# As of [1] format syntax is:
# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2021-07-29 01:49:26 +02:00
STR_FORMAT_RE_TMPL = r ''' (?x)
( ? < ! % ) ( ? P < prefix > ( ? : % % ) * )
2021-03-24 23:02:15 +01:00
%
2021-09-25 22:09:44 +02:00
( ? P < has_key > \( ( ? P < key > { 0 } ) \) ) ?
2021-06-03 20:00:38 +02:00
( ? P < format >
2021-09-25 22:09:44 +02:00
( ? P < conversion > [ #0\-+ ]+)?
( ? P < min_width > \d + ) ?
( ? P < precision > \. \d + ) ?
( ? P < len_mod > [ hlL ] ) ? # unused in python
2021-07-29 01:49:26 +02:00
{ 1 } # conversion type
2021-06-03 20:00:38 +02:00
)
2021-03-24 23:02:15 +01:00
'''
2021-07-29 04:56:17 +02:00
2023-06-21 01:45:03 +02:00
STR_FORMAT_TYPES = ' diouxXeEfFgGcrsa '
2014-09-15 15:10:24 +02:00
2021-07-29 04:56:17 +02:00
2014-09-15 15:10:24 +02:00
def limit_length ( s , length ) :
""" Add ellipses to overly long strings """
if s is None :
return None
ELLIPSES = ' ... '
if len ( s ) > length :
return s [ : length - len ( ELLIPSES ) ] + ELLIPSES
return s
2014-10-26 16:46:34 +01:00
def version_tuple ( v ) :
2014-12-06 12:14:26 +01:00
return tuple ( int ( e ) for e in re . split ( r ' [-.] ' , v ) )
2014-10-26 16:46:34 +01:00
def is_outdated_version ( version , limit , assume_new = True ) :
if not version :
return not assume_new
try :
return version_tuple ( version ) < version_tuple ( limit )
except ValueError :
return not assume_new
2014-11-20 12:14:28 +01:00
def ytdl_is_updateable ( ) :
2021-02-24 19:45:56 +01:00
""" Returns if yt-dlp can be updated with -U """
2021-01-06 12:58:30 +01:00
2023-05-20 23:56:23 +02:00
from . . update import is_non_updateable
2014-11-20 12:14:28 +01:00
2021-10-03 22:55:13 +02:00
return not is_non_updateable ( )
2014-11-23 10:49:19 +01:00
def args_to_str ( args ) :
# Get a short string representation for a subprocess command
2024-04-08 23:18:04 +02:00
return shell_quote ( args )
2015-01-04 02:20:45 +01:00
2022-03-27 04:20:43 +02:00
def error_to_str ( err ) :
return f ' { type ( err ) . __name__ } : { err } '
2022-12-29 17:32:54 +01:00
def mimetype2ext ( mt , default = NO_DEFAULT ) :
if not isinstance ( mt , str ) :
if default is not NO_DEFAULT :
return default
2016-04-24 20:03:12 +02:00
return None
2022-12-29 17:32:54 +01:00
MAP = {
# video
2016-01-24 17:58:53 +01:00
' 3gpp ' : ' 3gp ' ,
2022-12-29 17:32:54 +01:00
' mp2t ' : ' ts ' ,
' mp4 ' : ' mp4 ' ,
' mpeg ' : ' mpeg ' ,
' mpegurl ' : ' m3u8 ' ,
' quicktime ' : ' mov ' ,
' webm ' : ' webm ' ,
' vp9 ' : ' vp9 ' ,
2023-09-16 17:50:06 +02:00
' video/ogg ' : ' ogv ' ,
2016-01-24 17:58:53 +01:00
' x-flv ' : ' flv ' ,
2022-12-29 17:32:54 +01:00
' x-m4v ' : ' m4v ' ,
' x-matroska ' : ' mkv ' ,
' x-mng ' : ' mng ' ,
2016-01-24 18:01:15 +01:00
' x-mp4-fragmented ' : ' mp4 ' ,
2022-12-29 17:32:54 +01:00
' x-ms-asf ' : ' asf ' ,
2016-01-24 18:01:15 +01:00
' x-ms-wmv ' : ' wmv ' ,
2022-12-29 17:32:54 +01:00
' x-msvideo ' : ' avi ' ,
# application (streaming playlists)
2016-07-06 10:06:28 +02:00
' dash+xml ' : ' mpd ' ,
' f4m+xml ' : ' f4m ' ,
2016-07-23 11:48:59 +02:00
' hds+xml ' : ' f4m ' ,
2022-12-29 17:32:54 +01:00
' vnd.apple.mpegurl ' : ' m3u8 ' ,
2016-07-14 15:13:57 +02:00
' vnd.ms-sstr+xml ' : ' ism ' ,
2022-12-29 17:32:54 +01:00
' x-mpegurl ' : ' m3u8 ' ,
# audio
' audio/mp4 ' : ' m4a ' ,
# Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
# Using .mp3 as it's the most popular one
' audio/mpeg ' : ' mp3 ' ,
2023-01-03 03:35:45 +01:00
' audio/webm ' : ' webm ' ,
2022-12-29 17:32:54 +01:00
' audio/x-matroska ' : ' mka ' ,
' audio/x-mpegurl ' : ' m3u ' ,
2024-09-14 01:19:18 +02:00
' aacp ' : ' aac ' ,
2022-12-29 17:32:54 +01:00
' midi ' : ' mid ' ,
' ogg ' : ' ogg ' ,
' wav ' : ' wav ' ,
' wave ' : ' wav ' ,
' x-aac ' : ' aac ' ,
' x-flac ' : ' flac ' ,
' x-m4a ' : ' m4a ' ,
' x-realaudio ' : ' ra ' ,
2020-09-12 05:08:57 +02:00
' x-wav ' : ' wav ' ,
2021-10-02 20:43:42 +02:00
2022-12-29 17:32:54 +01:00
# image
' avif ' : ' avif ' ,
' bmp ' : ' bmp ' ,
' gif ' : ' gif ' ,
' jpeg ' : ' jpg ' ,
' png ' : ' png ' ,
' svg+xml ' : ' svg ' ,
' tiff ' : ' tif ' ,
' vnd.wap.wbmp ' : ' wbmp ' ,
' webp ' : ' webp ' ,
' x-icon ' : ' ico ' ,
' x-jng ' : ' jng ' ,
' x-ms-bmp ' : ' bmp ' ,
# caption
' filmstrip+json ' : ' fs ' ,
' smptett+xml ' : ' tt ' ,
' ttaf+xml ' : ' dfxp ' ,
' ttml+xml ' : ' ttml ' ,
' x-ms-sami ' : ' sami ' ,
2021-10-02 20:43:42 +02:00
2022-12-29 17:32:54 +01:00
# misc
' gzip ' : ' gz ' ,
2021-10-02 20:43:42 +02:00
' json ' : ' json ' ,
' xml ' : ' xml ' ,
' zip ' : ' zip ' ,
}
2022-12-29 17:32:54 +01:00
mimetype = mt . partition ( ' ; ' ) [ 0 ] . strip ( ) . lower ( )
_ , _ , subtype = mimetype . rpartition ( ' / ' )
2021-10-02 20:43:42 +02:00
2023-05-20 23:56:23 +02:00
ext = traversal . traverse_obj ( MAP , mimetype , subtype , subtype . rsplit ( ' + ' ) [ - 1 ] )
2022-12-29 17:32:54 +01:00
if ext :
return ext
elif default is not NO_DEFAULT :
return default
2021-10-02 20:43:42 +02:00
return subtype . replace ( ' + ' , ' . ' )
2015-02-19 00:31:01 +01:00
2021-12-09 12:40:52 +01:00
def ext2mimetype ( ext_or_url ) :
if not ext_or_url :
return None
if ' . ' not in ext_or_url :
ext_or_url = f ' file. { ext_or_url } '
return mimetypes . guess_type ( ext_or_url ) [ 0 ]
2016-03-16 18:48:06 +01:00
def parse_codecs ( codecs_str ) :
# http://tools.ietf.org/html/rfc6381
if not codecs_str :
return { }
2020-11-21 15:50:42 +01:00
split_codecs = list ( filter ( None , map (
2021-07-31 12:51:01 +02:00
str . strip , codecs_str . strip ( ) . strip ( ' , ' ) . split ( ' , ' ) ) ) )
2022-04-29 18:02:31 +02:00
vcodec , acodec , scodec , hdr = None , None , None , None
2020-11-21 15:50:42 +01:00
for full_codec in split_codecs :
2024-07-14 21:58:07 +02:00
full_codec = re . sub ( r ' ^([^.]+) ' , lambda m : m . group ( 1 ) . lower ( ) , full_codec )
2022-07-10 13:20:54 +02:00
parts = re . sub ( r ' 0+(?= \ d) ' , ' ' , full_codec ) . split ( ' . ' )
if parts [ 0 ] in ( ' avc1 ' , ' avc2 ' , ' avc3 ' , ' avc4 ' , ' vp9 ' , ' vp8 ' , ' hev1 ' , ' hev2 ' ,
' h263 ' , ' h264 ' , ' mp4v ' , ' hvc1 ' , ' av1 ' , ' theora ' , ' dvh1 ' , ' dvhe ' ) :
if vcodec :
continue
vcodec = full_codec
if parts [ 0 ] in ( ' dvh1 ' , ' dvhe ' ) :
hdr = ' DV '
2023-05-20 23:56:23 +02:00
elif parts [ 0 ] == ' av1 ' and traversal . traverse_obj ( parts , 3 ) == ' 10 ' :
2022-07-10 13:20:54 +02:00
hdr = ' HDR10 '
elif parts [ : 2 ] == [ ' vp9 ' , ' 2 ' ] :
hdr = ' HDR10 '
2022-12-09 10:47:16 +01:00
elif parts [ 0 ] in ( ' flac ' , ' mp4a ' , ' opus ' , ' vorbis ' , ' mp3 ' , ' aac ' , ' ac-4 ' ,
2022-07-10 13:20:54 +02:00
' ac-3 ' , ' ec-3 ' , ' eac3 ' , ' dtsc ' , ' dtse ' , ' dtsh ' , ' dtsl ' ) :
acodec = acodec or full_codec
elif parts [ 0 ] in ( ' stpp ' , ' wvtt ' ) :
scodec = scodec or full_codec
2016-03-16 18:48:06 +01:00
else :
2022-04-17 22:58:28 +02:00
write_string ( f ' WARNING: Unknown codec { full_codec } \n ' )
2022-04-29 18:02:31 +02:00
if vcodec or acodec or scodec :
2016-03-16 18:48:06 +01:00
return {
' vcodec ' : vcodec or ' none ' ,
' acodec ' : acodec or ' none ' ,
2021-10-18 15:04:21 +02:00
' dynamic_range ' : hdr ,
2022-04-29 18:02:31 +02:00
* * ( { ' scodec ' : scodec } if scodec is not None else { } ) ,
2016-03-16 18:48:06 +01:00
}
2021-12-23 02:42:26 +01:00
elif len ( split_codecs ) == 2 :
return {
' vcodec ' : split_codecs [ 0 ] ,
' acodec ' : split_codecs [ 1 ] ,
}
2016-03-16 18:48:06 +01:00
return { }
2022-08-04 02:42:12 +02:00
def get_compatible_ext ( * , vcodecs , acodecs , vexts , aexts , preferences = None ) :
assert len ( vcodecs ) == len ( vexts ) and len ( acodecs ) == len ( aexts )
allow_mkv = not preferences or ' mkv ' in preferences
if allow_mkv and max ( len ( acodecs ) , len ( vcodecs ) ) > 1 :
return ' mkv ' # TODO: any other format allows this?
# TODO: All codecs supported by parse_codecs isn't handled here
COMPATIBLE_CODECS = {
' mp4 ' : {
2022-12-09 10:47:16 +01:00
' av1 ' , ' hevc ' , ' avc1 ' , ' mp4a ' , ' ac-4 ' , # fourcc (m3u8, mpd)
2022-09-30 19:33:29 +02:00
' h264 ' , ' aacl ' , ' ec-3 ' , # Set in ISM
2022-08-04 02:42:12 +02:00
} ,
' webm ' : {
' av1 ' , ' vp9 ' , ' vp8 ' , ' opus ' , ' vrbs ' ,
' vp9x ' , ' vp8x ' , # in the webm spec
} ,
}
2023-06-22 06:32:38 +02:00
sanitize_codec = functools . partial (
try_get , getter = lambda x : x [ 0 ] . split ( ' . ' ) [ 0 ] . replace ( ' 0 ' , ' ' ) . lower ( ) )
2022-08-14 03:47:11 +02:00
vcodec , acodec = sanitize_codec ( vcodecs ) , sanitize_codec ( acodecs )
2022-08-04 02:42:12 +02:00
for ext in preferences or COMPATIBLE_CODECS . keys ( ) :
codec_set = COMPATIBLE_CODECS . get ( ext , set ( ) )
if ext == ' mkv ' or codec_set . issuperset ( ( vcodec , acodec ) ) :
return ext
COMPATIBLE_EXTS = (
{ ' mp3 ' , ' mp4 ' , ' m4a ' , ' m4p ' , ' m4b ' , ' m4r ' , ' m4v ' , ' ismv ' , ' isma ' , ' mov ' } ,
2022-12-30 11:00:56 +01:00
{ ' webm ' , ' weba ' } ,
2022-08-04 02:42:12 +02:00
)
for ext in preferences or vexts :
current_exts = { ext , * vexts , * aexts }
if ext == ' mkv ' or current_exts == { ext } or any (
ext_sets . issuperset ( current_exts ) for ext_sets in COMPATIBLE_EXTS ) :
return ext
return ' mkv ' if allow_mkv else preferences [ - 1 ]
2022-12-29 17:32:54 +01:00
def urlhandle_detect_ext ( url_handle , default = NO_DEFAULT ) :
2016-05-15 09:32:54 +02:00
getheader = url_handle . headers . get
2015-01-04 02:20:45 +01:00
2015-01-22 12:04:07 +01:00
cd = getheader ( ' Content-Disposition ' )
if cd :
m = re . match ( r ' attachment; \ s*filename= " (?P<filename>[^ " ]+) " ' , cd )
if m :
e = determine_ext ( m . group ( ' filename ' ) , default_ext = None )
if e :
return e
2022-12-29 17:32:54 +01:00
meta_ext = getheader ( ' x-amz-meta-name ' )
if meta_ext :
e = meta_ext . rpartition ( ' . ' ) [ 2 ]
if e :
return e
return mimetype2ext ( getheader ( ' Content-Type ' ) , default = default )
2015-01-07 07:20:20 +01:00
2015-07-22 14:03:05 +02:00
def encode_data_uri ( data , mime_type ) :
2024-06-12 01:09:58 +02:00
return ' data: {} ;base64, {} ' . format ( mime_type , base64 . b64encode ( data ) . decode ( ' ascii ' ) )
2015-07-22 14:03:05 +02:00
2015-01-07 07:20:20 +01:00
def age_restricted ( content_limit , age_limit ) :
2016-01-10 19:27:22 +01:00
""" Returns True iff the content should be blocked """
2015-01-07 07:20:20 +01:00
if age_limit is None : # No limit set
return False
if content_limit is None :
return False # Content available for everyone
return age_limit < content_limit
2015-01-23 01:21:30 +01:00
2022-07-15 18:14:07 +02:00
# List of known byte-order-marks (BOM)
2022-07-15 13:52:14 +02:00
BOMS = [
( b ' \xef \xbb \xbf ' , ' utf-8 ' ) ,
( b ' \x00 \x00 \xfe \xff ' , ' utf-32-be ' ) ,
( b ' \xff \xfe \x00 \x00 ' , ' utf-32-le ' ) ,
( b ' \xff \xfe ' , ' utf-16-le ' ) ,
( b ' \xfe \xff ' , ' utf-16-be ' ) ,
]
2015-01-23 01:21:30 +01:00
def is_html ( first_bytes ) :
""" Detect whether a file contains HTML by examining its first bytes. """
2022-05-18 03:12:43 +02:00
encoding = ' utf-8 '
2015-01-23 01:21:30 +01:00
for bom , enc in BOMS :
2022-05-18 03:12:43 +02:00
while first_bytes . startswith ( bom ) :
encoding , first_bytes = enc , first_bytes [ len ( bom ) : ]
2015-01-23 01:21:30 +01:00
2024-09-28 00:46:22 +02:00
return re . match ( r ' \ s*< ' , first_bytes . decode ( encoding , ' replace ' ) )
2015-01-23 23:50:31 +01:00
def determine_protocol ( info_dict ) :
protocol = info_dict . get ( ' protocol ' )
if protocol is not None :
return protocol
2021-10-26 16:01:56 +02:00
url = sanitize_url ( info_dict [ ' url ' ] )
2015-01-23 23:50:31 +01:00
if url . startswith ( ' rtmp ' ) :
return ' rtmp '
elif url . startswith ( ' mms ' ) :
return ' mms '
elif url . startswith ( ' rtsp ' ) :
return ' rtsp '
ext = determine_ext ( url )
if ext == ' m3u8 ' :
2022-09-10 00:16:54 +02:00
return ' m3u8 ' if info_dict . get ( ' is_live ' ) else ' m3u8_native '
2015-01-23 23:50:31 +01:00
elif ext == ' f4m ' :
return ' f4m '
2022-06-24 12:54:43 +02:00
return urllib . parse . urlparse ( url ) . scheme
2015-01-25 02:38:47 +01:00
2021-11-20 04:03:51 +01:00
def render_table ( header_row , data , delim = False , extra_gap = 0 , hide_empty = False ) :
""" Render a list of rows, each as a list of values.
Text after a \t will be right aligned """
2021-10-20 18:37:32 +02:00
def width ( string ) :
2021-11-20 04:03:51 +01:00
return len ( remove_terminal_sequences ( string ) . replace ( ' \t ' , ' ' ) )
2020-12-13 15:29:09 +01:00
def get_max_lens ( table ) :
2021-10-20 18:37:32 +02:00
return [ max ( width ( str ( v ) ) for v in col ) for col in zip ( * table ) ]
2020-12-13 15:29:09 +01:00
2024-06-12 01:09:58 +02:00
def filter_using_list ( row , filter_array ) :
return [ col for take , col in itertools . zip_longest ( filter_array , row , fillvalue = True ) if take ]
2020-12-13 15:29:09 +01:00
2022-02-02 01:38:40 +01:00
max_lens = get_max_lens ( data ) if hide_empty else [ ]
header_row = filter_using_list ( header_row , max_lens )
data = [ filter_using_list ( row , max_lens ) for row in data ]
2020-12-13 15:29:09 +01:00
2024-06-12 01:09:58 +02:00
table = [ header_row , * data ]
2020-12-13 15:29:09 +01:00
max_lens = get_max_lens ( table )
2021-11-20 04:03:51 +01:00
extra_gap + = 1
2020-12-13 15:29:09 +01:00
if delim :
2024-06-12 01:09:58 +02:00
table = [ header_row , [ delim * ( ml + extra_gap ) for ml in max_lens ] , * data ]
2022-03-06 09:11:10 +01:00
table [ 1 ] [ - 1 ] = table [ 1 ] [ - 1 ] [ : - extra_gap * len ( delim ) ] # Remove extra_gap from end of delimiter
2021-10-20 18:37:32 +02:00
for row in table :
for pos , text in enumerate ( map ( str , row ) ) :
2021-11-20 04:03:51 +01:00
if ' \t ' in text :
row [ pos ] = text . replace ( ' \t ' , ' ' * ( max_lens [ pos ] - width ( text ) ) ) + ' ' * extra_gap
else :
row [ pos ] = text + ' ' * ( max_lens [ pos ] - width ( text ) + extra_gap )
2024-06-12 01:09:58 +02:00
return ' \n ' . join ( ' ' . join ( row ) . rstrip ( ) for row in table )
2015-02-10 03:32:21 +01:00
2021-08-15 10:12:23 +02:00
def _match_one ( filter_part , dct , incomplete ) :
2021-06-13 16:25:19 +02:00
# TODO: Generalize code with YoutubeDL._build_format_filter
2021-08-04 23:31:23 +02:00
STRING_OPERATORS = {
' *= ' : operator . contains ,
' ^= ' : lambda attr , value : attr . startswith ( value ) ,
' $= ' : lambda attr , value : attr . endswith ( value ) ,
' ~= ' : lambda attr , value : re . search ( value , attr ) ,
}
2015-02-10 03:32:21 +01:00
COMPARISON_OPERATORS = {
2021-08-04 23:31:23 +02:00
* * STRING_OPERATORS ,
' <= ' : operator . le , # "<=" must be defined above "<"
2015-02-10 03:32:21 +01:00
' < ' : operator . lt ,
' >= ' : operator . ge ,
2021-08-04 23:31:23 +02:00
' > ' : operator . gt ,
2015-02-10 03:32:21 +01:00
' = ' : operator . eq ,
}
2021-08-04 23:31:23 +02:00
2022-03-25 09:36:46 +01:00
if isinstance ( incomplete , bool ) :
is_incomplete = lambda _ : incomplete
else :
is_incomplete = lambda k : k in incomplete
2022-05-25 14:23:46 +02:00
operator_rex = re . compile ( r ''' (?x)
2015-02-10 03:32:21 +01:00
( ? P < key > [ a - z_ ] + )
2024-06-12 01:09:58 +02:00
\s * ( ? P < negation > ! \s * ) ? ( ? P < op > { } ) ( ? P < none_inclusive > \s * \? ) ? \s *
2015-02-10 03:32:21 +01:00
( ? :
2021-08-04 23:31:23 +02:00
( ? P < quote > [ " \' ])(?P<quotedstrval>.+?)(?P=quote)|
( ? P < strval > . + ? )
2015-02-10 03:32:21 +01:00
)
2024-06-12 01:09:58 +02:00
''' .format( ' | ' .join(map(re.escape, COMPARISON_OPERATORS.keys()))))
2022-05-25 14:23:46 +02:00
m = operator_rex . fullmatch ( filter_part . strip ( ) )
2015-02-10 03:32:21 +01:00
if m :
2021-10-16 21:34:00 +02:00
m = m . groupdict ( )
unnegated_op = COMPARISON_OPERATORS [ m [ ' op ' ] ]
if m [ ' negation ' ] :
2021-06-13 16:25:19 +02:00
op = lambda attr , value : not unnegated_op ( attr , value )
else :
op = unnegated_op
2021-10-16 21:34:00 +02:00
comparison_value = m [ ' quotedstrval ' ] or m [ ' strval ' ] or m [ ' intval ' ]
if m [ ' quote ' ] :
2024-06-12 01:09:58 +02:00
comparison_value = comparison_value . replace ( r ' \ {} ' . format ( m [ ' quote ' ] ) , m [ ' quote ' ] )
2021-10-16 21:34:00 +02:00
actual_value = dct . get ( m [ ' key ' ] )
numeric_comparison = None
2022-04-11 22:09:26 +02:00
if isinstance ( actual_value , ( int , float ) ) :
2016-10-31 17:32:08 +01:00
# If the original field is a string and matching comparisonvalue is
# a number we should respect the origin of the original field
# and process comparison value as a string (see
2021-10-16 21:34:00 +02:00
# https://github.com/ytdl-org/youtube-dl/issues/11082)
2015-02-10 03:32:21 +01:00
try :
2021-10-16 21:34:00 +02:00
numeric_comparison = int ( comparison_value )
2015-02-10 03:32:21 +01:00
except ValueError :
2021-10-16 21:34:00 +02:00
numeric_comparison = parse_filesize ( comparison_value )
if numeric_comparison is None :
numeric_comparison = parse_filesize ( f ' { comparison_value } B ' )
if numeric_comparison is None :
numeric_comparison = parse_duration ( comparison_value )
if numeric_comparison is not None and m [ ' op ' ] in STRING_OPERATORS :
2024-06-12 01:09:58 +02:00
raise ValueError ( ' Operator {} only supports string values! ' . format ( m [ ' op ' ] ) )
2015-02-10 03:32:21 +01:00
if actual_value is None :
2022-03-25 09:36:46 +01:00
return is_incomplete ( m [ ' key ' ] ) or m [ ' none_inclusive ' ]
2021-10-16 21:34:00 +02:00
return op ( actual_value , comparison_value if numeric_comparison is None else numeric_comparison )
2015-02-10 03:32:21 +01:00
UNARY_OPERATORS = {
2018-04-24 18:49:30 +02:00
' ' : lambda v : ( v is True ) if isinstance ( v , bool ) else ( v is not None ) ,
' ! ' : lambda v : ( v is False ) if isinstance ( v , bool ) else ( v is None ) ,
2015-02-10 03:32:21 +01:00
}
2022-05-25 14:23:46 +02:00
operator_rex = re . compile ( r ''' (?x)
2024-06-12 01:09:58 +02:00
( ? P < op > { } ) \s * ( ? P < key > [ a - z_ ] + )
''' .format( ' | ' .join(map(re.escape, UNARY_OPERATORS.keys()))))
2022-05-25 14:23:46 +02:00
m = operator_rex . fullmatch ( filter_part . strip ( ) )
2015-02-10 03:32:21 +01:00
if m :
op = UNARY_OPERATORS [ m . group ( ' op ' ) ]
actual_value = dct . get ( m . group ( ' key ' ) )
2022-03-25 09:36:46 +01:00
if is_incomplete ( m . group ( ' key ' ) ) and actual_value is None :
2021-08-15 10:12:23 +02:00
return True
2015-02-10 03:32:21 +01:00
return op ( actual_value )
2024-06-12 01:09:58 +02:00
raise ValueError ( f ' Invalid filter part { filter_part !r} ' )
2015-02-10 03:32:21 +01:00
2021-08-15 10:12:23 +02:00
def match_str ( filter_str , dct , incomplete = False ) :
2022-03-25 09:36:46 +01:00
""" Filter a dictionary with a simple string syntax.
@returns Whether the filter passes
@param incomplete Set of keys that is expected to be missing from dct .
Can be True / False to indicate all / none of the keys may be missing .
All conditions on incomplete keys pass if the key is missing
2021-08-15 10:12:23 +02:00
"""
2015-02-10 03:32:21 +01:00
return all (
2021-08-15 10:12:23 +02:00
_match_one ( filter_part . replace ( r ' \ & ' , ' & ' ) , dct , incomplete )
2021-08-04 23:31:23 +02:00
for filter_part in re . split ( r ' (?<! \\ )& ' , filter_str ) )
2015-02-10 03:32:21 +01:00
2023-03-03 20:43:05 +01:00
def match_filter_func ( filters , breaking_filters = None ) :
if not filters and not breaking_filters :
2022-03-08 21:03:31 +01:00
return None
2024-01-31 11:27:37 +01:00
repr_ = f ' { match_filter_func . __module__ } . { match_filter_func . __qualname__ } ( { filters } , { breaking_filters } ) '
2023-03-03 20:43:05 +01:00
breaking_filters = match_filter_func ( breaking_filters ) or ( lambda _ , __ : None )
filters = set ( variadic ( filters or [ ] ) )
2022-03-08 21:03:31 +01:00
2022-04-28 16:33:26 +02:00
interactive = ' - ' in filters
if interactive :
filters . remove ( ' - ' )
2024-01-31 11:27:37 +01:00
@function_with_repr.set_repr ( repr_ )
2022-04-28 16:33:26 +02:00
def _match_func ( info_dict , incomplete = False ) :
2023-03-03 20:43:05 +01:00
ret = breaking_filters ( info_dict , incomplete )
if ret is not None :
raise RejectedVideoReached ( ret )
2022-04-28 16:33:26 +02:00
if not filters or any ( match_str ( f , info_dict , incomplete ) for f in filters ) :
return NO_DEFAULT if interactive and not incomplete else None
2015-02-10 03:32:21 +01:00
else :
2022-07-26 05:58:37 +02:00
video_title = info_dict . get ( ' title ' ) or info_dict . get ( ' id ' ) or ' entry '
2022-03-25 09:02:54 +01:00
filter_str = ' ) | ( ' . join ( map ( str . strip , filters ) )
return f ' { video_title } does not pass filter ( { filter_str } ), skipping .. '
2015-02-10 03:32:21 +01:00
return _match_func
2015-03-03 00:03:06 +01:00
2022-07-08 21:37:47 +02:00
class download_range_func :
2023-06-22 01:24:39 +02:00
def __init__ ( self , chapters , ranges , from_info = False ) :
self . chapters , self . ranges , self . from_info = chapters , ranges , from_info
2022-07-08 21:37:47 +02:00
def __call__ ( self , info_dict , ydl ) :
2022-09-25 23:33:52 +02:00
2022-06-06 22:13:50 +02:00
warning = ( ' There are no chapters matching the regex ' if info_dict . get ( ' chapters ' )
2022-06-10 21:03:54 +02:00
else ' Cannot match chapters since chapter information is unavailable ' )
2022-07-08 21:37:47 +02:00
for regex in self . chapters or [ ] :
2022-06-06 22:13:50 +02:00
for i , chapter in enumerate ( info_dict . get ( ' chapters ' ) or [ ] ) :
if re . search ( regex , chapter [ ' title ' ] ) :
warning = None
yield { * * chapter , ' index ' : i }
2022-07-08 21:37:47 +02:00
if self . chapters and warning :
2022-06-06 22:13:50 +02:00
ydl . to_screen ( f ' [info] { info_dict [ " id " ] } : { warning } ' )
2023-06-22 01:24:39 +02:00
for start , end in self . ranges or [ ] :
yield {
' start_time ' : self . _handle_negative_timestamp ( start , info_dict ) ,
' end_time ' : self . _handle_negative_timestamp ( end , info_dict ) ,
}
if self . from_info and ( info_dict . get ( ' start_time ' ) or info_dict . get ( ' end_time ' ) ) :
yield {
2023-06-22 19:52:14 +02:00
' start_time ' : info_dict . get ( ' start_time ' ) or 0 ,
' end_time ' : info_dict . get ( ' end_time ' ) or float ( ' inf ' ) ,
2023-06-22 01:24:39 +02:00
}
2023-06-22 19:52:14 +02:00
elif not self . ranges and not self . chapters :
yield { }
2023-06-22 01:24:39 +02:00
@staticmethod
def _handle_negative_timestamp ( time , info ) :
return max ( info [ ' duration ' ] + time , 0 ) if info . get ( ' duration ' ) and time < 0 else time
2022-06-06 22:13:50 +02:00
2022-07-08 21:37:47 +02:00
def __eq__ ( self , other ) :
return ( isinstance ( other , download_range_func )
and self . chapters == other . chapters and self . ranges == other . ranges )
2022-06-06 22:13:50 +02:00
2022-11-30 07:04:51 +01:00
def __repr__ ( self ) :
2023-02-17 13:22:22 +01:00
return f ' { __name__ } . { type ( self ) . __name__ } ( { self . chapters } , { self . ranges } ) '
2022-11-30 07:04:51 +01:00
2022-06-06 22:13:50 +02:00
2015-04-25 17:15:05 +02:00
def parse_dfxp_time_expr ( time_expr ) :
if not time_expr :
2015-12-19 11:21:42 +01:00
return
2015-04-25 17:15:05 +02:00
2022-04-29 03:48:36 +02:00
mobj = re . match ( rf ' ^(?P<time_offset> { NUMBER_RE } )s?$ ' , time_expr )
2015-04-25 17:15:05 +02:00
if mobj :
return float ( mobj . group ( ' time_offset ' ) )
2015-12-19 12:29:51 +01:00
mobj = re . match ( r ' ^( \ d+):( \ d \ d):( \ d \ d(?:(?: \ .|:) \ d+)?)$ ' , time_expr )
2015-04-25 17:15:05 +02:00
if mobj :
2015-12-19 12:29:51 +01:00
return 3600 * int ( mobj . group ( 1 ) ) + 60 * int ( mobj . group ( 2 ) ) + float ( mobj . group ( 3 ) . replace ( ' : ' , ' . ' ) )
2015-04-25 17:15:05 +02:00
2015-05-12 07:04:54 +02:00
def srt_subtitles_timecode ( seconds ) :
2021-10-19 19:28:14 +02:00
return ' %02d : %02d : %02d , %03d ' % timetuple_from_msec ( seconds * 1000 )
def ass_subtitles_timecode ( seconds ) :
time = timetuple_from_msec ( seconds * 1000 )
return ' %01d : %02d : %02d . %02d ' % ( * time [ : - 1 ] , time . milliseconds / 10 )
2015-04-25 17:15:05 +02:00
def dfxp2srt ( dfxp_data ) :
2024-06-12 01:09:58 +02:00
"""
2017-09-16 06:18:38 +02:00
@param dfxp_data A bytes - like object containing DFXP data
@returns A unicode object containing converted SRT data
2024-06-12 01:09:58 +02:00
"""
2017-02-23 18:46:20 +01:00
LEGACY_NAMESPACES = (
2017-09-16 06:18:38 +02:00
( b ' http://www.w3.org/ns/ttml ' , [
b ' http://www.w3.org/2004/11/ttaf1 ' ,
b ' http://www.w3.org/2006/04/ttaf1 ' ,
b ' http://www.w3.org/2006/10/ttaf1 ' ,
2017-02-23 18:46:20 +01:00
] ) ,
2017-09-16 06:18:38 +02:00
( b ' http://www.w3.org/ns/ttml#styling ' , [
b ' http://www.w3.org/ns/ttml#style ' ,
2017-02-23 18:46:20 +01:00
] ) ,
)
SUPPORTED_STYLING = [
' color ' ,
' fontFamily ' ,
' fontSize ' ,
' fontStyle ' ,
' fontWeight ' ,
2024-06-12 01:09:58 +02:00
' textDecoration ' ,
2017-02-23 18:46:20 +01:00
]
2015-06-21 13:16:59 +02:00
_x = functools . partial ( xpath_with_ns , ns_map = {
2018-05-26 15:35:47 +02:00
' xml ' : ' http://www.w3.org/XML/1998/namespace ' ,
2015-06-21 13:16:59 +02:00
' ttml ' : ' http://www.w3.org/ns/ttml ' ,
2017-02-23 18:46:20 +01:00
' tts ' : ' http://www.w3.org/ns/ttml#styling ' ,
2015-06-21 13:16:59 +02:00
} )
2015-04-25 17:15:05 +02:00
2017-02-23 18:46:20 +01:00
styles = { }
default_style = { }
2022-04-11 17:10:28 +02:00
class TTMLPElementParser :
2017-02-23 18:46:20 +01:00
_out = ' '
_unclosed_elements = [ ]
_applied_styles = [ ]
2015-04-25 17:15:05 +02:00
2016-01-28 12:38:34 +01:00
def start ( self , tag , attrib ) :
2017-02-23 18:46:20 +01:00
if tag in ( _x ( ' ttml:br ' ) , ' br ' ) :
self . _out + = ' \n '
else :
unclosed_elements = [ ]
style = { }
element_style_id = attrib . get ( ' style ' )
if default_style :
style . update ( default_style )
if element_style_id :
style . update ( styles . get ( element_style_id , { } ) )
for prop in SUPPORTED_STYLING :
prop_val = attrib . get ( _x ( ' tts: ' + prop ) )
if prop_val :
style [ prop ] = prop_val
if style :
font = ' '
for k , v in sorted ( style . items ( ) ) :
if self . _applied_styles and self . _applied_styles [ - 1 ] . get ( k ) == v :
continue
if k == ' color ' :
2024-06-12 01:09:58 +02:00
font + = f ' color= " { v } " '
2017-02-23 18:46:20 +01:00
elif k == ' fontSize ' :
2024-06-12 01:09:58 +02:00
font + = f ' size= " { v } " '
2017-02-23 18:46:20 +01:00
elif k == ' fontFamily ' :
2024-06-12 01:09:58 +02:00
font + = f ' face= " { v } " '
2017-02-23 18:46:20 +01:00
elif k == ' fontWeight ' and v == ' bold ' :
self . _out + = ' <b> '
unclosed_elements . append ( ' b ' )
elif k == ' fontStyle ' and v == ' italic ' :
self . _out + = ' <i> '
unclosed_elements . append ( ' i ' )
elif k == ' textDecoration ' and v == ' underline ' :
self . _out + = ' <u> '
unclosed_elements . append ( ' u ' )
if font :
self . _out + = ' <font ' + font + ' > '
unclosed_elements . append ( ' font ' )
applied_style = { }
if self . _applied_styles :
applied_style . update ( self . _applied_styles [ - 1 ] )
applied_style . update ( style )
self . _applied_styles . append ( applied_style )
self . _unclosed_elements . append ( unclosed_elements )
2015-04-25 17:15:05 +02:00
2016-01-28 12:38:34 +01:00
def end ( self , tag ) :
2017-02-23 18:46:20 +01:00
if tag not in ( _x ( ' ttml:br ' ) , ' br ' ) :
unclosed_elements = self . _unclosed_elements . pop ( )
for element in reversed ( unclosed_elements ) :
2024-06-12 01:09:58 +02:00
self . _out + = f ' </ { element } > '
2017-02-23 18:46:20 +01:00
if unclosed_elements and self . _applied_styles :
self . _applied_styles . pop ( )
2015-04-25 17:15:05 +02:00
2016-01-28 12:38:34 +01:00
def data ( self , data ) :
2017-02-23 18:46:20 +01:00
self . _out + = data
2016-01-28 12:38:34 +01:00
def close ( self ) :
2017-02-23 18:46:20 +01:00
return self . _out . strip ( )
2016-01-28 12:38:34 +01:00
2023-04-19 01:46:57 +02:00
# Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
# This will not trigger false positives since only UTF-8 text is being replaced
dfxp_data = dfxp_data . replace ( b ' encoding= \' UTF-16 \' ' , b ' encoding= \' UTF-8 \' ' )
2016-01-28 12:38:34 +01:00
def parse_node ( node ) :
target = TTMLPElementParser ( )
parser = xml . etree . ElementTree . XMLParser ( target = target )
parser . feed ( xml . etree . ElementTree . tostring ( node ) )
return parser . close ( )
2015-04-25 17:15:05 +02:00
2017-02-23 18:46:20 +01:00
for k , v in LEGACY_NAMESPACES :
for ns in v :
dfxp_data = dfxp_data . replace ( ns , k )
2017-09-16 06:18:38 +02:00
dfxp = compat_etree_fromstring ( dfxp_data )
2015-04-25 17:15:05 +02:00
out = [ ]
2017-02-23 18:46:20 +01:00
paras = dfxp . findall ( _x ( ' .//ttml:p ' ) ) or dfxp . findall ( ' .//p ' )
2015-05-18 18:45:01 +02:00
if not paras :
raise ValueError ( ' Invalid dfxp/TTML subtitle ' )
2015-04-25 17:15:05 +02:00
2017-02-23 18:46:20 +01:00
repeat = False
while True :
for style in dfxp . findall ( _x ( ' .//ttml:style ' ) ) :
2018-05-26 15:35:47 +02:00
style_id = style . get ( ' id ' ) or style . get ( _x ( ' xml:id ' ) )
if not style_id :
continue
2017-02-23 18:46:20 +01:00
parent_style_id = style . get ( ' style ' )
if parent_style_id :
if parent_style_id not in styles :
repeat = True
continue
styles [ style_id ] = styles [ parent_style_id ] . copy ( )
for prop in SUPPORTED_STYLING :
prop_val = style . get ( _x ( ' tts: ' + prop ) )
if prop_val :
styles . setdefault ( style_id , { } ) [ prop ] = prop_val
if repeat :
repeat = False
else :
break
for p in ( ' body ' , ' div ' ) :
ele = xpath_element ( dfxp , [ _x ( ' .//ttml: ' + p ) , ' .// ' + p ] )
if ele is None :
continue
style = styles . get ( ele . get ( ' style ' ) )
if not style :
continue
default_style . update ( style )
2015-04-25 17:15:05 +02:00
for para , index in zip ( paras , itertools . count ( 1 ) ) :
2015-12-19 11:21:42 +01:00
begin_time = parse_dfxp_time_expr ( para . attrib . get ( ' begin ' ) )
2015-05-12 06:47:37 +02:00
end_time = parse_dfxp_time_expr ( para . attrib . get ( ' end ' ) )
2015-12-19 11:21:42 +01:00
dur = parse_dfxp_time_expr ( para . attrib . get ( ' dur ' ) )
if begin_time is None :
continue
2015-05-12 06:47:37 +02:00
if not end_time :
2015-12-19 11:21:42 +01:00
if not dur :
continue
end_time = begin_time + dur
2015-04-25 17:15:05 +02:00
out . append ( ' %d \n %s --> %s \n %s \n \n ' % (
index ,
2015-05-12 07:04:54 +02:00
srt_subtitles_timecode ( begin_time ) ,
srt_subtitles_timecode ( end_time ) ,
2015-04-25 17:15:05 +02:00
parse_node ( para ) ) )
return ' ' . join ( out )
2022-04-17 19:18:50 +02:00
def cli_option ( params , command_option , param , separator = None ) :
2015-09-04 23:05:11 +02:00
param = params . get ( param )
2022-04-17 19:18:50 +02:00
return ( [ ] if param is None
else [ command_option , str ( param ) ] if separator is None
else [ f ' { command_option } { separator } { param } ' ] )
2015-09-04 23:05:11 +02:00
def cli_bool_option ( params , command_option , param , true_value = ' true ' , false_value = ' false ' , separator = None ) :
param = params . get ( param )
2022-04-17 19:18:50 +02:00
assert param in ( True , False , None )
return cli_option ( { True : true_value , False : false_value } , command_option , param , separator )
2015-09-04 23:05:11 +02:00
def cli_valueless_option ( params , command_option , param , expected_value = True ) :
2022-04-17 19:18:50 +02:00
return [ command_option ] if params . get ( param ) == expected_value else [ ]
2015-09-04 23:05:11 +02:00
2021-03-09 03:17:21 +01:00
def cli_configuration_args ( argdict , keys , default = [ ] , use_compat = True ) :
2021-01-23 10:43:51 +01:00
if isinstance ( argdict , ( list , tuple ) ) : # for backward compatibility
2021-03-09 03:17:21 +01:00
if use_compat :
2021-02-24 17:05:18 +01:00
return argdict
else :
argdict = None
2021-01-23 10:43:51 +01:00
if argdict is None :
2021-02-24 17:05:18 +01:00
return default
2021-01-23 10:43:51 +01:00
assert isinstance ( argdict , dict )
2021-03-09 03:17:21 +01:00
assert isinstance ( keys , ( list , tuple ) )
for key_list in keys :
arg_list = list ( filter (
lambda x : x is not None ,
2021-07-10 23:59:44 +02:00
[ argdict . get ( key . lower ( ) ) for key in variadic ( key_list ) ] ) )
2021-03-09 03:17:21 +01:00
if arg_list :
return [ arg for args in arg_list for arg in args ]
return default
2015-09-04 23:05:11 +02:00
2021-08-24 02:12:45 +02:00
2021-08-23 23:45:44 +02:00
def _configuration_args ( main_key , argdict , exe , keys = None , default = [ ] , use_compat = True ) :
main_key , exe = main_key . lower ( ) , exe . lower ( )
root_key = exe if main_key == exe else f ' { main_key } + { exe } '
keys = [ f ' { root_key } { k } ' for k in ( keys or [ ' ' ] ) ]
if root_key in keys :
if main_key != exe :
keys . append ( ( main_key , exe ) )
keys . append ( ' default ' )
else :
use_compat = False
return cli_configuration_args ( argdict , keys , default , use_compat )
2015-09-04 23:05:11 +02:00
2022-04-11 17:10:28 +02:00
class ISO639Utils :
2015-06-21 12:53:17 +02:00
# See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
_lang_map = {
' aa ' : ' aar ' ,
' ab ' : ' abk ' ,
' ae ' : ' ave ' ,
' af ' : ' afr ' ,
' ak ' : ' aka ' ,
' am ' : ' amh ' ,
' an ' : ' arg ' ,
' ar ' : ' ara ' ,
' as ' : ' asm ' ,
' av ' : ' ava ' ,
' ay ' : ' aym ' ,
' az ' : ' aze ' ,
' ba ' : ' bak ' ,
' be ' : ' bel ' ,
' bg ' : ' bul ' ,
' bh ' : ' bih ' ,
' bi ' : ' bis ' ,
' bm ' : ' bam ' ,
' bn ' : ' ben ' ,
' bo ' : ' bod ' ,
' br ' : ' bre ' ,
' bs ' : ' bos ' ,
' ca ' : ' cat ' ,
' ce ' : ' che ' ,
' ch ' : ' cha ' ,
' co ' : ' cos ' ,
' cr ' : ' cre ' ,
' cs ' : ' ces ' ,
' cu ' : ' chu ' ,
' cv ' : ' chv ' ,
' cy ' : ' cym ' ,
' da ' : ' dan ' ,
' de ' : ' deu ' ,
' dv ' : ' div ' ,
' dz ' : ' dzo ' ,
' ee ' : ' ewe ' ,
' el ' : ' ell ' ,
' en ' : ' eng ' ,
' eo ' : ' epo ' ,
' es ' : ' spa ' ,
' et ' : ' est ' ,
' eu ' : ' eus ' ,
' fa ' : ' fas ' ,
' ff ' : ' ful ' ,
' fi ' : ' fin ' ,
' fj ' : ' fij ' ,
' fo ' : ' fao ' ,
' fr ' : ' fra ' ,
' fy ' : ' fry ' ,
' ga ' : ' gle ' ,
' gd ' : ' gla ' ,
' gl ' : ' glg ' ,
' gn ' : ' grn ' ,
' gu ' : ' guj ' ,
' gv ' : ' glv ' ,
' ha ' : ' hau ' ,
' he ' : ' heb ' ,
2019-01-06 18:55:39 +01:00
' iw ' : ' heb ' , # Replaced by he in 1989 revision
2015-06-21 12:53:17 +02:00
' hi ' : ' hin ' ,
' ho ' : ' hmo ' ,
' hr ' : ' hrv ' ,
' ht ' : ' hat ' ,
' hu ' : ' hun ' ,
' hy ' : ' hye ' ,
' hz ' : ' her ' ,
' ia ' : ' ina ' ,
' id ' : ' ind ' ,
2019-01-06 18:55:39 +01:00
' in ' : ' ind ' , # Replaced by id in 1989 revision
2015-06-21 12:53:17 +02:00
' ie ' : ' ile ' ,
' ig ' : ' ibo ' ,
' ii ' : ' iii ' ,
' ik ' : ' ipk ' ,
' io ' : ' ido ' ,
' is ' : ' isl ' ,
' it ' : ' ita ' ,
' iu ' : ' iku ' ,
' ja ' : ' jpn ' ,
' jv ' : ' jav ' ,
' ka ' : ' kat ' ,
' kg ' : ' kon ' ,
' ki ' : ' kik ' ,
' kj ' : ' kua ' ,
' kk ' : ' kaz ' ,
' kl ' : ' kal ' ,
' km ' : ' khm ' ,
' kn ' : ' kan ' ,
' ko ' : ' kor ' ,
' kr ' : ' kau ' ,
' ks ' : ' kas ' ,
' ku ' : ' kur ' ,
' kv ' : ' kom ' ,
' kw ' : ' cor ' ,
' ky ' : ' kir ' ,
' la ' : ' lat ' ,
' lb ' : ' ltz ' ,
' lg ' : ' lug ' ,
' li ' : ' lim ' ,
' ln ' : ' lin ' ,
' lo ' : ' lao ' ,
' lt ' : ' lit ' ,
' lu ' : ' lub ' ,
' lv ' : ' lav ' ,
' mg ' : ' mlg ' ,
' mh ' : ' mah ' ,
' mi ' : ' mri ' ,
' mk ' : ' mkd ' ,
' ml ' : ' mal ' ,
' mn ' : ' mon ' ,
' mr ' : ' mar ' ,
' ms ' : ' msa ' ,
' mt ' : ' mlt ' ,
' my ' : ' mya ' ,
' na ' : ' nau ' ,
' nb ' : ' nob ' ,
' nd ' : ' nde ' ,
' ne ' : ' nep ' ,
' ng ' : ' ndo ' ,
' nl ' : ' nld ' ,
' nn ' : ' nno ' ,
' no ' : ' nor ' ,
' nr ' : ' nbl ' ,
' nv ' : ' nav ' ,
' ny ' : ' nya ' ,
' oc ' : ' oci ' ,
' oj ' : ' oji ' ,
' om ' : ' orm ' ,
' or ' : ' ori ' ,
' os ' : ' oss ' ,
' pa ' : ' pan ' ,
2023-06-14 17:52:17 +02:00
' pe ' : ' per ' ,
2015-06-21 12:53:17 +02:00
' pi ' : ' pli ' ,
' pl ' : ' pol ' ,
' ps ' : ' pus ' ,
' pt ' : ' por ' ,
' qu ' : ' que ' ,
' rm ' : ' roh ' ,
' rn ' : ' run ' ,
' ro ' : ' ron ' ,
' ru ' : ' rus ' ,
' rw ' : ' kin ' ,
' sa ' : ' san ' ,
' sc ' : ' srd ' ,
' sd ' : ' snd ' ,
' se ' : ' sme ' ,
' sg ' : ' sag ' ,
' si ' : ' sin ' ,
' sk ' : ' slk ' ,
' sl ' : ' slv ' ,
' sm ' : ' smo ' ,
' sn ' : ' sna ' ,
' so ' : ' som ' ,
' sq ' : ' sqi ' ,
' sr ' : ' srp ' ,
' ss ' : ' ssw ' ,
' st ' : ' sot ' ,
' su ' : ' sun ' ,
' sv ' : ' swe ' ,
' sw ' : ' swa ' ,
' ta ' : ' tam ' ,
' te ' : ' tel ' ,
' tg ' : ' tgk ' ,
' th ' : ' tha ' ,
' ti ' : ' tir ' ,
' tk ' : ' tuk ' ,
' tl ' : ' tgl ' ,
' tn ' : ' tsn ' ,
' to ' : ' ton ' ,
' tr ' : ' tur ' ,
' ts ' : ' tso ' ,
' tt ' : ' tat ' ,
' tw ' : ' twi ' ,
' ty ' : ' tah ' ,
' ug ' : ' uig ' ,
' uk ' : ' ukr ' ,
' ur ' : ' urd ' ,
' uz ' : ' uzb ' ,
' ve ' : ' ven ' ,
' vi ' : ' vie ' ,
' vo ' : ' vol ' ,
' wa ' : ' wln ' ,
' wo ' : ' wol ' ,
' xh ' : ' xho ' ,
' yi ' : ' yid ' ,
2019-01-06 19:02:34 +01:00
' ji ' : ' yid ' , # Replaced by yi in 1989 revision
2015-06-21 12:53:17 +02:00
' yo ' : ' yor ' ,
' za ' : ' zha ' ,
' zh ' : ' zho ' ,
' zu ' : ' zul ' ,
}
@classmethod
def short2long ( cls , code ) :
""" Convert language code from ISO 639-1 to ISO 639-2/T """
return cls . _lang_map . get ( code [ : 2 ] )
@classmethod
def long2short ( cls , code ) :
""" Convert language code from ISO 639-2/T to ISO 639-1 """
for short_name , long_name in cls . _lang_map . items ( ) :
if long_name == code :
return short_name
2022-04-11 17:10:28 +02:00
class ISO3166Utils :
2015-06-27 07:13:57 +02:00
# From http://data.okfn.org/data/core/country-list
_country_map = {
' AF ' : ' Afghanistan ' ,
' AX ' : ' Åland Islands ' ,
' AL ' : ' Albania ' ,
' DZ ' : ' Algeria ' ,
' AS ' : ' American Samoa ' ,
' AD ' : ' Andorra ' ,
' AO ' : ' Angola ' ,
' AI ' : ' Anguilla ' ,
' AQ ' : ' Antarctica ' ,
' AG ' : ' Antigua and Barbuda ' ,
' AR ' : ' Argentina ' ,
' AM ' : ' Armenia ' ,
' AW ' : ' Aruba ' ,
' AU ' : ' Australia ' ,
' AT ' : ' Austria ' ,
' AZ ' : ' Azerbaijan ' ,
' BS ' : ' Bahamas ' ,
' BH ' : ' Bahrain ' ,
' BD ' : ' Bangladesh ' ,
' BB ' : ' Barbados ' ,
' BY ' : ' Belarus ' ,
' BE ' : ' Belgium ' ,
' BZ ' : ' Belize ' ,
' BJ ' : ' Benin ' ,
' BM ' : ' Bermuda ' ,
' BT ' : ' Bhutan ' ,
' BO ' : ' Bolivia, Plurinational State of ' ,
' BQ ' : ' Bonaire, Sint Eustatius and Saba ' ,
' BA ' : ' Bosnia and Herzegovina ' ,
' BW ' : ' Botswana ' ,
' BV ' : ' Bouvet Island ' ,
' BR ' : ' Brazil ' ,
' IO ' : ' British Indian Ocean Territory ' ,
' BN ' : ' Brunei Darussalam ' ,
' BG ' : ' Bulgaria ' ,
' BF ' : ' Burkina Faso ' ,
' BI ' : ' Burundi ' ,
' KH ' : ' Cambodia ' ,
' CM ' : ' Cameroon ' ,
' CA ' : ' Canada ' ,
' CV ' : ' Cape Verde ' ,
' KY ' : ' Cayman Islands ' ,
' CF ' : ' Central African Republic ' ,
' TD ' : ' Chad ' ,
' CL ' : ' Chile ' ,
' CN ' : ' China ' ,
' CX ' : ' Christmas Island ' ,
' CC ' : ' Cocos (Keeling) Islands ' ,
' CO ' : ' Colombia ' ,
' KM ' : ' Comoros ' ,
' CG ' : ' Congo ' ,
' CD ' : ' Congo, the Democratic Republic of the ' ,
' CK ' : ' Cook Islands ' ,
' CR ' : ' Costa Rica ' ,
' CI ' : ' Côte d \' Ivoire ' ,
' HR ' : ' Croatia ' ,
' CU ' : ' Cuba ' ,
' CW ' : ' Curaçao ' ,
' CY ' : ' Cyprus ' ,
' CZ ' : ' Czech Republic ' ,
' DK ' : ' Denmark ' ,
' DJ ' : ' Djibouti ' ,
' DM ' : ' Dominica ' ,
' DO ' : ' Dominican Republic ' ,
' EC ' : ' Ecuador ' ,
' EG ' : ' Egypt ' ,
' SV ' : ' El Salvador ' ,
' GQ ' : ' Equatorial Guinea ' ,
' ER ' : ' Eritrea ' ,
' EE ' : ' Estonia ' ,
' ET ' : ' Ethiopia ' ,
' FK ' : ' Falkland Islands (Malvinas) ' ,
' FO ' : ' Faroe Islands ' ,
' FJ ' : ' Fiji ' ,
' FI ' : ' Finland ' ,
' FR ' : ' France ' ,
' GF ' : ' French Guiana ' ,
' PF ' : ' French Polynesia ' ,
' TF ' : ' French Southern Territories ' ,
' GA ' : ' Gabon ' ,
' GM ' : ' Gambia ' ,
' GE ' : ' Georgia ' ,
' DE ' : ' Germany ' ,
' GH ' : ' Ghana ' ,
' GI ' : ' Gibraltar ' ,
' GR ' : ' Greece ' ,
' GL ' : ' Greenland ' ,
' GD ' : ' Grenada ' ,
' GP ' : ' Guadeloupe ' ,
' GU ' : ' Guam ' ,
' GT ' : ' Guatemala ' ,
' GG ' : ' Guernsey ' ,
' GN ' : ' Guinea ' ,
' GW ' : ' Guinea-Bissau ' ,
' GY ' : ' Guyana ' ,
' HT ' : ' Haiti ' ,
' HM ' : ' Heard Island and McDonald Islands ' ,
' VA ' : ' Holy See (Vatican City State) ' ,
' HN ' : ' Honduras ' ,
' HK ' : ' Hong Kong ' ,
' HU ' : ' Hungary ' ,
' IS ' : ' Iceland ' ,
' IN ' : ' India ' ,
' ID ' : ' Indonesia ' ,
' IR ' : ' Iran, Islamic Republic of ' ,
' IQ ' : ' Iraq ' ,
' IE ' : ' Ireland ' ,
' IM ' : ' Isle of Man ' ,
' IL ' : ' Israel ' ,
' IT ' : ' Italy ' ,
' JM ' : ' Jamaica ' ,
' JP ' : ' Japan ' ,
' JE ' : ' Jersey ' ,
' JO ' : ' Jordan ' ,
' KZ ' : ' Kazakhstan ' ,
' KE ' : ' Kenya ' ,
' KI ' : ' Kiribati ' ,
' KP ' : ' Korea, Democratic People \' s Republic of ' ,
' KR ' : ' Korea, Republic of ' ,
' KW ' : ' Kuwait ' ,
' KG ' : ' Kyrgyzstan ' ,
' LA ' : ' Lao People \' s Democratic Republic ' ,
' LV ' : ' Latvia ' ,
' LB ' : ' Lebanon ' ,
' LS ' : ' Lesotho ' ,
' LR ' : ' Liberia ' ,
' LY ' : ' Libya ' ,
' LI ' : ' Liechtenstein ' ,
' LT ' : ' Lithuania ' ,
' LU ' : ' Luxembourg ' ,
' MO ' : ' Macao ' ,
' MK ' : ' Macedonia, the Former Yugoslav Republic of ' ,
' MG ' : ' Madagascar ' ,
' MW ' : ' Malawi ' ,
' MY ' : ' Malaysia ' ,
' MV ' : ' Maldives ' ,
' ML ' : ' Mali ' ,
' MT ' : ' Malta ' ,
' MH ' : ' Marshall Islands ' ,
' MQ ' : ' Martinique ' ,
' MR ' : ' Mauritania ' ,
' MU ' : ' Mauritius ' ,
' YT ' : ' Mayotte ' ,
' MX ' : ' Mexico ' ,
' FM ' : ' Micronesia, Federated States of ' ,
' MD ' : ' Moldova, Republic of ' ,
' MC ' : ' Monaco ' ,
' MN ' : ' Mongolia ' ,
' ME ' : ' Montenegro ' ,
' MS ' : ' Montserrat ' ,
' MA ' : ' Morocco ' ,
' MZ ' : ' Mozambique ' ,
' MM ' : ' Myanmar ' ,
' NA ' : ' Namibia ' ,
' NR ' : ' Nauru ' ,
' NP ' : ' Nepal ' ,
' NL ' : ' Netherlands ' ,
' NC ' : ' New Caledonia ' ,
' NZ ' : ' New Zealand ' ,
' NI ' : ' Nicaragua ' ,
' NE ' : ' Niger ' ,
' NG ' : ' Nigeria ' ,
' NU ' : ' Niue ' ,
' NF ' : ' Norfolk Island ' ,
' MP ' : ' Northern Mariana Islands ' ,
' NO ' : ' Norway ' ,
' OM ' : ' Oman ' ,
' PK ' : ' Pakistan ' ,
' PW ' : ' Palau ' ,
' PS ' : ' Palestine, State of ' ,
' PA ' : ' Panama ' ,
' PG ' : ' Papua New Guinea ' ,
' PY ' : ' Paraguay ' ,
' PE ' : ' Peru ' ,
' PH ' : ' Philippines ' ,
' PN ' : ' Pitcairn ' ,
' PL ' : ' Poland ' ,
' PT ' : ' Portugal ' ,
' PR ' : ' Puerto Rico ' ,
' QA ' : ' Qatar ' ,
' RE ' : ' Réunion ' ,
' RO ' : ' Romania ' ,
' RU ' : ' Russian Federation ' ,
' RW ' : ' Rwanda ' ,
' BL ' : ' Saint Barthélemy ' ,
' SH ' : ' Saint Helena, Ascension and Tristan da Cunha ' ,
' KN ' : ' Saint Kitts and Nevis ' ,
' LC ' : ' Saint Lucia ' ,
' MF ' : ' Saint Martin (French part) ' ,
' PM ' : ' Saint Pierre and Miquelon ' ,
' VC ' : ' Saint Vincent and the Grenadines ' ,
' WS ' : ' Samoa ' ,
' SM ' : ' San Marino ' ,
' ST ' : ' Sao Tome and Principe ' ,
' SA ' : ' Saudi Arabia ' ,
' SN ' : ' Senegal ' ,
' RS ' : ' Serbia ' ,
' SC ' : ' Seychelles ' ,
' SL ' : ' Sierra Leone ' ,
' SG ' : ' Singapore ' ,
' SX ' : ' Sint Maarten (Dutch part) ' ,
' SK ' : ' Slovakia ' ,
' SI ' : ' Slovenia ' ,
' SB ' : ' Solomon Islands ' ,
' SO ' : ' Somalia ' ,
' ZA ' : ' South Africa ' ,
' GS ' : ' South Georgia and the South Sandwich Islands ' ,
' SS ' : ' South Sudan ' ,
' ES ' : ' Spain ' ,
' LK ' : ' Sri Lanka ' ,
' SD ' : ' Sudan ' ,
' SR ' : ' Suriname ' ,
' SJ ' : ' Svalbard and Jan Mayen ' ,
' SZ ' : ' Swaziland ' ,
' SE ' : ' Sweden ' ,
' CH ' : ' Switzerland ' ,
' SY ' : ' Syrian Arab Republic ' ,
' TW ' : ' Taiwan, Province of China ' ,
' TJ ' : ' Tajikistan ' ,
' TZ ' : ' Tanzania, United Republic of ' ,
' TH ' : ' Thailand ' ,
' TL ' : ' Timor-Leste ' ,
' TG ' : ' Togo ' ,
' TK ' : ' Tokelau ' ,
' TO ' : ' Tonga ' ,
' TT ' : ' Trinidad and Tobago ' ,
' TN ' : ' Tunisia ' ,
' TR ' : ' Turkey ' ,
' TM ' : ' Turkmenistan ' ,
' TC ' : ' Turks and Caicos Islands ' ,
' TV ' : ' Tuvalu ' ,
' UG ' : ' Uganda ' ,
' UA ' : ' Ukraine ' ,
' AE ' : ' United Arab Emirates ' ,
' GB ' : ' United Kingdom ' ,
' US ' : ' United States ' ,
' UM ' : ' United States Minor Outlying Islands ' ,
' UY ' : ' Uruguay ' ,
' UZ ' : ' Uzbekistan ' ,
' VU ' : ' Vanuatu ' ,
' VE ' : ' Venezuela, Bolivarian Republic of ' ,
' VN ' : ' Viet Nam ' ,
' VG ' : ' Virgin Islands, British ' ,
' VI ' : ' Virgin Islands, U.S. ' ,
' WF ' : ' Wallis and Futuna ' ,
' EH ' : ' Western Sahara ' ,
' YE ' : ' Yemen ' ,
' ZM ' : ' Zambia ' ,
' ZW ' : ' Zimbabwe ' ,
2022-05-18 10:36:41 +02:00
# Not ISO 3166 codes, but used for IP blocks
' AP ' : ' Asia/Pacific Region ' ,
' EU ' : ' Europe ' ,
2015-06-27 07:13:57 +02:00
}
@classmethod
def short2full ( cls , code ) :
""" Convert an ISO 3166-2 country code to the corresponding full name """
return cls . _country_map . get ( code . upper ( ) )
2022-04-11 17:10:28 +02:00
class GeoUtils :
2017-02-04 12:49:58 +01:00
# Major IPv4 address blocks per country
_country_ip_map = {
2019-10-29 00:10:20 +01:00
' AD ' : ' 46.172.224.0/19 ' ,
2017-02-04 12:49:58 +01:00
' AE ' : ' 94.200.0.0/13 ' ,
' AF ' : ' 149.54.0.0/17 ' ,
' AG ' : ' 209.59.64.0/18 ' ,
' AI ' : ' 204.14.248.0/21 ' ,
' AL ' : ' 46.99.0.0/16 ' ,
' AM ' : ' 46.70.0.0/15 ' ,
' AO ' : ' 105.168.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' AP ' : ' 182.50.184.0/21 ' ,
' AQ ' : ' 23.154.160.0/24 ' ,
2017-02-04 12:49:58 +01:00
' AR ' : ' 181.0.0.0/12 ' ,
' AS ' : ' 202.70.112.0/20 ' ,
2019-10-29 00:10:20 +01:00
' AT ' : ' 77.116.0.0/14 ' ,
2017-02-04 12:49:58 +01:00
' AU ' : ' 1.128.0.0/11 ' ,
' AW ' : ' 181.41.0.0/18 ' ,
2019-10-29 00:10:20 +01:00
' AX ' : ' 185.217.4.0/22 ' ,
' AZ ' : ' 5.197.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' BA ' : ' 31.176.128.0/17 ' ,
' BB ' : ' 65.48.128.0/17 ' ,
' BD ' : ' 114.130.0.0/16 ' ,
' BE ' : ' 57.0.0.0/8 ' ,
2019-10-29 00:10:20 +01:00
' BF ' : ' 102.178.0.0/15 ' ,
2017-02-04 12:49:58 +01:00
' BG ' : ' 95.42.0.0/15 ' ,
' BH ' : ' 37.131.0.0/17 ' ,
' BI ' : ' 154.117.192.0/18 ' ,
' BJ ' : ' 137.255.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' BL ' : ' 185.212.72.0/23 ' ,
2017-02-04 12:49:58 +01:00
' BM ' : ' 196.12.64.0/18 ' ,
' BN ' : ' 156.31.0.0/16 ' ,
' BO ' : ' 161.56.0.0/16 ' ,
' BQ ' : ' 161.0.80.0/20 ' ,
2019-10-29 00:10:20 +01:00
' BR ' : ' 191.128.0.0/12 ' ,
2017-02-04 12:49:58 +01:00
' BS ' : ' 24.51.64.0/18 ' ,
' BT ' : ' 119.2.96.0/19 ' ,
' BW ' : ' 168.167.0.0/16 ' ,
' BY ' : ' 178.120.0.0/13 ' ,
' BZ ' : ' 179.42.192.0/18 ' ,
' CA ' : ' 99.224.0.0/11 ' ,
' CD ' : ' 41.243.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' CF ' : ' 197.242.176.0/21 ' ,
' CG ' : ' 160.113.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' CH ' : ' 85.0.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' CI ' : ' 102.136.0.0/14 ' ,
2017-02-04 12:49:58 +01:00
' CK ' : ' 202.65.32.0/19 ' ,
' CL ' : ' 152.172.0.0/14 ' ,
2019-10-29 00:10:20 +01:00
' CM ' : ' 102.244.0.0/14 ' ,
2017-02-04 12:49:58 +01:00
' CN ' : ' 36.128.0.0/10 ' ,
' CO ' : ' 181.240.0.0/12 ' ,
' CR ' : ' 201.192.0.0/12 ' ,
' CU ' : ' 152.206.0.0/15 ' ,
' CV ' : ' 165.90.96.0/19 ' ,
' CW ' : ' 190.88.128.0/17 ' ,
2019-10-29 00:10:20 +01:00
' CY ' : ' 31.153.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' CZ ' : ' 88.100.0.0/14 ' ,
' DE ' : ' 53.0.0.0/8 ' ,
' DJ ' : ' 197.241.0.0/17 ' ,
' DK ' : ' 87.48.0.0/12 ' ,
' DM ' : ' 192.243.48.0/20 ' ,
' DO ' : ' 152.166.0.0/15 ' ,
' DZ ' : ' 41.96.0.0/12 ' ,
' EC ' : ' 186.68.0.0/15 ' ,
' EE ' : ' 90.190.0.0/15 ' ,
' EG ' : ' 156.160.0.0/11 ' ,
' ER ' : ' 196.200.96.0/20 ' ,
' ES ' : ' 88.0.0.0/11 ' ,
' ET ' : ' 196.188.0.0/14 ' ,
' EU ' : ' 2.16.0.0/13 ' ,
' FI ' : ' 91.152.0.0/13 ' ,
' FJ ' : ' 144.120.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' FK ' : ' 80.73.208.0/21 ' ,
2017-02-04 12:49:58 +01:00
' FM ' : ' 119.252.112.0/20 ' ,
' FO ' : ' 88.85.32.0/19 ' ,
' FR ' : ' 90.0.0.0/9 ' ,
' GA ' : ' 41.158.0.0/15 ' ,
' GB ' : ' 25.0.0.0/8 ' ,
' GD ' : ' 74.122.88.0/21 ' ,
' GE ' : ' 31.146.0.0/16 ' ,
' GF ' : ' 161.22.64.0/18 ' ,
' GG ' : ' 62.68.160.0/19 ' ,
2019-10-29 00:10:20 +01:00
' GH ' : ' 154.160.0.0/12 ' ,
' GI ' : ' 95.164.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' GL ' : ' 88.83.0.0/19 ' ,
' GM ' : ' 160.182.0.0/15 ' ,
' GN ' : ' 197.149.192.0/18 ' ,
' GP ' : ' 104.250.0.0/19 ' ,
' GQ ' : ' 105.235.224.0/20 ' ,
' GR ' : ' 94.64.0.0/13 ' ,
' GT ' : ' 168.234.0.0/16 ' ,
' GU ' : ' 168.123.0.0/16 ' ,
' GW ' : ' 197.214.80.0/20 ' ,
' GY ' : ' 181.41.64.0/18 ' ,
' HK ' : ' 113.252.0.0/14 ' ,
' HN ' : ' 181.210.0.0/16 ' ,
' HR ' : ' 93.136.0.0/13 ' ,
' HT ' : ' 148.102.128.0/17 ' ,
' HU ' : ' 84.0.0.0/14 ' ,
' ID ' : ' 39.192.0.0/10 ' ,
' IE ' : ' 87.32.0.0/12 ' ,
' IL ' : ' 79.176.0.0/13 ' ,
' IM ' : ' 5.62.80.0/20 ' ,
' IN ' : ' 117.192.0.0/10 ' ,
' IO ' : ' 203.83.48.0/21 ' ,
' IQ ' : ' 37.236.0.0/14 ' ,
' IR ' : ' 2.176.0.0/12 ' ,
' IS ' : ' 82.221.0.0/16 ' ,
' IT ' : ' 79.0.0.0/10 ' ,
' JE ' : ' 87.244.64.0/18 ' ,
' JM ' : ' 72.27.0.0/17 ' ,
' JO ' : ' 176.29.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' JP ' : ' 133.0.0.0/8 ' ,
2017-02-04 12:49:58 +01:00
' KE ' : ' 105.48.0.0/12 ' ,
' KG ' : ' 158.181.128.0/17 ' ,
' KH ' : ' 36.37.128.0/17 ' ,
' KI ' : ' 103.25.140.0/22 ' ,
' KM ' : ' 197.255.224.0/20 ' ,
2019-10-29 00:10:20 +01:00
' KN ' : ' 198.167.192.0/19 ' ,
2017-02-04 12:49:58 +01:00
' KP ' : ' 175.45.176.0/22 ' ,
' KR ' : ' 175.192.0.0/10 ' ,
' KW ' : ' 37.36.0.0/14 ' ,
' KY ' : ' 64.96.0.0/15 ' ,
' KZ ' : ' 2.72.0.0/13 ' ,
' LA ' : ' 115.84.64.0/18 ' ,
' LB ' : ' 178.135.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' LC ' : ' 24.92.144.0/20 ' ,
2017-02-04 12:49:58 +01:00
' LI ' : ' 82.117.0.0/19 ' ,
' LK ' : ' 112.134.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' LR ' : ' 102.183.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' LS ' : ' 129.232.0.0/17 ' ,
' LT ' : ' 78.56.0.0/13 ' ,
' LU ' : ' 188.42.0.0/16 ' ,
' LV ' : ' 46.109.0.0/16 ' ,
' LY ' : ' 41.252.0.0/14 ' ,
' MA ' : ' 105.128.0.0/11 ' ,
' MC ' : ' 88.209.64.0/18 ' ,
' MD ' : ' 37.246.0.0/16 ' ,
' ME ' : ' 178.175.0.0/17 ' ,
' MF ' : ' 74.112.232.0/21 ' ,
' MG ' : ' 154.126.0.0/17 ' ,
' MH ' : ' 117.103.88.0/21 ' ,
' MK ' : ' 77.28.0.0/15 ' ,
' ML ' : ' 154.118.128.0/18 ' ,
' MM ' : ' 37.111.0.0/17 ' ,
' MN ' : ' 49.0.128.0/17 ' ,
' MO ' : ' 60.246.0.0/16 ' ,
' MP ' : ' 202.88.64.0/20 ' ,
' MQ ' : ' 109.203.224.0/19 ' ,
' MR ' : ' 41.188.64.0/18 ' ,
' MS ' : ' 208.90.112.0/22 ' ,
' MT ' : ' 46.11.0.0/16 ' ,
' MU ' : ' 105.16.0.0/12 ' ,
' MV ' : ' 27.114.128.0/18 ' ,
2019-10-29 00:10:20 +01:00
' MW ' : ' 102.70.0.0/15 ' ,
2017-02-04 12:49:58 +01:00
' MX ' : ' 187.192.0.0/11 ' ,
' MY ' : ' 175.136.0.0/13 ' ,
' MZ ' : ' 197.218.0.0/15 ' ,
' NA ' : ' 41.182.0.0/16 ' ,
' NC ' : ' 101.101.0.0/18 ' ,
' NE ' : ' 197.214.0.0/18 ' ,
' NF ' : ' 203.17.240.0/22 ' ,
' NG ' : ' 105.112.0.0/12 ' ,
' NI ' : ' 186.76.0.0/15 ' ,
' NL ' : ' 145.96.0.0/11 ' ,
' NO ' : ' 84.208.0.0/13 ' ,
' NP ' : ' 36.252.0.0/15 ' ,
' NR ' : ' 203.98.224.0/19 ' ,
' NU ' : ' 49.156.48.0/22 ' ,
' NZ ' : ' 49.224.0.0/14 ' ,
' OM ' : ' 5.36.0.0/15 ' ,
' PA ' : ' 186.72.0.0/15 ' ,
' PE ' : ' 186.160.0.0/14 ' ,
' PF ' : ' 123.50.64.0/18 ' ,
' PG ' : ' 124.240.192.0/19 ' ,
' PH ' : ' 49.144.0.0/13 ' ,
' PK ' : ' 39.32.0.0/11 ' ,
' PL ' : ' 83.0.0.0/11 ' ,
' PM ' : ' 70.36.0.0/20 ' ,
' PR ' : ' 66.50.0.0/16 ' ,
' PS ' : ' 188.161.0.0/16 ' ,
' PT ' : ' 85.240.0.0/13 ' ,
' PW ' : ' 202.124.224.0/20 ' ,
' PY ' : ' 181.120.0.0/14 ' ,
' QA ' : ' 37.210.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' RE ' : ' 102.35.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' RO ' : ' 79.112.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' RS ' : ' 93.86.0.0/15 ' ,
2017-02-04 12:49:58 +01:00
' RU ' : ' 5.136.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' RW ' : ' 41.186.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' SA ' : ' 188.48.0.0/13 ' ,
' SB ' : ' 202.1.160.0/19 ' ,
' SC ' : ' 154.192.0.0/11 ' ,
2019-10-29 00:10:20 +01:00
' SD ' : ' 102.120.0.0/13 ' ,
2017-02-04 12:49:58 +01:00
' SE ' : ' 78.64.0.0/12 ' ,
2019-10-29 00:10:20 +01:00
' SG ' : ' 8.128.0.0/10 ' ,
2017-02-04 12:49:58 +01:00
' SI ' : ' 188.196.0.0/14 ' ,
' SK ' : ' 78.98.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' SL ' : ' 102.143.0.0/17 ' ,
2017-02-04 12:49:58 +01:00
' SM ' : ' 89.186.32.0/19 ' ,
' SN ' : ' 41.82.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' SO ' : ' 154.115.192.0/18 ' ,
2017-02-04 12:49:58 +01:00
' SR ' : ' 186.179.128.0/17 ' ,
' SS ' : ' 105.235.208.0/21 ' ,
' ST ' : ' 197.159.160.0/19 ' ,
' SV ' : ' 168.243.0.0/16 ' ,
' SX ' : ' 190.102.0.0/20 ' ,
' SY ' : ' 5.0.0.0/16 ' ,
' SZ ' : ' 41.84.224.0/19 ' ,
' TC ' : ' 65.255.48.0/20 ' ,
' TD ' : ' 154.68.128.0/19 ' ,
' TG ' : ' 196.168.0.0/14 ' ,
' TH ' : ' 171.96.0.0/13 ' ,
' TJ ' : ' 85.9.128.0/18 ' ,
' TK ' : ' 27.96.24.0/21 ' ,
' TL ' : ' 180.189.160.0/20 ' ,
' TM ' : ' 95.85.96.0/19 ' ,
' TN ' : ' 197.0.0.0/11 ' ,
' TO ' : ' 175.176.144.0/21 ' ,
' TR ' : ' 78.160.0.0/11 ' ,
' TT ' : ' 186.44.0.0/15 ' ,
' TV ' : ' 202.2.96.0/19 ' ,
' TW ' : ' 120.96.0.0/11 ' ,
' TZ ' : ' 156.156.0.0/14 ' ,
2019-10-29 00:10:20 +01:00
' UA ' : ' 37.52.0.0/14 ' ,
' UG ' : ' 102.80.0.0/13 ' ,
' US ' : ' 6.0.0.0/8 ' ,
2017-02-04 12:49:58 +01:00
' UY ' : ' 167.56.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' UZ ' : ' 84.54.64.0/18 ' ,
2017-02-04 12:49:58 +01:00
' VA ' : ' 212.77.0.0/19 ' ,
2019-10-29 00:10:20 +01:00
' VC ' : ' 207.191.240.0/21 ' ,
2017-02-04 12:49:58 +01:00
' VE ' : ' 186.88.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' VG ' : ' 66.81.192.0/20 ' ,
2017-02-04 12:49:58 +01:00
' VI ' : ' 146.226.0.0/16 ' ,
' VN ' : ' 14.160.0.0/11 ' ,
' VU ' : ' 202.80.32.0/20 ' ,
' WF ' : ' 117.20.32.0/21 ' ,
' WS ' : ' 202.4.32.0/19 ' ,
' YE ' : ' 134.35.0.0/16 ' ,
' YT ' : ' 41.242.116.0/22 ' ,
' ZA ' : ' 41.0.0.0/11 ' ,
2019-10-29 00:10:20 +01:00
' ZM ' : ' 102.144.0.0/13 ' ,
' ZW ' : ' 102.177.192.0/18 ' ,
2017-02-04 12:49:58 +01:00
}
@classmethod
2018-05-02 02:18:01 +02:00
def random_ipv4 ( cls , code_or_block ) :
if len ( code_or_block ) == 2 :
block = cls . _country_ip_map . get ( code_or_block . upper ( ) )
if not block :
return None
else :
block = code_or_block
2017-02-04 12:49:58 +01:00
addr , preflen = block . split ( ' / ' )
2022-06-24 10:10:17 +02:00
addr_min = struct . unpack ( ' !L ' , socket . inet_aton ( addr ) ) [ 0 ]
2017-02-04 12:49:58 +01:00
addr_max = addr_min | ( 0xffffffff >> int ( preflen ) )
2022-06-24 12:54:43 +02:00
return str ( socket . inet_ntoa (
2022-06-24 10:10:17 +02:00
struct . pack ( ' !L ' , random . randint ( addr_min , addr_max ) ) ) )
2017-02-04 12:49:58 +01:00
2017-02-28 12:16:55 +01:00
# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
# released into Public Domain
# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
def long_to_bytes ( n , blocksize = 0 ) :
""" long_to_bytes(n:long, blocksize:int) : string
Convert a long integer to a byte string .
If optional blocksize is given and greater than zero , pad the front of the
byte string with binary zeros so that the length is a multiple of
blocksize .
"""
# after much testing, this algorithm was deemed to be the fastest
s = b ' '
n = int ( n )
while n > 0 :
2022-06-24 10:10:17 +02:00
s = struct . pack ( ' >I ' , n & 0xffffffff ) + s
2017-02-28 12:16:55 +01:00
n = n >> 32
# strip off leading zeros
for i in range ( len ( s ) ) :
if s [ i ] != b ' \000 ' [ 0 ] :
break
else :
# only happens when n == 0
s = b ' \000 '
i = 0
s = s [ i : ]
# add back some pad bytes. this could be done more efficiently w.r.t. the
# de-padding being done above, but sigh...
if blocksize > 0 and len ( s ) % blocksize :
s = ( blocksize - len ( s ) % blocksize ) * b ' \000 ' + s
return s
def bytes_to_long ( s ) :
""" bytes_to_long(string) : long
Convert a byte string to a long integer .
This is ( essentially ) the inverse of long_to_bytes ( ) .
"""
acc = 0
length = len ( s )
if length % 4 :
extra = ( 4 - length % 4 )
s = b ' \000 ' * extra + s
length = length + extra
for i in range ( 0 , length , 4 ) :
2022-06-24 10:10:17 +02:00
acc = ( acc << 32 ) + struct . unpack ( ' >I ' , s [ i : i + 4 ] ) [ 0 ]
2017-02-28 12:16:55 +01:00
return acc
2016-02-16 23:01:44 +01:00
def ohdave_rsa_encrypt ( data , exponent , modulus ) :
2024-06-12 01:09:58 +02:00
"""
2016-02-16 23:01:44 +01:00
Implement OHDave ' s RSA algorithm. See http://www.ohdave.com/rsa/
Input :
data : data to encrypt , bytes - like object
exponent , modulus : parameter e and N of RSA algorithm , both integer
Output : hex string of encrypted data
Limitation : supports one block encryption only
2024-06-12 01:09:58 +02:00
"""
2016-02-16 23:01:44 +01:00
payload = int ( binascii . hexlify ( data [ : : - 1 ] ) , 16 )
encrypted = pow ( payload , exponent , modulus )
2024-06-12 01:09:58 +02:00
return f ' { encrypted : x } '
2016-02-24 15:08:40 +01:00
2017-02-27 11:50:19 +01:00
def pkcs1pad ( data , length ) :
"""
Padding input data with PKCS #1 scheme
@param { int [ ] } data input data
@param { int } length target length
@returns { int [ ] } padded data
"""
if len ( data ) > length - 11 :
raise ValueError ( ' Input data too long for PKCS#1 padding ' )
pseudo_random = [ random . randint ( 0 , 254 ) for _ in range ( length - len ( data ) - 3 ) ]
2024-06-12 01:09:58 +02:00
return [ 0 , 2 , * pseudo_random , 0 , * data ]
2017-02-27 11:50:19 +01:00
2022-06-20 08:14:55 +02:00
def _base_n_table ( n , table ) :
if not table and not n :
raise ValueError ( ' Either table or n must be specified ' )
2022-06-20 08:25:54 +02:00
table = ( table or ' 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ' ) [ : n ]
2022-06-30 14:59:39 +02:00
if n and n != len ( table ) :
2022-06-20 08:25:54 +02:00
raise ValueError ( f ' base { n } exceeds table length { len ( table ) } ' )
return table
2016-02-26 07:37:20 +01:00
2016-02-26 20:19:50 +01:00
2022-06-20 08:14:55 +02:00
def encode_base_n ( num , n = None , table = None ) :
""" Convert given int to a base-n string """
2022-06-20 08:25:54 +02:00
table = _base_n_table ( n , table )
2022-06-20 08:14:55 +02:00
if not num :
2016-02-26 20:19:50 +01:00
return table [ 0 ]
2022-06-20 08:14:55 +02:00
result , base = ' ' , len ( table )
2016-02-24 15:08:40 +01:00
while num :
2022-06-20 08:14:55 +02:00
result = table [ num % base ] + result
2022-06-20 08:25:54 +02:00
num = num / / base
2022-06-20 08:14:55 +02:00
return result
def decode_base_n ( string , n = None , table = None ) :
""" Convert given base-n string to int """
table = { char : index for index , char in enumerate ( _base_n_table ( n , table ) ) }
result , base = 0 , len ( table )
for char in string :
result = result * base + table [ char ]
return result
2016-02-26 07:58:29 +01:00
def decode_packed_codes ( code ) :
2016-10-19 18:28:49 +02:00
mobj = re . search ( PACKED_CODES_RE , code )
2020-11-21 15:50:42 +01:00
obfuscated_code , base , count , symbols = mobj . groups ( )
2016-02-26 07:58:29 +01:00
base = int ( base )
count = int ( count )
symbols = symbols . split ( ' | ' )
symbol_table = { }
while count :
count - = 1
2016-02-26 20:19:50 +01:00
base_n_count = encode_base_n ( count , base )
2016-02-26 07:58:29 +01:00
symbol_table [ base_n_count ] = symbols [ count ] or base_n_count
return re . sub (
r ' \ b( \ w+) \ b ' , lambda mobj : symbol_table [ mobj . group ( 0 ) ] ,
2020-11-21 15:50:42 +01:00
obfuscated_code )
2016-01-10 20:09:53 +01:00
2019-11-26 20:26:42 +01:00
def caesar ( s , alphabet , shift ) :
if shift == 0 :
return s
l = len ( alphabet )
return ' ' . join (
alphabet [ ( alphabet . index ( c ) + shift ) % l ] if c in alphabet else c
for c in s )
def rot47 ( s ) :
return caesar ( s , r ''' ! " #$ % & ' ()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ \ ]^_`abcdefghijklmnopqrstuvwxyz { |}~ ''' , 47 )
2016-01-10 20:09:53 +01:00
def parse_m3u8_attributes ( attrib ) :
info = { }
for ( key , val ) in re . findall ( r ' (?P<key>[A-Z0-9-]+)=(?P<val> " [^ " ]+ " |[^ " ,]+)(?:,|$) ' , attrib ) :
if val . startswith ( ' " ' ) :
val = val [ 1 : - 1 ]
info [ key ] = val
return info
2016-06-26 09:16:49 +02:00
def urshift ( val , n ) :
return val >> n if val > = 0 else ( val + 0x100000000 ) >> n
2016-08-06 20:42:58 +02:00
2016-09-29 18:28:32 +02:00
def write_xattr ( path , key , value ) :
2022-05-01 01:16:05 +02:00
# Windows: Write xattrs to NTFS Alternate Data Streams:
# http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
if compat_os_name == ' nt ' :
assert ' : ' not in key
assert os . path . exists ( path )
2016-09-29 18:28:32 +02:00
try :
2022-05-01 01:16:05 +02:00
with open ( f ' { path } : { key } ' , ' wb ' ) as f :
f . write ( value )
2022-04-11 17:10:28 +02:00
except OSError as e :
2016-09-29 18:28:32 +02:00
raise XAttrMetadataError ( e . errno , e . strerror )
2022-05-01 01:16:05 +02:00
return
2016-09-29 18:28:32 +02:00
2023-10-09 20:30:36 +02:00
# UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
2016-09-29 18:28:32 +02:00
2022-05-01 01:16:05 +02:00
setxattr = None
2023-10-09 20:30:36 +02:00
if callable ( getattr ( os , ' setxattr ' , None ) ) :
setxattr = os . setxattr
elif getattr ( xattr , ' _yt_dlp__identifier ' , None ) == ' pyxattr ' :
2022-05-01 01:16:05 +02:00
# Unicode arguments are not supported in pyxattr until version 0.5.0
# See https://github.com/ytdl-org/youtube-dl/issues/5498
if version_tuple ( xattr . __version__ ) > = ( 0 , 5 , 0 ) :
setxattr = xattr . set
elif xattr :
setxattr = xattr . setxattr
2016-09-29 18:28:32 +02:00
2022-05-01 01:16:05 +02:00
if setxattr :
try :
setxattr ( path , key , value )
except OSError as e :
raise XAttrMetadataError ( e . errno , e . strerror )
return
2016-09-29 18:28:32 +02:00
2022-05-01 01:16:05 +02:00
# UNIX Method 2. Use setfattr/xattr executables
exe = ( ' setfattr ' if check_executable ( ' setfattr ' , [ ' --version ' ] )
else ' xattr ' if check_executable ( ' xattr ' , [ ' -h ' ] ) else None )
if not exe :
raise XAttrUnavailableError (
2024-03-10 20:18:47 +01:00
' Couldn \' t find a tool to set the xattrs. Install either the " xattr " or " pyxattr " Python modules or the '
2022-05-01 01:16:05 +02:00
+ ( ' " xattr " binary ' if sys . platform != ' linux ' else ' GNU " attr " package (which contains the " setfattr " tool) ' ) )
2016-09-29 18:28:32 +02:00
2022-05-09 13:54:28 +02:00
value = value . decode ( )
2022-05-01 01:16:05 +02:00
try :
2022-06-15 22:55:43 +02:00
_ , stderr , returncode = Popen . run (
2022-05-01 01:16:05 +02:00
[ exe , ' -w ' , key , value , path ] if exe == ' xattr ' else [ exe , ' -n ' , key , ' -v ' , value , path ] ,
2022-06-18 03:57:22 +02:00
text = True , stdout = subprocess . PIPE , stderr = subprocess . PIPE , stdin = subprocess . PIPE )
2022-05-01 01:16:05 +02:00
except OSError as e :
raise XAttrMetadataError ( e . errno , e . strerror )
2022-06-15 22:55:43 +02:00
if returncode :
raise XAttrMetadataError ( returncode , stderr )
2017-05-01 17:09:18 +02:00
def random_birthday ( year_field , month_field , day_field ) :
2024-02-25 01:16:34 +01:00
start_date = dt . date ( 1950 , 1 , 1 )
end_date = dt . date ( 1995 , 12 , 31 )
2018-12-01 18:05:15 +01:00
offset = random . randint ( 0 , ( end_date - start_date ) . days )
2024-02-25 01:16:34 +01:00
random_date = start_date + dt . timedelta ( offset )
2017-05-01 17:09:18 +02:00
return {
2018-12-01 18:05:15 +01:00
year_field : str ( random_date . year ) ,
month_field : str ( random_date . month ) ,
day_field : str ( random_date . day ) ,
2017-05-01 17:09:18 +02:00
}
2020-10-27 11:37:21 +01:00
2021-01-07 07:41:05 +01:00
2023-01-01 18:16:25 +01:00
def find_available_port ( interface = ' ' ) :
try :
with socket . socket ( ) as sock :
sock . bind ( ( interface , 0 ) )
return sock . getsockname ( ) [ 1 ]
except OSError :
return None
2020-10-27 11:37:21 +01:00
# Templates for internet shortcut files, which are plain text files.
2022-04-12 02:01:54 +02:00
DOT_URL_LINK_TEMPLATE = ''' \
2020-10-27 11:37:21 +01:00
[ InternetShortcut ]
URL = % ( url ) s
2022-04-12 02:01:54 +02:00
'''
2020-10-27 11:37:21 +01:00
2022-04-12 02:01:54 +02:00
DOT_WEBLOC_LINK_TEMPLATE = ''' \
2020-10-27 11:37:21 +01:00
< ? xml version = " 1.0 " encoding = " UTF-8 " ? >
< ! DOCTYPE plist PUBLIC " -//Apple//DTD PLIST 1.0//EN " " http://www.apple.com/DTDs/PropertyList-1.0.dtd " >
< plist version = " 1.0 " >
< dict >
\t < key > URL < / key >
\t < string > % ( url ) s < / string >
< / dict >
< / plist >
2022-04-12 02:01:54 +02:00
'''
2020-10-27 11:37:21 +01:00
2022-04-12 02:01:54 +02:00
DOT_DESKTOP_LINK_TEMPLATE = ''' \
2020-10-27 11:37:21 +01:00
[ Desktop Entry ]
Encoding = UTF - 8
Name = % ( filename ) s
Type = Link
URL = % ( url ) s
Icon = text - html
2022-04-12 02:01:54 +02:00
'''
2020-10-27 11:37:21 +01:00
2021-10-26 16:41:59 +02:00
LINK_TEMPLATES = {
' url ' : DOT_URL_LINK_TEMPLATE ,
' desktop ' : DOT_DESKTOP_LINK_TEMPLATE ,
' webloc ' : DOT_WEBLOC_LINK_TEMPLATE ,
}
2020-10-27 11:37:21 +01:00
def iri_to_uri ( iri ) :
"""
Converts an IRI ( Internationalized Resource Identifier , allowing Unicode characters ) to a URI ( Uniform Resource Identifier , ASCII - only ) .
The function doesn ' t add an additional layer of escaping; e.g., it doesn ' t escape ` % 3 C ` as ` % 253 C ` . Instead , it percent - escapes characters with an underlying UTF - 8 encoding * besides * those already escaped , leaving the URI intact .
"""
2022-06-24 12:54:43 +02:00
iri_parts = urllib . parse . urlparse ( iri )
2020-10-27 11:37:21 +01:00
if ' [ ' in iri_parts . netloc :
raise ValueError ( ' IPv6 URIs are not, yet, supported. ' )
# Querying `.netloc`, when there's only one bracket, also raises a ValueError.
# The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
net_location = ' '
if iri_parts . username :
2022-04-11 22:09:26 +02:00
net_location + = urllib . parse . quote ( iri_parts . username , safe = r " !$ % & ' ()*+,~ " )
2020-10-27 11:37:21 +01:00
if iri_parts . password is not None :
2022-04-11 22:09:26 +02:00
net_location + = ' : ' + urllib . parse . quote ( iri_parts . password , safe = r " !$ % & ' ()*+,~ " )
2020-10-27 11:37:21 +01:00
net_location + = ' @ '
2022-05-09 13:54:28 +02:00
net_location + = iri_parts . hostname . encode ( ' idna ' ) . decode ( ) # Punycode for Unicode hostnames.
2020-10-27 11:37:21 +01:00
# The 'idna' encoding produces ASCII text.
if iri_parts . port is not None and iri_parts . port != 80 :
net_location + = ' : ' + str ( iri_parts . port )
2022-04-11 22:09:26 +02:00
return urllib . parse . urlunparse (
2020-10-27 11:37:21 +01:00
( iri_parts . scheme ,
net_location ,
2022-04-11 22:09:26 +02:00
urllib . parse . quote_plus ( iri_parts . path , safe = r " !$ % & ' ()*+,/:;=@|~ " ) ,
2020-10-27 11:37:21 +01:00
# Unsure about the `safe` argument, since this is a legacy way of handling parameters.
2022-04-11 22:09:26 +02:00
urllib . parse . quote_plus ( iri_parts . params , safe = r " !$ % & ' ()*+,/:;=@|~ " ) ,
2020-10-27 11:37:21 +01:00
# Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
2022-04-11 22:09:26 +02:00
urllib . parse . quote_plus ( iri_parts . query , safe = r " !$ % & ' ()*+,/:;=?@ { |}~ " ) ,
2020-10-27 11:37:21 +01:00
2022-04-11 22:09:26 +02:00
urllib . parse . quote_plus ( iri_parts . fragment , safe = r " !#$ % & ' ()*+,/:;=?@ { |}~ " ) ) )
2020-10-27 11:37:21 +01:00
# Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
def to_high_limit_path ( path ) :
if sys . platform in [ ' win32 ' , ' cygwin ' ] :
# Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
2022-04-12 02:01:54 +02:00
return ' \\ \\ ? \\ ' + os . path . abspath ( path )
2020-10-27 11:37:21 +01:00
return path
2020-12-13 15:29:09 +01:00
2021-01-07 07:41:05 +01:00
2022-06-20 08:14:55 +02:00
def format_field ( obj , field = None , template = ' %s ' , ignore = NO_DEFAULT , default = ' ' , func = IDENTITY ) :
2023-05-20 23:56:23 +02:00
val = traversal . traverse_obj ( obj , * variadic ( field ) )
2023-05-19 23:36:23 +02:00
if not val if ignore is NO_DEFAULT else val in variadic ( ignore ) :
2022-01-21 08:57:40 +01:00
return default
2022-06-20 08:14:55 +02:00
return template % func ( val )
2021-01-08 17:14:50 +01:00
def clean_podcast_url ( url ) :
2023-06-26 12:49:49 +02:00
url = re . sub ( r ''' (?x)
2021-01-08 17:14:50 +01:00
( ? :
( ? :
chtbl \. com / track |
media \. blubrry \. com | # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
2023-07-11 03:00:38 +02:00
play \. podtrac \. com |
chrt \. fm / track |
mgln \. ai / e
) ( ? : / [ ^ / . ] + ) ? |
2021-01-08 17:14:50 +01:00
( ? : dts | www ) \. podtrac \. com / ( ? : pts / ) ? redirect \. [ 0 - 9 a - z ] { 3 , 4 } | # http://analytics.podtrac.com/how-to-measure
flex \. acast \. com |
pd ( ? :
cn \. co | # https://podcorn.com/analytics-prefix/
st \. fm # https://podsights.com/docs/
2023-07-11 03:00:38 +02:00
) / e |
[ 0 - 9 ] \. gum \. fm |
pscrb \. fm / rss / p
2021-01-08 17:14:50 +01:00
) / ''' , ' ' , url)
2023-06-26 12:49:49 +02:00
return re . sub ( r ' ^ \ w+://( \ w+://) ' , r ' \ 1 ' , url )
2021-01-22 14:43:30 +01:00
_HEX_TABLE = ' 0123456789abcdef '
def random_uuidv4 ( ) :
return re . sub ( r ' [xy] ' , lambda x : _HEX_TABLE [ random . randint ( 0 , 15 ) ] , ' xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx ' )
2021-01-23 13:18:12 +01:00
def make_dir ( path , to_screen = None ) :
try :
dn = os . path . dirname ( path )
2023-02-17 04:29:32 +01:00
if dn :
os . makedirs ( dn , exist_ok = True )
2021-01-23 13:18:12 +01:00
return True
2022-04-11 17:10:28 +02:00
except OSError as err :
2021-01-23 13:18:12 +01:00
if callable ( to_screen ) is not None :
2023-05-20 23:56:23 +02:00
to_screen ( f ' unable to create directory { err } ' )
2021-01-23 13:18:12 +01:00
return False
2021-01-24 14:40:02 +01:00
def get_executable_path ( ) :
2023-05-20 23:56:23 +02:00
from . . update import _get_variant_and_executable_path
2022-04-17 19:18:50 +02:00
2022-05-22 13:37:18 +02:00
return os . path . dirname ( os . path . abspath ( _get_variant_and_executable_path ( ) [ 1 ] ) )
2021-01-24 14:40:02 +01:00
2023-01-01 05:29:22 +01:00
def get_user_config_dirs ( package_name ) :
# .config (e.g. ~/.config/package_name)
xdg_config_home = os . getenv ( ' XDG_CONFIG_HOME ' ) or compat_expanduser ( ' ~/.config ' )
2023-01-06 20:01:00 +01:00
yield os . path . join ( xdg_config_home , package_name )
2023-01-01 05:29:22 +01:00
# appdata (%APPDATA%/package_name)
appdata_dir = os . getenv ( ' appdata ' )
if appdata_dir :
2023-01-06 20:01:00 +01:00
yield os . path . join ( appdata_dir , package_name )
2023-01-01 05:29:22 +01:00
# home (~/.package_name)
2023-01-06 20:01:00 +01:00
yield os . path . join ( compat_expanduser ( ' ~ ' ) , f ' . { package_name } ' )
2023-01-01 05:29:22 +01:00
def get_system_config_dirs ( package_name ) :
# /etc/package_name
2023-01-06 20:01:00 +01:00
yield os . path . join ( ' /etc ' , package_name )
2021-01-27 16:02:51 +01:00
2022-02-25 03:14:04 +01:00
def time_seconds ( * * kwargs ) :
2023-01-31 14:30:00 +01:00
"""
Returns TZ - aware time in seconds since the epoch ( 1970 - 01 - 01 T00 : 00 : 00 Z )
"""
2024-02-25 01:16:34 +01:00
return time . time ( ) + dt . timedelta ( * * kwargs ) . total_seconds ( )
2022-02-25 03:14:04 +01:00
2021-09-23 19:40:51 +02:00
# create a JSON Web Signature (jws) with HS256 algorithm
# the resulting format is in JWS Compact Serialization
# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
def jwt_encode_hs256 ( payload_data , key , headers = { } ) :
header_data = {
' alg ' : ' HS256 ' ,
' typ ' : ' JWT ' ,
}
if headers :
header_data . update ( headers )
2022-05-09 13:54:28 +02:00
header_b64 = base64 . b64encode ( json . dumps ( header_data ) . encode ( ) )
payload_b64 = base64 . b64encode ( json . dumps ( payload_data ) . encode ( ) )
h = hmac . new ( key . encode ( ) , header_b64 + b ' . ' + payload_b64 , hashlib . sha256 )
2021-09-23 19:40:51 +02:00
signature_b64 = base64 . b64encode ( h . digest ( ) )
2024-06-12 01:09:58 +02:00
return header_b64 + b ' . ' + payload_b64 + b ' . ' + signature_b64
2021-10-08 21:11:59 +02:00
2021-10-27 22:37:15 +02:00
# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
def jwt_decode_hs256 ( jwt ) :
header_b64 , payload_b64 , signature_b64 = jwt . split ( ' . ' )
2022-10-10 22:31:01 +02:00
# add trailing ='s that may have been stripped, superfluous ='s are ignored
2024-06-12 01:09:58 +02:00
return json . loads ( base64 . urlsafe_b64decode ( f ' { payload_b64 } === ' ) )
2021-10-27 22:37:15 +02:00
2022-05-19 23:32:25 +02:00
WINDOWS_VT_MODE = False if compat_os_name == ' nt ' else None
2022-09-27 05:02:57 +02:00
@functools.cache
2021-10-08 21:11:59 +02:00
def supports_terminal_sequences ( stream ) :
if compat_os_name == ' nt ' :
2022-05-27 01:06:23 +02:00
if not WINDOWS_VT_MODE :
2021-10-08 21:11:59 +02:00
return False
elif not os . getenv ( ' TERM ' ) :
return False
try :
return stream . isatty ( )
except BaseException :
return False
2022-12-04 20:36:37 +01:00
def windows_enable_vt_mode ( ) :
""" Ref: https://bugs.python.org/issue30075 """
2022-05-27 01:06:23 +02:00
if get_windows_version ( ) < ( 10 , 0 , 10586 ) :
2022-05-19 23:32:25 +02:00
return
2022-12-04 20:36:37 +01:00
import ctypes
import ctypes . wintypes
import msvcrt
ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
dll = ctypes . WinDLL ( ' kernel32 ' , use_last_error = False )
handle = os . open ( ' CONOUT$ ' , os . O_RDWR )
try :
h_out = ctypes . wintypes . HANDLE ( msvcrt . get_osfhandle ( handle ) )
dw_original_mode = ctypes . wintypes . DWORD ( )
success = dll . GetConsoleMode ( h_out , ctypes . byref ( dw_original_mode ) )
if not success :
raise Exception ( ' GetConsoleMode failed ' )
success = dll . SetConsoleMode ( h_out , ctypes . wintypes . DWORD (
dw_original_mode . value | ENABLE_VIRTUAL_TERMINAL_PROCESSING ) )
if not success :
raise Exception ( ' SetConsoleMode failed ' )
finally :
os . close ( handle )
2022-05-19 23:32:25 +02:00
2023-01-03 06:53:34 +01:00
global WINDOWS_VT_MODE
WINDOWS_VT_MODE = True
supports_terminal_sequences . cache_clear ( )
2022-05-19 23:32:25 +02:00
2021-10-20 18:37:32 +02:00
_terminal_sequences_re = re . compile ( ' \033 \\ [[^m]+m ' )
def remove_terminal_sequences ( string ) :
return _terminal_sequences_re . sub ( ' ' , string )
def number_of_digits ( number ) :
return len ( ' %d ' % number )
2021-11-06 02:05:24 +01:00
def join_nonempty ( * values , delim = ' - ' , from_dict = None ) :
if from_dict is not None :
2023-05-20 23:56:23 +02:00
values = ( traversal . traverse_obj ( from_dict , variadic ( v ) ) for v in values )
2021-11-06 02:05:24 +01:00
return delim . join ( map ( str , filter ( None , values ) ) )
2021-12-14 18:03:47 +01:00
2022-03-04 22:52:48 +01:00
def scale_thumbnails_to_max_format_width ( formats , thumbnails , url_width_re ) :
"""
Find the largest format dimensions in terms of video width and , for each thumbnail :
* Modify the URL : Match the width with the provided regex and replace with the former width
* Update dimensions
This function is useful with video services that scale the provided thumbnails on demand
"""
_keys = ( ' width ' , ' height ' )
max_dimensions = max (
2024-06-12 01:09:58 +02:00
( tuple ( fmt . get ( k ) or 0 for k in _keys ) for fmt in formats ) ,
2022-03-04 22:52:48 +01:00
default = ( 0 , 0 ) )
if not max_dimensions [ 0 ] :
return thumbnails
return [
merge_dicts (
{ ' url ' : re . sub ( url_width_re , str ( max_dimensions [ 0 ] ) , thumbnail [ ' url ' ] ) } ,
dict ( zip ( _keys , max_dimensions ) ) , thumbnail )
for thumbnail in thumbnails
]
2022-02-28 05:10:54 +01:00
def parse_http_range ( range ) :
""" Parse value of " Range " or " Content-Range " HTTP header into tuple. """
if not range :
return None , None , None
crg = re . search ( r ' bytes[ =]( \ d+)-( \ d+)?(?:/( \ d+))? ' , range )
if not crg :
return None , None , None
return int ( crg . group ( 1 ) ) , int_or_none ( crg . group ( 2 ) ) , int_or_none ( crg . group ( 3 ) )
2022-05-24 14:00:28 +02:00
def read_stdin ( what ) :
2023-11-28 22:48:17 +01:00
if what :
eof = ' Ctrl+Z ' if compat_os_name == ' nt ' else ' Ctrl+D '
write_string ( f ' Reading { what } from STDIN - EOF ( { eof } ) to end: \n ' )
2022-05-24 14:00:28 +02:00
return sys . stdin
2022-07-15 13:52:14 +02:00
def determine_file_encoding ( data ) :
"""
2022-07-15 18:14:07 +02:00
Detect the text encoding used
2022-07-15 13:52:14 +02:00
@returns ( encoding , bytes to skip )
"""
2022-07-15 18:14:07 +02:00
# BOM marks are given priority over declarations
2022-07-15 13:52:14 +02:00
for bom , enc in BOMS :
if data . startswith ( bom ) :
return enc , len ( bom )
2022-07-15 18:14:07 +02:00
# Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
# We ignore the endianness to get a good enough match
2022-07-15 13:52:14 +02:00
data = data . replace ( b ' \0 ' , b ' ' )
2022-07-15 18:14:07 +02:00
mobj = re . match ( rb ' (?m)^# \ s*coding \ s*: \ s*( \ S+) \ s*$ ' , data )
return mobj . group ( 1 ) . decode ( ) if mobj else None , 0
2022-07-15 13:52:14 +02:00
2021-12-14 18:03:47 +01:00
class Config :
own_args = None
2022-05-19 16:15:21 +02:00
parsed_args = None
2021-12-14 18:03:47 +01:00
filename = None
__initialized = False
2024-10-20 19:10:26 +02:00
# Internal only, do not use! Hack to enable --plugin-dirs
# TODO(coletdjnz): remove when plugin globals system is implemented
_plugin_dirs = None
2021-12-14 18:03:47 +01:00
def __init__ ( self , parser , label = None ) :
2022-05-19 16:15:21 +02:00
self . parser , self . label = parser , label
2021-12-14 18:03:47 +01:00
self . _loaded_paths , self . configs = set ( ) , [ ]
def init ( self , args = None , filename = None ) :
assert not self . __initialized
2022-07-01 06:00:21 +02:00
self . own_args , self . filename = args , filename
return self . load_configs ( )
def load_configs ( self ) :
2022-02-03 13:48:18 +01:00
directory = ' '
2022-07-01 06:00:21 +02:00
if self . filename :
location = os . path . realpath ( self . filename )
2022-02-03 13:48:18 +01:00
directory = os . path . dirname ( location )
2021-12-14 18:03:47 +01:00
if location in self . _loaded_paths :
return False
self . _loaded_paths . add ( location )
2022-07-01 06:00:21 +02:00
self . __initialized = True
opts , _ = self . parser . parse_known_args ( self . own_args )
self . parsed_args = self . own_args
2022-05-19 16:15:21 +02:00
for location in opts . config_locations or [ ] :
2022-05-24 14:00:28 +02:00
if location == ' - ' :
2022-09-13 12:48:15 +02:00
if location in self . _loaded_paths :
continue
self . _loaded_paths . add ( location )
2022-05-24 14:00:28 +02:00
self . append_config ( shlex . split ( read_stdin ( ' options ' ) , comments = True ) , label = ' stdin ' )
continue
2022-02-03 13:48:18 +01:00
location = os . path . join ( directory , expand_path ( location ) )
2021-12-14 18:03:47 +01:00
if os . path . isdir ( location ) :
location = os . path . join ( location , ' yt-dlp.conf ' )
if not os . path . exists ( location ) :
2022-05-19 16:15:21 +02:00
self . parser . error ( f ' config location { location } does not exist ' )
2021-12-14 18:03:47 +01:00
self . append_config ( self . read_file ( location ) , location )
return True
def __str__ ( self ) :
label = join_nonempty (
self . label , ' config ' , f ' " { self . filename } " ' if self . filename else ' ' ,
delim = ' ' )
return join_nonempty (
self . own_args is not None and f ' { label [ 0 ] . upper ( ) } { label [ 1 : ] } : { self . hide_login_info ( self . own_args ) } ' ,
* ( f ' \n { c } ' . replace ( ' \n ' , ' \n | ' ) [ 1 : ] for c in self . configs ) ,
delim = ' \n ' )
2022-09-27 05:02:57 +02:00
@staticmethod
2021-12-14 18:03:47 +01:00
def read_file ( filename , default = [ ] ) :
try :
2022-07-15 13:52:14 +02:00
optionf = open ( filename , ' rb ' )
2022-04-11 17:10:28 +02:00
except OSError :
2021-12-14 18:03:47 +01:00
return default # silently skip if file is not present
2022-07-15 13:52:14 +02:00
try :
enc , skip = determine_file_encoding ( optionf . read ( 512 ) )
optionf . seek ( skip , io . SEEK_SET )
except OSError :
enc = None # silently skip read errors
2021-12-14 18:03:47 +01:00
try :
# FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
2022-07-15 13:52:14 +02:00
contents = optionf . read ( ) . decode ( enc or preferredencoding ( ) )
2022-04-11 22:09:26 +02:00
res = shlex . split ( contents , comments = True )
2022-06-18 05:47:45 +02:00
except Exception as err :
raise ValueError ( f ' Unable to parse " { filename } " : { err } ' )
2021-12-14 18:03:47 +01:00
finally :
optionf . close ( )
return res
2022-09-27 05:02:57 +02:00
@staticmethod
2021-12-14 18:03:47 +01:00
def hide_login_info ( opts ) :
2022-04-11 17:10:28 +02:00
PRIVATE_OPTS = { ' -p ' , ' --password ' , ' -u ' , ' --username ' , ' --video-password ' , ' --ap-password ' , ' --ap-username ' }
2021-12-14 18:03:47 +01:00
eqre = re . compile ( ' ^(?P<key> ' + ( ' | ' . join ( re . escape ( po ) for po in PRIVATE_OPTS ) ) + ' )=.+$ ' )
def _scrub_eq ( o ) :
m = eqre . match ( o )
if m :
return m . group ( ' key ' ) + ' =PRIVATE '
else :
return o
opts = list ( map ( _scrub_eq , opts ) )
for idx , opt in enumerate ( opts ) :
if opt in PRIVATE_OPTS and idx + 1 < len ( opts ) :
opts [ idx + 1 ] = ' PRIVATE '
return opts
def append_config ( self , * args , label = None ) :
2022-05-19 16:15:21 +02:00
config = type ( self ) ( self . parser , label )
2021-12-14 18:03:47 +01:00
config . _loaded_paths = self . _loaded_paths
if config . init ( * args ) :
self . configs . append ( config )
2022-09-27 05:02:57 +02:00
@property
2021-12-14 18:03:47 +01:00
def all_args ( self ) :
for config in reversed ( self . configs ) :
yield from config . all_args
2022-05-19 16:15:21 +02:00
yield from self . parsed_args or [ ]
def parse_known_args ( self , * * kwargs ) :
return self . parser . parse_known_args ( self . all_args , * * kwargs )
2021-12-14 18:03:47 +01:00
def parse_args ( self ) :
2022-05-19 16:15:21 +02:00
return self . parser . parse_args ( self . all_args )
2022-02-13 06:58:21 +01:00
2022-01-28 22:55:35 +01:00
def merge_headers ( * dicts ) :
2022-03-04 15:08:55 +01:00
""" Merge dicts of http headers case insensitively, prioritizing the latter ones """
2022-03-11 09:54:45 +01:00
return { k . title ( ) : v for k , v in itertools . chain . from_iterable ( map ( dict . items , dicts ) ) }
2022-03-25 08:08:33 +01:00
2022-06-29 03:13:24 +02:00
def cached_method ( f ) :
""" Cache a method """
signature = inspect . signature ( f )
2022-09-27 05:02:57 +02:00
@functools.wraps ( f )
2022-06-29 03:13:24 +02:00
def wrapper ( self , * args , * * kwargs ) :
bound_args = signature . bind ( self , * args , * * kwargs )
bound_args . apply_defaults ( )
2022-10-18 19:58:57 +02:00
key = tuple ( bound_args . arguments . values ( ) ) [ 1 : ]
2022-06-29 03:13:24 +02:00
2022-11-16 01:57:43 +01:00
cache = vars ( self ) . setdefault ( ' _cached_method__cache ' , { } ) . setdefault ( f . __name__ , { } )
2022-06-29 03:13:24 +02:00
if key not in cache :
cache [ key ] = f ( self , * args , * * kwargs )
return cache [ key ]
return wrapper
2022-03-25 08:08:33 +01:00
class classproperty :
2022-11-13 03:59:49 +01:00
""" property access for class methods with optional caching """
def __new__ ( cls , func = None , * args , * * kwargs ) :
if not func :
return functools . partial ( cls , * args , * * kwargs )
return super ( ) . __new__ ( cls )
2022-04-17 19:18:50 +02:00
2022-11-13 03:59:49 +01:00
def __init__ ( self , func , * , cache = False ) :
2022-04-17 19:18:50 +02:00
functools . update_wrapper ( self , func )
self . func = func
2022-11-13 03:59:49 +01:00
self . _cache = { } if cache else None
2022-03-25 08:08:33 +01:00
def __get__ ( self , _ , cls ) :
2022-11-13 03:59:49 +01:00
if self . _cache is None :
return self . func ( cls )
elif cls not in self . _cache :
self . _cache [ cls ] = self . func ( cls )
return self . _cache [ cls ]
2022-04-17 22:58:28 +02:00
2023-02-17 13:22:22 +01:00
class function_with_repr :
2023-03-03 18:54:50 +01:00
def __init__ ( self , func , repr_ = None ) :
2023-02-17 13:22:22 +01:00
functools . update_wrapper ( self , func )
2023-03-03 18:54:50 +01:00
self . func , self . __repr = func , repr_
2023-02-17 13:22:22 +01:00
def __call__ ( self , * args , * * kwargs ) :
return self . func ( * args , * * kwargs )
2024-01-31 11:27:37 +01:00
@classmethod
def set_repr ( cls , repr_ ) :
return functools . partial ( cls , repr_ = repr_ )
2023-02-17 13:22:22 +01:00
def __repr__ ( self ) :
2023-03-03 18:54:50 +01:00
if self . __repr :
return self . __repr
2023-02-17 13:22:22 +01:00
return f ' { self . func . __module__ } . { self . func . __qualname__ } '
2022-05-25 14:23:46 +02:00
class Namespace ( types . SimpleNamespace ) :
2022-05-17 15:06:29 +02:00
""" Immutable namespace """
2022-05-17 18:38:12 +02:00
def __iter__ ( self ) :
2022-05-25 14:23:46 +02:00
return iter ( self . __dict__ . values ( ) )
2022-05-17 18:38:12 +02:00
2022-09-27 05:02:57 +02:00
@property
2022-05-25 14:23:46 +02:00
def items_ ( self ) :
return self . __dict__ . items ( )
2022-04-20 21:05:57 +02:00
2022-07-30 22:45:22 +02:00
MEDIA_EXTENSIONS = Namespace (
common_video = ( ' avi ' , ' flv ' , ' mkv ' , ' mov ' , ' mp4 ' , ' webm ' ) ,
video = ( ' 3g2 ' , ' 3gp ' , ' f4v ' , ' mk3d ' , ' divx ' , ' mpg ' , ' ogv ' , ' m4v ' , ' wmv ' ) ,
common_audio = ( ' aiff ' , ' alac ' , ' flac ' , ' m4a ' , ' mka ' , ' mp3 ' , ' ogg ' , ' opus ' , ' wav ' ) ,
2024-07-07 23:23:40 +02:00
audio = ( ' aac ' , ' ape ' , ' asf ' , ' f4a ' , ' f4b ' , ' m4b ' , ' m4r ' , ' oga ' , ' ogx ' , ' spx ' , ' vorbis ' , ' wma ' , ' weba ' ) ,
2022-07-30 22:45:22 +02:00
thumbnails = ( ' jpg ' , ' png ' , ' webp ' ) ,
storyboards = ( ' mhtml ' , ) ,
subtitles = ( ' srt ' , ' vtt ' , ' ass ' , ' lrc ' ) ,
manifests = ( ' f4f ' , ' f4m ' , ' m3u8 ' , ' smil ' , ' mpd ' ) ,
)
MEDIA_EXTENSIONS . video + = MEDIA_EXTENSIONS . common_video
MEDIA_EXTENSIONS . audio + = MEDIA_EXTENSIONS . common_audio
KNOWN_EXTENSIONS = ( * MEDIA_EXTENSIONS . video , * MEDIA_EXTENSIONS . audio , * MEDIA_EXTENSIONS . manifests )
2024-07-02 00:52:50 +02:00
class _UnsafeExtensionError ( Exception ) :
"""
Mitigation exception for uncommon / malicious file extensions
This should be caught in YoutubeDL . py alongside a warning
Ref : https : / / github . com / yt - dlp / yt - dlp / security / advisories / GHSA - 79 w7 - vh3h - 8 g4j
"""
ALLOWED_EXTENSIONS = frozenset ( [
# internal
' description ' ,
' json ' ,
' meta ' ,
' orig ' ,
' part ' ,
' temp ' ,
' uncut ' ,
' unknown_video ' ,
' ytdl ' ,
# video
* MEDIA_EXTENSIONS . video ,
2024-07-06 01:17:47 +02:00
' asx ' ,
2024-07-02 00:52:50 +02:00
' ismv ' ,
2024-07-06 01:17:47 +02:00
' m2t ' ,
2024-07-02 00:52:50 +02:00
' m2ts ' ,
2024-07-06 01:17:47 +02:00
' m2v ' ,
2024-07-02 00:52:50 +02:00
' m4s ' ,
' mng ' ,
2024-07-06 01:17:47 +02:00
' mp2v ' ,
' mp4v ' ,
' mpe ' ,
2024-07-02 00:52:50 +02:00
' mpeg ' ,
2024-07-06 01:17:47 +02:00
' mpeg1 ' ,
' mpeg2 ' ,
' mpeg4 ' ,
' mxf ' ,
' ogm ' ,
2024-07-02 00:52:50 +02:00
' qt ' ,
2024-07-06 01:17:47 +02:00
' rm ' ,
2024-07-02 00:52:50 +02:00
' swf ' ,
' ts ' ,
2024-07-06 01:17:47 +02:00
' vob ' ,
2024-07-02 00:52:50 +02:00
' vp9 ' ,
# audio
* MEDIA_EXTENSIONS . audio ,
2024-07-06 01:17:47 +02:00
' 3ga ' ,
' ac3 ' ,
' adts ' ,
' aif ' ,
' au ' ,
' dts ' ,
2024-07-02 00:52:50 +02:00
' isma ' ,
2024-07-06 01:17:47 +02:00
' it ' ,
2024-07-02 00:52:50 +02:00
' mid ' ,
2024-07-06 01:17:47 +02:00
' mod ' ,
2024-07-02 00:52:50 +02:00
' mpga ' ,
2024-07-06 01:17:47 +02:00
' mp1 ' ,
' mp2 ' ,
' mp4a ' ,
' mpa ' ,
2024-07-02 00:52:50 +02:00
' ra ' ,
2024-07-06 01:17:47 +02:00
' shn ' ,
' xm ' ,
2024-07-02 00:52:50 +02:00
# image
* MEDIA_EXTENSIONS . thumbnails ,
2024-07-06 01:17:47 +02:00
' avif ' ,
2024-07-02 00:52:50 +02:00
' bmp ' ,
' gif ' ,
' heic ' ,
' ico ' ,
2024-07-17 00:01:01 +02:00
' image ' ,
2024-07-02 00:52:50 +02:00
' jng ' ,
' jpeg ' ,
' jxl ' ,
' svg ' ,
' tif ' ,
2024-07-06 01:17:47 +02:00
' tiff ' ,
2024-07-02 00:52:50 +02:00
' wbmp ' ,
# subtitle
* MEDIA_EXTENSIONS . subtitles ,
' dfxp ' ,
' fs ' ,
' ismt ' ,
2024-07-06 01:17:47 +02:00
' json3 ' ,
2024-07-02 00:52:50 +02:00
' sami ' ,
' scc ' ,
2024-07-06 01:17:47 +02:00
' srv1 ' ,
' srv2 ' ,
' srv3 ' ,
2024-07-02 00:52:50 +02:00
' ssa ' ,
' tt ' ,
' ttml ' ,
2024-07-06 01:17:47 +02:00
' xml ' ,
2024-07-02 00:52:50 +02:00
# others
* MEDIA_EXTENSIONS . manifests ,
* MEDIA_EXTENSIONS . storyboards ,
' desktop ' ,
' ism ' ,
' m3u ' ,
' sbv ' ,
' url ' ,
' webloc ' ,
] )
def __init__ ( self , extension , / ) :
super ( ) . __init__ ( f ' unsafe file extension: { extension !r} ' )
self . extension = extension
@classmethod
def sanitize_extension ( cls , extension , / , * , prepend = False ) :
2024-07-03 18:46:01 +02:00
if extension is None :
return None
2024-07-02 00:52:50 +02:00
if ' / ' in extension or ' \\ ' in extension :
raise cls ( extension )
if not prepend :
_ , _ , last = extension . rpartition ( ' . ' )
if last == ' bin ' :
extension = last = ' unknown_video '
if last . lower ( ) not in cls . ALLOWED_EXTENSIONS :
raise cls ( extension )
return extension
2022-08-01 22:13:18 +02:00
class RetryManager :
""" Usage:
for retry in RetryManager ( . . . ) :
try :
. . .
except SomeException as err :
retry . error = err
continue
"""
attempt , _error = 0 , None
def __init__ ( self , _retries , _error_callback , * * kwargs ) :
self . retries = _retries or 0
self . error_callback = functools . partial ( _error_callback , * * kwargs )
def _should_retry ( self ) :
return self . _error is not NO_DEFAULT and self . attempt < = self . retries
2022-09-27 05:02:57 +02:00
@property
2022-08-01 22:13:18 +02:00
def error ( self ) :
if self . _error is NO_DEFAULT :
return None
return self . _error
2022-09-27 05:02:57 +02:00
@error.setter
2022-08-01 22:13:18 +02:00
def error ( self , value ) :
self . _error = value
def __iter__ ( self ) :
while self . _should_retry ( ) :
self . error = NO_DEFAULT
self . attempt + = 1
yield self
if self . error :
self . error_callback ( self . error , self . attempt , self . retries )
2022-09-27 05:02:57 +02:00
@staticmethod
2022-08-01 22:13:18 +02:00
def report_retry ( e , count , retries , * , sleep_func , info , warn , error = None , suffix = None ) :
""" Utility function for reporting retries """
if count > retries :
if error :
return error ( f ' { e } . Giving up after { count - 1 } retries ' ) if count > 1 else error ( str ( e ) )
raise e
if not count :
return warn ( e )
elif isinstance ( e , ExtractorError ) :
2022-08-16 18:31:48 +02:00
e = remove_end ( str_or_none ( e . cause ) or e . orig_msg , ' . ' )
2022-08-01 22:13:18 +02:00
warn ( f ' { e } . Retrying { format_field ( suffix , None , " %s " ) } ( { count } / { retries } )... ' )
delay = float_or_none ( sleep_func ( n = count - 1 ) ) if callable ( sleep_func ) else sleep_func
if delay :
info ( f ' Sleeping { delay : .2f } seconds ... ' )
time . sleep ( delay )
2022-08-02 00:10:47 +02:00
def make_archive_id ( ie , video_id ) :
ie_key = ie if isinstance ( ie , str ) else ie . ie_key ( )
return f ' { ie_key . lower ( ) } { video_id } '
2022-08-12 15:23:53 +02:00
def truncate_string ( s , left , right = 0 ) :
assert left > 3 and right > = 0
if s is None or len ( s ) < = left + right :
return s
2023-12-30 22:27:36 +01:00
return f ' { s [ : left - 3 ] } ... { s [ - right : ] if right else " " } '
2022-08-12 15:23:53 +02:00
2022-08-24 04:08:55 +02:00
def orderedSet_from_options ( options , alias_dict , * , use_regex = False , start = None ) :
assert ' all ' in alias_dict , ' " all " alias is required '
requested = list ( start or [ ] )
for val in options :
discard = val . startswith ( ' - ' )
if discard :
val = val [ 1 : ]
if val in alias_dict :
val = alias_dict [ val ] if not discard else [
i [ 1 : ] if i . startswith ( ' - ' ) else f ' - { i } ' for i in alias_dict [ val ] ]
# NB: Do not allow regex in aliases for performance
requested = orderedSet_from_options ( val , alias_dict , start = requested )
continue
current = ( filter ( re . compile ( val , re . I ) . fullmatch , alias_dict [ ' all ' ] ) if use_regex
else [ val ] if val in alias_dict [ ' all ' ] else None )
if current is None :
raise ValueError ( val )
if discard :
for item in current :
while item in requested :
requested . remove ( item )
else :
requested . extend ( current )
return orderedSet ( requested )
2023-06-19 10:36:39 +02:00
# TODO: Rewrite
2022-11-17 06:33:20 +01:00
class FormatSorter :
regex = r ' *((?P<reverse> \ +)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$ '
default = ( ' hidden ' , ' aud_or_vid ' , ' hasvid ' , ' ie_pref ' , ' lang ' , ' quality ' ,
' res ' , ' fps ' , ' hdr:12 ' , ' vcodec:vp9.2 ' , ' channels ' , ' acodec ' ,
' size ' , ' br ' , ' asr ' , ' proto ' , ' ext ' , ' hasaud ' , ' source ' , ' id ' ) # These must not be aliases
ytdl_default = ( ' hasaud ' , ' lang ' , ' quality ' , ' tbr ' , ' filesize ' , ' vbr ' ,
' height ' , ' width ' , ' proto ' , ' vext ' , ' abr ' , ' aext ' ,
' fps ' , ' fs_approx ' , ' source ' , ' id ' )
settings = {
' vcodec ' : { ' type ' : ' ordered ' , ' regex ' : True ,
2024-09-27 23:32:39 +02:00
' order ' : [ ' av0?1 ' , ' vp0?9.0?2 ' , ' vp0?9 ' , ' [hx]265|he?vc? ' , ' [hx]264|avc ' , ' vp0?8 ' , ' mp4v|h263 ' , ' theora ' , ' ' , None , ' none ' ] } ,
2022-11-17 06:33:20 +01:00
' acodec ' : { ' type ' : ' ordered ' , ' regex ' : True ,
2022-12-09 10:47:16 +01:00
' order ' : [ ' [af]lac ' , ' wav|aiff ' , ' opus ' , ' vorbis|ogg ' , ' aac ' , ' mp?4a? ' , ' mp3 ' , ' ac-?4 ' , ' e-?a?c-?3 ' , ' ac-?3 ' , ' dts ' , ' ' , None , ' none ' ] } ,
2022-11-17 06:33:20 +01:00
' hdr ' : { ' type ' : ' ordered ' , ' regex ' : True , ' field ' : ' dynamic_range ' ,
' order ' : [ ' dv ' , ' (hdr)?12 ' , r ' (hdr)?10 \ + ' , ' (hdr)?10 ' , ' hlg ' , ' ' , ' sdr ' , None ] } ,
' proto ' : { ' type ' : ' ordered ' , ' regex ' : True , ' field ' : ' protocol ' ,
' order ' : [ ' (ht|f)tps ' , ' (ht|f)tp$ ' , ' m3u8.* ' , ' .*dash ' , ' websocket_frag ' , ' rtmpe? ' , ' ' , ' mms|rtsp ' , ' ws|websocket ' , ' f4 ' ] } ,
' vext ' : { ' type ' : ' ordered ' , ' field ' : ' video_ext ' ,
2022-11-18 07:01:15 +01:00
' order ' : ( ' mp4 ' , ' mov ' , ' webm ' , ' flv ' , ' ' , ' none ' ) ,
' order_free ' : ( ' webm ' , ' mp4 ' , ' mov ' , ' flv ' , ' ' , ' none ' ) } ,
2022-12-30 11:00:56 +01:00
' aext ' : { ' type ' : ' ordered ' , ' regex ' : True , ' field ' : ' audio_ext ' ,
' order ' : ( ' m4a ' , ' aac ' , ' mp3 ' , ' ogg ' , ' opus ' , ' web[am] ' , ' ' , ' none ' ) ,
' order_free ' : ( ' ogg ' , ' opus ' , ' web[am] ' , ' mp3 ' , ' m4a ' , ' aac ' , ' ' , ' none ' ) } ,
2022-11-17 06:33:20 +01:00
' hidden ' : { ' visible ' : False , ' forced ' : True , ' type ' : ' extractor ' , ' max ' : - 1000 } ,
' aud_or_vid ' : { ' visible ' : False , ' forced ' : True , ' type ' : ' multiple ' ,
' field ' : ( ' vcodec ' , ' acodec ' ) ,
' function ' : lambda it : int ( any ( v != ' none ' for v in it ) ) } ,
' ie_pref ' : { ' priority ' : True , ' type ' : ' extractor ' } ,
' hasvid ' : { ' priority ' : True , ' field ' : ' vcodec ' , ' type ' : ' boolean ' , ' not_in_list ' : ( ' none ' , ) } ,
' hasaud ' : { ' field ' : ' acodec ' , ' type ' : ' boolean ' , ' not_in_list ' : ( ' none ' , ) } ,
' lang ' : { ' convert ' : ' float ' , ' field ' : ' language_preference ' , ' default ' : - 1 } ,
' quality ' : { ' convert ' : ' float ' , ' default ' : - 1 } ,
' filesize ' : { ' convert ' : ' bytes ' } ,
' fs_approx ' : { ' convert ' : ' bytes ' , ' field ' : ' filesize_approx ' } ,
' id ' : { ' convert ' : ' string ' , ' field ' : ' format_id ' } ,
' height ' : { ' convert ' : ' float_none ' } ,
' width ' : { ' convert ' : ' float_none ' } ,
' fps ' : { ' convert ' : ' float_none ' } ,
' channels ' : { ' convert ' : ' float_none ' , ' field ' : ' audio_channels ' } ,
' tbr ' : { ' convert ' : ' float_none ' } ,
' vbr ' : { ' convert ' : ' float_none ' } ,
' abr ' : { ' convert ' : ' float_none ' } ,
' asr ' : { ' convert ' : ' float_none ' } ,
' source ' : { ' convert ' : ' float ' , ' field ' : ' source_preference ' , ' default ' : - 1 } ,
' codec ' : { ' type ' : ' combined ' , ' field ' : ( ' vcodec ' , ' acodec ' ) } ,
2023-06-22 06:32:38 +02:00
' br ' : { ' type ' : ' multiple ' , ' field ' : ( ' tbr ' , ' vbr ' , ' abr ' ) , ' convert ' : ' float_none ' ,
2023-06-19 10:36:39 +02:00
' function ' : lambda it : next ( filter ( None , it ) , None ) } ,
2023-06-22 06:32:38 +02:00
' size ' : { ' type ' : ' multiple ' , ' field ' : ( ' filesize ' , ' fs_approx ' ) , ' convert ' : ' bytes ' ,
2023-06-19 10:36:39 +02:00
' function ' : lambda it : next ( filter ( None , it ) , None ) } ,
2022-11-17 06:33:20 +01:00
' ext ' : { ' type ' : ' combined ' , ' field ' : ( ' vext ' , ' aext ' ) } ,
' res ' : { ' type ' : ' multiple ' , ' field ' : ( ' height ' , ' width ' ) ,
2024-06-12 01:09:58 +02:00
' function ' : lambda it : min ( filter ( None , it ) , default = 0 ) } ,
2022-11-17 06:33:20 +01:00
# Actual field names
' format_id ' : { ' type ' : ' alias ' , ' field ' : ' id ' } ,
' preference ' : { ' type ' : ' alias ' , ' field ' : ' ie_pref ' } ,
' language_preference ' : { ' type ' : ' alias ' , ' field ' : ' lang ' } ,
' source_preference ' : { ' type ' : ' alias ' , ' field ' : ' source ' } ,
' protocol ' : { ' type ' : ' alias ' , ' field ' : ' proto ' } ,
' filesize_approx ' : { ' type ' : ' alias ' , ' field ' : ' fs_approx ' } ,
' audio_channels ' : { ' type ' : ' alias ' , ' field ' : ' channels ' } ,
# Deprecated
' dimension ' : { ' type ' : ' alias ' , ' field ' : ' res ' , ' deprecated ' : True } ,
' resolution ' : { ' type ' : ' alias ' , ' field ' : ' res ' , ' deprecated ' : True } ,
' extension ' : { ' type ' : ' alias ' , ' field ' : ' ext ' , ' deprecated ' : True } ,
' bitrate ' : { ' type ' : ' alias ' , ' field ' : ' br ' , ' deprecated ' : True } ,
' total_bitrate ' : { ' type ' : ' alias ' , ' field ' : ' tbr ' , ' deprecated ' : True } ,
' video_bitrate ' : { ' type ' : ' alias ' , ' field ' : ' vbr ' , ' deprecated ' : True } ,
' audio_bitrate ' : { ' type ' : ' alias ' , ' field ' : ' abr ' , ' deprecated ' : True } ,
' framerate ' : { ' type ' : ' alias ' , ' field ' : ' fps ' , ' deprecated ' : True } ,
' filesize_estimate ' : { ' type ' : ' alias ' , ' field ' : ' size ' , ' deprecated ' : True } ,
' samplerate ' : { ' type ' : ' alias ' , ' field ' : ' asr ' , ' deprecated ' : True } ,
' video_ext ' : { ' type ' : ' alias ' , ' field ' : ' vext ' , ' deprecated ' : True } ,
' audio_ext ' : { ' type ' : ' alias ' , ' field ' : ' aext ' , ' deprecated ' : True } ,
' video_codec ' : { ' type ' : ' alias ' , ' field ' : ' vcodec ' , ' deprecated ' : True } ,
' audio_codec ' : { ' type ' : ' alias ' , ' field ' : ' acodec ' , ' deprecated ' : True } ,
' video ' : { ' type ' : ' alias ' , ' field ' : ' hasvid ' , ' deprecated ' : True } ,
' has_video ' : { ' type ' : ' alias ' , ' field ' : ' hasvid ' , ' deprecated ' : True } ,
' audio ' : { ' type ' : ' alias ' , ' field ' : ' hasaud ' , ' deprecated ' : True } ,
' has_audio ' : { ' type ' : ' alias ' , ' field ' : ' hasaud ' , ' deprecated ' : True } ,
' extractor ' : { ' type ' : ' alias ' , ' field ' : ' ie_pref ' , ' deprecated ' : True } ,
' extractor_preference ' : { ' type ' : ' alias ' , ' field ' : ' ie_pref ' , ' deprecated ' : True } ,
}
def __init__ ( self , ydl , field_preference ) :
self . ydl = ydl
self . _order = [ ]
self . evaluate_params ( self . ydl . params , field_preference )
if ydl . params . get ( ' verbose ' ) :
self . print_verbose_info ( self . ydl . write_debug )
def _get_field_setting ( self , field , key ) :
if field not in self . settings :
if key in ( ' forced ' , ' priority ' ) :
return False
self . ydl . deprecated_feature ( f ' Using arbitrary fields ( { field } ) for format sorting is '
' deprecated and may be removed in a future version ' )
self . settings [ field ] = { }
2024-06-12 01:09:58 +02:00
prop_obj = self . settings [ field ]
if key not in prop_obj :
type_ = prop_obj . get ( ' type ' )
2022-11-17 06:33:20 +01:00
if key == ' field ' :
2024-06-12 01:09:58 +02:00
default = ' preference ' if type_ == ' extractor ' else ( field , ) if type_ in ( ' combined ' , ' multiple ' ) else field
2022-11-17 06:33:20 +01:00
elif key == ' convert ' :
2024-06-12 01:09:58 +02:00
default = ' order ' if type_ == ' ordered ' else ' float_string ' if field else ' ignore '
2022-11-17 06:33:20 +01:00
else :
2024-06-12 01:09:58 +02:00
default = { ' type ' : ' field ' , ' visible ' : True , ' order ' : [ ] , ' not_in_list ' : ( None , ) } . get ( key )
prop_obj [ key ] = default
return prop_obj [ key ]
2022-11-17 06:33:20 +01:00
2024-06-12 01:09:58 +02:00
def _resolve_field_value ( self , field , value , convert_none = False ) :
2022-11-17 06:33:20 +01:00
if value is None :
2024-06-12 01:09:58 +02:00
if not convert_none :
2022-11-17 06:33:20 +01:00
return None
else :
value = value . lower ( )
conversion = self . _get_field_setting ( field , ' convert ' )
if conversion == ' ignore ' :
return None
if conversion == ' string ' :
return value
elif conversion == ' float_none ' :
return float_or_none ( value )
elif conversion == ' bytes ' :
return parse_bytes ( value )
elif conversion == ' order ' :
order_list = ( self . _use_free_order and self . _get_field_setting ( field , ' order_free ' ) ) or self . _get_field_setting ( field , ' order ' )
use_regex = self . _get_field_setting ( field , ' regex ' )
list_length = len ( order_list )
empty_pos = order_list . index ( ' ' ) if ' ' in order_list else list_length + 1
if use_regex and value is not None :
for i , regex in enumerate ( order_list ) :
if regex and re . match ( regex , value ) :
return list_length - i
return list_length - empty_pos # not in list
else : # not regex or value = None
return list_length - ( order_list . index ( value ) if value in order_list else empty_pos )
else :
if value . isnumeric ( ) :
return float ( value )
else :
self . settings [ field ] [ ' convert ' ] = ' string '
return value
def evaluate_params ( self , params , sort_extractor ) :
self . _use_free_order = params . get ( ' prefer_free_formats ' , False )
self . _sort_user = params . get ( ' format_sort ' , [ ] )
self . _sort_extractor = sort_extractor
def add_item ( field , reverse , closest , limit_text ) :
field = field . lower ( )
if field in self . _order :
return
self . _order . append ( field )
limit = self . _resolve_field_value ( field , limit_text )
data = {
' reverse ' : reverse ,
' closest ' : False if limit is None else closest ,
' limit_text ' : limit_text ,
' limit ' : limit }
if field in self . settings :
self . settings [ field ] . update ( data )
else :
self . settings [ field ] = data
sort_list = (
tuple ( field for field in self . default if self . _get_field_setting ( field , ' forced ' ) )
+ ( tuple ( ) if params . get ( ' format_sort_force ' , False )
else tuple ( field for field in self . default if self . _get_field_setting ( field , ' priority ' ) ) )
+ tuple ( self . _sort_user ) + tuple ( sort_extractor ) + self . default )
for item in sort_list :
match = re . match ( self . regex , item )
if match is None :
2024-06-12 01:09:58 +02:00
raise ExtractorError ( f ' Invalid format sort string " { item } " given by extractor ' )
2022-11-17 06:33:20 +01:00
field = match . group ( ' field ' )
if field is None :
continue
if self . _get_field_setting ( field , ' type ' ) == ' alias ' :
alias , field = field , self . _get_field_setting ( field , ' field ' )
if self . _get_field_setting ( alias , ' deprecated ' ) :
self . ydl . deprecated_feature ( f ' Format sorting alias { alias } is deprecated and may '
f ' be removed in a future version. Please use { field } instead ' )
reverse = match . group ( ' reverse ' ) is not None
closest = match . group ( ' separator ' ) == ' ~ '
limit_text = match . group ( ' limit ' )
has_limit = limit_text is not None
has_multiple_fields = self . _get_field_setting ( field , ' type ' ) == ' combined '
has_multiple_limits = has_limit and has_multiple_fields and not self . _get_field_setting ( field , ' same_limit ' )
fields = self . _get_field_setting ( field , ' field ' ) if has_multiple_fields else ( field , )
limits = limit_text . split ( ' : ' ) if has_multiple_limits else ( limit_text , ) if has_limit else tuple ( )
limit_count = len ( limits )
for ( i , f ) in enumerate ( fields ) :
add_item ( f , reverse , closest ,
limits [ i ] if i < limit_count
else limits [ 0 ] if has_limit and not has_multiple_limits
else None )
def print_verbose_info ( self , write_debug ) :
if self . _sort_user :
2024-06-12 01:09:58 +02:00
write_debug ( ' Sort order given by user: {} ' . format ( ' , ' . join ( self . _sort_user ) ) )
2022-11-17 06:33:20 +01:00
if self . _sort_extractor :
2024-06-12 01:09:58 +02:00
write_debug ( ' Sort order given by extractor: {} ' . format ( ' , ' . join ( self . _sort_extractor ) ) )
write_debug ( ' Formats sorted by: {} ' . format ( ' , ' . join ( [ ' {} {} {} ' . format (
2022-11-17 06:33:20 +01:00
' + ' if self . _get_field_setting ( field , ' reverse ' ) else ' ' , field ,
2024-06-12 01:09:58 +02:00
' {} {} ( {} ) ' . format ( ' ~ ' if self . _get_field_setting ( field , ' closest ' ) else ' : ' ,
self . _get_field_setting ( field , ' limit_text ' ) ,
self . _get_field_setting ( field , ' limit ' ) )
2022-11-17 06:33:20 +01:00
if self . _get_field_setting ( field , ' limit_text ' ) is not None else ' ' )
2024-06-12 01:09:58 +02:00
for field in self . _order if self . _get_field_setting ( field , ' visible ' ) ] ) ) )
2022-11-17 06:33:20 +01:00
2024-06-12 01:09:58 +02:00
def _calculate_field_preference_from_value ( self , format_ , field , type_ , value ) :
2022-11-17 06:33:20 +01:00
reverse = self . _get_field_setting ( field , ' reverse ' )
closest = self . _get_field_setting ( field , ' closest ' )
limit = self . _get_field_setting ( field , ' limit ' )
2024-06-12 01:09:58 +02:00
if type_ == ' extractor ' :
2022-11-17 06:33:20 +01:00
maximum = self . _get_field_setting ( field , ' max ' )
if value is None or ( maximum is not None and value > = maximum ) :
value = - 1
2024-06-12 01:09:58 +02:00
elif type_ == ' boolean ' :
2022-11-17 06:33:20 +01:00
in_list = self . _get_field_setting ( field , ' in_list ' )
not_in_list = self . _get_field_setting ( field , ' not_in_list ' )
value = 0 if ( ( in_list is None or value in in_list ) and ( not_in_list is None or value not in not_in_list ) ) else - 1
2024-06-12 01:09:58 +02:00
elif type_ == ' ordered ' :
2022-11-17 06:33:20 +01:00
value = self . _resolve_field_value ( field , value , True )
# try to convert to number
val_num = float_or_none ( value , default = self . _get_field_setting ( field , ' default ' ) )
is_num = self . _get_field_setting ( field , ' convert ' ) != ' string ' and val_num is not None
if is_num :
value = val_num
return ( ( - 10 , 0 ) if value is None
else ( 1 , value , 0 ) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
else ( 0 , - abs ( value - limit ) , value - limit if reverse else limit - value ) if closest
else ( 0 , value , 0 ) if not reverse and ( limit is None or value < = limit )
else ( 0 , - value , 0 ) if limit is None or ( reverse and value == limit ) or value > limit
else ( - 1 , value , 0 ) )
2024-06-12 01:09:58 +02:00
def _calculate_field_preference ( self , format_ , field ) :
type_ = self . _get_field_setting ( field , ' type ' ) # extractor, boolean, ordered, field, multiple
get_value = lambda f : format_ . get ( self . _get_field_setting ( f , ' field ' ) )
if type_ == ' multiple ' :
type_ = ' field ' # Only 'field' is allowed in multiple for now
2022-11-17 06:33:20 +01:00
actual_fields = self . _get_field_setting ( field , ' field ' )
value = self . _get_field_setting ( field , ' function ' ) ( get_value ( f ) for f in actual_fields )
else :
value = get_value ( field )
2024-06-12 01:09:58 +02:00
return self . _calculate_field_preference_from_value ( format_ , field , type_ , value )
2022-11-17 06:33:20 +01:00
2024-10-27 00:17:26 +02:00
@staticmethod
def _fill_sorting_fields ( format ) :
2022-11-17 06:33:20 +01:00
# Determine missing protocol
if not format . get ( ' protocol ' ) :
format [ ' protocol ' ] = determine_protocol ( format )
# Determine missing ext
if not format . get ( ' ext ' ) and ' url ' in format :
2024-10-27 00:17:26 +02:00
format [ ' ext ' ] = determine_ext ( format [ ' url ' ] ) . lower ( )
2022-11-17 06:33:20 +01:00
if format . get ( ' vcodec ' ) == ' none ' :
format [ ' audio_ext ' ] = format [ ' ext ' ] if format . get ( ' acodec ' ) != ' none ' else ' none '
format [ ' video_ext ' ] = ' none '
else :
format [ ' video_ext ' ] = format [ ' ext ' ]
format [ ' audio_ext ' ] = ' none '
# if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
# format['preference'] = -1000
2022-12-19 03:36:14 +01:00
if format . get ( ' preference ' ) is None and format . get ( ' ext ' ) == ' flv ' and re . match ( ' [hx]265|he?vc? ' , format . get ( ' vcodec ' ) or ' ' ) :
# HEVC-over-FLV is out-of-spec by FLV's original spec
# ref. https://trac.ffmpeg.org/ticket/6389
# ref. https://github.com/yt-dlp/yt-dlp/pull/5821
format [ ' preference ' ] = - 100
2022-11-17 06:33:20 +01:00
# Determine missing bitrates
2023-06-19 10:36:39 +02:00
if format . get ( ' vcodec ' ) == ' none ' :
format [ ' vbr ' ] = 0
if format . get ( ' acodec ' ) == ' none ' :
format [ ' abr ' ] = 0
if not format . get ( ' vbr ' ) and format . get ( ' vcodec ' ) != ' none ' :
format [ ' vbr ' ] = try_call ( lambda : format [ ' tbr ' ] - format [ ' abr ' ] ) or None
if not format . get ( ' abr ' ) and format . get ( ' acodec ' ) != ' none ' :
format [ ' abr ' ] = try_call ( lambda : format [ ' tbr ' ] - format [ ' vbr ' ] ) or None
if not format . get ( ' tbr ' ) :
format [ ' tbr ' ] = try_call ( lambda : format [ ' vbr ' ] + format [ ' abr ' ] ) or None
2022-11-17 06:33:20 +01:00
2024-10-27 00:17:26 +02:00
def calculate_preference ( self , format ) :
self . _fill_sorting_fields ( format )
2022-11-17 06:33:20 +01:00
return tuple ( self . _calculate_field_preference ( format , field ) for field in self . _order )
2023-07-15 08:11:08 +02:00
2024-04-01 01:17:24 +02:00
def filesize_from_tbr ( tbr , duration ) :
"""
@param tbr : Total bitrate in kbps ( 1000 bits / sec )
@param duration : Duration in seconds
@returns Filesize in bytes
"""
if tbr is None or duration is None :
return None
return int ( duration * tbr * ( 1000 / 8 ) )
2023-07-15 08:11:08 +02:00
# XXX: Temporary
class _YDLLogger :
def __init__ ( self , ydl = None ) :
self . _ydl = ydl
def debug ( self , message ) :
if self . _ydl :
self . _ydl . write_debug ( message )
def info ( self , message ) :
if self . _ydl :
self . _ydl . to_screen ( message )
def warning ( self , message , * , once = False ) :
if self . _ydl :
2023-07-09 09:53:02 +02:00
self . _ydl . report_warning ( message , once )
2023-07-15 08:11:08 +02:00
def error ( self , message , * , is_error = True ) :
if self . _ydl :
self . _ydl . report_error ( message , is_error = is_error )
def stdout ( self , message ) :
if self . _ydl :
self . _ydl . to_stdout ( message )
def stderr ( self , message ) :
if self . _ydl :
self . _ydl . to_stderr ( message )