Parse metadata from multiple fields

Closes #196
2024-12-12 14:26:49 +01:00 · 2021-03-25 03:32:15 +05:30 · 2021-03-25 03:32:15 +05:30 · 143db31d48
commit 143db31d48
parent 3700c7ef10
5 changed files with 143 additions and 115 deletions
--- a/README.md
+++ b/README.md
@ -670,18 +670,24 @@ ## Post-Processing Options:
    --add-metadata                   Write metadata to the video file
    --no-add-metadata                Do not write metadata (default)
    --parse-metadata FIELD:FORMAT    Parse additional metadata like title/artist
-                                     from other fields. Give field name to
-                                     extract data from, and format of the field
-                                     seperated by a ":". Either regular
-                                     expression with named capture groups or a
-                                     similar syntax to the output template can
-                                     also be used. The parsed parameters replace
-                                     any existing values and can be use in
-                                     output template. This option can be used
-                                     multiple times. Example: --parse-metadata
-                                     "title:%(artist)s - %(title)s" matches a
-                                     title like "Coldplay - Paradise". Example
-                                     (regex): --parse-metadata
+                                     from other fields. Give a template or field
+                                     name to extract data from and the format to
+                                     interpret it as, seperated by a ":". Either
+                                     regular expression with named capture
+                                     groups or a similar syntax to the output
+                                     template can be used for the FORMAT.
+                                     Similarly, the syntax for output template
+                                     can be used for FIELD to parse the data
+                                     from multiple fields. The parsed parameters
+                                     replace any existing values and can be used
+                                     in output templates. This option can be
+                                     used multiple times. Example: --parse-
+                                     metadata "title:%(artist)s - %(title)s"
+                                     matches a title like "Coldplay - Paradise".
+                                     Example: --parse-metadata "%(series)s
+                                     %(episode_number)s:%(title)s" sets the
+                                     title using series and episode number.
+                                     Example (regex): --parse-metadata
                                     "description:Artist - (?P<artist>.+?)"
    --xattrs                         Write metadata to the video file's xattrs
                                     (using dublin core and xdg standards)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -67,6 +67,7 @@
    float_or_none,
    format_bytes,
    format_field,
+    FORMAT_RE,
    formatSeconds,
    GeoRestrictedError,
    int_or_none,
@ -772,95 +773,93 @@ def parse_outtmpl(self):
                    'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
        return outtmpl_dict

+    def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
+        """ Make the template and info_dict suitable for substitution (outtmpl % info_dict)"""
+        template_dict = dict(info_dict)
+
+        # duration_string
+        template_dict['duration_string'] = (  # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
+            formatSeconds(info_dict['duration'], '-')
+            if info_dict.get('duration', None) is not None
+            else None)
+
+        # epoch
+        template_dict['epoch'] = int(time.time())
+
+        # autonumber
+        autonumber_size = self.params.get('autonumber_size')
+        if autonumber_size is None:
+            autonumber_size = 5
+        template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
+
+        # resolution if not defined
+        if template_dict.get('resolution') is None:
+            if template_dict.get('width') and template_dict.get('height'):
+                template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
+            elif template_dict.get('height'):
+                template_dict['resolution'] = '%sp' % template_dict['height']
+            elif template_dict.get('width'):
+                template_dict['resolution'] = '%dx?' % template_dict['width']
+
+        if sanitize is None:
+            sanitize = lambda k, v: v
+        template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
+                             for k, v in template_dict.items()
+                             if v is not None and not isinstance(v, (list, tuple, dict)))
+        na = self.params.get('outtmpl_na_placeholder', 'NA')
+        template_dict = collections.defaultdict(lambda: na, template_dict)
+
+        # For fields playlist_index and autonumber convert all occurrences
+        # of %(field)s to %(field)0Nd for backward compatibility
+        field_size_compat_map = {
+            'playlist_index': len(str(template_dict['n_entries'])),
+            'autonumber': autonumber_size,
+        }
+        FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
+        mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
+        if mobj:
+            outtmpl = re.sub(
+                FIELD_SIZE_COMPAT_RE,
+                r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
+                outtmpl)
+
+        numeric_fields = list(self._NUMERIC_FIELDS)
+
+        # Format date
+        FORMAT_DATE_RE = FORMAT_RE.format(r'(?P<key>(?P<field>\w+)>(?P<format>.+?))')
+        for mobj in re.finditer(FORMAT_DATE_RE, outtmpl):
+            conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key')
+            if key in template_dict:
+                continue
+            value = strftime_or_none(template_dict.get(field), frmt, na)
+            if conv_type in 'crs':  # string
+                value = sanitize(field, value)
+            else:  # number
+                numeric_fields.append(key)
+                value = float_or_none(value, default=None)
+            if value is not None:
+                template_dict[key] = value
+
+        # Missing numeric fields used together with integer presentation types
+        # in format specification will break the argument substitution since
+        # string NA placeholder is returned for missing fields. We will patch
+        # output template for missing fields to meet string presentation type.
+        for numeric_field in numeric_fields:
+            if numeric_field not in template_dict:
+                outtmpl = re.sub(
+                    FORMAT_RE.format(re.escape(numeric_field)),
+                    r'%({0})s'.format(numeric_field), outtmpl)
+
+        return outtmpl, template_dict
+
    def _prepare_filename(self, info_dict, tmpl_type='default'):
        try:
-            template_dict = dict(info_dict)
-
-            template_dict['duration_string'] = (  # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
-                formatSeconds(info_dict['duration'], '-')
-                if info_dict.get('duration', None) is not None
-                else None)
-
-            template_dict['epoch'] = int(time.time())
-            autonumber_size = self.params.get('autonumber_size')
-            if autonumber_size is None:
-                autonumber_size = 5
-            template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
-            if template_dict.get('resolution') is None:
-                if template_dict.get('width') and template_dict.get('height'):
-                    template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
-                elif template_dict.get('height'):
-                    template_dict['resolution'] = '%sp' % template_dict['height']
-                elif template_dict.get('width'):
-                    template_dict['resolution'] = '%dx?' % template_dict['width']
-
            sanitize = lambda k, v: sanitize_filename(
                compat_str(v),
                restricted=self.params.get('restrictfilenames'),
                is_id=(k == 'id' or k.endswith('_id')))
-            template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
-                                 for k, v in template_dict.items()
-                                 if v is not None and not isinstance(v, (list, tuple, dict)))
-            na = self.params.get('outtmpl_na_placeholder', 'NA')
-            template_dict = collections.defaultdict(lambda: na, template_dict)
-
            outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
-            force_ext = OUTTMPL_TYPES.get(tmpl_type)
-
-            # For fields playlist_index and autonumber convert all occurrences
-            # of %(field)s to %(field)0Nd for backward compatibility
-            field_size_compat_map = {
-                'playlist_index': len(str(template_dict['n_entries'])),
-                'autonumber': autonumber_size,
-            }
-            FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
-            mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
-            if mobj:
-                outtmpl = re.sub(
-                    FIELD_SIZE_COMPAT_RE,
-                    r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
-                    outtmpl)
-
-            # As of [1] format syntax is:
-            #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
-            # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
-            FORMAT_RE = r'''(?x)
-                (?<!%)
-                %
-                \({0}\)  # mapping key
-                (?:[#0\-+ ]+)?  # conversion flags (optional)
-                (?:\d+)?  # minimum field width (optional)
-                (?:\.\d+)?  # precision (optional)
-                [hlL]?  # length modifier (optional)
-                (?P<type>[diouxXeEfFgGcrs%])  # conversion type
-            '''
-
-            numeric_fields = list(self._NUMERIC_FIELDS)
-
-            # Format date
-            FORMAT_DATE_RE = FORMAT_RE.format(r'(?P<key>(?P<field>\w+)>(?P<format>.+?))')
-            for mobj in re.finditer(FORMAT_DATE_RE, outtmpl):
-                conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key')
-                if key in template_dict:
-                    continue
-                value = strftime_or_none(template_dict.get(field), frmt, na)
-                if conv_type in 'crs':  # string
-                    value = sanitize(field, value)
-                else:  # number
-                    numeric_fields.append(key)
-                    value = float_or_none(value, default=None)
-                if value is not None:
-                    template_dict[key] = value
-
-            # Missing numeric fields used together with integer presentation types
-            # in format specification will break the argument substitution since
-            # string NA placeholder is returned for missing fields. We will patch
-            # output template for missing fields to meet string presentation type.
-            for numeric_field in numeric_fields:
-                if numeric_field not in template_dict:
-                    outtmpl = re.sub(
-                        FORMAT_RE.format(re.escape(numeric_field)),
-                        r'%({0})s'.format(numeric_field), outtmpl)
+            outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)

            # expand_path translates '%%' into '%' and '$$' into '$'
            # correspondingly that is not what we want since we need to keep
@ -875,6 +874,7 @@ def _prepare_filename(self, info_dict, tmpl_type='default'):
            # title "Hello $PATH", we don't want `$PATH` to be expanded.
            filename = expand_path(outtmpl).replace(sep, '') % template_dict

+            force_ext = OUTTMPL_TYPES.get(tmpl_type)
            if force_ext is not None:
                filename = replace_extension(filename, force_ext, template_dict.get('ext'))

--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -1147,13 +1147,18 @@ def _dict_from_multiple_values_options_callback(
        metavar='FIELD:FORMAT', dest='metafromfield', action='append',
        help=(
            'Parse additional metadata like title/artist from other fields. '
-            'Give field name to extract data from, and format of the field seperated by a ":". '
+            'Give a template or field name to extract data from and the '
+            'format to interpret it as, seperated by a ":". '
            'Either regular expression with named capture groups or a '
-            'similar syntax to the output template can also be used. '
-            'The parsed parameters replace any existing values and can be use in output template. '
+            'similar syntax to the output template can be used for the FORMAT. '
+            'Similarly, the syntax for output template can be used for FIELD '
+            'to parse the data from multiple fields. '
+            'The parsed parameters replace any existing values and can be used in output templates. '
            'This option can be used multiple times. '
            'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
            '"Coldplay - Paradise". '
+            'Example: --parse-metadata "%(series)s %(episode_number)s:%(title)s" '
+            'sets the title using series and episode number. '
            'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
    postproc.add_option(
        '--xattrs',
--- a/yt_dlp/postprocessor/metadatafromfield.py
+++ b/yt_dlp/postprocessor/metadatafromfield.py
@ -8,7 +8,7 @@


 class MetadataFromFieldPP(PostProcessor):
-    regex = r'(?P<field>\w+):(?P<format>.+)$'
+    regex = r'(?P<in>.+):(?P<out>.+)$'

    def __init__(self, downloader, formats):
        PostProcessor.__init__(self, downloader)
@ -19,11 +19,20 @@ def __init__(self, downloader, formats):
            match = re.match(self.regex, f)
            assert match is not None
            self._data.append({
-                'field': match.group('field'),
-                'format': match.group('format'),
-                'regex': self.format_to_regex(match.group('format'))})
+                'in': match.group('in'),
+                'out': match.group('out'),
+                'tmpl': self.field_to_template(match.group('in')),
+                'regex': self.format_to_regex(match.group('out')),
+            })

-    def format_to_regex(self, fmt):
+    @staticmethod
+    def field_to_template(tmpl):
+        if re.match(r'\w+$', tmpl):
+            return '%%(%s)s' % tmpl
+        return tmpl
+
+    @staticmethod
+    def format_to_regex(fmt):
        r"""
        Converts a string like
           '%(title)s - %(artist)s'
@ -37,7 +46,7 @@ def format_to_regex(self, fmt):
        # replace %(..)s with regex group and escape other string parts
        for match in re.finditer(r'%\((\w+)\)s', fmt):
            regex += re.escape(fmt[lastpos:match.start()])
-            regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
+            regex += r'(?P<%s>[^\r\n]+)' % match.group(1)
            lastpos = match.end()
        if lastpos < len(fmt):
            regex += re.escape(fmt[lastpos:])
@ -45,22 +54,16 @@ def format_to_regex(self, fmt):

    def run(self, info):
        for dictn in self._data:
-            field, regex = dictn['field'], dictn['regex']
-            if field not in info:
-                self.report_warning('Video doesnot have a %s' % field)
-                continue
-            data_to_parse = str_or_none(info[field])
-            if data_to_parse is None:
-                self.report_warning('Field %s cannot be parsed' % field)
-                continue
-            self.write_debug('Searching for r"%s" in %s' % (regex, field))
-            match = re.search(regex, data_to_parse)
+            tmpl, info_copy = self._downloader.prepare_outtmpl(dictn['tmpl'], info)
+            data_to_parse = tmpl % info_copy
+            self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], tmpl))
+            match = re.search(dictn['regex'], data_to_parse)
            if match is None:
-                self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
+                self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out']))
                continue
            for attribute, value in match.groupdict().items():
                info[attribute] = value
-                self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
+                self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['in'], value if value is not None else 'NA'))
        return [], info


--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -4205,6 +4205,20 @@ def q(qid):
    'pl_infojson': 'info.json',
 }

+# As of [1] format syntax is:
+#  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
+# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
+FORMAT_RE = r'''(?x)
+    (?<!%)
+    %
+    \({0}\)  # mapping key
+    (?:[#0\-+ ]+)?  # conversion flags (optional)
+    (?:\d+)?  # minimum field width (optional)
+    (?:\.\d+)?  # precision (optional)
+    [hlL]?  # length modifier (optional)
+    (?P<type>[diouxXeEfFgGcrs%])  # conversion type
+'''
+

 def limit_length(s, length):
    """ Add ellipses to overly long strings """