From e9f4ccd19eb92621970b518fb5984b8aef52bdc8 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 10 Aug 2021 01:22:55 +0530 Subject: [PATCH] Add option `--replace-in-metadata` --- README.md | 13 ++- test/test_postprocessors.py | 31 +++--- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/__init__.py | 41 ++++++-- yt_dlp/options.py | 6 +- yt_dlp/postprocessor/__init__.py | 8 +- yt_dlp/postprocessor/metadatafromfield.py | 74 -------------- yt_dlp/postprocessor/metadataparser.py | 117 ++++++++++++++++++++++ 8 files changed, 186 insertions(+), 106 deletions(-) delete mode 100644 yt_dlp/postprocessor/metadatafromfield.py create mode 100644 yt_dlp/postprocessor/metadataparser.py diff --git a/README.md b/README.md index be4323771b..493f437d1b 100644 --- a/README.md +++ b/README.md @@ -777,6 +777,10 @@ ## Post-Processing Options: --parse-metadata FROM:TO Parse additional metadata like title/artist from other fields; see "MODIFYING METADATA" for details + --replace-in-metadata FIELDS REGEX REPLACE + Replace text in a metadata field using the + given regex. This option can be used + multiple times --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the @@ -1333,7 +1337,11 @@ # preferring better codec and then larger total bitrate for the same resolution # MODIFYING METADATA -The metadata obtained the the extractors can be modified by using `--parse-metadata FROM:TO`. The general syntax is to give the name of a field or a template (with similar syntax to [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields. +The metadata obtained the the extractors can be modified by using `--parse-metadata` and `--replace-in-metadata` + +`--replace-in-metadata FIELDS REGEX REPLACE` is used to replace text in any metatdata field using [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax). [Backreferences](https://docs.python.org/3/library/re.html?highlight=backreferences#re.sub) can be used in the replace string for advanced use. + +The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or a template (with same syntax as [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields. Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--add-metadata`. @@ -1380,6 +1388,9 @@ # Set title as "Series name S01E05" # Set "comment" field in video metadata using description instead of webpage_url $ yt-dlp --parse-metadata 'description:(?s)(?P.+)' --add-metadata +# Replace all spaces and "_" in title and uploader with a `-` +$ yt-dlp --replace-in-metadata 'title,uploader' '[ _]' '-' + ``` # EXTRACTOR ARGUMENTS diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index bdc2d93cb6..320e69e887 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -14,29 +14,28 @@ ExecAfterDownloadPP, FFmpegThumbnailsConvertorPP, MetadataFromFieldPP, - MetadataFromTitlePP, + MetadataParserPP, ) class TestMetadataFromField(unittest.TestCase): + def test_format_to_regex(self): - pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s']) - self.assertEqual(pp._data[0]['regex'], r'(?P.+)\ \-\ (?P<artist>.+)') + self.assertEqual( + MetadataParserPP.format_to_regex('%(title)s - %(artist)s'), + r'(?P<title>.+)\ \-\ (?P<artist>.+)') + self.assertEqual(MetadataParserPP.format_to_regex(r'(?P<x>.+)'), r'(?P<x>.+)') - def test_field_to_outtmpl(self): - pp = MetadataFromFieldPP(None, ['title:%(title)s : %(artist)s']) - self.assertEqual(pp._data[0]['tmpl'], '%(title)s') + def test_field_to_template(self): + self.assertEqual(MetadataParserPP.field_to_template('title'), '%(title)s') + self.assertEqual(MetadataParserPP.field_to_template('1'), '1') + self.assertEqual(MetadataParserPP.field_to_template('foo bar'), 'foo bar') + self.assertEqual(MetadataParserPP.field_to_template(' literal'), ' literal') - def test_in_out_seperation(self): - pp = MetadataFromFieldPP(None, ['%(title)s \\: %(artist)s:%(title)s : %(artist)s']) - self.assertEqual(pp._data[0]['in'], '%(title)s : %(artist)s') - self.assertEqual(pp._data[0]['out'], '%(title)s : %(artist)s') - - -class TestMetadataFromTitle(unittest.TestCase): - def test_format_to_regex(self): - pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') - self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') + def test_metadatafromfield(self): + self.assertEqual( + MetadataFromFieldPP.to_action('%(title)s \\: %(artist)s:%(title)s : %(artist)s'), + (MetadataParserPP.Actions.INTERPRET, '%(title)s : %(artist)s', '%(title)s : %(artist)s')) class TestConvertThumbnail(unittest.TestCase): diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 2791d180ac..72d9f2c336 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1281,7 +1281,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): ie_result = self.process_video_result(ie_result, download=download) additional_urls = (ie_result or {}).get('additional_urls') if additional_urls: - # TODO: Improve MetadataFromFieldPP to allow setting a list + # TODO: Improve MetadataParserPP to allow setting a list if isinstance(additional_urls, compat_str): additional_urls = [additional_urls] self.to_screen( diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 7d08120790..73e3f9f786 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -7,6 +7,7 @@ import codecs import io +import itertools import os import random import re @@ -18,6 +19,7 @@ ) from .compat import ( compat_getpass, + compat_shlex_quote, workaround_optparse_bug9161, ) from .cookies import SUPPORTED_BROWSERS @@ -46,14 +48,15 @@ from .extractor import gen_extractors, list_extractors from .extractor.common import InfoExtractor from .extractor.adobepass import MSO_INFO -from .postprocessor.ffmpeg import ( +from .postprocessor import ( FFmpegExtractAudioPP, FFmpegSubtitlesConvertorPP, FFmpegThumbnailsConvertorPP, FFmpegVideoConvertorPP, FFmpegVideoRemuxerPP, + MetadataFromFieldPP, + MetadataParserPP, ) -from .postprocessor.metadatafromfield import MetadataFromFieldPP from .YoutubeDL import YoutubeDL @@ -344,13 +347,29 @@ def validate_outtmpl(tmpl, msg): if re.match(InfoExtractor.FormatSort.regex, f) is None: parser.error('invalid format sort string "%s" specified' % f) - if opts.metafromfield is None: - opts.metafromfield = [] + def metadataparser_actions(f): + if isinstance(f, str): + cmd = '--parse-metadata %s' % compat_shlex_quote(f) + try: + actions = [MetadataFromFieldPP.to_action(f)] + except Exception as err: + parser.error(f'{cmd} is invalid; {err}') + else: + cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f)) + actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(',')) + + for action in actions: + try: + MetadataParserPP.validate_action(*action) + except Exception as err: + parser.error(f'{cmd} is invalid; {err}') + yield action + + if opts.parse_metadata is None: + opts.parse_metadata = [] if opts.metafromtitle is not None: - opts.metafromfield.append('title:%s' % opts.metafromtitle) - for f in opts.metafromfield: - if re.match(MetadataFromFieldPP.regex, f) is None: - parser.error('invalid format string "%s" specified for --parse-metadata' % f) + opts.parse_metadata.append('title:%s' % opts.metafromtitle) + opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata))) any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json any_printing = opts.print_json @@ -402,10 +421,10 @@ def report_conflict(arg1, arg2): # PostProcessors postprocessors = [] - if opts.metafromfield: + if opts.parse_metadata: postprocessors.append({ - 'key': 'MetadataFromField', - 'formats': opts.metafromfield, + 'key': 'MetadataParser', + 'actions': opts.parse_metadata, # Run this immediately after extraction is complete 'when': 'pre_process' }) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index aef2f81434..f8cfdeb126 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1241,10 +1241,14 @@ def _dict_from_options_callback( help=optparse.SUPPRESS_HELP) postproc.add_option( '--parse-metadata', - metavar='FROM:TO', dest='metafromfield', action='append', + metavar='FROM:TO', dest='parse_metadata', action='append', help=( 'Parse additional metadata like title/artist from other fields; ' 'see "MODIFYING METADATA" for details')) + postproc.add_option( + '--replace-in-metadata', + dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3, + help='Replace text in a metadata field using the given regex. This option can be used multiple times') postproc.add_option( '--xattrs', action='store_true', dest='xattrs', default=False, diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index 98cbe86657..b1a6917d70 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -20,8 +20,11 @@ ) from .xattrpp import XAttrMetadataPP from .execafterdownload import ExecAfterDownloadPP -from .metadatafromfield import MetadataFromFieldPP -from .metadatafromfield import MetadataFromTitlePP +from .metadataparser import ( + MetadataFromFieldPP, + MetadataFromTitlePP, + MetadataParserPP, +) from .movefilesafterdownload import MoveFilesAfterDownloadPP from .sponskrub import SponSkrubPP @@ -48,6 +51,7 @@ def get_postprocessor(key): 'FFmpegThumbnailsConvertorPP', 'FFmpegVideoConvertorPP', 'FFmpegVideoRemuxerPP', + 'MetadataParserPP', 'MetadataFromFieldPP', 'MetadataFromTitlePP', 'MoveFilesAfterDownloadPP', diff --git a/yt_dlp/postprocessor/metadatafromfield.py b/yt_dlp/postprocessor/metadatafromfield.py deleted file mode 100644 index 0027947650..0000000000 --- a/yt_dlp/postprocessor/metadatafromfield.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import PostProcessor -from ..compat import compat_str - - -class MetadataFromFieldPP(PostProcessor): - regex = r'(?P<in>.*?)(?<!\\):(?P<out>.+)$' - - def __init__(self, downloader, formats): - PostProcessor.__init__(self, downloader) - assert isinstance(formats, (list, tuple)) - self._data = [] - for f in formats: - assert isinstance(f, compat_str) - match = re.match(self.regex, f) - assert match is not None - inp = match.group('in').replace('\\:', ':') - self._data.append({ - 'in': inp, - 'out': match.group('out'), - 'tmpl': self.field_to_template(inp), - 'regex': self.format_to_regex(match.group('out')), - }) - - @staticmethod - def field_to_template(tmpl): - if re.match(r'[a-zA-Z_]+$', tmpl): - return '%%(%s)s' % tmpl - return tmpl - - @staticmethod - def format_to_regex(fmt): - r""" - Converts a string like - '%(title)s - %(artist)s' - to a regex like - '(?P<title>.+)\ \-\ (?P<artist>.+)' - """ - if not re.search(r'%\(\w+\)s', fmt): - return fmt - lastpos = 0 - regex = '' - # replace %(..)s with regex group and escape other string parts - for match in re.finditer(r'%\((\w+)\)s', fmt): - regex += re.escape(fmt[lastpos:match.start()]) - regex += r'(?P<%s>.+)' % match.group(1) - lastpos = match.end() - if lastpos < len(fmt): - regex += re.escape(fmt[lastpos:]) - return regex - - def run(self, info): - for dictn in self._data: - tmpl, tmpl_dict = self._downloader.prepare_outtmpl(dictn['tmpl'], info) - data_to_parse = self._downloader.escape_outtmpl(tmpl) % tmpl_dict - self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], dictn['tmpl'])) - match = re.search(dictn['regex'], data_to_parse) - if match is None: - self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out'])) - continue - for attribute, value in match.groupdict().items(): - info[attribute] = value - self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['tmpl'], value if value is not None else 'NA')) - return [], info - - -class MetadataFromTitlePP(MetadataFromFieldPP): # for backward compatibility - def __init__(self, downloader, titleformat): - super(MetadataFromTitlePP, self).__init__(downloader, ['%%(title)s:%s' % titleformat]) - self._titleformat = titleformat - self._titleregex = self._data[0]['regex'] diff --git a/yt_dlp/postprocessor/metadataparser.py b/yt_dlp/postprocessor/metadataparser.py new file mode 100644 index 0000000000..4d3c0e0edf --- /dev/null +++ b/yt_dlp/postprocessor/metadataparser.py @@ -0,0 +1,117 @@ +import re + +from enum import Enum + +from .common import PostProcessor + + +class MetadataParserPP(PostProcessor): + class Actions(Enum): + INTERPRET = 'interpretter' + REPLACE = 'replacer' + + def __init__(self, downloader, actions): + PostProcessor.__init__(self, downloader) + self._actions = [] + for f in actions: + action = f[0] + assert isinstance(action, self.Actions) + self._actions.append(getattr(self, action._value_)(*f[1:])) + + @classmethod + def validate_action(cls, action, *data): + ''' Each action can be: + (Actions.INTERPRET, from, to) OR + (Actions.REPLACE, field, search, replace) + ''' + if not isinstance(action, cls.Actions): + raise ValueError(f'{action!r} is not a valid action') + getattr(cls, action._value_)(cls, *data) + + @staticmethod + def field_to_template(tmpl): + if re.match(r'[a-zA-Z_]+$', tmpl): + return f'%({tmpl})s' + return tmpl + + @staticmethod + def format_to_regex(fmt): + r""" + Converts a string like + '%(title)s - %(artist)s' + to a regex like + '(?P<title>.+)\ \-\ (?P<artist>.+)' + """ + if not re.search(r'%\(\w+\)s', fmt): + return fmt + lastpos = 0 + regex = '' + # replace %(..)s with regex group and escape other string parts + for match in re.finditer(r'%\((\w+)\)s', fmt): + regex += re.escape(fmt[lastpos:match.start()]) + regex += rf'(?P<{match.group(1)}>.+)' + lastpos = match.end() + if lastpos < len(fmt): + regex += re.escape(fmt[lastpos:]) + return regex + + def run(self, info): + for f in self._actions: + f(info) + return [], info + + def interpretter(self, inp, out): + def f(info): + outtmpl, tmpl_dict = self._downloader.prepare_outtmpl(template, info) + data_to_parse = self._downloader.escape_outtmpl(outtmpl) % tmpl_dict + self.write_debug(f'Searching for r{out_re.pattern!r} in {template!r}') + match = out_re.search(data_to_parse) + if match is None: + self.report_warning('Could not interpret {inp!r} as {out!r}') + return + for attribute, value in match.groupdict().items(): + info[attribute] = value + self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA')) + + template = self.field_to_template(inp) + out_re = re.compile(self.format_to_regex(out)) + return f + + def replacer(self, field, search, replace): + def f(info): + val = info.get(field) + if val is None: + self.report_warning(f'Video does not have a {field}') + return + elif not isinstance(val, str): + self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}') + return + self.write_debug(f'Replacing all r{search!r} in {field} with {replace!r}') + info[field], n = search_re.subn(replace, val) + if n: + self.to_screen(f'Changed {field} to: {info[field]}') + else: + self.to_screen(f'Did not find r{search!r} in {field}') + + search_re = re.compile(search) + return f + + +class MetadataFromFieldPP(MetadataParserPP): + @classmethod + def to_action(cls, f): + match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f) + if match is None: + raise ValueError(f'it should be FROM:TO, not {f!r}') + return ( + cls.Actions.INTERPRET, + match.group('in').replace('\\:', ':'), + match.group('out')) + + def __init__(self, downloader, formats): + MetadataParserPP.__init__(self, downloader, [self.to_action(f) for f in formats]) + + +class MetadataFromTitlePP(MetadataParserPP): # for backward compatibility + def __init__(self, downloader, titleformat): + MetadataParserPP.__init__(self, downloader, [(self.Actions.INTERPRET, 'title', titleformat)])