mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-01-21 20:41:30 +01:00
[webvtt] Merge daisy-chained duplicate cues (#638)
Fixes: https://github.com/yt-dlp/yt-dlp/issues/631#issuecomment-893338552 Previous deduplication algorithm only removed duplicate cues with identical text, styles and timestamps. This change also merges cues that come in ‘daisy chains’, where sequences of cues with identical text and styles appear in which the ending timestamp of one equals the starting timestamp of the next. This deduplication algorithm has the somewhat unfortunate side effect that NOTE blocks between cues, if found, will be emitted in a different order relative to their original cues. This may be unwanted if perfect fidelity is desired, but then so is daisy-chain deduplication itself. NOTE blocks ought to be ignored by WebVTT players in any case. Authored by: fstirlitz
This commit is contained in:
parent
ad3dc496bb
commit
25a3f4f5d6
@ -329,7 +329,7 @@ def _prepare_external_frag_download(self, ctx):
|
|||||||
'fragment_index': 0,
|
'fragment_index': 0,
|
||||||
})
|
})
|
||||||
|
|
||||||
def download_and_append_fragments(self, ctx, fragments, info_dict, pack_func=None):
|
def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None):
|
||||||
fragment_retries = self.params.get('fragment_retries', 0)
|
fragment_retries = self.params.get('fragment_retries', 0)
|
||||||
is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)
|
is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)
|
||||||
if not pack_func:
|
if not pack_func:
|
||||||
@ -424,5 +424,8 @@ def _download_fragment(fragment):
|
|||||||
if not result:
|
if not result:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if finish_func is not None:
|
||||||
|
ctx['dest_stream'].write(finish_func())
|
||||||
|
ctx['dest_stream'].flush()
|
||||||
self._finish_frag_download(ctx, info_dict)
|
self._finish_frag_download(ctx, info_dict)
|
||||||
return True
|
return True
|
||||||
|
@ -260,29 +260,35 @@ def pack_fragment(frag_content, frag_index):
|
|||||||
block.end += adjust
|
block.end += adjust
|
||||||
|
|
||||||
dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
|
dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
|
||||||
cue = block.as_json
|
|
||||||
|
|
||||||
# skip the cue if an identical one appears
|
ready = []
|
||||||
# in the window of potential duplicates
|
|
||||||
# and prune the window of unviable candidates
|
|
||||||
i = 0
|
i = 0
|
||||||
skip = True
|
is_new = True
|
||||||
while i < len(dedup_window):
|
while i < len(dedup_window):
|
||||||
window_cue = dedup_window[i]
|
wcue = dedup_window[i]
|
||||||
if window_cue == cue:
|
wblock = webvtt.CueBlock.from_json(wcue)
|
||||||
break
|
|
||||||
if window_cue['end'] >= cue['start']:
|
|
||||||
i += 1
|
i += 1
|
||||||
|
if wblock.hinges(block):
|
||||||
|
wcue['end'] = block.end
|
||||||
|
is_new = False
|
||||||
continue
|
continue
|
||||||
|
if wblock == block:
|
||||||
|
is_new = False
|
||||||
|
continue
|
||||||
|
if wblock.end > block.start:
|
||||||
|
continue
|
||||||
|
ready.append(wblock)
|
||||||
|
i -= 1
|
||||||
del dedup_window[i]
|
del dedup_window[i]
|
||||||
else:
|
|
||||||
skip = False
|
|
||||||
|
|
||||||
if skip:
|
if is_new:
|
||||||
|
dedup_window.append(block.as_json)
|
||||||
|
for block in ready:
|
||||||
|
block.write_into(output)
|
||||||
|
|
||||||
|
# we only emit cues once they fall out of the duplicate window
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# add the cue to the window
|
|
||||||
dedup_window.append(cue)
|
|
||||||
elif isinstance(block, webvtt.Magic):
|
elif isinstance(block, webvtt.Magic):
|
||||||
# take care of MPEG PES timestamp overflow
|
# take care of MPEG PES timestamp overflow
|
||||||
if block.mpegts is None:
|
if block.mpegts is None:
|
||||||
@ -317,6 +323,19 @@ def pack_fragment(frag_content, frag_index):
|
|||||||
block.write_into(output)
|
block.write_into(output)
|
||||||
|
|
||||||
return output.getvalue().encode('utf-8')
|
return output.getvalue().encode('utf-8')
|
||||||
|
|
||||||
|
def fin_fragments():
|
||||||
|
dedup_window = extra_state.get('webvtt_dedup_window')
|
||||||
|
if not dedup_window:
|
||||||
|
return b''
|
||||||
|
|
||||||
|
output = io.StringIO()
|
||||||
|
for cue in dedup_window:
|
||||||
|
webvtt.CueBlock.from_json(cue).write_into(output)
|
||||||
|
|
||||||
|
return output.getvalue().encode('utf-8')
|
||||||
|
|
||||||
|
self.download_and_append_fragments(
|
||||||
|
ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
|
||||||
else:
|
else:
|
||||||
pack_fragment = None
|
return self.download_and_append_fragments(ctx, fragments, info_dict)
|
||||||
return self.download_and_append_fragments(ctx, fragments, info_dict, pack_fragment)
|
|
||||||
|
@ -331,6 +331,26 @@ def as_json(self):
|
|||||||
'settings': self.settings,
|
'settings': self.settings,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.as_json == other.as_json
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json(cls, json):
|
||||||
|
return cls(
|
||||||
|
id=json['id'],
|
||||||
|
start=json['start'],
|
||||||
|
end=json['end'],
|
||||||
|
text=json['text'],
|
||||||
|
settings=json['settings']
|
||||||
|
)
|
||||||
|
|
||||||
|
def hinges(self, other):
|
||||||
|
if self.text != other.text:
|
||||||
|
return False
|
||||||
|
if self.settings != other.settings:
|
||||||
|
return False
|
||||||
|
return self.start <= self.end == other.start <= other.end
|
||||||
|
|
||||||
|
|
||||||
def parse_fragment(frag_content):
|
def parse_fragment(frag_content):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user