-
Notifications
You must be signed in to change notification settings - Fork 2
/
trim_caption_ends.py
executable file
·58 lines (46 loc) · 1.62 KB
/
trim_caption_ends.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import sys
import yaml
from process_show import extract_video_captions
from levenshtein import weighted_levenshtein
from process_show import get_video_length_size
def trim_ends(
list_path: str,
videos_path: str,
offset_start: float,
offset_end: float,
):
with open(list_path, 'r') as f:
video_ids = [vid for vid in f.read().split('\n') if vid.strip() != '']
all_captions = []
for video_id in video_ids:
if video_id.startswith('#'):
continue
for fmt in ['mkv', 'webm', 'mp4']:
video_path = os.path.join(videos_path, video_id + '.' + fmt)
if os.path.exists(video_path):
break
else:
video_path = None
if video_path is None:
print(f'Found no video for id {video_id}')
continue
video_length, _ = get_video_length_size(video_path)
caption_id = f'youtube-{video_id}'
raw_captions_file = f'data/remote/private/caption_data/raw_captions/{caption_id}.yaml'
redo_sections = []
data = None
with open(raw_captions_file, 'r') as f:
data = yaml.loads(f.read())
lines = data['lines']
new_lines = []
for line in lines:
if line[1] < offset_start or line[2] > video_length - offset_end:
print(f'Trimming {line}')
continue
print(f'Keeping {line}')
new_lines.append(line)
data['lines'] = new_lines
with open(raw_captions_file, 'w') as f:
yaml.dump(data, f)
return all_captions