forked from bshall/UniversalVocoding
-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocess.py
93 lines (75 loc) · 3.86 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import argparse
import os
import numpy as np
import json
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from tqdm import tqdm
from utils import load_wav, mulaw_encode, melspectrogram
import random
import glob
from itertools import chain
def process_wav(wav_path, audio_path, mel_path, params):
wav = load_wav(wav_path, sample_rate=params["preprocessing"]["sample_rate"])
wav /= np.abs(wav).max() * 0.999
mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"],
preemph=params["preprocessing"]["preemph"],
num_mels=params["preprocessing"]["num_mels"],
num_fft=params["preprocessing"]["num_fft"],
min_level_db=params["preprocessing"]["min_level_db"],
hop_length=params["preprocessing"]["hop_length"],
win_length=params["preprocessing"]["win_length"],
fmin=params["preprocessing"]["fmin"])
length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav)
wav = np.pad(wav, (0, length_diff), "constant")
pad = (params["vocoder"]["sample_frames"] - params["vocoder"]["audio_slice_frames"]) // 2
mel = np.pad(mel, ((pad,), (0,)), "constant")
wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"],), "constant")
wav = mulaw_encode(wav, mu=2 ** params["preprocessing"]["bits"])
speaker = os.path.splitext(os.path.split(wav_path)[-1])[0].split("_")[0]
np.save(audio_path, wav)
np.save(mel_path, mel)
return speaker, audio_path, mel_path, len(mel)
def preprocess(wav_dirs, out_dir, num_workers, params):
audio_out_dir = os.path.join(out_dir, "audio")
mel_out_dir = os.path.join(out_dir, "mels")
os.makedirs(out_dir, exist_ok=True)
os.makedirs(audio_out_dir, exist_ok=True)
os.makedirs(mel_out_dir, exist_ok=True)
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
wav_paths = chain.from_iterable(glob.iglob("{}/*.wav".format(dir), recursive=True) for dir in wav_dirs)
for wav_path in wav_paths:
fid = os.path.basename(wav_path).replace(".wav", ".npy")
audio_path = os.path.join(audio_out_dir, fid)
mel_path = os.path.join(mel_out_dir, fid)
futures.append(executor.submit(partial(process_wav, wav_path, audio_path, mel_path, params)))
metadata = [future.result() for future in tqdm(futures)]
write_metadata(metadata, out_dir, params)
def write_metadata(metadata, out_dir, params):
random.shuffle(metadata)
test = metadata[-params["preprocessing"]["num_evaluation_utterances"]:]
train = metadata[:-params["preprocessing"]["num_evaluation_utterances"]]
with open(os.path.join(out_dir, "test.txt"), "w", encoding="utf-8") as f:
for m in test:
f.write("|".join([str(x) for x in m]) + "\n")
with open(os.path.join(out_dir, "train.txt"), "w", encoding="utf-8") as f:
for m in train:
f.write("|".join([str(x) for x in m]) + "\n")
frames = sum([m[3] for m in metadata])
frame_shift_ms = params["preprocessing"]["hop_length"] / params["preprocessing"]["sample_rate"]
hours = frames * frame_shift_ms / 3600
print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output", default="data")
parser.add_argument("--num-workers", type=int, default=cpu_count())
parser.add_argument("--language", type=str, default="./english")
with open("config.json") as f:
params = json.load(f)
args = parser.parse_args()
wav_dirs = [os.path.join(args.language, "train", "unit"), os.path.join(args.language, "train", "voice")]
preprocess(wav_dirs, args.output, args.num_workers, params)
if __name__ == "__main__":
main()