forked from onejiin/CycleGAN-VC2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert.py
148 lines (119 loc) · 7.16 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import argparse
import os
import numpy as np
from model import CycleGAN
from preprocess import *
def conversion(model_dir, model_name, data_dir, conversion_direction, output_dir, pc, generation_model):
num_features = 32
sampling_rate = 44000
frame_period = 5.0
model = CycleGAN(num_features = num_features, mode = 'test', gen_model=generation_model)
model.load(filepath = os.path.join(model_dir, model_name))
mcep_normalization_params = np.load(os.path.join(model_dir, 'mcep_normalization.npz'))
mcep_mean_A = mcep_normalization_params['mean_A']
mcep_std_A = mcep_normalization_params['std_A']
mcep_mean_B = mcep_normalization_params['mean_B']
mcep_std_B = mcep_normalization_params['std_B']
logf0s_normalization_params = np.load(os.path.join(model_dir, 'logf0s_normalization.npz'))
logf0s_mean_A = logf0s_normalization_params['mean_A']
logf0s_std_A = logf0s_normalization_params['std_A']
logf0s_mean_B = logf0s_normalization_params['mean_B']
logf0s_std_B = logf0s_normalization_params['std_B']
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for file in os.listdir(data_dir):
filepath = os.path.join(data_dir, file)
wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True)
# wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_features)
coded_sp_transposed = coded_sp.T
frame_size = 128
if conversion_direction == 'A2B':
# pitch
print("AtoB")
if pc == True:
print("pitch convert")
f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A,
mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B)
else:
print("pitch same")
f0_converted = f0
# normalization A Domain
coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A
# padding
remain, padd = frame_size - coded_sp_norm.shape[1] % frame_size, False
if coded_sp_norm.shape[1] % frame_size != 0:
coded_sp_norm = np.concatenate((coded_sp_norm, np.zeros((32, remain))), axis=1)
padd = True
# inference for segmentation
coded_sp_converted_norm = model.test(inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0]
for i in range(1, coded_sp_norm.shape[1] // frame_size):
ccat = model.test(inputs=np.array([coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]),
direction=conversion_direction)[0]
coded_sp_converted_norm = np.concatenate((coded_sp_converted_norm, ccat), axis=1)
if padd == True:
coded_sp_converted_norm = coded_sp_converted_norm[:,:-remain]
coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
else:
print("BtoA")
if pc == True:
print("pitch convert")
f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A,
mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B)
else:
f0_converted = f0
# normalization B Domain
coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B
# padding
remain, padd = frame_size - coded_sp_norm.shape[1] % frame_size, False
if coded_sp_norm.shape[1] % frame_size != 0:
coded_sp_norm = np.concatenate((coded_sp_norm, np.zeros((32, remain))), axis=1)
padd = True
# inference for segmentation
coded_sp_converted_norm = model.test(inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0]
for i in range(1, coded_sp_norm.shape[1] // frame_size):
ccat = model.test(inputs=np.array([coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]),
direction=conversion_direction)[0]
coded_sp_converted_norm = np.concatenate((coded_sp_converted_norm, ccat), axis=1)
if padd == True:
coded_sp_converted_norm = coded_sp_converted_norm[:,:-remain]
coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A
# output translation value processing
coded_sp_converted = coded_sp_converted.T
coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
# World vocoder synthesis
wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period)
librosa.output.write_wav(os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Convert voices using pre-trained CycleGAN model.')
model_dir_default = './model/sf1_tm1'
model_name_default = 'sf1_tm1.ckpt'
data_dir_default = './data/evaluation_all/SF1'
conversion_direction_default = 'A2B'
output_dir_default = './converted_voices'
pc_default = True
generation_model_default='CycleGAN-VC2'
parser.add_argument('--model_dir', type = str, help = 'Directory for the pre-trained model.', default = model_dir_default)
parser.add_argument('--model_name', type = str, help = 'Filename for the pre-trained model.', default = model_name_default)
parser.add_argument('--data_dir', type = str, help = 'Directory for the voices for conversion.', default = data_dir_default)
parser.add_argument('--conversion_direction', type=str,
help='Conversion direction for CycleGAN. A2B or B2A. The first object in the model file name is A, and the second object in the model file name is B.',
default=conversion_direction_default)
parser.add_argument('--output_dir', type = str, help = 'Directory for the converted voices.', default = output_dir_default)
parser.add_argument('--pc', type=bool, help='True: using pitch conversion in DomainB',
default=pc_default)
parser.add_argument('--generation_model', type=str, help='generator_gatedcnn / generator_gatedcnn_SAGAN',
default=generation_model_default)
argv = parser.parse_args()
model_dir = argv.model_dir
model_name = argv.model_name
data_dir = argv.data_dir
conversion_direction = argv.conversion_direction
output_dir = argv.output_dir
pc = argv.pc
plot_ox = argv.plot_ox
generation_model=argv.generation_model
# Conversion coder
conversion(model_dir = model_dir, model_name = model_name, data_dir = data_dir, conversion_direction = conversion_direction, output_dir = output_dir, pc=pc, generation_model=generation_model)