-
Notifications
You must be signed in to change notification settings - Fork 30
/
hparams.py
119 lines (101 loc) · 3.82 KB
/
hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import tensorflow as tf
from text.symbols import kor_symbols as symbols
def create_hparams(hparams_string=None, verbose=False):
"""Create model hyperparameters. Parse nondefault from given string."""
hparams = tf.contrib.training.HParams(
################################
# Experiment Parameters #
################################
epochs=30,
iters_per_checkpoint=500,
seed=1234,
dynamic_loss_scaling=True,
fp16_run=False,
distributed_run=False,
dist_backend="nccl",
dist_url="tcp://localhost:54321",
cudnn_enabled=True,
cudnn_benchmark=False,
ignore_layers=['embedding.weight'],
################################
# Data Parameters #
################################
load_mel_from_disk=False,
training_files='filelists/nam-h_train_filelist.txt',
validation_files='filelists/nam-h_val_filelist.txt',
text_cleaners=['korean_cleaners'], # english_cleaners, korean_cleaners
sort_by_length=False,
################################
# Audio Parameters #
################################
max_wav_value=32768.0,
sampling_rate=16000,
filter_length=1024,
hop_length=256, # number audio of frames between stft colmns, default win_length/4
win_length=1024, # win_length int <= n_ftt: fft window size (frequency domain), defaults to win_length = n_fft
n_mel_channels=80,
mel_fmin=0.0,
mel_fmax=8000.0,
################################
# Model Parameters #
################################
# n_symbols = 80, # set 80 if u use korean_cleaners. set 65 if u use english_cleaners
n_symbols = len(symbols),
symbols_embedding_dim=512,
# Transcript encoder parameters
encoder_kernel_size = 5,
encoder_n_convolutions = 3,
encoder_embedding_dim = 512,
# Prosody encoder parameters
prosody_n_convolutions = 6,
prosody_conv_dim_in = [1, 32, 32, 64, 64, 128],
prosody_conv_dim_out = [32, 32, 64, 64, 128, 128],
prosody_conv_kernel = 3,
prosody_conv_stride = 2,
prosody_embedding_dim = 128,
# Decoder parameters
n_frames_per_step=1, # currently only 1 is supported
decoder_rnn_dim=1024,
prenet_dim=256,
max_decoder_steps=1000,
gate_threshold=0.5,
p_attention_dropout=0.1,
p_decoder_dropout=0.1,
# Attention parameters
attention_rnn_dim=1024,
attention_dim=128,
# Location Layer parameters
attention_location_n_filters=32,
attention_location_kernel_size=31,
# Mel-post processing network parameters
postnet_embedding_dim=512,
postnet_kernel_size=5,
postnet_n_convolutions=5,
################################
# Optimization Hyperparameters #
################################
use_saved_learning_rate=False,
learning_rate=1e-3,
weight_decay=1e-6,
grad_clip_thresh=1.0,
batch_size=64,
mask_padding=True, # set model's padded outputs to padded values
#######################3
# reference encoder
E = 512,
ref_enc_filters = [32, 32, 64, 64, 128, 128],
ref_enc_size = [3, 3],
ref_enc_strides = [2, 2],
ref_enc_pad = [1, 1],
ref_enc_gru_size = 512 // 2,
# Style Token Layer
token_num = 10,
num_heads = 8,
n_mels = 80,
)
if hparams_string:
tf.logging.info('Parsing command line hparams: %s', hparams_string)
hparams.parse(hparams_string)
if verbose:
tf.logging.info('Final parsed hparams: %s', hparams.values())
return hparams