-
Notifications
You must be signed in to change notification settings - Fork 4
/
model.py
153 lines (112 loc) · 3.97 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import tensorflow as tf
# Default hyper-parameters:
from tensorflow.contrib.seq2seq import BahdanauAttention, LuongAttention
from tacotron.attention import AttentionMode, AttentionScore, LocalLuongAttention
model_params = tf.contrib.training.HParams(
# Number of unique characters in the vocabulary.
vocabulary_size=39,
# Target sampling rate.
sampling_rate=22050,
# FFT window size.
n_fft=2048,
# Windows length in ms.
# win_len=25.0,
win_len=50.0,
# Window stride in ms.
# win_hop=8.0,
win_hop=12.5,
# Number of Mel bands to generate.
n_mels=80,
# Mel spectrum lower cutoff frequency.
mel_fmin=0,
# Mel spectrum upper cutoff frequency.
mel_fmax=8000,
# Number of Mel-frequency cepstral coefficients to generate.
n_mfcc=13,
# Tacotron reduction factor r.
reduction=5,
# Flag that controls application of the post-processing network.
apply_post_processing=True,
# Linear scale magnitudes are raise to the power of `magnitude_power` before reconstruction.
magnitude_power=1.3,
# The number of Griffin-Lim reconstruction iterations.
reconstruction_iterations=50,
# Flag allowing to force the use accelerated RNN implementation from CUDNN.
force_cudnn=True,
# Encoder network parameters.
encoder=tf.contrib.training.HParams(
# Embedding size for each sentence character.
embedding_size=256,
pre_net_layers=(
# (units, dropout, activation).
(256, 0.5, tf.nn.relu),
(128, 0.5, tf.nn.relu)
),
# Number of filter banks.
n_banks=16,
# Number of filters in each bank.
n_filters=128,
projections=(
# (filters, kernel_size, activation).
(128, 3, tf.nn.relu),
(128, 3, None)
),
# Number of highway network layers.
n_highway_layers=4,
# Number of units in each highway layer.
n_highway_units=128,
# Number of units in the encoder RNN.
n_gru_units=128
),
# Decoder network parameters.
decoder=tf.contrib.training.HParams(
pre_net_layers=(
# (units, dropout, activation).
(256, 0.5, tf.nn.relu),
(128, 0.5, tf.nn.relu)
),
# Number of decoder RNN layers.
n_gru_layers=2,
# Number of units in the decoder RNN.
n_decoder_gru_units=256,
# Number of units in the attention RNN.
n_attention_units=256,
# Dimensionality of a single RNN target frame.
target_size=80,
# Maximum number of decoder iterations after which to stop for evaluation and inference.
# This is equal to the number of mel-scale spectrogram frames generated.
maximum_iterations=1000,
),
# Attention parameters.
attention=tf.contrib.training.HParams(
# mechanism=BahdanauAttention,
mechanism=LuongAttention,
# mechanism=LocalLuongAttention,
# Luong local style content based scoring function.
luong_local_score=AttentionScore.DOT,
# Luong local style attention mode.
luong_local_mode=AttentionMode.MONOTONIC,
# Luong local: Force a gaussian distribution onto the scores in the attention window.
luong_force_gaussian=True,
# Luong local style window D parameter. (Window size will be `2D+1`).
luong_local_window_D=10
),
# Post-processing network parameters.
post=tf.contrib.training.HParams(
# Number of filter banks.
n_banks=8,
# Number of filters in each bank.
n_filters=128,
projections=(
# (filters, kernel_size, activation).
(256, 3, tf.nn.relu),
(80, 3, None)
),
# Number of highway network layers.
n_highway_layers=4,
# Number of units in each highway layer.
n_highway_units=128,
# Number of units in the post-processing RNN.
n_gru_units=128
)
)