-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1506 from yt605155624/fix_frontend
[TTS]update text frontend, test=tts
- Loading branch information
Showing
11 changed files
with
165 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
########################################################### | ||
# FEATURE EXTRACTION SETTING # | ||
########################################################### | ||
|
||
fs: 24000 # sr | ||
n_fft: 2048 # FFT size (samples). | ||
n_shift: 300 # Hop size (samples). 12.5ms | ||
win_length: 1200 # Window length (samples). 50ms | ||
# If set to null, it will be the same as fft_size. | ||
window: "hann" # Window function. | ||
|
||
# Only used for feats_type != raw | ||
|
||
fmin: 80 # Minimum frequency of Mel basis. | ||
fmax: 7600 # Maximum frequency of Mel basis. | ||
n_mels: 80 # The number of mel basis. | ||
|
||
# Only used for the model using pitch features (e.g. FastSpeech2) | ||
f0min: 80 # Maximum f0 for pitch extraction. | ||
f0max: 400 # Minimum f0 for pitch extraction. | ||
|
||
|
||
########################################################### | ||
# DATA SETTING # | ||
########################################################### | ||
batch_size: 32 | ||
num_workers: 4 | ||
|
||
|
||
########################################################### | ||
# MODEL SETTING # | ||
########################################################### | ||
model: | ||
adim: 384 # attention dimension | ||
aheads: 2 # number of attention heads | ||
elayers: 4 # number of encoder layers | ||
eunits: 1536 # number of encoder ff units | ||
dlayers: 4 # number of decoder layers | ||
dunits: 1536 # number of decoder ff units | ||
positionwise_layer_type: conv1d # type of position-wise layer | ||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer | ||
duration_predictor_layers: 2 # number of layers of duration predictor | ||
duration_predictor_chans: 256 # number of channels of duration predictor | ||
duration_predictor_kernel_size: 3 # filter size of duration predictor | ||
postnet_layers: 5 # number of layers of postnset | ||
postnet_filts: 5 # filter size of conv layers in postnet | ||
postnet_chans: 256 # number of channels of conv layers in postnet | ||
encoder_normalize_before: True # whether to perform layer normalization before the input | ||
decoder_normalize_before: True # whether to perform layer normalization before the input | ||
reduction_factor: 1 # reduction factor | ||
encoder_type: conformer # encoder type | ||
decoder_type: conformer # decoder type | ||
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type | ||
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type | ||
conformer_activation_type: swish # conformer activation type | ||
use_macaron_style_in_conformer: true # whether to use macaron style in conformer | ||
use_cnn_in_conformer: true # whether to use CNN in conformer | ||
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder | ||
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder | ||
init_type: xavier_uniform # initialization type | ||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer | ||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding | ||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer | ||
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer | ||
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding | ||
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer | ||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor | ||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor | ||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor | ||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor | ||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch | ||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch | ||
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder | ||
energy_predictor_layers: 2 # number of conv layers in energy predictor | ||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor | ||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor | ||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor | ||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy | ||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy | ||
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder | ||
spk_embed_dim: 256 # speaker embedding dimension | ||
spk_embed_integration_type: concat # speaker embedding integration type | ||
|
||
|
||
########################################################### | ||
# UPDATER SETTING # | ||
########################################################### | ||
updater: | ||
use_masking: True # whether to apply masking for padded part in loss calculation | ||
|
||
|
||
|
||
########################################################### | ||
# OPTIMIZER SETTING # | ||
########################################################### | ||
optimizer: | ||
optim: adam # optimizer type | ||
learning_rate: 0.001 # learning rate | ||
|
||
########################################################### | ||
# TRAINING SETTING # | ||
########################################################### | ||
max_epoch: 1000 | ||
num_snapshots: 5 | ||
|
||
|
||
########################################################### | ||
# OTHER SETTING # | ||
########################################################### | ||
seed: 10086 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters