-
Notifications
You must be signed in to change notification settings - Fork 17
/
aishell_asr_example_lstm4atthead1.yaml
58 lines (56 loc) · 2.35 KB
/
aishell_asr_example_lstm4atthead1.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
data:
corpus:
name: 'aishell' # Specify dataset
path: '/data/Speech/SLR33/data_aishell/wav/' # Path to raw LibriSpeech dataset
train_split: ['train'] # Name of data splits to be used as training set
dev_split: ['dev'] # Name of data splits to be used as validation set
bucketing: True # Enable/Disable bucketing
batch_size: 16
audio: # Attributes of audio feature
feat_type: 'fbank'
feat_dim: 40
frame_length: 25 # ms
frame_shift: 10 # ms
dither: 0 # random dither audio, 0: no dither
apply_cmvn: True
delta_order: 2 # 0: do nothing, 1: add delta, 2: add delta and accelerate
delta_window_size: 2
text:
mode: 'character' # 'character'/'word'/'subword'
vocab_file: 'aishell_vocab.txt'
hparas: # Experiment hyper-parameters
valid_step: 5000
max_step: 1000001
tf_start: 1.0
tf_end: 1.0
tf_step: 500000
optimizer: 'Adadelta'
lr: 1.0
eps: 0.00000001 # 1e-8
lr_scheduler: 'fixed' # 'fixed'/'warmup'
curriculum: 0
model: # Model architecture
ctc_weight: 0.0 # Weight for CTC loss
encoder:
vgg: True # 4x reduction on time feature extraction
module: 'LSTM' # 'LSTM'/'GRU'/'Transformer'
bidirection: True
dim: [1024,1024,1024,1024]
dropout: [0,0,0,0]
layer_norm: [False,False,False,False]
proj: [True,True,True,True] # Linear projection + Tanh after each rnn layer
sample_rate: [1,1,1,1]
sample_style: 'drop' # 'drop'/'concat'
attention:
mode: 'loc' # 'dot'/'loc'
dim: 512
num_head: 1
v_proj: False # if False and num_head>1, encoder state will be duplicated for each head
temperature: 1 # scaling factor for attention
loc_kernel_size: 100 # just for mode=='loc'
loc_kernel_num: 10 # just for mode=='loc'
decoder:
module: 'LSTM' # 'LSTM'/'GRU'/'Transformer'
dim: 1024
layer: 2
dropout: 0