diff --git a/examples/aishell2/s0/README.md b/examples/aishell2/s0/README.md index fbd9500b4..577133f1d 100644 --- a/examples/aishell2/s0/README.md +++ b/examples/aishell2/s0/README.md @@ -15,6 +15,21 @@ | attention rescoring | 5.39 | 5.78 | | LM + attention rescoring | 5.35 | 5.73 | +## U2++ Transformer Result + +* Feature info: using fbank feature, with cmvn, no speed perturb +* Training info: lr 0.002, batch size 22, 8 gpus, acc_grad 1, 240 epochs, dither 0.0 +* Decoding info: ctc_weight 0.1, reverse_weight 0.5, average_num 30 +* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95 +* Model link: http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell2/20210621_u2pp_transformer_exp.tar.gz + +| decoding mode/chunk size | full | 16 | +|---------------------------|-------|-------| +| ctc greedy search | 7.35 | 8.23 | +| ctc prefix beam search | 7.36 | 8.23 | +| attention rescoring | 6.09 | 6.70 | +| LM + attention rescoring | 6.07 | 6.55 | + ## Unified Conformer Result * Feature info: using fbank feature, with cmvn, no speed perturb. diff --git a/examples/aishell2/s0/conf/train_u2++_transformer.yaml b/examples/aishell2/s0/conf/train_u2++_transformer.yaml new file mode 100644 index 000000000..6ad02ccd8 --- /dev/null +++ b/examples/aishell2/s0/conf/train_u2++_transformer.yaml @@ -0,0 +1,100 @@ +# network architecture +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder architecture type + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false + +# decoder related +decoder: bitransformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 3 + r_num_blocks: 3 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + reverse_weight: 0.3 +# use raw_wav or kaldi feature +raw_wav: true + +# feature extraction +collate_conf: + # waveform level config + wav_distortion_conf: + wav_dither: 1.0 + wav_distortion_rate: 0.0 + distortion_methods: [] + speed_perturb: false + feature_extraction_conf: + feature_type: 'fbank' + mel_bins: 80 + frame_shift: 10 + +# feature extraction +collate_conf: + # waveform level config + wav_distortion_conf: + wav_dither: 0.0 + wav_distortion_rate: 0.0 + distortion_methods: [] + speed_perturb: false + feature_extraction_conf: + feature_type: 'fbank' + mel_bins: 80 + frame_shift: 10 + frame_length: 25 + using_pitch: false + # spec level config + # spec_swap: false + feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature + spec_aug: true + spec_aug_conf: + warp_for_time: False + num_t_mask: 2 + num_f_mask: 2 + max_t: 50 + max_f: 10 + max_w: 80 + spec_sub: true + spec_sub_conf: + num_t_sub: 3 + max_t: 20 + +# dataset related +dataset_conf: + max_length: 40960 + min_length: 0 + batch_type: 'static' # static or dynamic + # the size of batch_size should be set according to your gpu memory size, here we used titan xp gpu whose memory size is 12GB + batch_size: 22 + sort: true + +grad_clip: 5 +accum_grad: 1 +max_epoch: 240 +log_interval: 100 + +optim: adam +optim_conf: + lr: 0.002 +scheduler: warmuplr # pytorch v1.1.0+ required +scheduler_conf: + warmup_steps: 25000