-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
speedyspeech code adapt for mlu (#3828)
* speedyspeech code adapt for mlu * fix inference * fix help message
- Loading branch information
1 parent
4be0058
commit a9ece28
Showing
9 changed files
with
357 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#!/bin/bash | ||
|
||
train_output_path=$1 | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=mb_melgan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device mlu | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
python3 ${BIN_DIR}/../inference.py \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--am=speedyspeech_csmsc \ | ||
--voc=hifigan_csmsc \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/pd_infer_out \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--device mlu | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=mb_melgan_csmsc \ | ||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ | ||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
fi | ||
|
||
# the pretrained models haven't release now | ||
# style melgan | ||
# style melgan's Dygraph to Static Graph is not ready now | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=style_melgan_csmsc \ | ||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ | ||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
# --inference_dir=${train_output_path}/inference | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=hifigan_csmsc \ | ||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ | ||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
fi | ||
|
||
# wavernn | ||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
echo "in wavernn syn_e2e" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=wavernn_csmsc \ | ||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ | ||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ | ||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../../assets/sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
stage=0 | ||
stop_stage=0 | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=mb_melgan_csmsc \ | ||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ | ||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
fi | ||
|
||
# style melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=style_melgan_csmsc \ | ||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ | ||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
echo "in hifigan syn" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=hifigan_csmsc \ | ||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ | ||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
fi | ||
|
||
# wavernn | ||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
echo "in wavernn syn" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=speedyspeech_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/feats_stats.npy \ | ||
--voc=wavernn_csmsc \ | ||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ | ||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ | ||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--tones_dict=dump/tone_id_map.txt \ | ||
--phones_dict=dump/phone_id_map.txt \ | ||
--ngpu=0 \ | ||
--nmlu=1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
|
||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
# export MLU_VISIBLE_DEVICES=8 | ||
python ${BIN_DIR}/train.py \ | ||
--train-metadata=dump/train/norm/metadata.jsonl \ | ||
--dev-metadata=dump/dev/norm/metadata.jsonl \ | ||
--config=${config_path} \ | ||
--output-dir=${train_output_path} \ | ||
--ngpu=0 \ | ||
--nmlu=2 \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--tones-dict=dump/tone_id_map.txt \ | ||
--use-relative-path=True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
source path.sh | ||
export CUSTOM_DEVICE_BLACK_LIST=elementwise_max | ||
mlus=0 | ||
stage=0 | ||
stop_stage=100 | ||
|
||
conf_path=conf/default.yaml | ||
train_output_path=exp/default | ||
ckpt_name=snapshot_iter_30600.pdz | ||
|
||
# with the following command, you can choose the stage range you want to run | ||
# such as `./run.sh --stage 0 --stop-stage 0` | ||
# this can not be mixed use with `$1`, `$2` ... | ||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
# prepare data | ||
./local/preprocess.sh ${conf_path} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir | ||
FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
# synthesize, vocoder is pwgan by default | ||
FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
# synthesize_e2e, vocoder is pwgan by default | ||
FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 | ||
fi | ||
|
||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then | ||
# inference with static model | ||
FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1 | ||
fi | ||
|
||
# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first | ||
# we have only tested the following models so far | ||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then | ||
# install paddle2onnx | ||
pip install paddle2onnx --upgrade | ||
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc | ||
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder | ||
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc | ||
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc | ||
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc | ||
fi | ||
|
||
# inference with onnxruntime | ||
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then | ||
./local/ort_predict.sh ${train_output_path} | ||
fi | ||
|
||
# must run after stage 3 (which stage generated static models) | ||
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then | ||
./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86 | ||
./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 | ||
# ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 | ||
# ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 | ||
fi | ||
|
||
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then | ||
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 | ||
fi | ||
|
||
# PTQ_static | ||
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then | ||
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.