Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

speedyspeech code adapt for mlu #3828

Merged
merged 3 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions examples/csmsc/tts2/local/inference_mlu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

train_output_path=$1

stage=0
stop_stage=0

# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi

# hifigan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi
99 changes: 99 additions & 0 deletions examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash

config_path=$1
train_output_path=$2
ckpt_name=$3

stage=0
stop_stage=0

# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
# --inference_dir=${train_output_path}/inference
fi

# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi

# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi
90 changes: 90 additions & 0 deletions examples/csmsc/tts2/local/synthesize_mlu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/bin/bash

config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0

# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

# style melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
16 changes: 16 additions & 0 deletions examples/csmsc/tts2/local/train_mlu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

#!/bin/bash

config_path=$1
train_output_path=$2
# export MLU_VISIBLE_DEVICES=8
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nmlu=2 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True
76 changes: 76 additions & 0 deletions examples/csmsc/tts2/run_mlu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash

set -e
source path.sh
export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
mlus=0
stage=0
stop_stage=100

conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_30600.pdz

# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
fi

# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
# we have only tested the following models so far
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
pip install paddle2onnx --upgrade
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi

# inference with onnxruntime
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi

# must run after stage 3 (which stage generated static models)
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
fi

if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi

# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi
2 changes: 1 addition & 1 deletion paddlespeech/t2s/exps/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def parse_args():
parser.add_argument(
"--device",
default="gpu",
choices=["gpu", "cpu", "xpu", "npu"],
choices=["gpu", "cpu", "xpu", "npu", "mlu"],
help="Device selected for inference.", )
parser.add_argument('--cpu_threads', type=int, default=1)

Expand Down
12 changes: 10 additions & 2 deletions paddlespeech/t2s/exps/speedyspeech/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ def train_sp(args, config):
paddle.device.set_device("npu")
if world_size > 1:
paddle.distributed.init_parallel_env()
elif args.nmlu > 0:
paddle.device.set_device("mlu")
else:
paddle.set_device("cpu")

Expand Down Expand Up @@ -194,13 +196,19 @@ def main():
"--nxpu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=1,
help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument(
"--ngpu",
Expand Down
Loading