Skip to content

Commit

Permalink
[TTS] fix some bugs of ERNIE-SAT (#2378)
Browse files Browse the repository at this point in the history
* fix ernie_sat, test=tts

* fix for comments, test=tts
  • Loading branch information
yt605155624 authored Sep 14, 2022
1 parent ec571bb commit 80b1802
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 23 deletions.
6 changes: 3 additions & 3 deletions examples/aishell3/ernie_sat/local/synthesize_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/SSB03540307.wav\
--old_str='请播放歌曲小苹果' \
--new_str='歌曲真好听' \
--wav_path=source/SSB03540307.wav \
--old_str='请播放歌曲小苹果' \
--new_str='歌曲真好听' \
--source_lang=zh \
--target_lang=zh \
--erniesat_config=${config_path} \
Expand Down
6 changes: 3 additions & 3 deletions examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \
--old_str='For that reason cover should not be given' \
--new_str='今天天气很好' \
--source_lang=en \
--target_lang=zh \
Expand All @@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/SSB03540307.wav \
--old_str='请播放歌曲小苹果' \
--new_str="Thank you!" \
--old_str='请播放歌曲小苹果' \
--new_str="Thank you" \
--source_lang=zh \
--target_lang=en \
--erniesat_config=${config_path} \
Expand Down
6 changes: 3 additions & 3 deletions examples/vctk/ernie_sat/local/synthesize_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=synthesize \
--wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \
--old_str='For that reason cover should not be given' \
--new_str='I love you very much do you love me' \
--source_lang=en \
--target_lang=en \
Expand All @@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/synthesize_e2e.py \
--task_name=edit \
--wav_path=source/p243_313.wav \
--old_str='For that reason cover should not be given.' \
--new_str='For that reason cover is not impossible to be given.' \
--old_str='For that reason cover should not be given' \
--new_str='For that reason cover is not impossible to be given' \
--source_lang=en \
--target_lang=en \
--erniesat_config=${config_path} \
Expand Down
4 changes: 2 additions & 2 deletions paddlespeech/t2s/exps/ernie_sat/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300):
durations[-2] += durations[-1]
durations = durations[:-1]

# replace ' and 'sil' with 'sp'
# replace '' and 'sil' with 'sp'
phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones]

if lang == 'en':
Expand Down Expand Up @@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'):
wrd = wrd.upper()
if (wrd not in ds):
wrd2phns[str(index) + '_' + wrd] = 'spn'
phns.extend('spn')
phns.extend(['spn'])
else:
wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split()
phns.extend(word2phns_dict[wrd].split())
Expand Down
28 changes: 16 additions & 12 deletions paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str,
new_wav = np.concatenate(
[wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])

# 音频是正常遮住了
sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)

# 4. get old and new mel span to be mask
old_span_bdy = get_span_bdy(
mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl)
Expand Down Expand Up @@ -274,7 +271,8 @@ def get_wav(wav_path: str,
new_str: str='',
duration_adjust: bool=True,
fs: int=24000,
n_shift: int=300):
n_shift: int=300,
task_name: str='synthesize'):

outs = get_mlm_output(
wav_path=wav_path,
Expand All @@ -298,9 +296,11 @@ def get_wav(wav_path: str,
alt_wav = np.squeeze(alt_wav)

old_time_bdy = [n_shift * x for x in old_span_bdy]
wav_replaced = np.concatenate(
[wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])

if task_name == 'edit':
wav_replaced = np.concatenate(
[wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
else:
wav_replaced = alt_wav
wav_dict = {"origin": wav_org, "output": wav_replaced}
return wav_dict

Expand Down Expand Up @@ -356,7 +356,11 @@ def parse_args():
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")

# ernie sat related
parser.add_argument("--task_name", type=str, help="task name")
parser.add_argument(
"--task_name",
type=str,
choices=['edit', 'synthesize'],
help="task name.")
parser.add_argument("--wav_path", type=str, help="path of old wav")
parser.add_argument("--old_str", type=str, help="old string")
parser.add_argument("--new_str", type=str, help="new string")
Expand Down Expand Up @@ -410,10 +414,9 @@ def parse_args():
if args.task_name == 'edit':
new_str = new_str
elif args.task_name == 'synthesize':
new_str = old_str + new_str
new_str = old_str + ' ' + new_str
else:
new_str = old_str + new_str
print("new_str:", new_str)
new_str = old_str + ' ' + new_str

# Extractor
mel_extractor = LogMelFBank(
Expand Down Expand Up @@ -467,7 +470,8 @@ def parse_args():
new_str=new_str,
duration_adjust=args.duration_adjust,
fs=erniesat_config.fs,
n_shift=erniesat_config.n_shift)
n_shift=erniesat_config.n_shift,
task_name=args.task_name)

sf.write(
args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)
Expand Down

0 comments on commit 80b1802

Please sign in to comment.