-
Notifications
You must be signed in to change notification settings - Fork 4
/
compile_decode_tdnn.sh
375 lines (289 loc) · 10.9 KB
/
compile_decode_tdnn.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#!/bin/bash
transform_dir= # dir to find fMLLR transforms.
transform_dir_model= # dir to find fMLLR GMM model.
transform_dir_decode= # dir to find fMLLR transforms.
set -u # from train_am.sh and compile_lingware.sh and decode_nnet.sh
export LC_ALL=C # from compile_lingware.sh
## This script is a combination of compile_lingware.sh and decode_nnet.sh
#####################################
## For decoding GMM-based models
#####################################
## First, we compile the lingware to run the decoder with certain input
## lexicon, language model, and acoustic models.
## Second, we extract features from test data.
## Finally, we decode the test data using the compiled graph.
# Get the general configuration variables
# (SPOKEN_NOISE_WORD, SIL_WORD and GRAPHEMIC_CLUSTERS)
. uzh/configuration.sh
# This call selects the tool used for parallel computing: ($train_cmd, $decode_cmd)
. cmd.sh
# This includes in the path the kaldi binaries:
. path.sh
# This parses any input option, if supplied.
. utils/parse_options.sh
# . parse_options.sh || exit 1;
# Relevant for decoding
num_jobs=69 # Number of jobs for parallel processing
spn_word='<SPOKEN_NOISE>'
sil_word='<SIL_WORD>'
nsn_word='<NOISE>'
#####################################
# Flags to choose with stages to run:
#####################################
do_compile_graph=1
do_data_prep=1
do_feature_extraction=1
do_decoding=1
do_f1_scoring=0
do_wer_flex_scoring=0
echo "$0 $@" # Print the command line for logging
##################
# Input arguments:
##################
lm=$1
am_dir=$2 # am_out directory (output of train_AM.sh, usually am_out/)
model_dir=$3 # model directory (it's assumed that model_dir contains 'tree' and 'final.mdl')
dev_csv=$4 # dev csv for decoding
wav_dir=$5 # audio files for decoding
output_dir=$6 # can be shared between compile and decode
transcription=${7:-"orig"}
scoring_opts=${8:-"--min-lmwt 7 --max-lmwt 17"}
n2d_mapping=${9:-"/mnt/tannon/corpus_data/norm2dieth.json"}
vocabulary=${10:-''} # <-- lexicon
echo "$transcription"
#################
# Existing files: cf. am_out/initial_data/ling/ vs am_out/data/lang/
#################
am_ling_dir="$am_dir/initial_data/ling/" # equivalent to data/local/lang (input)
# vocabulary="$am_dir/initial_data/tmp/vocabulary.txt" # initial vocab created by train_am
# lexicon="$am_dir/initial_data/ling/lexicon.txt" # lexicon created by train_am
phone_table="$am_dir/data/lang/phones.txt" # created by train_am
# tree="$am_dir/initial_data/ling/tree" NOT USED
###########################
# Intermediate Directories:
###########################
# lexicon="$tmp_dir/lexicon.txt" # created in train_am, removed to reduce repetition!
tmp_dir="$output_dir/tmp"
lexicon_tmp="$tmp_dir/lexicon"
prepare_lang_tmp="$tmp_dir/prepare_lang_tmp"
tmp_lang="$tmp_dir/lang"
decode_dir="$output_dir/decode" # output dir for decoding
lang_dir="$output_dir/lang" # output dir for intermediate language data
feats_dir="$output_dir/feats"
feats_log_dir="$output_dir/feats/log"
# wav_scp="$lang_dir/wav.scp" NOT USED !!!
##########
## Output:
##########
wav_lst="$tmp_dir/wav.lst"
dev_transcriptions="$tmp_dir/text"
#####################################################################################
# check dependencies for compiling graph
for f in $lm $phone_table; do
[[ ! -e $f ]] && echo -e "\n\tERROR: missing input $f\n" && exit 1
done
mkdir -p $output_dir $tmp_dir $lexicon_tmp $prepare_lang_tmp $tmp_lang $decode_dir $lang_dir $feats_dir $feats_log_dir
# . ./path.sh
START_TIME=$(date +%s) # record time of operations
###################
## 1. Compile graph
###################
if [[ $do_compile_graph -ne 0 ]]; then
if [[ ! -z $vocabulary ]]; then
# Generate the lexicon fst:
# Instead of generating a lexicon again from scratch, we copy the one
# created in train_AM.sh.
lexicon="$lexicon_tmp/lexicon.txt"
# Generate the lexicon (text version):
echo ""
echo "###############################"
echo "### BEGIN: GENERATE LEXICON ###"
echo "###############################"
echo ""
# python3 archimob/create_simple_lexicon.py \
# -v $vocabulary \
# -c $GRAPHEMIC_CLUSTERS \
# -o $lexicon
archimob/create_simple_lexicon_2.py \
-v $vocabulary \
-c $GRAPHEMIC_CLUSTERS \
-o $lexicon
[[ $? -ne 0 ]] && echo -e "\n\tERROR: calling create_lexicon.py\n" && exit 1
# Add to the lexicon the mapping for the silence word:
echo -e "$NOISE_WORD NSN\n$SIL_WORD SIL\n$SPOKEN_NOISE_WORD SPN\n" | cat - $lexicon | \
sort | uniq | sort -o $lexicon
sed -i '/^$/d' $lexicon
# Generate the lexicon fst:
for f in lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt; do
[[ ! -e $am_ling_dir/$f ]] && echo -e "\n\tERROR: missing $f in $am_ling_dir\n" && exit 1
cp $am_ling_dir/$f $lexicon_tmp/
done
# cp $lexicon $lexicon_tmp/lexicon.txt
rm $lexicon_tmp/lexiconp.txt &> /dev/null
else
for f in lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt; do
[[ ! -e $am_ling_dir/$f ]] && echo -e "\n\tERROR: missing $f in $am_ling_dir\n" && exit 1
cp $am_ling_dir/$f $lexicon_tmp/
done
fi
CUR_TIME=$(date +%s)
echo ""
echo "TIME ELAPSED: $(($CUR_TIME - $START_TIME)) seconds"
echo ""
echo ""
echo "######################################"
echo "### BEGIN: PREPARING LANGUAGE DATA ###"
echo "######################################"
echo ""
utils/prepare_lang.sh \
--phone-symbol-table $phone_table \
$lexicon_tmp \
"$SPOKEN_NOISE_WORD" \
$prepare_lang_tmp \
$tmp_lang
[[ $? -ne 0 ]] && echo -e "\n\tERROR: calling prepare_lang.sh\n" && exit 1
CUR_TIME=$(date +%s)
echo ""
echo "TIME ELAPSED: $(($CUR_TIME - $START_TIME)) seconds"
echo ""
echo ""
echo "##############################"
echo "### BEGIN: GENERATE LM FST $tmp_lang/G.fst ###"
echo "##############################"
echo ""
# Generate G.fst (grammar / language model):
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$tmp_lang/words.txt \
$lm \
$tmp_lang/G.fst
[[ $? -ne 0 ]] && echo -e "\n\tERROR: generating $tmp_lang/G.fst\n" && exit 1
set +e # don't exit on error
fstisstochastic $tmp_lang/G.fst
set -e # exit on error
CUR_TIME=$(date +%s)
echo ""
echo "TIME ELAPSED: $(($CUR_TIME - $START_TIME)) seconds"
echo ""
echo ""
echo "#############################"
echo "### BEGIN: GENERATE GRAPH ###"
echo "#############################"
echo ""
# Generate the complete cascade:
utils/mkgraph.sh $tmp_lang $model_dir $output_dir
[[ $? -ne 0 ]] && echo -e "\n\tERROR: calling mkgraph.sh\n" && exit 1
CUR_TIME=$(date +%s)
echo ""
echo "TIME ELAPSED: $(($CUR_TIME - $START_TIME)) seconds"
echo ""
echo ""
echo "#####################################"
echo "### COMPLETED: CONSTRUCTING GRAPH ###"
echo "#####################################"
echo ""
fi
###########################
## 2. Prep DEVELOPMENT Data
###########################
if [[ $do_data_prep -ne 0 ]]; then
# 1.- Create the transcriptions and wave list:
echo ""
echo "#######################################################"
echo "### BEGIN: EXTRACT DEV TRANSCRIPTIONS AND WAVE LIST ###"
echo "#######################################################"
echo ""
# Note the options -f and -p: we are rejecting files with no-relevant-speech or
# overlapping speech; also, Archimob markers (hesitations, coughing, ...) are
# mapped to less specific classes (see process_archimob.csv.py)
# echo "Processing $dev_csv:"
archimob/process_archimob_csv.py \
-i $dev_csv \
-trans $transcription \
-f \
-p \
-t $dev_transcriptions \
--spn-word $spn_word \
--sil-word $sil_word \
--nsn-word $nsn_word \
-o $wav_lst
[[ $? -ne 0 ]] && echo -e "\n\tERROR: calling process_archimob_csv.py\n" && exit 1
# Sort them the way Kaldi likes it:
sort $dev_transcriptions -o $dev_transcriptions
sort $wav_lst -o $wav_lst
# 2. Create the secondary files needed by Kaldi (wav.scp, utt2spk, spk2utt):
echo ""
echo "###############################################"
echo "### BEGIN: CREATE SECONDARY FILES FOR KALDI ###"
echo "###############################################"
echo ""
archimob/create_secondary_files.py \
-w $wav_dir \
-o $lang_dir \
decode \
-t $dev_transcriptions
[[ $? -ne 0 ]] && echo -e "\n\tERROR: calling create_secondary_files.py\n" && exit 1
CUR_TIME=$(date +%s)
echo ""
echo "TIME ELAPSED: $(($CUR_TIME - $START_TIME)) seconds"
echo ""
fi
#######################
## 3. Extract features:
#######################
if [[ $do_feature_extraction -ne 0 ]]; then
echo ""
echo "#################################"
echo "### BEGIN: FEATURE EXTRACTION ###"
echo "#################################"
echo ""
# This extracts MFCC features. See conf/mfcc.conf for the configuration
# Note: the $train_cmd variable is defined in cmd.sh
steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs $lang_dir \
$feats_log_dir $feats_dir
[[ $? -ne 0 ]] && echo -e "\n\tERROR: during feature extraction\n" && exit 1
# This extracts the Cepstral Mean Normalization features:
steps/compute_cmvn_stats.sh $lang_dir $feats_log_dir $feats_dir
[[ $? -ne 0 ]] && echo -e "\n\tERROR: during cmvn computation\n" && exit 1
CUR_TIME=$(date +%s)
echo ""
echo "TIME ELAPSED: $(($CUR_TIME - $START_TIME)) seconds"
echo ""
fi
##############
## 4. Decoding
##############
# dependencies for decoding
# for f in $dev_transcriptions $wav_dir $model_dir ; do
# [[ ! -e $f ]] && echo "Error. Missing input $f" && exit 1
# done
# $output_dir --> graphdir=$1
# srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory
# $lang_dir --> data=$2
# $decode_dir --> dir=$3
# $model_dir --> srcdir=$4
if [[ $do_decoding -ne 0 ]]; then
echo ""
echo "#######################"
echo "### BEGIN: DECODING ###"
echo "#######################"
echo ""
rm -rf $decode_dir/*
uzh/decode_tdnn.sh --stage 0 --nj $num_jobs \
--test-set "$exp/models/eval/test/nnet_disc/lang" \
--test-affix "test_label" \
--lm "$data/lms/language_model.arpa" \
--lmtype "lm_name" \
--model-dir "$exp/models/ivector" \
--wav-dir "$data/audio/chunked_wav_files"
[[ $? -ne 0 ]] && echo -e "\n\tERROR: during decoding\n" && exit 1
# Copy the results to the output folder:
cp $decode_dir/scoring_kaldi/best_wer $output_dir
cp -r $decode_dir/scoring_kaldi/wer_details $output_dir/
fi
CUR_TIME=$(date +%s)
echo ""
echo "TIME ELAPSED: $(($CUR_TIME - $START_TIME)) seconds"
echo ""
echo ""
echo "### DONE: $0 ###"
echo ""