forked from kaldi-asr/kaldi
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[src,scripts,egs] Add support for Continual Learning using chain/LF-M…
…MI models (kaldi-asr#4686) * Add support for LWF and DenLWF continual learning using chain models * Some minor changes based on comments * Some more minor changes * Some more minor changes Co-authored-by: Hossein Hadian <hossein.hadian@behavox.com>
- Loading branch information
1 parent
0be6186
commit e4940d0
Showing
31 changed files
with
2,031 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
export train_cmd=run.pl | ||
export decode_cmd=run.pl | ||
export cuda_cmd=run.pl | ||
export mkgraph_cmd=run.pl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
--use-energy=false # only non-default option. | ||
--sample-frequency=8000 # Switchboard is sampled at 8kHz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# config for high-resolution MFCC features, intended for neural network training. | ||
# Note: we keep all cepstra, so it has the same info as filterbank features, | ||
# but MFCC is more easily compressible (because less correlated) which is why | ||
# we prefer this method. | ||
--use-energy=false # use average of log energy, not energy. | ||
--sample-frequency=8000 # Switchboard is sampled at 8kHz | ||
--num-mel-bins=40 # similar to Google's setup. | ||
--num-ceps=40 # there is no dimensionality reduction. | ||
--low-freq=40 # low cutoff frequency for mel bins | ||
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
#!/bin/bash | ||
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) | ||
# Copyright 2020 Hossein Hadian | ||
# Apache 2.0 | ||
|
||
# This is similar to align_fmllr_lats.sh except it works with non-SAT | ||
# systems e.g. tri1 or tri2 | ||
|
||
# Computes training alignments using a model with delta or | ||
# LDA+MLLT features. | ||
|
||
# If you supply the "--use-graphs true" option, it will use the training | ||
# graphs from the source directory (where the model is). In this | ||
# case the number of jobs must match with the source directory. | ||
|
||
|
||
# Begin configuration section. | ||
nj=4 | ||
cmd=run.pl | ||
use_graphs=false | ||
# Begin configuration. | ||
#scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" | ||
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" | ||
acoustic_scale=0.1 | ||
beam=10 | ||
retry_beam=40 | ||
retry_beam=40 | ||
final_beam=20 # For the lattice-generation phase there is no retry-beam. This | ||
# is a limitation of gmm-latgen-faster. We just use an | ||
# intermediate beam. We'll lose a little data and it will be | ||
# slightly slower. (however, the min-active of 200 that | ||
# gmm-latgen-faster defaults to may help.) | ||
careful=false | ||
boost_silence=1.0 # Factor by which to boost silence during alignment. | ||
stage=0 | ||
generate_ali_from_lats=false # If true, alingments generated from lattices. | ||
max_active=7000 | ||
# End configuration options. | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
|
||
[ -f path.sh ] && . ./path.sh # source the path. | ||
. parse_options.sh || exit 1; | ||
|
||
if [ $# != 4 ]; then | ||
echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>" | ||
echo "e.g.: steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali" | ||
echo "main options (for others, see top of script file)" | ||
echo " --config <config-file> # config containing options" | ||
echo " --nj <nj> # number of parallel jobs" | ||
echo " --use-graphs true # use graphs in src-dir" | ||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." | ||
exit 1; | ||
fi | ||
|
||
data=$1 | ||
lang=$2 | ||
srcdir=$3 | ||
dir=$4 | ||
|
||
|
||
for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do | ||
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; | ||
done | ||
|
||
oov=`cat $lang/oov.int` || exit 1; | ||
mkdir -p $dir/log | ||
echo $nj > $dir/num_jobs | ||
sdata=$data/split$nj | ||
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. | ||
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. | ||
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` | ||
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. | ||
delta_opts=`cat $srcdir/delta_opts 2>/dev/null` | ||
cp $srcdir/delta_opts $dir 2>/dev/null | ||
|
||
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; | ||
|
||
utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; | ||
cp $lang/phones.txt $dir || exit 1; | ||
|
||
cp $srcdir/{tree,final.mdl} $dir || exit 1; | ||
cp $srcdir/final.occs $dir; | ||
|
||
|
||
|
||
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi | ||
echo "$0: feature type is $feat_type" | ||
|
||
case $feat_type in | ||
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; | ||
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" | ||
cp $srcdir/final.mat $srcdir/full.mat $dir | ||
;; | ||
*) echo "$0: invalid feature type $feat_type" && exit 1; | ||
esac | ||
|
||
|
||
if [ $stage -le 0 ]; then | ||
echo "$0: compiling training graphs" | ||
tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; | ||
$cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ | ||
compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ | ||
"ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; | ||
fi | ||
|
||
|
||
echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" | ||
|
||
mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |" | ||
|
||
if [ $stage -le 2 ]; then | ||
# Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more | ||
# alignment errors (however, it does have a default min-active=200 so this | ||
# will tend to reduce alignment errors). | ||
# --allow_partial=false makes sure we reach the end of the decoding graph. | ||
# --word-determinize=false makes sure we retain the alternative pronunciations of | ||
# words (including alternatives regarding optional silences). | ||
# --lattice-beam=$beam keeps all the alternatives that were within the beam, | ||
# it means we do no pruning of the lattice (lattices from a training transcription | ||
# will be small anyway). | ||
echo "$0: generating lattices containing alternate pronunciations." | ||
$cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \ | ||
gmm-latgen-faster --max-active=$max_active --acoustic-scale=$acoustic_scale --beam=$final_beam \ | ||
--lattice-beam=$final_beam --allow-partial=false --word-determinize=false \ | ||
"$mdl" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ | ||
"ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; | ||
fi | ||
|
||
if [ $stage -le 3 ] && $generate_ali_from_lats; then | ||
# If generate_alignments is true, ali.*.gz is generated in lats dir | ||
$cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \ | ||
lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \ | ||
ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; | ||
fi | ||
|
||
steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir | ||
|
||
echo "$0: done aligning data." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright 2021 Behavox (author: Hossein Hadian) | ||
# Apache 2.0 | ||
|
||
# The only path modification this script does is to append _hires to test set directory names | ||
# for extracting the ivectos. | ||
|
||
set -euo pipefail | ||
|
||
stage=0 | ||
lores_train_data_dir=data/train_sp | ||
train_data_dir=data/train_sp_hires | ||
gmm=exp/tri3b | ||
ali_lats_dir=exp/tri3b_lats_train | ||
lang=data/lang | ||
lang_chain=data/lang | ||
tree_dir=exp/chain/tree_sp | ||
leaves=4500 | ||
test_sets="test_ldc test_sp_oc" | ||
nj=10 | ||
tree_opts="--context-width=2 --central-position=1" | ||
exp=exp | ||
|
||
online_cmvn_iextractor=false | ||
use_ivector=true | ||
extractor= | ||
ivector_dim=100 | ||
nnet3_affix= | ||
echo "$0 $@" # Print the command line for logging | ||
|
||
. ./cmd.sh | ||
. ./path.sh | ||
. utils/parse_options.sh | ||
|
||
if [ -z $extractor ]; then | ||
extractor=$exp/nnet3${nnet3_affix}/extractor | ||
fi | ||
|
||
train_set=$(basename $train_data_dir) | ||
echo "$0: Highres train data dir: $train_data_dir" | ||
echo "$0: Lowres train data dir: ${lores_train_data_dir}" | ||
|
||
for f in ${lores_train_data_dir}/feats.scp $train_data_dir/feats.scp $gmm/final.mdl; do | ||
if [ ! -f $f ]; then | ||
echo "$0: expected file $f to exist" | ||
exit 1 | ||
fi | ||
done | ||
|
||
if [[ $stage -le 5 ]] && [[ ! -z $gmm ]]; then | ||
if [[ -f $ali_lats_dir/lat.1.gz ]] && [[ $ali_lats_dir/lat.1.gz -nt $gmm/final.mdl ]]; then | ||
printf "\n$0: The lattices seem to be there and up to date wrt to gmm model. Skipping\n\n" | ||
else | ||
echo "$0: Generating alignments and lattices for " | ||
if [ -f $gmm/trans.1 ]; then # It's fmllr | ||
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \ | ||
--generate-ali-from-lats true \ | ||
$lores_train_data_dir \ | ||
$lang $gmm $ali_lats_dir || exit 1; | ||
else | ||
local/align_si_lats.sh --nj $nj --cmd "$train_cmd" \ | ||
--generate-ali-from-lats true \ | ||
$lores_train_data_dir $lang $gmm $ali_lats_dir | ||
fi | ||
rm $ali_lats_dir/fsts.*.gz 2>/dev/null || true # save space | ||
fi | ||
sleep 2 | ||
fi | ||
|
||
|
||
if [ $stage -le 6 ]; then | ||
echo "$0: Creating lang directory $lang_chain with chain-type topology" | ||
# Create a version of the lang/ directory that has one state per phone in the | ||
# topo file. [note, it really has two states.. the first one is only repeated | ||
# once, the second one has zero or more repeats.] | ||
if [ -d $lang_chain ]; then | ||
if [ $lang_chain/L.fst -nt $lang/L.fst ]; then | ||
echo "$0: $lang_chain already exists, not overwriting it; continuing" | ||
else | ||
echo "$0: $lang_chain already exists and seems to be older than data/lang..." | ||
fi | ||
else | ||
cp -r $lang $lang_chain | ||
silphonelist=$(cat $lang_chain/phones/silence.csl) || exit 1; | ||
nonsilphonelist=$(cat $lang_chain/phones/nonsilence.csl) || exit 1; | ||
# Use our special topology... note that later on may have to tune this | ||
# topology. | ||
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_chain/topo | ||
fi | ||
sleep 2 | ||
fi | ||
|
||
if [ $stage -le 7 ]; then | ||
echo "$0: Buidling the tree..." | ||
if [[ -f $tree_dir/final.mdl ]]; then | ||
printf "\n$0: $tree_dir/final.mdl already exists. Skipping.\n\n" | ||
elif [ -z "$tree_dir" ]; then | ||
printf "\n$0: Tree dir is empty. Skipping tree stage.\n\n" | ||
else | ||
steps/nnet3/chain/build_tree.sh \ | ||
--frame-subsampling-factor 3 \ | ||
--context-opts "$tree_opts" \ | ||
--cmd "$train_cmd" $leaves ${lores_train_data_dir} \ | ||
$lang_chain $ali_lats_dir $tree_dir | ||
fi | ||
sleep 2 | ||
fi | ||
|
||
if ! $use_ivector; then | ||
echo "$0: ## Not doing ivectors ##" | ||
sleep 2 | ||
exit 0; | ||
fi | ||
|
||
|
||
if [[ -f $extractor/final.ie ]] && [[ $stage -le 9 ]]; then | ||
echo "" | ||
echo "$0: There is already an ivector extractor trained. Skipping..." | ||
echo "" | ||
else | ||
if [ $stage -le 8 ]; then | ||
echo "$0: computing a subset of data to train the diagonal UBM." | ||
# We'll use about a quarter of the data. | ||
mkdir -p $exp/nnet3${nnet3_affix}/diag_ubm | ||
temp_data_root=$exp/nnet3${nnet3_affix}/diag_ubm | ||
|
||
num_utts_total=$(wc -l <data/${train_set}/utt2spk) | ||
num_utts=$[$num_utts_total/4] | ||
utils/data/subset_data_dir.sh data/${train_set} \ | ||
$num_utts ${temp_data_root}/${train_set}_subset | ||
|
||
echo "$0: computing a PCA transform from the hires data." | ||
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ | ||
--splice-opts "--left-context=3 --right-context=3" \ | ||
--max-utts 10000 --subsample 2 \ | ||
${temp_data_root}/${train_set}_subset \ | ||
$exp/nnet3${nnet3_affix}/pca_transform | ||
|
||
echo "$0: training the diagonal UBM." | ||
# Use 512 Gaussians in the UBM. | ||
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $[$nj/2] \ | ||
--num-frames 700000 \ | ||
--num-threads 8 \ | ||
${temp_data_root}/${train_set}_subset 512 \ | ||
$exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm | ||
fi | ||
|
||
if [ $stage -le 9 ]; then | ||
# Train the iVector extractor. Use all of the speed-perturbed data since iVector extractors | ||
# can be sensitive to the amount of data. The script defaults to an iVector dimension of | ||
# 100. | ||
echo "$0: training the iVector extractor" | ||
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ | ||
--num-threads 4 --num-processes 2 --ivector-dim $ivector_dim \ | ||
--online-cmvn-iextractor $online_cmvn_iextractor \ | ||
data/${train_set} $exp/nnet3${nnet3_affix}/diag_ubm \ | ||
$extractor || exit 1; | ||
fi | ||
sleep 2 | ||
fi | ||
|
||
if [ $stage -le 10 ]; then | ||
# We extract iVectors on the speed-perturbed training data after combining | ||
# short segments, which will be what we train the system on. With | ||
# --utts-per-spk-max 2, the script pairs the utterances into twos, and treats | ||
# each of these pairs as one speaker; this gives more diversity in iVectors.. | ||
# Note that these are extracted 'online'. | ||
|
||
# note, we don't encode the 'max2' in the name of the ivectordir even though | ||
# that's the data we extract the ivectors from, as it's still going to be | ||
# valid for the non-'max2' data, the utterance list is the same. | ||
|
||
ivectordir=$exp/nnet3${nnet3_affix}/ivectors_${train_set} | ||
|
||
if [ -f $ivectordir/ivector_online.scp ]; then | ||
echo "" | ||
echo "iVectors already there for $train_set. Skipping. Check compatibility yourself!" | ||
echo "" | ||
else | ||
# having a larger number of speakers is helpful for generalization, and to | ||
# handle per-utterance decoding well (iVector starts at zero). | ||
temp_data_root=${ivectordir} | ||
utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ | ||
data/${train_set} ${temp_data_root}/${train_set}_max2 | ||
|
||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ | ||
${temp_data_root}/${train_set}_max2 \ | ||
$extractor $ivectordir | ||
fi | ||
sleep 2 | ||
# Also extract iVectors for the test data, but in this case we don't need the speed | ||
# perturbation (sp). | ||
echo "--- $0: $test_sets " | ||
for data in $test_sets; do | ||
odir=$exp/nnet3${nnet3_affix}/ivectors_${data}_hires | ||
if [ -f $odir/ivector_online.scp ]; then | ||
echo "" | ||
echo "iVectors already there for $data. Skipping. Check compatibility yourself!" | ||
echo "" | ||
else | ||
nspk=$(wc -l <data/${data}_hires/spk2utt) | ||
if [ $nspk -gt $nj ]; then | ||
nspk=$nj | ||
fi | ||
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nspk \ | ||
data/${data}_hires $extractor $odir | ||
fi | ||
done | ||
sleep 2 | ||
fi | ||
|
||
exit 0 |
Oops, something went wrong.