Skip to content

Commit

Permalink
[src,scripts,egs] Add support for Continual Learning using chain/LF-M…
Browse files Browse the repository at this point in the history
…MI models (kaldi-asr#4686)

* Add support for LWF and DenLWF continual learning using chain models

* Some minor changes based on comments

* Some more minor changes

* Some more minor changes

Co-authored-by: Hossein Hadian <hossein.hadian@behavox.com>
  • Loading branch information
hhadian and hossein-hadian authored Feb 13, 2022
1 parent 0be6186 commit e4940d0
Show file tree
Hide file tree
Showing 31 changed files with 2,031 additions and 30 deletions.
4 changes: 4 additions & 0 deletions egs/cl_english/v1/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
export train_cmd=run.pl
export decode_cmd=run.pl
export cuda_cmd=run.pl
export mkgraph_cmd=run.pl
2 changes: 2 additions & 0 deletions egs/cl_english/v1/conf/mfcc.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
--use-energy=false # only non-default option.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
10 changes: 10 additions & 0 deletions egs/cl_english/v1/conf/mfcc_hires.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# config for high-resolution MFCC features, intended for neural network training.
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
--num-mel-bins=40 # similar to Google's setup.
--num-ceps=40 # there is no dimensionality reduction.
--low-freq=40 # low cutoff frequency for mel bins
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
1 change: 1 addition & 0 deletions egs/cl_english/v1/conf/online_cmvn.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
139 changes: 139 additions & 0 deletions egs/cl_english/v1/local/align_si_lats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# Copyright 2020 Hossein Hadian
# Apache 2.0

# This is similar to align_fmllr_lats.sh except it works with non-SAT
# systems e.g. tri1 or tri2

# Computes training alignments using a model with delta or
# LDA+MLLT features.

# If you supply the "--use-graphs true" option, it will use the training
# graphs from the source directory (where the model is). In this
# case the number of jobs must match with the source directory.


# Begin configuration section.
nj=4
cmd=run.pl
use_graphs=false
# Begin configuration.
#scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=10
retry_beam=40
retry_beam=40
final_beam=20 # For the lattice-generation phase there is no retry-beam. This
# is a limitation of gmm-latgen-faster. We just use an
# intermediate beam. We'll lose a little data and it will be
# slightly slower. (however, the min-active of 200 that
# gmm-latgen-faster defaults to may help.)
careful=false
boost_silence=1.0 # Factor by which to boost silence during alignment.
stage=0
generate_ali_from_lats=false # If true, alingments generated from lattices.
max_active=7000
# End configuration options.

echo "$0 $@" # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
echo "e.g.: steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --use-graphs true # use graphs in src-dir"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$4


for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
cp $srcdir/delta_opts $dir 2>/dev/null

[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;

cp $srcdir/{tree,final.mdl} $dir || exit 1;
cp $srcdir/final.occs $dir;



if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
cp $srcdir/final.mat $srcdir/full.mat $dir
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
esac


if [ $stage -le 0 ]; then
echo "$0: compiling training graphs"
tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
$cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" \
"ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
fi


echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"

mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |"

if [ $stage -le 2 ]; then
# Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
# alignment errors (however, it does have a default min-active=200 so this
# will tend to reduce alignment errors).
# --allow_partial=false makes sure we reach the end of the decoding graph.
# --word-determinize=false makes sure we retain the alternative pronunciations of
# words (including alternatives regarding optional silences).
# --lattice-beam=$beam keeps all the alternatives that were within the beam,
# it means we do no pruning of the lattice (lattices from a training transcription
# will be small anyway).
echo "$0: generating lattices containing alternate pronunciations."
$cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
gmm-latgen-faster --max-active=$max_active --acoustic-scale=$acoustic_scale --beam=$final_beam \
--lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
"$mdl" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
"ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi

if [ $stage -le 3 ] && $generate_ali_from_lats; then
# If generate_alignments is true, ali.*.gz is generated in lats dir
$cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi

steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir

echo "$0: done aligning data."
213 changes: 213 additions & 0 deletions egs/cl_english/v1/local/chain/run_chain_common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env bash

# Copyright 2021 Behavox (author: Hossein Hadian)
# Apache 2.0

# The only path modification this script does is to append _hires to test set directory names
# for extracting the ivectos.

set -euo pipefail

stage=0
lores_train_data_dir=data/train_sp
train_data_dir=data/train_sp_hires
gmm=exp/tri3b
ali_lats_dir=exp/tri3b_lats_train
lang=data/lang
lang_chain=data/lang
tree_dir=exp/chain/tree_sp
leaves=4500
test_sets="test_ldc test_sp_oc"
nj=10
tree_opts="--context-width=2 --central-position=1"
exp=exp

online_cmvn_iextractor=false
use_ivector=true
extractor=
ivector_dim=100
nnet3_affix=
echo "$0 $@" # Print the command line for logging

. ./cmd.sh
. ./path.sh
. utils/parse_options.sh

if [ -z $extractor ]; then
extractor=$exp/nnet3${nnet3_affix}/extractor
fi

train_set=$(basename $train_data_dir)
echo "$0: Highres train data dir: $train_data_dir"
echo "$0: Lowres train data dir: ${lores_train_data_dir}"

for f in ${lores_train_data_dir}/feats.scp $train_data_dir/feats.scp $gmm/final.mdl; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
exit 1
fi
done

if [[ $stage -le 5 ]] && [[ ! -z $gmm ]]; then
if [[ -f $ali_lats_dir/lat.1.gz ]] && [[ $ali_lats_dir/lat.1.gz -nt $gmm/final.mdl ]]; then
printf "\n$0: The lattices seem to be there and up to date wrt to gmm model. Skipping\n\n"
else
echo "$0: Generating alignments and lattices for "
if [ -f $gmm/trans.1 ]; then # It's fmllr
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
--generate-ali-from-lats true \
$lores_train_data_dir \
$lang $gmm $ali_lats_dir || exit 1;
else
local/align_si_lats.sh --nj $nj --cmd "$train_cmd" \
--generate-ali-from-lats true \
$lores_train_data_dir $lang $gmm $ali_lats_dir
fi
rm $ali_lats_dir/fsts.*.gz 2>/dev/null || true # save space
fi
sleep 2
fi


if [ $stage -le 6 ]; then
echo "$0: Creating lang directory $lang_chain with chain-type topology"
# Create a version of the lang/ directory that has one state per phone in the
# topo file. [note, it really has two states.. the first one is only repeated
# once, the second one has zero or more repeats.]
if [ -d $lang_chain ]; then
if [ $lang_chain/L.fst -nt $lang/L.fst ]; then
echo "$0: $lang_chain already exists, not overwriting it; continuing"
else
echo "$0: $lang_chain already exists and seems to be older than data/lang..."
fi
else
cp -r $lang $lang_chain
silphonelist=$(cat $lang_chain/phones/silence.csl) || exit 1;
nonsilphonelist=$(cat $lang_chain/phones/nonsilence.csl) || exit 1;
# Use our special topology... note that later on may have to tune this
# topology.
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_chain/topo
fi
sleep 2
fi

if [ $stage -le 7 ]; then
echo "$0: Buidling the tree..."
if [[ -f $tree_dir/final.mdl ]]; then
printf "\n$0: $tree_dir/final.mdl already exists. Skipping.\n\n"
elif [ -z "$tree_dir" ]; then
printf "\n$0: Tree dir is empty. Skipping tree stage.\n\n"
else
steps/nnet3/chain/build_tree.sh \
--frame-subsampling-factor 3 \
--context-opts "$tree_opts" \
--cmd "$train_cmd" $leaves ${lores_train_data_dir} \
$lang_chain $ali_lats_dir $tree_dir
fi
sleep 2
fi

if ! $use_ivector; then
echo "$0: ## Not doing ivectors ##"
sleep 2
exit 0;
fi


if [[ -f $extractor/final.ie ]] && [[ $stage -le 9 ]]; then
echo ""
echo "$0: There is already an ivector extractor trained. Skipping..."
echo ""
else
if [ $stage -le 8 ]; then
echo "$0: computing a subset of data to train the diagonal UBM."
# We'll use about a quarter of the data.
mkdir -p $exp/nnet3${nnet3_affix}/diag_ubm
temp_data_root=$exp/nnet3${nnet3_affix}/diag_ubm

num_utts_total=$(wc -l <data/${train_set}/utt2spk)
num_utts=$[$num_utts_total/4]
utils/data/subset_data_dir.sh data/${train_set} \
$num_utts ${temp_data_root}/${train_set}_subset

echo "$0: computing a PCA transform from the hires data."
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
--max-utts 10000 --subsample 2 \
${temp_data_root}/${train_set}_subset \
$exp/nnet3${nnet3_affix}/pca_transform

echo "$0: training the diagonal UBM."
# Use 512 Gaussians in the UBM.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $[$nj/2] \
--num-frames 700000 \
--num-threads 8 \
${temp_data_root}/${train_set}_subset 512 \
$exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm
fi

if [ $stage -le 9 ]; then
# Train the iVector extractor. Use all of the speed-perturbed data since iVector extractors
# can be sensitive to the amount of data. The script defaults to an iVector dimension of
# 100.
echo "$0: training the iVector extractor"
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
--num-threads 4 --num-processes 2 --ivector-dim $ivector_dim \
--online-cmvn-iextractor $online_cmvn_iextractor \
data/${train_set} $exp/nnet3${nnet3_affix}/diag_ubm \
$extractor || exit 1;
fi
sleep 2
fi

if [ $stage -le 10 ]; then
# We extract iVectors on the speed-perturbed training data after combining
# short segments, which will be what we train the system on. With
# --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
# each of these pairs as one speaker; this gives more diversity in iVectors..
# Note that these are extracted 'online'.

# note, we don't encode the 'max2' in the name of the ivectordir even though
# that's the data we extract the ivectors from, as it's still going to be
# valid for the non-'max2' data, the utterance list is the same.

ivectordir=$exp/nnet3${nnet3_affix}/ivectors_${train_set}

if [ -f $ivectordir/ivector_online.scp ]; then
echo ""
echo "iVectors already there for $train_set. Skipping. Check compatibility yourself!"
echo ""
else
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
temp_data_root=${ivectordir}
utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
data/${train_set} ${temp_data_root}/${train_set}_max2

steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
${temp_data_root}/${train_set}_max2 \
$extractor $ivectordir
fi
sleep 2
# Also extract iVectors for the test data, but in this case we don't need the speed
# perturbation (sp).
echo "--- $0: $test_sets "
for data in $test_sets; do
odir=$exp/nnet3${nnet3_affix}/ivectors_${data}_hires
if [ -f $odir/ivector_online.scp ]; then
echo ""
echo "iVectors already there for $data. Skipping. Check compatibility yourself!"
echo ""
else
nspk=$(wc -l <data/${data}_hires/spk2utt)
if [ $nspk -gt $nj ]; then
nspk=$nj
fi
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nspk \
data/${data}_hires $extractor $odir
fi
done
sleep 2
fi

exit 0
Loading

0 comments on commit e4940d0

Please sign in to comment.