[src,scripts,egs] Add support for Continual Learning using chain/LF-M…

…MI models (kaldi-asr#4686) * Add support for LWF and DenLWF continual learning using chain models * Some minor changes based on comments * Some more minor changes * Some more minor changes Co-authored-by: Hossein Hadian <hossein.hadian@behavox.com>
pguyot · Feb 13, 2022 · e4940d0 · e4940d0
1 parent 0be6186
commit e4940d0
Show file tree

Hide file tree

Showing 31 changed files with 2,031 additions and 30 deletions.
diff --git a/egs/cl_english/v1/cmd.sh b/egs/cl_english/v1/cmd.sh
@@ -0,0 +1,4 @@
+export train_cmd=run.pl
+export decode_cmd=run.pl
+export cuda_cmd=run.pl
+export mkgraph_cmd=run.pl
diff --git a/egs/cl_english/v1/conf/mfcc.conf b/egs/cl_english/v1/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
diff --git a/egs/cl_english/v1/conf/mfcc_hires.conf b/egs/cl_english/v1/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/cl_english/v1/conf/online_cmvn.conf b/egs/cl_english/v1/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/cl_english/v1/local/align_si_lats.sh b/egs/cl_english/v1/local/align_si_lats.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2020  Hossein Hadian
+# Apache 2.0
+
+# This is similar to align_fmllr_lats.sh except it works with non-SAT
+# systems e.g. tri1 or tri2
+
+# Computes training alignments using a model with delta or
+# LDA+MLLT features.
+
+# If you supply the "--use-graphs true" option, it will use the training
+# graphs from the source directory (where the model is).  In this
+# case the number of jobs must match with the source directory.
+
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+#scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=10
+retry_beam=40
+retry_beam=40
+final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
+               # is a limitation of gmm-latgen-faster.  We just use an
+               # intermediate beam.  We'll lose a little data and it will be
+               # slightly slower.  (however, the min-active of 200 that
+               # gmm-latgen-faster defaults to may help.)
+careful=false
+boost_silence=1.0 # Factor by which to boost silence during alignment.
+stage=0
+generate_ali_from_lats=false # If true, alingments generated from lattices.
+max_active=7000
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+
+for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+cp $srcdir/delta_opts $dir 2>/dev/null
+
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.occs $dir;
+
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $srcdir/full.mat $dir
+   ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: compiling training graphs"
+  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+
+
+echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
+
+mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/final.mdl - |"
+
+if [ $stage -le 2 ]; then
+  # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
+  # alignment errors (however, it does have a default min-active=200 so this
+  # will tend to reduce alignment errors).
+  # --allow_partial=false makes sure we reach the end of the decoding graph.
+  # --word-determinize=false makes sure we retain the alternative pronunciations of
+  #   words (including alternatives regarding optional silences).
+  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
+  #    it means we do no pruning of the lattice (lattices from a training transcription
+  #    will be small anyway).
+  echo "$0: generating lattices containing alternate pronunciations."
+  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
+    gmm-latgen-faster --max-active=$max_active --acoustic-scale=$acoustic_scale --beam=$final_beam \
+        --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
+      "$mdl" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
+      "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 3 ] && $generate_ali_from_lats; then
+  # If generate_alignments is true, ali.*.gz is generated in lats dir
+  $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
+    lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
+    ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
+
+echo "$0: done aligning data."
diff --git a/egs/cl_english/v1/local/chain/run_chain_common.sh b/egs/cl_english/v1/local/chain/run_chain_common.sh
@@ -0,0 +1,213 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Behavox (author: Hossein Hadian)
+# Apache 2.0
+
+# The only path modification this script does is to append _hires to test set directory names
+# for extracting the ivectos.
+
+set -euo pipefail
+
+stage=0
+lores_train_data_dir=data/train_sp
+train_data_dir=data/train_sp_hires
+gmm=exp/tri3b
+ali_lats_dir=exp/tri3b_lats_train
+lang=data/lang
+lang_chain=data/lang
+tree_dir=exp/chain/tree_sp
+leaves=4500
+test_sets="test_ldc test_sp_oc"
+nj=10
+tree_opts="--context-width=2 --central-position=1"
+exp=exp
+
+online_cmvn_iextractor=false
+use_ivector=true
+extractor=
+ivector_dim=100
+nnet3_affix=
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+if [ -z $extractor ]; then
+  extractor=$exp/nnet3${nnet3_affix}/extractor
+fi
+
+train_set=$(basename $train_data_dir)
+echo "$0: Highres train data dir: $train_data_dir"
+echo "$0: Lowres train data dir: ${lores_train_data_dir}"
+
+for f in ${lores_train_data_dir}/feats.scp $train_data_dir/feats.scp $gmm/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [[ $stage -le 5 ]] && [[ ! -z $gmm ]]; then
+  if [[ -f $ali_lats_dir/lat.1.gz ]] && [[ $ali_lats_dir/lat.1.gz -nt $gmm/final.mdl ]]; then
+    printf "\n$0: The lattices seem to be there and up to date wrt to gmm model. Skipping\n\n"
+  else
+    echo "$0: Generating alignments and lattices for "
+    if [ -f $gmm/trans.1 ]; then # It's fmllr
+      steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd"  \
+                                --generate-ali-from-lats true \
+                                $lores_train_data_dir \
+                                $lang $gmm $ali_lats_dir || exit 1;
+    else
+      local/align_si_lats.sh --nj $nj --cmd "$train_cmd" \
+                                  --generate-ali-from-lats true \
+                                  $lores_train_data_dir $lang $gmm $ali_lats_dir
+    fi
+    rm $ali_lats_dir/fsts.*.gz 2>/dev/null || true # save space
+  fi
+  sleep 2
+fi
+
+
+if [ $stage -le 6 ]; then
+  echo "$0: Creating lang directory $lang_chain with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_chain ]; then
+    if [ $lang_chain/L.fst -nt $lang/L.fst ]; then
+      echo "$0: $lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_chain already exists and seems to be older than data/lang..."
+    fi
+  else
+    cp -r $lang $lang_chain
+    silphonelist=$(cat $lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_chain/topo
+  fi
+  sleep 2
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Buidling the tree..."
+  if [[ -f $tree_dir/final.mdl ]]; then
+    printf "\n$0: $tree_dir/final.mdl already exists. Skipping.\n\n"
+  elif [ -z "$tree_dir" ]; then
+    printf "\n$0: Tree dir is empty. Skipping tree stage.\n\n"
+  else
+      steps/nnet3/chain/build_tree.sh \
+        --frame-subsampling-factor 3 \
+        --context-opts "$tree_opts" \
+        --cmd "$train_cmd" $leaves ${lores_train_data_dir} \
+        $lang_chain $ali_lats_dir $tree_dir
+  fi
+  sleep 2
+fi
+
+if ! $use_ivector; then
+  echo "$0: ## Not doing ivectors ##"
+  sleep 2
+  exit 0;
+fi
+
+
+if [[ -f $extractor/final.ie ]] && [[ $stage -le 9  ]]; then
+    echo ""
+    echo "$0: There is already an ivector extractor trained. Skipping..."
+    echo ""
+else
+    if [ $stage -le 8 ]; then
+      echo "$0: computing a subset of data to train the diagonal UBM."
+      # We'll use about a quarter of the data.
+      mkdir -p $exp/nnet3${nnet3_affix}/diag_ubm
+      temp_data_root=$exp/nnet3${nnet3_affix}/diag_ubm
+
+      num_utts_total=$(wc -l <data/${train_set}/utt2spk)
+      num_utts=$[$num_utts_total/4]
+      utils/data/subset_data_dir.sh data/${train_set} \
+         $num_utts ${temp_data_root}/${train_set}_subset
+
+      echo "$0: computing a PCA transform from the hires data."
+      steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+          --splice-opts "--left-context=3 --right-context=3" \
+          --max-utts 10000 --subsample 2 \
+           ${temp_data_root}/${train_set}_subset \
+           $exp/nnet3${nnet3_affix}/pca_transform
+
+      echo "$0: training the diagonal UBM."
+      # Use 512 Gaussians in the UBM.
+      steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $[$nj/2] \
+        --num-frames 700000 \
+        --num-threads 8 \
+        ${temp_data_root}/${train_set}_subset 512 \
+        $exp/nnet3${nnet3_affix}/pca_transform $exp/nnet3${nnet3_affix}/diag_ubm
+    fi
+
+    if [ $stage -le 9 ]; then
+      # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+      # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+      # 100.
+      echo "$0: training the iVector extractor"
+      steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+         --num-threads 4 --num-processes 2 --ivector-dim $ivector_dim \
+         --online-cmvn-iextractor $online_cmvn_iextractor \
+         data/${train_set} $exp/nnet3${nnet3_affix}/diag_ubm \
+         $extractor || exit 1;
+    fi
+    sleep 2
+fi
+
+if [ $stage -le 10 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=$exp/nnet3${nnet3_affix}/ivectors_${train_set}
+
+  if [ -f $ivectordir/ivector_online.scp ]; then
+      echo ""
+      echo "iVectors already there for $train_set. Skipping. Check compatibility yourself!"
+      echo ""
+  else
+      # having a larger number of speakers is helpful for generalization, and to
+      # handle per-utterance decoding well (iVector starts at zero).
+      temp_data_root=${ivectordir}
+      utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+        data/${train_set} ${temp_data_root}/${train_set}_max2
+
+      steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+        ${temp_data_root}/${train_set}_max2 \
+        $extractor $ivectordir
+  fi
+  sleep 2
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  echo "--- $0: $test_sets "
+  for data in $test_sets; do
+    odir=$exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+    if [ -f $odir/ivector_online.scp ]; then
+        echo ""
+        echo "iVectors already there for $data. Skipping. Check compatibility yourself!"
+        echo ""
+    else
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      if [ $nspk -gt $nj ]; then
+        nspk=$nj
+      fi
+      steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nspk \
+                                                    data/${data}_hires $extractor $odir
+    fi
+  done
+  sleep 2
+fi
+
+exit 0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		--use-energy=false # only non-default option.
		--sample-frequency=8000 # Switchboard is sampled at 8kHz
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh