Merge pull request #7 from qindazhu/haowen-fix

fix some bugs/issues of the training scripts
k2-fsa · Nov 13, 2020 · 27931f1 · 27931f1
2 parents 811e533 + 691f679
commit 27931f1
Show file tree

Hide file tree

Showing 7 changed files with 88 additions and 601 deletions.
diff --git a/egs/librispeech/asr/simple_v1/README.md b/egs/librispeech/asr/simple_v1/README.md
@@ -1,132 +1,9 @@
 
+### k2_librispeech
 
-## k2_librispeech
-An example of how to build G and L FST for K2.
-
-Most scripts of this example are copied from Kaldi.
+An example of how to use k2 and lhotse to train a CTC acoustic model.
 
 ### Run scripts
-```bash
-$ ./run.sh
-
-$ ls data/lang_nosp
-
-G.fsa.txt
-L.fst.txt
-L_disambig.fst.txt
-oov.int
-oov.txt
-phones
-phones.txt
-words.txt
-```
-
-### Load L, G into K2
-```python
-import k2, _k2
-
-
-with open('data/lang_nosp/L.fst.txt') as f:
-    s = f.read()
-
-Lfst = k2.Fsa.from_openfst(s, acceptor=False)
-
-with open('data/lang_nosp/G.fsa.txt') as f:
-    s = f.read()
 
-Gfsa = k2.Fsa.from_openfst(s, acceptor=True)
-```
-
-### An example of G building
-The `toy.arpa` file:
-```plain
-\data\
-ngram 1=5
-ngram 2=6
-ngram 3=1
-
-\1-grams:
--2.348754 </s>
--99 <s> -1.070027
--4.214113 A -0.5964623
--4.255245 B -0.3214741
--4.20255  C -0.2937318
-
-\2-grams:
--4.284099 <s> A -0.1969815
--1.100091 A </s>
--2.839235 A B -0.1747991
--2.838903 A C -0.5100551
--1.104238 B </s>
--1.251644 C </s>
-
-\3-grams:
--0.1605104  A C B
-
-\end\
-```
-
-Build G fst:
-```bash
-$ local/arpa2fst.py toy.arpa
-
-0 1 </s> -5.408205947510138
-2 0 <eps> -1.070027
-0 3 A -9.703353773992418
-3 0 <eps> -0.5964623
-0 4 B -9.79806370403745
-4 0 <eps> -0.3214741
-0 5 C -9.676728982562127
-5 0 <eps> -0.2937318
-2 6 A -9.864502494310699
-6 3 <eps> -0.1969815
-3 1 </s> -2.5330531375369127
-3 7 B -6.53758018650695
-7 4 <eps> -0.1747991
-3 8 C -6.536815728256077
-8 5 <eps> -0.5100551
-4 1 </s> -2.5426019579175594
-5 1 </s> -2.8820168161354394
-8 9 B -0.36958885431051147
-1
-```
+$ ./run.sh
 
-Draw it by Graphviz:
-```
-digraph FST {
-rankdir = LR;
-size = "8.5,11";
-label = "";
-center = 1;
-ranksep = "0.4";
-nodesep = "0.25";
-0 [label = "0", shape = circle, style = bold, fontsize = 14]
-	0 -> 1 [label = "</s>/-5.4082", fontsize = 14];
-	0 -> 3 [label = "A/-9.7034", fontsize = 14];
-	0 -> 4 [label = "B/-9.7981", fontsize = 14];
-	0 -> 5 [label = "C/-9.6767", fontsize = 14];
-1 [label = "1", shape = doublecircle, style = solid, fontsize = 14]
-2 [label = "2", shape = circle, style = solid, fontsize = 14]
-	2 -> 0 [label = "<eps>/-1.07", fontsize = 14];
-	2 -> 6 [label = "A/-9.8645", fontsize = 14];
-3 [label = "3", shape = circle, style = solid, fontsize = 14]
-	3 -> 0 [label = "<eps>/-0.59646", fontsize = 14];
-	3 -> 1 [label = "</s>/-2.5331", fontsize = 14];
-	3 -> 7 [label = "B/-6.5376", fontsize = 14];
-	3 -> 8 [label = "C/-6.5368", fontsize = 14];
-4 [label = "4", shape = circle, style = solid, fontsize = 14]
-	4 -> 0 [label = "<eps>/-0.32147", fontsize = 14];
-	4 -> 1 [label = "</s>/-2.5426", fontsize = 14];
-5 [label = "5", shape = circle, style = solid, fontsize = 14]
-	5 -> 0 [label = "<eps>/-0.29373", fontsize = 14];
-	5 -> 1 [label = "</s>/-2.882", fontsize = 14];
-6 [label = "6", shape = circle, style = solid, fontsize = 14]
-	6 -> 3 [label = "<eps>/-0.19698", fontsize = 14];
-7 [label = "7", shape = circle, style = solid, fontsize = 14]
-	7 -> 4 [label = "<eps>/-0.1748", fontsize = 14];
-8 [label = "8", shape = circle, style = solid, fontsize = 14]
-	8 -> 5 [label = "<eps>/-0.51006", fontsize = 14];
-	8 -> 9 [label = "B/-0.36959", fontsize = 14];
-9 [label = "9", shape = circle, style = solid, fontsize = 14]
-}
-```
diff --git a/egs/librispeech/asr/simple_v1/model.py b/egs/librispeech/asr/simple_v1/model.py
@@ -1,6 +1,9 @@
 from torch import Tensor
 from torch import nn
 
+# Copyright (c)  2020  Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu)
+# Apache 2.0
+
 
 class Model(nn.Module):
     """

diff --git a/egs/librispeech/asr/simple_v1/prepare.py b/egs/librispeech/asr/simple_v1/prepare.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 
+# Copyright (c)  2020  Xiaomi Corporation (authors: Junbo Zhang, Haowen Qiu)
+# Apache 2.0
+
 import os
 from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
@@ -17,7 +20,7 @@
 print("Parts we will prepare: ", dataset_parts)
 
 corpus_dir = '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
-output_dir = 'exp/data1'
+output_dir = 'exp/data'
 librispeech_manifests = prepare_librispeech(corpus_dir, dataset_parts,
                                             output_dir)
 
@@ -34,7 +37,6 @@
     torch.set_num_threads(1)
     torch.set_num_interop_threads(1)
 
-num_jobs = 1
 for partition, manifests in librispeech_manifests.items():
     print(partition)
     with LilcomFilesWriter(f'{output_dir}/feats_{partition}'
@@ -44,17 +46,7 @@
             supervisions=manifests['supervisions']).compute_and_store_features(
                 extractor=Fbank(),
                 storage=storage,
-                augmenter=augmenter if 'train' in partition else None,
+                augment_fn=augmenter if 'train' in partition else None,
                 executor=ex)
     librispeech_manifests[partition]['cuts'] = cut_set
     cut_set.to_json(output_dir + f'/cuts_{partition}.json.gz')
-
-cuts_train = SpeechRecognitionDataset(
-    librispeech_manifests['train-clean-100']['cuts'])
-cuts_test = SpeechRecognitionDataset(
-    librispeech_manifests['test-clean']['cuts'])
-
-sample = cuts_train[0]
-print('Transcript:', sample['text'])
-print('Supervisions mask:', sample['supervisions_mask'])
-print('Feature matrix:', sample.load_features())
diff --git a/egs/librispeech/asr/simple_v1/run.sh b/egs/librispeech/asr/simple_v1/run.sh
@@ -38,5 +38,5 @@ if [ $stage -le 5 ]; then
 fi
 
 if [ $stage -le 6 ]; then
-  python3 ./train_fast.py
+  python3 ./train.py
 fi