Merge pull request k2-fsa#11 from pzelasko/master

Add non-augmented data in training
jimbozhang · Nov 15, 2020 · 655456d · 655456d
2 parents 7201fde + 6a5bce8
commit 655456d
Showing 1 changed file with 13 additions and 8 deletions.
diff --git a/egs/librispeech/asr/simple_v1/prepare.py b/egs/librispeech/asr/simple_v1/prepare.py
@@ -6,13 +6,10 @@
 import multiprocessing
 import os
 from concurrent.futures import ProcessPoolExecutor
-from pathlib import Path
 
 import torch
-
-from lhotse import CutSet, Fbank, Mfcc, LilcomFilesWriter, WavAugmenter
-from lhotse.dataset import SpeechRecognitionDataset
-from lhotse.recipes.librispeech import download_and_untar, prepare_librispeech, dataset_parts_full
+from lhotse import CutSet, Fbank, LilcomFilesWriter, WavAugmenter
+from lhotse.recipes.librispeech import dataset_parts_full, prepare_librispeech
 
 print("All dataset parts: ", dataset_parts_full)
 
@@ -41,13 +38,21 @@
 for partition, manifests in librispeech_manifests.items():
     print(partition)
     with LilcomFilesWriter(f'{output_dir}/feats_{partition}') as storage, \
-        ProcessPoolExecutor(num_jobs, mp_context=multiprocessing.get_context("spawn")) as ex:
+            ProcessPoolExecutor(num_jobs, mp_context=multiprocessing.get_context("spawn")) as ex:
         cut_set = CutSet.from_manifests(
             recordings=manifests['recordings'],
-            supervisions=manifests['supervisions']).compute_and_store_features(
+            supervisions=manifests['supervisions'])
+        cut_set = cut_set.compute_and_store_features(
+            extractor=Fbank(),
+            storage=storage,
+            executor=ex)
+        if 'train' in partition:
+            # Duplicate the training set with an augmented version
+            augmented_cs = cut_set.compute_and_store_features(
                 extractor=Fbank(),
                 storage=storage,
-                augment_fn=augmenter if 'train' in partition else None,
+                augment_fn=augmenter,
                 executor=ex)
+            cut_set = cut_set + CutSet.from_cuts(c.with_id(c.id + '_aug') for c in augmented_cs)
     librispeech_manifests[partition]['cuts'] = cut_set
     cut_set.to_json(output_dir + f'/cuts_{partition}.json.gz')