Support for importing Kaldi's feats.scp and reading features directly…

… from scp/ark (#318) * Support for import Kaldi's feats.scp and reading features directly from scp/ark * Remove feature type specification
lhotse-speech · Jun 9, 2021 · ef7a037 · ef7a037
1 parent a837922
commit ef7a037
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 39 deletions.
diff --git a/docs/kaldi.rst b/docs/kaldi.rst
@@ -5,15 +5,16 @@ Data import/export
 ******************
 
 We support importing Kaldi data directories that contain at least the ``wav.scp`` file,
-required to create the :class:`~lhotse.audio.RecordingSet`. Other files, such as ``segments``, ``utt2spk``, etc.
-are used to create the :class:`~lhotse.supervision.SupervisionSet`.
+required to create the :class:`~lhotse.audio.RecordingSet`.
+Other files, such as ``segments``, ``utt2spk``, etc. are used to create the :class:`~lhotse.supervision.SupervisionSet`.
+We also support converting ``feats.scp`` to :class:`~lhotse.features.base.FeatureSet`, and reading features
+directly from Kaldi's scp/ark files via `kaldiio`_ library (which is an optional Lhotse's dependency).
 
 We also allow to export a pair of :class:`~lhotse.audio.RecordingSet` and :class:`~lhotse.supervision.SupervisionSet`
 to a Kaldi data directory.
 
 We currently do not support the following (but may start doing so in the future):
 
-* Importing Kaldi's extracted features (``feats.scp`` is ignored)
 * Exporting Lhotse extracted features to Kaldi's ``feats.scp``
 * Export Lhotse's multi-channel recording sets to Kaldi
 
@@ -52,3 +53,6 @@ to a directory with Lhotse manifests called ``train_manifests``:
         train_manifests/recordings.json \
         train_manifests/supervisions.json \
         data/train
+
+
+.. _kaldiio: https://pypi.org/project/kaldiio/
diff --git a/lhotse/bin/modes/kaldi.py b/lhotse/bin/modes/kaldi.py
@@ -2,7 +2,7 @@
 
 import click
 
-from lhotse import load_manifest
+from lhotse import Seconds, load_manifest
 from lhotse.bin.modes.cli_base import cli
 from lhotse.kaldi import export_to_kaldi, load_kaldi_data_dir
 from lhotse.utils import Pathlike
@@ -18,17 +18,25 @@ def kaldi():
 @click.argument('data_dir', type=click.Path(exists=True, file_okay=False))
 @click.argument('sampling_rate', type=int)
 @click.argument('manifest_dir', type=click.Path())
-def import_(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike):
+@click.option('-f', '--frame-shift', type=float,
+              help='Frame shift (in seconds) is required to support reading feats.scp.')
+def import_(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike, frame_shift: Seconds):
     """
     Convert a Kaldi data dir DATA_DIR into a directory MANIFEST_DIR of lhotse manifests. Ignores feats.scp.
     The SAMPLING_RATE has to be explicitly specified as it is not available to read from DATA_DIR.
     """
-    recording_set, maybe_supervision_set = load_kaldi_data_dir(path=data_dir, sampling_rate=sampling_rate)
+    recording_set, maybe_supervision_set, maybe_feature_set = load_kaldi_data_dir(
+        path=data_dir,
+        sampling_rate=sampling_rate,
+        frame_shift=frame_shift
+    )
     manifest_dir = Path(manifest_dir)
     manifest_dir.mkdir(parents=True, exist_ok=True)
-    recording_set.to_file(manifest_dir / 'audio.jsonl')
+    recording_set.to_file(manifest_dir / 'recordings.jsonl.gz')
     if maybe_supervision_set is not None:
-        maybe_supervision_set.to_file(manifest_dir / 'supervision.jsonl')
+        maybe_supervision_set.to_file(manifest_dir / 'supervisions.jsonl.gz')
+    if maybe_feature_set is not None:
+        maybe_feature_set.to_file(manifest_dir / 'features.jsonl.gz')
 
 
 @kaldi.command()

diff --git a/lhotse/features/__init__.py b/lhotse/features/__init__.py
@@ -12,6 +12,7 @@
 from .io import (
     FeaturesReader,
     FeaturesWriter,
+    KaldiReader,
     LilcomFilesReader,
     LilcomFilesWriter,
     LilcomHdf5Reader,

diff --git a/lhotse/features/io.py b/lhotse/features/io.py
@@ -6,7 +6,7 @@
 import lilcom
 import numpy as np
 
-from lhotse.utils import Pathlike
+from lhotse.utils import Pathlike, is_module_available
 
 
 class FeaturesWriter(metaclass=ABCMeta):
@@ -553,3 +553,43 @@ def write(self, key: str, value: np.ndarray) -> str:
         with smart_open.open(output_features_url, 'wb', transport_params=self.transport_params) as f:
             f.write(serialized_feats)
         return key
+
+
+"""
+Kaldi-compatible feature reader
+"""
+
+
+@register_reader
+class KaldiReader(FeaturesReader):
+    """
+    Reads Kaldi's "feats.scp" file using kaldiio.
+    ``storage_path`` corresponds to the path to ``feats.scp``.
+    ``storage_key`` corresponds to the utterance-id in Kaldi.
+
+    .. caution::
+        Requires ``kaldiio`` to be installed (``pip install kaldiio``).
+    """
+    name = 'kaldiio'
+
+    def __init__(
+            self,
+            storage_path: Pathlike,
+            *args,
+            **kwargs
+    ):
+        if not is_module_available('kaldiio'):
+            raise ValueError("To read Kaldi feats.scp, please 'pip install kaldiio' first.")
+        import kaldiio
+        super().__init__()
+        self.storage_path = storage_path
+        self.storage = kaldiio.load_scp(str(self.storage_path))
+
+    def read(
+            self,
+            key: str,
+            left_offset_frames: int = 0,
+            right_offset_frames: Optional[int] = None
+    ) -> np.ndarray:
+        arr = self.storage[key]
+        return arr[left_offset_frames: right_offset_frames]
diff --git a/lhotse/kaldi.py b/lhotse/kaldi.py
@@ -1,14 +1,19 @@
+import warnings
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
-from lhotse import CutSet
+from lhotse import CutSet, FeatureSet, Features, Seconds
 from lhotse.audio import AudioSource, Recording, RecordingSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
-from lhotse.utils import Pathlike
+from lhotse.utils import Pathlike, is_module_available
 
 
-def load_kaldi_data_dir(path: Pathlike, sampling_rate: int) -> Tuple[RecordingSet, Optional[SupervisionSet]]:
+def load_kaldi_data_dir(
+        path: Pathlike,
+        sampling_rate: int,
+        frame_shift: Optional[Seconds] = None,
+) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
     """
     Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
     For this to work, at least the wav.scp file must exist.
@@ -31,7 +36,7 @@ def load_kaldi_data_dir(path: Pathlike, sampling_rate: int) -> Tuple[RecordingSe
             recording_id, dur = line.strip().split()
             durations[recording_id] = float(dur)
 
-    audio_set = RecordingSet.from_recordings(
+    recording_set = RecordingSet.from_recordings(
         Recording(
             id=recording_id,
             sources=[
@@ -48,35 +53,60 @@ def load_kaldi_data_dir(path: Pathlike, sampling_rate: int) -> Tuple[RecordingSe
         for recording_id, path_or_cmd in recordings.items()
     )
 
-    # must exist for SupervisionSet
+    supervision_set = None
     segments = path / 'segments'
-    if not segments.is_file():
-        return audio_set, None
-
-    with segments.open() as f:
-        supervision_segments = [l.strip().split() for l in f]
-
-    texts = load_kaldi_text_mapping(path / 'text')
-    speakers = load_kaldi_text_mapping(path / 'utt2spk')
-    genders = load_kaldi_text_mapping(path / 'spk2gender')
-    languages = load_kaldi_text_mapping(path / 'utt2lang')
-
-    supervision_set = SupervisionSet.from_segments(
-        SupervisionSegment(
-            id=segment_id,
-            recording_id=recording_id,
-            start=float(start),
-            duration=float(end) - float(start),
-            channel=0,
-            text=texts[segment_id],
-            language=languages[segment_id],
-            speaker=speakers[segment_id],
-            gender=genders[speakers[segment_id]]
+    if segments.is_file():
+        with segments.open() as f:
+            supervision_segments = [l.strip().split() for l in f]
+
+        texts = load_kaldi_text_mapping(path / 'text')
+        speakers = load_kaldi_text_mapping(path / 'utt2spk')
+        genders = load_kaldi_text_mapping(path / 'spk2gender')
+        languages = load_kaldi_text_mapping(path / 'utt2lang')
+
+        supervision_set = SupervisionSet.from_segments(
+            SupervisionSegment(
+                id=segment_id,
+                recording_id=recording_id,
+                start=float(start),
+                duration=float(end) - float(start),
+                channel=0,
+                text=texts[segment_id],
+                language=languages[segment_id],
+                speaker=speakers[segment_id],
+                gender=genders[speakers[segment_id]]
+            )
+            for segment_id, recording_id, start, end in supervision_segments
         )
-        for segment_id, recording_id, start, end in supervision_segments
-    )
 
-    return audio_set, supervision_set
+    feature_set = None
+    feats_scp = path / 'feats.scp'
+    if feats_scp.exists() and is_module_available('kaldiio'):
+        if frame_shift is not None:
+            import kaldiio
+            from lhotse.features.io import KaldiReader
+            feature_set = FeatureSet.from_features(
+                Features(
+                    type='kaldiio',
+                    num_frames=mat.shape[0],
+                    num_features=mat.shape[1],
+                    frame_shift=frame_shift,
+                    sampling_rate=sampling_rate,
+                    start=0,
+                    duration=mat.shape[0] * frame_shift,
+                    storage_type=KaldiReader.name,
+                    storage_path=str(feats_scp),
+                    storage_key=utt_id,
+                    recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id,
+                    channels=0
+                ) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))
+            )
+        else:
+            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
+                          f"frame_shift must be not None. "
+                          f"Feature import omitted.")
+
+    return recording_set, supervision_set, feature_set
 
 
 def export_to_kaldi(recordings: RecordingSet, supervisions: SupervisionSet, output_dir: Pathlike):