Skip to content

Commit

Permalink
Support for importing Kaldi's feats.scp and reading features directly…
Browse files Browse the repository at this point in the history
… from scp/ark (#318)

* Support for import Kaldi's feats.scp and reading features directly from scp/ark

* Remove feature type specification
  • Loading branch information
pzelasko authored Jun 9, 2021
1 parent a837922 commit ef7a037
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 39 deletions.
10 changes: 7 additions & 3 deletions docs/kaldi.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@ Data import/export
******************

We support importing Kaldi data directories that contain at least the ``wav.scp`` file,
required to create the :class:`~lhotse.audio.RecordingSet`. Other files, such as ``segments``, ``utt2spk``, etc.
are used to create the :class:`~lhotse.supervision.SupervisionSet`.
required to create the :class:`~lhotse.audio.RecordingSet`.
Other files, such as ``segments``, ``utt2spk``, etc. are used to create the :class:`~lhotse.supervision.SupervisionSet`.
We also support converting ``feats.scp`` to :class:`~lhotse.features.base.FeatureSet`, and reading features
directly from Kaldi's scp/ark files via `kaldiio`_ library (which is an optional Lhotse's dependency).

We also allow to export a pair of :class:`~lhotse.audio.RecordingSet` and :class:`~lhotse.supervision.SupervisionSet`
to a Kaldi data directory.

We currently do not support the following (but may start doing so in the future):

* Importing Kaldi's extracted features (``feats.scp`` is ignored)
* Exporting Lhotse extracted features to Kaldi's ``feats.scp``
* Export Lhotse's multi-channel recording sets to Kaldi

Expand Down Expand Up @@ -52,3 +53,6 @@ to a directory with Lhotse manifests called ``train_manifests``:
train_manifests/recordings.json \
train_manifests/supervisions.json \
data/train
.. _kaldiio: https://pypi.org/project/kaldiio/
18 changes: 13 additions & 5 deletions lhotse/bin/modes/kaldi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import click

from lhotse import load_manifest
from lhotse import Seconds, load_manifest
from lhotse.bin.modes.cli_base import cli
from lhotse.kaldi import export_to_kaldi, load_kaldi_data_dir
from lhotse.utils import Pathlike
Expand All @@ -18,17 +18,25 @@ def kaldi():
@click.argument('data_dir', type=click.Path(exists=True, file_okay=False))
@click.argument('sampling_rate', type=int)
@click.argument('manifest_dir', type=click.Path())
def import_(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike):
@click.option('-f', '--frame-shift', type=float,
help='Frame shift (in seconds) is required to support reading feats.scp.')
def import_(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike, frame_shift: Seconds):
"""
Convert a Kaldi data dir DATA_DIR into a directory MANIFEST_DIR of lhotse manifests. Ignores feats.scp.
The SAMPLING_RATE has to be explicitly specified as it is not available to read from DATA_DIR.
"""
recording_set, maybe_supervision_set = load_kaldi_data_dir(path=data_dir, sampling_rate=sampling_rate)
recording_set, maybe_supervision_set, maybe_feature_set = load_kaldi_data_dir(
path=data_dir,
sampling_rate=sampling_rate,
frame_shift=frame_shift
)
manifest_dir = Path(manifest_dir)
manifest_dir.mkdir(parents=True, exist_ok=True)
recording_set.to_file(manifest_dir / 'audio.jsonl')
recording_set.to_file(manifest_dir / 'recordings.jsonl.gz')
if maybe_supervision_set is not None:
maybe_supervision_set.to_file(manifest_dir / 'supervision.jsonl')
maybe_supervision_set.to_file(manifest_dir / 'supervisions.jsonl.gz')
if maybe_feature_set is not None:
maybe_feature_set.to_file(manifest_dir / 'features.jsonl.gz')


@kaldi.command()
Expand Down
1 change: 1 addition & 0 deletions lhotse/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .io import (
FeaturesReader,
FeaturesWriter,
KaldiReader,
LilcomFilesReader,
LilcomFilesWriter,
LilcomHdf5Reader,
Expand Down
42 changes: 41 additions & 1 deletion lhotse/features/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import lilcom
import numpy as np

from lhotse.utils import Pathlike
from lhotse.utils import Pathlike, is_module_available


class FeaturesWriter(metaclass=ABCMeta):
Expand Down Expand Up @@ -553,3 +553,43 @@ def write(self, key: str, value: np.ndarray) -> str:
with smart_open.open(output_features_url, 'wb', transport_params=self.transport_params) as f:
f.write(serialized_feats)
return key


"""
Kaldi-compatible feature reader
"""


@register_reader
class KaldiReader(FeaturesReader):
"""
Reads Kaldi's "feats.scp" file using kaldiio.
``storage_path`` corresponds to the path to ``feats.scp``.
``storage_key`` corresponds to the utterance-id in Kaldi.
.. caution::
Requires ``kaldiio`` to be installed (``pip install kaldiio``).
"""
name = 'kaldiio'

def __init__(
self,
storage_path: Pathlike,
*args,
**kwargs
):
if not is_module_available('kaldiio'):
raise ValueError("To read Kaldi feats.scp, please 'pip install kaldiio' first.")
import kaldiio
super().__init__()
self.storage_path = storage_path
self.storage = kaldiio.load_scp(str(self.storage_path))

def read(
self,
key: str,
left_offset_frames: int = 0,
right_offset_frames: Optional[int] = None
) -> np.ndarray:
arr = self.storage[key]
return arr[left_offset_frames: right_offset_frames]
90 changes: 60 additions & 30 deletions lhotse/kaldi.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import warnings
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, Optional, Tuple

from lhotse import CutSet
from lhotse import CutSet, FeatureSet, Features, Seconds
from lhotse.audio import AudioSource, Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike
from lhotse.utils import Pathlike, is_module_available


def load_kaldi_data_dir(path: Pathlike, sampling_rate: int) -> Tuple[RecordingSet, Optional[SupervisionSet]]:
def load_kaldi_data_dir(
path: Pathlike,
sampling_rate: int,
frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
"""
Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
For this to work, at least the wav.scp file must exist.
Expand All @@ -31,7 +36,7 @@ def load_kaldi_data_dir(path: Pathlike, sampling_rate: int) -> Tuple[RecordingSe
recording_id, dur = line.strip().split()
durations[recording_id] = float(dur)

audio_set = RecordingSet.from_recordings(
recording_set = RecordingSet.from_recordings(
Recording(
id=recording_id,
sources=[
Expand All @@ -48,35 +53,60 @@ def load_kaldi_data_dir(path: Pathlike, sampling_rate: int) -> Tuple[RecordingSe
for recording_id, path_or_cmd in recordings.items()
)

# must exist for SupervisionSet
supervision_set = None
segments = path / 'segments'
if not segments.is_file():
return audio_set, None

with segments.open() as f:
supervision_segments = [l.strip().split() for l in f]

texts = load_kaldi_text_mapping(path / 'text')
speakers = load_kaldi_text_mapping(path / 'utt2spk')
genders = load_kaldi_text_mapping(path / 'spk2gender')
languages = load_kaldi_text_mapping(path / 'utt2lang')

supervision_set = SupervisionSet.from_segments(
SupervisionSegment(
id=segment_id,
recording_id=recording_id,
start=float(start),
duration=float(end) - float(start),
channel=0,
text=texts[segment_id],
language=languages[segment_id],
speaker=speakers[segment_id],
gender=genders[speakers[segment_id]]
if segments.is_file():
with segments.open() as f:
supervision_segments = [l.strip().split() for l in f]

texts = load_kaldi_text_mapping(path / 'text')
speakers = load_kaldi_text_mapping(path / 'utt2spk')
genders = load_kaldi_text_mapping(path / 'spk2gender')
languages = load_kaldi_text_mapping(path / 'utt2lang')

supervision_set = SupervisionSet.from_segments(
SupervisionSegment(
id=segment_id,
recording_id=recording_id,
start=float(start),
duration=float(end) - float(start),
channel=0,
text=texts[segment_id],
language=languages[segment_id],
speaker=speakers[segment_id],
gender=genders[speakers[segment_id]]
)
for segment_id, recording_id, start, end in supervision_segments
)
for segment_id, recording_id, start, end in supervision_segments
)

return audio_set, supervision_set
feature_set = None
feats_scp = path / 'feats.scp'
if feats_scp.exists() and is_module_available('kaldiio'):
if frame_shift is not None:
import kaldiio
from lhotse.features.io import KaldiReader
feature_set = FeatureSet.from_features(
Features(
type='kaldiio',
num_frames=mat.shape[0],
num_features=mat.shape[1],
frame_shift=frame_shift,
sampling_rate=sampling_rate,
start=0,
duration=mat.shape[0] * frame_shift,
storage_type=KaldiReader.name,
storage_path=str(feats_scp),
storage_key=utt_id,
recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id,
channels=0
) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))
)
else:
warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
f"frame_shift must be not None. "
f"Feature import omitted.")

return recording_set, supervision_set, feature_set


def export_to_kaldi(recordings: RecordingSet, supervisions: SupervisionSet, output_dir: Pathlike):
Expand Down

0 comments on commit ef7a037

Please sign in to comment.