Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine librispeech.py for DeepSpeech2. #78

Merged
merged 2 commits into from
Jun 9, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions deep_speech_2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ For some machines, we also need to install libsndfile1. Details to be added.
```
cd data
python librispeech.py
cat manifest.libri.train-* > manifest.libri.train-all
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we introduce the meaning of manifest.libri.train-* file ?
I see, the introduction details is in following section. Feel abrupt about manifest file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

cd ..
```

Expand All @@ -32,13 +33,13 @@ python librispeech.py --help
For GPU Training:

```
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all
```

For CPU Training:

```
python train.py --trainer_count 8 --use_gpu False
python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all
```

More help for arguments:
Expand Down
90 changes: 77 additions & 13 deletions deep_speech_2/data/librispeech.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""
Download, unpack and create manifest for Librespeech dataset.
Download, unpack and create manifest file for the Librespeech dataset.

Manifest is a json file with each line containing one audio clip filepath,
its transcription text string, and its duration. It servers as a unified
interfance to organize different data sets.
A manifest file is a dataset summarization, with each line a json format
string containing meta data for one audio clip, including its filepath,
transcription string, and duration. It serves as a unified interface for
different data sets.
"""

import paddle.v2 as paddle
from paddle.v2.dataset.common import md5file
import distutils.util
import os
import wget
import tarfile
Expand All @@ -27,11 +29,21 @@
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"

NUM_LINES_TEST_CLEAN = 2620
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please explain why it's necessary to check the line number when MD5 has been checked.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also feel confused

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed.

NUM_LINES_TEST_OTHER = 2939
NUM_LINES_DEV_CLEAN = 2703
NUM_LINES_DEV_OTHER = 2864
NUM_LINES_TRAIN_CLEAN_100 = 28539
NUM_LINES_TRAIN_CLEAN_360 = 104014
NUM_LINES_TRAIN_OTHER_500 = 148688

parser = argparse.ArgumentParser(
description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument(
Expand All @@ -44,6 +56,13 @@
default="manifest.libri",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
parser.add_argument(
"--full_download",
default="True",
type=distutils.util.strtobool,
help="Download all datasets for Librispeech."
" If False, only download a minimal requirement (test-clean, dev-clean"
" train-clean-100). (default: %(default)s)")
args = parser.parse_args()


Expand All @@ -57,7 +76,10 @@ def download(url, md5sum, target_dir):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
assert md5file(filepath) == md5sum, "MD5 checksum failed."
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath


Expand All @@ -69,7 +91,6 @@ def unpack(filepath, target_dir):
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
return target_dir


def create_manifest(data_dir, manifest_path):
Expand All @@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path):
"""
print("Creating manifest %s ..." % manifest_path)
json_lines = []
for subfolder, _, filelist in os.walk(data_dir):
for subfolder, _, filelist in sorted(os.walk(data_dir)):
text_filelist = [
filename for filename in filelist if filename.endswith('trans.txt')
]
Expand All @@ -107,31 +128,74 @@ def create_manifest(data_dir, manifest_path):
out_file.write(line + '\n')


def prepare_dataset(url, md5sum, target_dir, manifest_path):
def verify_file_line_number(filepath, num_lines):
with open(filepath, 'r') as file:
return len(file.readlines()) == num_lines


def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines):
"""
Download, unpack and create summmary manifest file.
"""
# download
filepath = download(url, md5sum, target_dir)
unpacked_dir = unpack(filepath, target_dir)
create_manifest(unpacked_dir, manifest_path)
# unpack
if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
unpack(filepath, target_dir)
else:
print("Unpacked data exists, skip unpacking.")
# create manifest and verify line number
create_manifest(target_dir, manifest_path)
if not verify_file_line_number(manifest_path, num_lines):
raise RuntimeError("Manifest line number check failed. "
"Please remove directory and try running the script "
"again.")


def main():
prepare_dataset(
url=URL_TEST_CLEAN,
md5sum=MD5_TEST_CLEAN,
target_dir=os.path.join(args.target_dir, "test-clean"),
manifest_path=args.manifest_prefix + ".test-clean")
manifest_path=args.manifest_prefix + ".test-clean",
num_lines=NUM_LINES_TEST_CLEAN)
prepare_dataset(
url=URL_DEV_CLEAN,
md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean")
manifest_path=args.manifest_prefix + ".dev-clean",
num_lines=NUM_LINES_DEV_CLEAN)
prepare_dataset(
url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100")
manifest_path=args.manifest_prefix + ".train-clean-100",
num_lines=NUM_LINES_TRAIN_CLEAN_100)
if args.full_download:
prepare_dataset(
url=URL_TEST_OTHER,
md5sum=MD5_TEST_OTHER,
target_dir=os.path.join(args.target_dir, "test-other"),
manifest_path=args.manifest_prefix + ".test-other",
num_lines=NUM_LINES_TEST_OTHER)
prepare_dataset(
url=URL_DEV_OTHER,
md5sum=MD5_DEV_OTHER,
target_dir=os.path.join(args.target_dir, "dev-other"),
manifest_path=args.manifest_prefix + ".dev-other",
num_lines=NUM_LINES_DEV_OTHER)
prepare_dataset(
url=URL_TRAIN_CLEAN_360,
md5sum=MD5_TRAIN_CLEAN_360,
target_dir=os.path.join(args.target_dir, "train-clean-360"),
manifest_path=args.manifest_prefix + ".train-clean-360",
num_lines=NUM_LINES_TRAIN_CLEAN_360)
prepare_dataset(
url=URL_TRAIN_OTHER_500,
md5sum=MD5_TRAIN_OTHER_500,
target_dir=os.path.join(args.target_dir, "train-other-500"),
manifest_path=args.manifest_prefix + ".train-other-500",
num_lines=NUM_LINES_TRAIN_OTHER_500)


if __name__ == '__main__':
Expand Down