-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refine librispeech.py for DeepSpeech2. #78
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,15 @@ | ||
""" | ||
Download, unpack and create manifest for Librespeech dataset. | ||
Download, unpack and create manifest file for the Librespeech dataset. | ||
|
||
Manifest is a json file with each line containing one audio clip filepath, | ||
its transcription text string, and its duration. It servers as a unified | ||
interfance to organize different data sets. | ||
A manifest file is a dataset summarization, with each line a json format | ||
string containing meta data for one audio clip, including its filepath, | ||
transcription string, and duration. It serves as a unified interface for | ||
different data sets. | ||
""" | ||
|
||
import paddle.v2 as paddle | ||
from paddle.v2.dataset.common import md5file | ||
import distutils.util | ||
import os | ||
import wget | ||
import tarfile | ||
|
@@ -27,11 +29,21 @@ | |
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" | ||
|
||
MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" | ||
MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" | ||
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" | ||
MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" | ||
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" | ||
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" | ||
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" | ||
|
||
NUM_LINES_TEST_CLEAN = 2620 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please explain why it's necessary to check the line number when MD5 has been checked. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also feel confused There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed. |
||
NUM_LINES_TEST_OTHER = 2939 | ||
NUM_LINES_DEV_CLEAN = 2703 | ||
NUM_LINES_DEV_OTHER = 2864 | ||
NUM_LINES_TRAIN_CLEAN_100 = 28539 | ||
NUM_LINES_TRAIN_CLEAN_360 = 104014 | ||
NUM_LINES_TRAIN_OTHER_500 = 148688 | ||
|
||
parser = argparse.ArgumentParser( | ||
description='Downloads and prepare LibriSpeech dataset.') | ||
parser.add_argument( | ||
|
@@ -44,6 +56,13 @@ | |
default="manifest.libri", | ||
type=str, | ||
help="Filepath prefix for output manifests. (default: %(default)s)") | ||
parser.add_argument( | ||
"--full_download", | ||
default="True", | ||
type=distutils.util.strtobool, | ||
help="Download all datasets for Librispeech." | ||
" If False, only download a minimal requirement (test-clean, dev-clean" | ||
" train-clean-100). (default: %(default)s)") | ||
args = parser.parse_args() | ||
|
||
|
||
|
@@ -57,7 +76,10 @@ def download(url, md5sum, target_dir): | |
print("Downloading %s ..." % url) | ||
wget.download(url, target_dir) | ||
print("\nMD5 Chesksum %s ..." % filepath) | ||
assert md5file(filepath) == md5sum, "MD5 checksum failed." | ||
if not md5file(filepath) == md5sum: | ||
raise RuntimeError("MD5 checksum failed.") | ||
else: | ||
print("File exists, skip downloading. (%s)" % filepath) | ||
return filepath | ||
|
||
|
||
|
@@ -69,7 +91,6 @@ def unpack(filepath, target_dir): | |
tar = tarfile.open(filepath) | ||
tar.extractall(target_dir) | ||
tar.close() | ||
return target_dir | ||
|
||
|
||
def create_manifest(data_dir, manifest_path): | ||
|
@@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path): | |
""" | ||
print("Creating manifest %s ..." % manifest_path) | ||
json_lines = [] | ||
for subfolder, _, filelist in os.walk(data_dir): | ||
for subfolder, _, filelist in sorted(os.walk(data_dir)): | ||
text_filelist = [ | ||
filename for filename in filelist if filename.endswith('trans.txt') | ||
] | ||
|
@@ -107,31 +128,74 @@ def create_manifest(data_dir, manifest_path): | |
out_file.write(line + '\n') | ||
|
||
|
||
def prepare_dataset(url, md5sum, target_dir, manifest_path): | ||
def verify_file_line_number(filepath, num_lines): | ||
with open(filepath, 'r') as file: | ||
return len(file.readlines()) == num_lines | ||
|
||
|
||
def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): | ||
""" | ||
Download, unpack and create summmary manifest file. | ||
""" | ||
# download | ||
filepath = download(url, md5sum, target_dir) | ||
unpacked_dir = unpack(filepath, target_dir) | ||
create_manifest(unpacked_dir, manifest_path) | ||
# unpack | ||
if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): | ||
unpack(filepath, target_dir) | ||
else: | ||
print("Unpacked data exists, skip unpacking.") | ||
# create manifest and verify line number | ||
create_manifest(target_dir, manifest_path) | ||
if not verify_file_line_number(manifest_path, num_lines): | ||
raise RuntimeError("Manifest line number check failed. " | ||
"Please remove directory and try running the script " | ||
"again.") | ||
|
||
|
||
def main(): | ||
prepare_dataset( | ||
url=URL_TEST_CLEAN, | ||
md5sum=MD5_TEST_CLEAN, | ||
target_dir=os.path.join(args.target_dir, "test-clean"), | ||
manifest_path=args.manifest_prefix + ".test-clean") | ||
manifest_path=args.manifest_prefix + ".test-clean", | ||
num_lines=NUM_LINES_TEST_CLEAN) | ||
prepare_dataset( | ||
url=URL_DEV_CLEAN, | ||
md5sum=MD5_DEV_CLEAN, | ||
target_dir=os.path.join(args.target_dir, "dev-clean"), | ||
manifest_path=args.manifest_prefix + ".dev-clean") | ||
manifest_path=args.manifest_prefix + ".dev-clean", | ||
num_lines=NUM_LINES_DEV_CLEAN) | ||
prepare_dataset( | ||
url=URL_TRAIN_CLEAN_100, | ||
md5sum=MD5_TRAIN_CLEAN_100, | ||
target_dir=os.path.join(args.target_dir, "train-clean-100"), | ||
manifest_path=args.manifest_prefix + ".train-clean-100") | ||
manifest_path=args.manifest_prefix + ".train-clean-100", | ||
num_lines=NUM_LINES_TRAIN_CLEAN_100) | ||
if args.full_download: | ||
prepare_dataset( | ||
url=URL_TEST_OTHER, | ||
md5sum=MD5_TEST_OTHER, | ||
target_dir=os.path.join(args.target_dir, "test-other"), | ||
manifest_path=args.manifest_prefix + ".test-other", | ||
num_lines=NUM_LINES_TEST_OTHER) | ||
prepare_dataset( | ||
url=URL_DEV_OTHER, | ||
md5sum=MD5_DEV_OTHER, | ||
target_dir=os.path.join(args.target_dir, "dev-other"), | ||
manifest_path=args.manifest_prefix + ".dev-other", | ||
num_lines=NUM_LINES_DEV_OTHER) | ||
prepare_dataset( | ||
url=URL_TRAIN_CLEAN_360, | ||
md5sum=MD5_TRAIN_CLEAN_360, | ||
target_dir=os.path.join(args.target_dir, "train-clean-360"), | ||
manifest_path=args.manifest_prefix + ".train-clean-360", | ||
num_lines=NUM_LINES_TRAIN_CLEAN_360) | ||
prepare_dataset( | ||
url=URL_TRAIN_OTHER_500, | ||
md5sum=MD5_TRAIN_OTHER_500, | ||
target_dir=os.path.join(args.target_dir, "train-other-500"), | ||
manifest_path=args.manifest_prefix + ".train-other-500", | ||
num_lines=NUM_LINES_TRAIN_OTHER_500) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we introduce the meaning of manifest.libri.train-* file ?
I see, the introduction details is in following section. Feel abrupt about manifest file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.