From d784c538c274b3afd8412b6606016d7c2160004a Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 7 Jun 2017 17:44:11 +0800 Subject: [PATCH 1/2] Refine librispeech.py for DeepSpeech2. Summary: 1. Add manifest line check. 2. Avoid re-unpacking if unpacked data already exists. 3. Add full_download (download all 7 sub-datasets of LibriSpeech). --- deep_speech_2/README.md | 5 +- deep_speech_2/data/librispeech.py | 90 ++++++++++++++++++++++++++----- 2 files changed, 80 insertions(+), 15 deletions(-) diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md index bb1815c008..403511d586 100644 --- a/deep_speech_2/README.md +++ b/deep_speech_2/README.md @@ -18,6 +18,7 @@ For some machines, we also need to install libsndfile1. Details to be added. ``` cd data python librispeech.py +cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` @@ -32,13 +33,13 @@ python librispeech.py --help For GPU Training: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all ``` For CPU Training: ``` -python train.py --trainer_count 8 --use_gpu False +python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all ``` More help for arguments: diff --git a/deep_speech_2/data/librispeech.py b/deep_speech_2/data/librispeech.py index 838fee5978..8bc33575e2 100644 --- a/deep_speech_2/data/librispeech.py +++ b/deep_speech_2/data/librispeech.py @@ -1,13 +1,15 @@ """ - Download, unpack and create manifest for Librespeech dataset. + Download, unpack and create manifest file for the Librespeech dataset. - Manifest is a json file with each line containing one audio clip filepath, - its transcription text string, and its duration. It servers as a unified - interfance to organize different data sets. + A manifest file is a dataset summarization, with each line a json format + string containing meta data for one audio clip, including its filepath, + transcription string, and duration. It serves as a unified interface for + different data sets. """ import paddle.v2 as paddle from paddle.v2.dataset.common import md5file +import distutils.util import os import wget import tarfile @@ -27,11 +29,21 @@ URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" +NUM_LINES_TEST_CLEAN = 2620 +NUM_LINES_TEST_OTHER = 2939 +NUM_LINES_DEV_CLEAN = 2703 +NUM_LINES_DEV_OTHER = 2864 +NUM_LINES_TRAIN_CLEAN_100 = 28539 +NUM_LINES_TRAIN_CLEAN_360 = 104014 +NUM_LINES_TRAIN_OTHER_500 = 148688 + parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') parser.add_argument( @@ -44,6 +56,13 @@ default="manifest.libri", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") args = parser.parse_args() @@ -57,7 +76,10 @@ def download(url, md5sum, target_dir): print("Downloading %s ..." % url) wget.download(url, target_dir) print("\nMD5 Chesksum %s ..." % filepath) - assert md5file(filepath) == md5sum, "MD5 checksum failed." + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) return filepath @@ -69,7 +91,6 @@ def unpack(filepath, target_dir): tar = tarfile.open(filepath) tar.extractall(target_dir) tar.close() - return target_dir def create_manifest(data_dir, manifest_path): @@ -83,7 +104,7 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] - for subfolder, _, filelist in os.walk(data_dir): + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') ] @@ -107,13 +128,28 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def prepare_dataset(url, md5sum, target_dir, manifest_path): +def verify_file_line_number(filepath, num_lines): + with open(filepath, 'r') as file: + return len(file.readlines()) == num_lines + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): """ Download, unpack and create summmary manifest file. """ + # download filepath = download(url, md5sum, target_dir) - unpacked_dir = unpack(filepath, target_dir) - create_manifest(unpacked_dir, manifest_path) + # unpack + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + unpack(filepath, target_dir) + else: + print("Unpacked data exists, skip unpacking.") + # create manifest and verify line number + create_manifest(target_dir, manifest_path) + if not verify_file_line_number(manifest_path, num_lines): + raise RuntimeError("Manifest line number check failed. " + "Please remove directory and try running the script " + "again.") def main(): @@ -121,17 +157,45 @@ def main(): url=URL_TEST_CLEAN, md5sum=MD5_TEST_CLEAN, target_dir=os.path.join(args.target_dir, "test-clean"), - manifest_path=args.manifest_prefix + ".test-clean") + manifest_path=args.manifest_prefix + ".test-clean", + num_lines=NUM_LINES_TEST_CLEAN) prepare_dataset( url=URL_DEV_CLEAN, md5sum=MD5_DEV_CLEAN, target_dir=os.path.join(args.target_dir, "dev-clean"), - manifest_path=args.manifest_prefix + ".dev-clean") + manifest_path=args.manifest_prefix + ".dev-clean", + num_lines=NUM_LINES_DEV_CLEAN) prepare_dataset( url=URL_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), - manifest_path=args.manifest_prefix + ".train-clean-100") + manifest_path=args.manifest_prefix + ".train-clean-100", + num_lines=NUM_LINES_TRAIN_CLEAN_100) + if args.full_download: + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other", + num_lines=NUM_LINES_TEST_OTHER) + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other", + num_lines=NUM_LINES_DEV_OTHER) + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360", + num_lines=NUM_LINES_TRAIN_CLEAN_360) + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500", + num_lines=NUM_LINES_TRAIN_OTHER_500) if __name__ == '__main__': From b05d5ea5c3b4040db2fd07fec47b96f47b0a35f9 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 8 Jun 2017 22:20:11 +0800 Subject: [PATCH 2/2] Remove manifest's line number check from librispeech.py and update README.md. --- deep_speech_2/README.md | 4 ++ deep_speech_2/data/librispeech.py | 69 ++++++++++--------------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md index 403511d586..7a372e9bed 100644 --- a/deep_speech_2/README.md +++ b/deep_speech_2/README.md @@ -22,6 +22,10 @@ cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` +After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format. + +By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets. + More help for arguments: ``` diff --git a/deep_speech_2/data/librispeech.py b/deep_speech_2/data/librispeech.py index 8bc33575e2..653caa9267 100644 --- a/deep_speech_2/data/librispeech.py +++ b/deep_speech_2/data/librispeech.py @@ -1,10 +1,9 @@ """ - Download, unpack and create manifest file for the Librespeech dataset. + Download, unpack and create manifest json files for the Librespeech dataset. - A manifest file is a dataset summarization, with each line a json format - string containing meta data for one audio clip, including its filepath, - transcription string, and duration. It serves as a unified interface for - different data sets. + A manifest is a json file summarizing filelist in a data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file in the data set. """ import paddle.v2 as paddle @@ -36,14 +35,6 @@ MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" -NUM_LINES_TEST_CLEAN = 2620 -NUM_LINES_TEST_OTHER = 2939 -NUM_LINES_DEV_CLEAN = 2703 -NUM_LINES_DEV_OTHER = 2864 -NUM_LINES_TRAIN_CLEAN_100 = 28539 -NUM_LINES_TRAIN_CLEAN_360 = 104014 -NUM_LINES_TRAIN_OTHER_500 = 148688 - parser = argparse.ArgumentParser( description='Downloads and prepare LibriSpeech dataset.') parser.add_argument( @@ -95,12 +86,9 @@ def unpack(filepath, target_dir): def create_manifest(data_dir, manifest_path): """ - Create a manifest file summarizing the dataset (list of filepath and meta - data). - - Each line of the manifest contains one audio clip filepath, its - transcription text string, and its duration. Manifest file servers as a - unified interfance to organize data sets. + Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. """ print("Creating manifest %s ..." % manifest_path) json_lines = [] @@ -128,28 +116,20 @@ def create_manifest(data_dir, manifest_path): out_file.write(line + '\n') -def verify_file_line_number(filepath, num_lines): - with open(filepath, 'r') as file: - return len(file.readlines()) == num_lines - - -def prepare_dataset(url, md5sum, target_dir, manifest_path, num_lines): +def prepare_dataset(url, md5sum, target_dir, manifest_path): """ Download, unpack and create summmary manifest file. """ - # download - filepath = download(url, md5sum, target_dir) - # unpack if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack unpack(filepath, target_dir) else: - print("Unpacked data exists, skip unpacking.") - # create manifest and verify line number + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file create_manifest(target_dir, manifest_path) - if not verify_file_line_number(manifest_path, num_lines): - raise RuntimeError("Manifest line number check failed. " - "Please remove directory and try running the script " - "again.") def main(): @@ -157,45 +137,38 @@ def main(): url=URL_TEST_CLEAN, md5sum=MD5_TEST_CLEAN, target_dir=os.path.join(args.target_dir, "test-clean"), - manifest_path=args.manifest_prefix + ".test-clean", - num_lines=NUM_LINES_TEST_CLEAN) + manifest_path=args.manifest_prefix + ".test-clean") prepare_dataset( url=URL_DEV_CLEAN, md5sum=MD5_DEV_CLEAN, target_dir=os.path.join(args.target_dir, "dev-clean"), - manifest_path=args.manifest_prefix + ".dev-clean", - num_lines=NUM_LINES_DEV_CLEAN) + manifest_path=args.manifest_prefix + ".dev-clean") prepare_dataset( url=URL_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), - manifest_path=args.manifest_prefix + ".train-clean-100", - num_lines=NUM_LINES_TRAIN_CLEAN_100) + manifest_path=args.manifest_prefix + ".train-clean-100") if args.full_download: prepare_dataset( url=URL_TEST_OTHER, md5sum=MD5_TEST_OTHER, target_dir=os.path.join(args.target_dir, "test-other"), - manifest_path=args.manifest_prefix + ".test-other", - num_lines=NUM_LINES_TEST_OTHER) + manifest_path=args.manifest_prefix + ".test-other") prepare_dataset( url=URL_DEV_OTHER, md5sum=MD5_DEV_OTHER, target_dir=os.path.join(args.target_dir, "dev-other"), - manifest_path=args.manifest_prefix + ".dev-other", - num_lines=NUM_LINES_DEV_OTHER) + manifest_path=args.manifest_prefix + ".dev-other") prepare_dataset( url=URL_TRAIN_CLEAN_360, md5sum=MD5_TRAIN_CLEAN_360, target_dir=os.path.join(args.target_dir, "train-clean-360"), - manifest_path=args.manifest_prefix + ".train-clean-360", - num_lines=NUM_LINES_TRAIN_CLEAN_360) + manifest_path=args.manifest_prefix + ".train-clean-360") prepare_dataset( url=URL_TRAIN_OTHER_500, md5sum=MD5_TRAIN_OTHER_500, target_dir=os.path.join(args.target_dir, "train-other-500"), - manifest_path=args.manifest_prefix + ".train-other-500", - num_lines=NUM_LINES_TRAIN_OTHER_500) + manifest_path=args.manifest_prefix + ".train-other-500") if __name__ == '__main__':