Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script for VoxForge data preparation. #16

Merged
merged 3 commits into from
Nov 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ VoxForge European | 31.21 | 20.47
VoxForge Indian | 56.79 | 28.15
Baidu Internal Testset  |   47.73 |   8.92

For reproducing benchmark results on VoxForge data, we provide a script to download data and generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updating and the generated manifest files may have difference from those we evaluated on.

#### Benchmark Results for Mandarin Model (Character Error Rate)

Test Set | Aishell Model | BaiduCN1.2k Model
Expand Down
16 changes: 16 additions & 0 deletions data/voxforge/run_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#! /usr/bin/env bash

# download data, generate manifests
PYTHONPATH=../../:$PYTHONPATH python voxforge.py \
--manifest_prefix='./manifest' \
--target_dir='~/.cache/paddle/dataset/speech/VoxForge' \
--is_merge_dialect=True \
--dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian'

if [ $? -ne 0 ]; then
echo "Prepare VoxForge failed. Terminated."
exit 1
fi

echo "VoxForge Data preparation done."
exit 0
221 changes: 221 additions & 0 deletions data/voxforge/voxforge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
"""Prepare VoxForge dataset

Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import codecs
import soundfile
import json
import argparse
import shutil
import subprocess
from data_utils.utility import download_multi, unpack, getfile_insensitive

DATA_HOME = '~/.cache/paddle/dataset/speech'

DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \
'Audio/Main/16kHz_16bit'

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/VoxForge",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--dialects",
default=[
'american', 'british', 'australian', 'european', 'irish', 'canadian',
'indian'
],
nargs='+',
type=str,
help="Dialect types. (default: %(default)s)")
parser.add_argument(
"--is_merge_dialect",
default=True,
type=bool,
help="If set True, manifests of american dialect and canadian dialect will "
"be merged to american-canadian dialect; manifests of british "
"dialect, irish dialect and australian dialect will be merged to "
"commonwealth dialect. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def download_and_unpack(target_dir, url):
wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np'
tgz_dir = os.path.join(target_dir, 'tgz')
exit_code = download_multi(url, tgz_dir, wget_args)
if exit_code != 0:
print('Download tgz audio files failed with exit code %d.' % exit_code)
else:
print('Download done, start unpacking ...')
audio_dir = os.path.join(target_dir, 'audio')
for root, dirs, files in os.walk(tgz_dir):
for file in files:
print(file)
if file.endswith('.tgz'):
unpack(os.path.join(root, file), audio_dir)


def select_dialects(target_dir, dialect_list):
"""Classify audio files by dialect."""
dialect_root_dir = os.path.join(target_dir, 'dialect')
if os.path.exists(dialect_root_dir):
shutil.rmtree(dialect_root_dir)
os.mkdir(dialect_root_dir)
audio_dir = os.path.abspath(os.path.join(target_dir, 'audio'))
for dialect in dialect_list:
# filter files by dialect
command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \
"pronunciation dialect.*%s" {} \;' % (audio_dir, dialect)
p = subprocess.Popen(
command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
output, err = p.communicate()
dialect_dir = os.path.join(dialect_root_dir, dialect)
if os.path.exists(dialect_dir):
shutil.rmtree(dialect_dir)
os.mkdir(dialect_dir)
for path in output.splitlines():
src_dir = os.path.dirname(os.path.dirname(path))
link = os.path.basename(os.path.normpath(src_dir))
os.symlink(src_dir, os.path.join(dialect_dir, link))


def generate_manifest(data_dir, manifest_path):
json_lines = []

for path in os.listdir(data_dir):
audio_link = os.path.join(data_dir, path)
assert os.path.islink(
audio_link), '%s should be symbolic link.' % audio_link
actual_audio_dir = os.path.abspath(os.readlink(audio_link))

audio_type = ''
if os.path.isdir(os.path.join(actual_audio_dir, 'wav')):
audio_type = 'wav'
elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')):
audio_type = 'flac'
else:
print('Unknown audio type, skipped processing %s.' %
actual_audio_dir)
continue

etc_dir = os.path.join(actual_audio_dir, 'etc')
prompts_file = os.path.join(etc_dir, 'PROMPTS')
if not os.path.isfile(prompts_file):
print('PROMPTS file missing, skip processing %s.' %
actual_audio_dir)
continue

readme_file = getfile_insensitive(os.path.join(etc_dir, 'README'))
if readme_file is None:
print('README file missing, skip processing %s.' % actual_audio_dir)
continue

for line in file(prompts_file):
u, trans = line.strip().split(None, 1)
u_parts = u.split('/')

# try to format the date time
try:
speaker, date, sfx = u_parts[-3].split('-')
obj = datetime.datetime.strptime(date, '%y.%m.%d')
formatted = obj.strftime('%Y%m%d')
u_parts[-3] = '-'.join([speaker, formatted, sfx])
except Exception as e:
pass

if len(u_parts) < 2:
u_parts = [audio_type] + u_parts
u_parts[-2] = audio_type
u_parts[-1] += '.' + audio_type
u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:]))

if not os.path.isfile(u):
print('Audio file missing, skip processing %s.' % u)
continue

if os.stat(u).st_size == 0:
print('Empty audio file, skip processing %s.' % u)
continue

trans = trans.strip().replace('-', ' ')
if not trans.isupper() or \
not trans.strip().replace(' ', '').replace("'", "").isalpha():
print("Transcript not normalized properly, skip processing %s."
% u)
continue

audio_data, samplerate = soundfile.read(u)
duration = float(len(audio_data)) / samplerate
json_lines.append(
json.dumps({
'audio_filepath': u,
'duration': duration,
'text': trans.lower()
}))

with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')


def merge_manifests(manifest_files, save_path):
lines = []
for manifest_file in manifest_files:
line = codecs.open(manifest_file, 'r', 'utf-8').readlines()
lines += line

with codecs.open(save_path, 'w', 'utf-8') as fout:
for line in lines:
fout.write(line)


def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge):
download_and_unpack(target_dir, url)
select_dialects(target_dir, dialects)
american_canadian_manifests = []
commonwealth_manifests = []
for dialect in dialects:
dialect_dir = os.path.join(target_dir, 'dialect', dialect)
manifest_fpath = manifest_prefix + '.' + dialect
if dialect == 'american' or dialect == 'canadian':
american_canadian_manifests.append(manifest_fpath)
if dialect == 'australian' \
or dialect == 'british' \
or dialect == 'irish':
commonwealth_manifests.append(manifest_fpath)
generate_manifest(dialect_dir, manifest_fpath)

if is_merge:
if len(american_canadian_manifests) > 0:
manifest_fpath = manifest_prefix + '.american-canadian'
merge_manifests(american_canadian_manifests, manifest_fpath)
if len(commonwealth_manifests) > 0:
manifest_fpath = manifest_prefix + '.commonwealth'
merge_manifests(commonwealth_manifests, manifest_fpath)


def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)

prepare_dataset(DATA_URL, args.dialects, args.target_dir,
args.manifest_prefix, args.is_merge_dialect)


if __name__ == '__main__':
main()
19 changes: 19 additions & 0 deletions data_utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,25 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
return manifest


def getfile_insensitive(path):
"""Get the actual file path when given insensitive filename."""
directory, filename = os.path.split(path)
directory, filename = (directory or '.'), filename.lower()
for f in os.listdir(directory):
newpath = os.path.join(directory, f)
if os.path.isfile(newpath) and f.lower() == filename:
return newpath


def download_multi(url, target_dir, extra_args):
"""Download multiple files from url to target_dir."""
if not os.path.exists(target_dir): os.makedirs(target_dir)
print("Downloading %s ..." % url)
ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
target_dir)
return ret_code


def download(url, md5sum, target_dir):
"""Download file from url to target_dir, and check md5sum."""
if not os.path.exists(target_dir): os.makedirs(target_dir)
Expand Down