Skip to content

Commit

Permalink
Merge branch 'develop' into refine_seq2seq
Browse files Browse the repository at this point in the history
  • Loading branch information
lcy-seso committed Jun 26, 2017
2 parents 555e089 + 1aaee80 commit 00eb42a
Show file tree
Hide file tree
Showing 15 changed files with 310 additions and 261 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
group: deprecated-2017Q2
language: cpp
cache: ccache
sudo: required
Expand Down
4 changes: 2 additions & 2 deletions .travis/unittest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ abort(){

unittest(){
cd $1 > /dev/null
if [ -f "requirements.txt" ]; then
pip install -r requirements.txt
if [ -f "setup.sh" ]; then
sh setup.sh
fi
if [ $? != 0 ]; then
exit 1
Expand Down
2 changes: 1 addition & 1 deletion deep_speech_2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.

```
pip install -r requirements.txt
sh setup.sh
export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
```

Expand Down
15 changes: 12 additions & 3 deletions deep_speech_2/data_utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import random
import numpy as np
import multiprocessing
import paddle.v2 as paddle
from data_utils import utils
from data_utils.augmentor.augmentation import AugmentationPipeline
Expand Down Expand Up @@ -44,6 +45,8 @@ class DataGenerator(object):
:types max_freq: None|float
:param specgram_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str
:param num_threads: Number of CPU threads for processing data.
:type num_threads: int
:param random_seed: Random seed.
:type random_seed: int
"""
Expand All @@ -58,6 +61,7 @@ def __init__(self,
window_ms=20.0,
max_freq=None,
specgram_type='linear',
num_threads=multiprocessing.cpu_count(),
random_seed=0):
self._max_duration = max_duration
self._min_duration = min_duration
Expand All @@ -70,6 +74,7 @@ def __init__(self,
stride_ms=stride_ms,
window_ms=window_ms,
max_freq=max_freq)
self._num_threads = num_threads
self._rng = random.Random(random_seed)
self._epoch = 0

Expand Down Expand Up @@ -207,10 +212,14 @@ def _instance_reader_creator(self, manifest):

def reader():
for instance in manifest:
yield self._process_utterance(instance["audio_filepath"],
instance["text"])
yield instance

return reader
def mapper(instance):
return self._process_utterance(instance["audio_filepath"],
instance["text"])

return paddle.reader.xmap_readers(
mapper, reader, self._num_threads, 1024, order=True)

def _padding_batch(self, batch, padding_to=-1, flatten=False):
"""
Expand Down
2 changes: 1 addition & 1 deletion deep_speech_2/data_utils/speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def concatenate(cls, *segments):
return cls(samples, sample_rate, transcripts)

@classmethod
def slice_from_file(cls, filepath, start=None, end=None, transcript):
def slice_from_file(cls, filepath, transcript, start=None, end=None):
"""Loads a small section of an speech without having to load
the entire file into the memory which can be incredibly wasteful.
Expand Down
9 changes: 8 additions & 1 deletion deep_speech_2/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import argparse
import gzip
import distutils.util
import multiprocessing
import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import deep_speech2
Expand Down Expand Up @@ -38,6 +39,11 @@
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count(),
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
Expand Down Expand Up @@ -67,7 +73,8 @@ def infer():
data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}')
augmentation_config='{}',
num_threads=args.num_threads_data)

# create network config
# paddle.data_type.dense_array is used for variable batch input.
Expand Down
3 changes: 1 addition & 2 deletions deep_speech_2/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
SoundFile==0.9.0.post1
wget==3.2
scikits.samplerate==0.3.3
scipy==0.13.0b1
scipy==0.13.1
30 changes: 30 additions & 0 deletions deep_speech_2/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# install python dependencies
if [ -f 'requirements.txt' ]; then
pip install -r requirements.txt
fi
if [ $? != 0 ]; then
echo "Install python dependencies failed !!!"
exit 1
fi

# install scikits.samplerate
curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz"
if [ $? != 0 ]; then
echo "Download libsamplerate-0.1.9.tar.gz failed !!!"
exit 1
fi
tar -xvf libsamplerate-0.1.9.tar.gz
cd libsamplerate-0.1.9
./configure && make && make install
cd -
rm -rf libsamplerate-0.1.9
rm libsamplerate-0.1.9.tar.gz
pip install scikits.samplerate==0.3.3
if [ $? != 0 ]; then
echo "Install scikits.samplerate failed !!!"
exit 1
fi

echo "Install all dependencies successfully."
23 changes: 22 additions & 1 deletion deep_speech_2/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import gzip
import time
import distutils.util
import multiprocessing
import paddle.v2 as paddle
from model import deep_speech2
from data_utils.data import DataGenerator
Expand Down Expand Up @@ -52,6 +53,18 @@
default=True,
type=distutils.util.strtobool,
help="Use sortagrad or not. (default: %(default)s)")
parser.add_argument(
"--max_duration",
default=100.0,
type=float,
help="Audios with duration larger than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--min_duration",
default=0.0,
type=float,
help="Audios with duration smaller than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--shuffle_method",
default='instance_shuffle',
Expand All @@ -63,6 +76,11 @@
default=4,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count(),
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
Expand Down Expand Up @@ -107,7 +125,10 @@ def data_generator():
return DataGenerator(
vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath,
augmentation_config=args.augmentation_config)
augmentation_config=args.augmentation_config,
max_duration=args.max_duration,
min_duration=args.min_duration,
num_threads=args.num_threads_data)

train_generator = data_generator()
test_generator = data_generator()
Expand Down
51 changes: 18 additions & 33 deletions language_model/network_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,56 +51,41 @@ def rnn_lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer):
return cost, output


def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer):
def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer, gram_num=4):
"""
N-Gram language model definition.
:param vocab_size: size of vocab.
:param emb_dim: embedding vector's dimension.
:param hidden_size: size of unit.
:param num_layer: layer number.
:param num_layer: number of hidden layers.
:param gram_size: gram number in n-gram method
:return: cost and output layer of model.
"""

assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0

def wordemb(inlayer):
wordemb = paddle.layer.table_projection(
input=inlayer,
size=emb_dim,
param_attr=paddle.attr.Param(
name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
return wordemb

# input layers
first_word = paddle.layer.data(
name="first_word", type=paddle.data_type.integer_value(vocab_size))
second_word = paddle.layer.data(
name="second_word", type=paddle.data_type.integer_value(vocab_size))
third_word = paddle.layer.data(
name="third_word", type=paddle.data_type.integer_value(vocab_size))
fourth_word = paddle.layer.data(
name="fourth_word", type=paddle.data_type.integer_value(vocab_size))
emb_layers = []
for i in range(gram_num):
word = paddle.layer.data(
name="__word%02d__" % (i + 1),
type=paddle.data_type.integer_value(vocab_size))
emb = paddle.layer.embedding(
input=word,
size=emb_dim,
param_attr=paddle.attr.Param(name="_proj", initial_std=1e-3))
emb_layers.append(emb)
next_word = paddle.layer.data(
name="next_word", type=paddle.data_type.integer_value(vocab_size))

# embedding layer
first_emb = wordemb(first_word)
second_emb = wordemb(second_word)
third_emb = wordemb(third_word)
fourth_emb = wordemb(fourth_word)

context_emb = paddle.layer.concat(
input=[first_emb, second_emb, third_emb, fourth_emb])
name="__next_word__", type=paddle.data_type.integer_value(vocab_size))

# hidden layer
hidden = paddle.layer.fc(
input=context_emb, size=hidden_size, act=paddle.activation.Relu())
for _ in range(num_layer - 1):
for i in range(num_layer):
hidden = paddle.layer.fc(
input=hidden, size=hidden_size, act=paddle.activation.Relu())
input=hidden if i else paddle.layer.concat(input=emb_layers),
size=hidden_size,
act=paddle.activation.Relu())

# fc(full connected) and output layer
predict_word = paddle.layer.fc(
input=[hidden], size=vocab_size, act=paddle.activation.Softmax())

Expand Down
Loading

0 comments on commit 00eb42a

Please sign in to comment.