mozilla · reuben · Sep 13, 2017 · Aug 28, 2017 · Sep 1, 2017 · Sep 1, 2017
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,3 @@
 
 *.binary filter=lfs diff=lfs merge=lfs -crlf
+data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
diff --git a/.tc.training.yml b/.tc.training.yml
@@ -11,16 +11,17 @@ payload:
   image: "ubuntu:14.04"
   env:
     TENSORFLOW_WHEEL: https://index.taskcluster.net/v1/task/project.deepspeech.tensorflow.pip.master.cpu/artifacts/public/tensorflow_warpctc-1.3.0rc0-cp27-cp27mu-linux_x86_64.whl
+    DEEPSPEECH_ARTIFACTS_ROOT: https://queue.taskcluster.net/v1/task/{{ TASK_ID }}/runs/0/artifacts/public
   command:
     - "/bin/bash"
     - "--login"
     - "-cxe"
-    - apt-get -qq update && apt-get -qq -y install git &&
+    - apt-get -qq update && apt-get -qq -y install git pixz &&
       apt-get -qq -y install make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev xz-utils tk-dev &&
       {{ SYSTEM_ADD_USER }} &&
       echo -e "#!/bin/bash\nset -xe\nexport PATH=/home/build-user/bin:$PATH && env && id && wget https://github.com/git-lfs/git-lfs/releases/download/v2.2.1/git-lfs-linux-amd64-2.2.1.tar.gz -O - | tar -C /tmp -zxf - && PREFIX=/home/build-user/ /tmp/git-lfs-2.2.1/install.sh && mkdir ~/DeepSpeech/ && git clone --quiet {{ GITHUB_HEAD_REPO_URL }} ~/DeepSpeech/ds/ && cd ~/DeepSpeech/ds && git checkout --quiet {{ GITHUB_HEAD_SHA }}" > /tmp/clone.sh && chmod +x /tmp/clone.sh &&
       {{ SYSTEM_DO_CLONE }} &&
-      sudo -H -u build-user TENSORFLOW_WHEEL=${TENSORFLOW_WHEEL} /bin/bash /home/build-user/DeepSpeech/ds/tc-train-tests.sh 2.7.13
+      sudo -H -u build-user TENSORFLOW_WHEEL=${TENSORFLOW_WHEEL} DEEPSPEECH_ARTIFACTS_ROOT=${DEEPSPEECH_ARTIFACTS_ROOT} /bin/bash /home/build-user/DeepSpeech/ds/tc-train-tests.sh 2.7.13
   artifacts:
     "public":
       type: "directory"

diff --git a/DeepSpeech.py b/DeepSpeech.py
@@ -24,7 +24,6 @@
 from util.feeding import DataSet, ModelFeeder
 from util.gpu import get_available_gpus
 from util.shared_lib import check_cupti
-from util.spell import correction
 from util.text import sparse_tensor_value_to_texts, wer, Alphabet
 from xdg import BaseDirectory as xdg
 import numpy as np
@@ -140,7 +139,16 @@
 tf.app.flags.DEFINE_float   ('estop_mean_thresh', 0.5,        'mean threshold for loss to determine the condition if early stopping is required')
 tf.app.flags.DEFINE_float   ('estop_std_thresh',  0.5,        'standard deviation threshold for loss to determine the condition if early stopping is required')
 
+# Decoder
+
+tf.app.flags.DEFINE_string  ('decoder_library_path', 'native_client/libctc_decoder_with_kenlm.so', 'path to the libctc_decoder_with_kenlm.so library containing the decoder implementation.')
 tf.app.flags.DEFINE_string  ('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.')
+tf.app.flags.DEFINE_string  ('lm_binary_path',       'data/lm/lm.binary', 'path to the language model binary file created with KenLM')
+tf.app.flags.DEFINE_string  ('lm_trie_path',         'data/lm/trie', 'path to the language model trie file created with native_client/generate_trie')
+tf.app.flags.DEFINE_integer ('beam_width',        1024,       'beam width used in the CTC decoder when building candidate transcriptions')
+tf.app.flags.DEFINE_float   ('lm_weight',         2.15,        'the alpha hyperparameter of the CTC decoder. Language Model weight.')
+tf.app.flags.DEFINE_float   ('word_count_weight', -0.10,        'the beta hyperparameter of the CTC decoder. Word insertion weight (penalty).')
+tf.app.flags.DEFINE_float   ('valid_word_count_weight', 1.10,        'Valid word insertion weight. This is used to lessen the word insertion penalty when the inserted word is part of the vocabulary.')
 
 for var in ['b1', 'h1', 'b2', 'h2', 'b3', 'h3', 'b5', 'h5', 'b6', 'h6']:
     tf.app.flags.DEFINE_float('%s_stddev' % var, None, 'standard deviation to use when initialising %s' % var)
@@ -452,6 +460,28 @@ def BiRNN(batch_x, seq_length, dropout):
     # Output shape: [n_steps, batch_size, n_hidden_6]
     return layer_6
 
+if not os.path.exists(os.path.abspath(FLAGS.decoder_library_path)):
+    print('ERROR: The decoder library file does not exist. Make sure you have ' \
+          'downloaded or built the native client binaries and pass the ' \
+          'appropriate path to the binaries in the --decoder_library_path parameter.')
+
+custom_op_module = tf.load_op_library(FLAGS.decoder_library_path)
+
+def decode_with_lm(inputs, sequence_length, beam_width=100,
+                   top_paths=1, merge_repeated=True):
+  decoded_ixs, decoded_vals, decoded_shapes, log_probabilities = (
+      custom_op_module.ctc_beam_search_decoder_with_lm(
+          inputs, sequence_length, beam_width=beam_width,
+          model_path=FLAGS.lm_binary_path, trie_path=FLAGS.lm_trie_path, alphabet_path=FLAGS.alphabet_config_path,
+          lm_weight=FLAGS.lm_weight, word_count_weight=FLAGS.word_count_weight, valid_word_count_weight=FLAGS.valid_word_count_weight,
+          top_paths=top_paths, merge_repeated=merge_repeated))
+
+  return (
+      [tf.SparseTensor(ix, val, shape) for (ix, val, shape)
+       in zip(decoded_ixs, decoded_vals, decoded_shapes)],
+      log_probabilities)
+
+
 
 # Accuracy and Loss
 # =================
@@ -485,7 +515,7 @@ def calculate_mean_edit_distance_and_loss(model_feeder, tower, dropout):
     avg_loss = tf.reduce_mean(total_loss)
 
     # Beam search decode the batch
-    decoded, _ = tf.nn.ctc_beam_search_decoder(logits, batch_seq_len, merge_repeated=False)
+    decoded, _ = decode_with_lm(logits, batch_seq_len, merge_repeated=False, beam_width=FLAGS.beam_width)
 
     # Compute the edit (Levenshtein) distance
     distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), batch_y)
@@ -718,9 +748,8 @@ def calculate_report(results_tuple):
     items = list(zip(*results_tuple))
     mean_wer = 0.0
     for label, decoding, distance, loss in items:
-        corrected = correction(decoding, alphabet)
-        sample_wer = wer(label, corrected)
-        sample = Sample(label, corrected, loss, distance, sample_wer)
+        sample_wer = wer(label, decoding)
+        sample = Sample(label, decoding, loss, distance, sample_wer)
         samples.append(sample)
         mean_wer += sample_wer
 

diff --git a/README.md b/README.md
@@ -71,6 +71,15 @@ $ ./DeepSpeech.py --help
 ```
 
 To get the output of this in a slightly better-formatted way, you can also look up the option definitions top of `DeepSpeech.py`.
+
+You'll need to download `native_client.tar.xz` or build the native client files yourself to get the custom TensorFlow OP needed for decoding the outputs of the neural network. You can use `util/tc.py` to download the files for your architecture:
+
+```bash
+python util/tc.py destination/folder cpu
+```
+
+This will download the native client files for the x86_64 architecture without CUDA support, and extract them into `destination/folder`. If you prefer building the binaries from source, see the [native_client README file](native_client/README.md). We also have binaries with CUDA enabled ("gpu") and for ARM7 ("arm").
+
 For executing pre-configured training scenarios, there is a collection of convenience scripts in the `bin` folder. Most of them are named after the corpora they are configured for. Keep in mind that the other speech corpora are *very large*, on the order of tens of gigabytes, and some aren't free. Downloading and preprocessing them can take a very long time, and training on them without a fast GPU (GTX 10 series recommended) takes even longer. If you experience GPU OOM errors while training, try reducing `batch_size`.
 
 As a simple first example you can open a terminal, change to the directory of the DeepSpeech checkout and run:

diff --git a/bin/run-tc-ldc93s1.sh b/bin/run-tc-ldc93s1.sh
@@ -16,4 +16,5 @@ python -u DeepSpeech.py \
   --test_files ${ldc93s1_csv} --test_batch_size 1 \
   --n_hidden 494 --epoch 75 --random_seed 4567 --default_stddev 0.046875 \
   --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
-  --learning_rate 0.001 --dropout_rate 0.05  --export_dir "/tmp/train"
+  --learning_rate 0.001 --dropout_rate 0.05  --export_dir "/tmp/train" \
+  --decoder_library_path "/tmp/ds/libctc_decoder_with_kenlm.so"
diff --git a/data/lm/trie b/data/lm/trie
diff --git a/data/spell/words.txt → data/lm/vocab.txt b/data/spell/words.txt → data/lm/vocab.txt
@@ -6876,7 +6876,7 @@ i would like to conclude by reading an email i got from one of them cindy the da
 as i worked i couldnt help but think about the individuals and the stories represented in the images one in particular
 a photo of women of all ages from grandmother to little girl gathered around a baby struck a chord because a similar photo from my family my grandmother and mother myself and newborn daughter hangs on our wall
 across the globe throughout the ages our basic needs are just the same arent they thank you
-i along with hundreds of other volunteers knew we couldnt just sit at home so i decided to join them for three weeks on may the thirteenth i made my way to the town of ōfunato its a small fishing town in iwate prefecture
+i along with hundreds of other volunteers knew we couldnt just sit at home so i decided to join them for three weeks on may the thirteenth i made my way to the town of ofunato its a small fishing town in iwate prefecture
 about fifty thousand people one of the first that was hit by the wave
 the waters here have been recorded at reaching over twenty four meters in height
 and traveled over two miles inland as you can imagine the town had been devastated
@@ -48368,7 +48368,7 @@ what if we thought of fear as an amazing act of the imagination something that c
 its easiest to see this link between fear and the imagination in young children whose fears are often extraordinarily vivid
 when i was a child i lived in california which is you know mostly a very nice place to live
 but at a certain point most of us learn to leave these kinds of visions behind and grow up we learn that there are no monsters hiding under the bed and not every earthquake brings buildings down
-but maybe its no coincidence that some of our most creative minds fail to leave these kinds of fears behind as adults the same incredible imaginations that produced the origin of species jane eyre and the remembrance of things past also generated intense worries that haunted the adult lives of charles darwin charlotte brontăť
+but maybe its no coincidence that some of our most creative minds fail to leave these kinds of fears behind as adults the same incredible imaginations that produced the origin of species jane eyre and the remembrance of things past also generated intense worries that haunted the adult lives of charles darwin charlotte brontat
 and marcel proust so the question is what can the rest of us learn
 lets take a look at the fears that their imaginations were generating as they drifted in the middle of the pacific twenty four hours had now passed since the capsizing of the ship the time had come for the men to make a plan but they had very few options
 in his fascinating account of the disaster nathaniel philbrick wrote that these men were just about as far from land as it was possible to be anywhere on earth

diff --git a/native_client/BUILD b/native_client/BUILD
@@ -19,13 +19,13 @@ cc_library(
     srcs = ["deepspeech_utils.cc",
             "c_speech_features/c_speech_features.c",
             "kiss_fft130/kiss_fft.c",
-            "kiss_fft130/tools/kiss_fftr.c"],
-    hdrs = ["deepspeech_utils.h",
+            "kiss_fft130/tools/kiss_fftr.c",
             "c_speech_features/c_speech_features.h",
             "c_speech_features/c_speech_features_config.h",
             "kiss_fft130/kiss_fft.h",
             "kiss_fft130/_kiss_fft_guts.h",
             "kiss_fft130/tools/kiss_fftr.h"],
+    hdrs = ["deepspeech_utils.h"],
     includes = ["c_speech_features",
                 "kiss_fft130"],
 
@@ -34,3 +34,41 @@ cc_library(
     copts = [] + if_linux_x86_64(["-mno-fma", "-mno-avx", "-mno-avx2"]),
     nocopts = "(-fstack-protector|-fno-omit-frame-pointer)",
 )
+
+
+cc_library(
+    name = "ctc_decoder_with_kenlm",
+    srcs = [
+            "beam_search.cc",
+            "alphabet.h",
+            "trie_node.h"
+           ] +
+           glob(["kenlm/lm/*.cc", "kenlm/util/*.cc", "kenlm/util/double-conversion/*.cc",
+                 "kenlm/lm/*.hh", "kenlm/util/*.hh", "kenlm/util/double-conversion/*.h"],
+                exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"]) +
+           glob(["boost_locale/**/*.hpp"]),
+    includes = ["kenlm", "boost_locale"],
+    copts = ["-std=c++11"],
+    defines = ["KENLM_MAX_ORDER=6"],
+    deps = ["//tensorflow/core:core",
+            "//tensorflow/core/util/ctc",
+            "//third_party/eigen3",
+    ],
+)
+
+cc_binary(
+    name = "generate_trie",
+    srcs = [
+            "generate_trie.cpp",
+            "trie_node.h",
+            "alphabet.h",
+           ] +
+           glob(["kenlm/lm/*.cc", "kenlm/util/*.cc", "kenlm/util/double-conversion/*.cc",
+                 "kenlm/lm/*.hh", "kenlm/util/*.hh", "kenlm/util/double-conversion/*.h"],
+                exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"]) +
+           glob(["boost_locale/**/*.hpp"]),
+    includes = ["kenlm", "boost_locale"],
+    copts = ["-std=c++11"],
+    linkopts = ["-lm"],
+    defines = ["KENLM_MAX_ORDER=6"],
+)
diff --git a/native_client/alphabet.h b/native_client/alphabet.h
@@ -3,6 +3,7 @@
 
 #include <cassert>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <unordered_map>
 
@@ -46,6 +47,7 @@ class Alphabet {
     if (it != str_to_label_.end()) {
       return it->second;
     } else {
+      std::cerr << "Invalid label " << string << std::endl;
       abort();
     }
   }
@@ -54,6 +56,12 @@ class Alphabet {
     return size_;
   }
 
+  bool IsSpace(unsigned int label) const {
+    //TODO: we should probably do something more i18n-aware here
+    const std::string& str = StringFromLabel(label);
+    return str.size() == 1 && str[0] == ' ';
+  }
+
 private:
   size_t size_;
   std::unordered_map<unsigned int, std::string> label_to_str_;
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,3 @@

		*.binary filter=lfs diff=lfs merge=lfs -crlf
		data/lm/trie filter=lfs diff=lfs merge=lfs -crlf