diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3f61fe4d..ac31ec55 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,6 +10,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           path: sphinxtrain
+          submodules: recursive
       - name: Install
         run: |
           sudo apt-get install libfst-dev libngram-dev cmake \
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..01e9a7b6
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "src/upstream/Phonetisaurus"]
+	path = src/upstream/Phonetisaurus
+	url = git@github.com:cmusphinx/Phonetisaurus.git
diff --git a/README.md b/README.md
index 5a9d23f3..ece59283 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,10 @@ command above, namely:
 
     cmake -S . -B build -DBUILD_G2P=ON
     
+You must also enable git submodules, e.g.:
+
+    git submodule init
+    
 You can also enable shared libraries with `-DBUILD_SHARED_LIBS=ON`,
 but I suggest that you *not* do that unless you have a very good
 reason.
diff --git a/include/sphinxbase/fixpoint.h b/include/sphinxbase/fixpoint.h
index 30b5cb20..46132593 100644
--- a/include/sphinxbase/fixpoint.h
+++ b/include/sphinxbase/fixpoint.h
@@ -55,6 +55,10 @@ extern "C" {
 }
 #endif
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #ifndef DEFAULT_RADIX
 #define DEFAULT_RADIX 12
 #endif
diff --git a/include/sphinxbase/prim_type.h b/include/sphinxbase/prim_type.h
index aebb1982..9fa12f32 100644
--- a/include/sphinxbase/prim_type.h
+++ b/include/sphinxbase/prim_type.h
@@ -85,6 +85,10 @@ extern "C" {
 } /* Fool Emacs into not indenting things. */
 #endif
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 /* Define some things for VisualDSP++ */
 #if defined(__ADSPBLACKFIN__) && !defined(__GNUC__)
 # ifndef HAVE_LONG_LONG
diff --git a/scripts/0000.g2p_train/calculateER.py b/scripts/0000.g2p_train/calculateER.py
index b24f5abc..b0ef6e5a 100755
--- a/scripts/0000.g2p_train/calculateER.py
+++ b/scripts/0000.g2p_train/calculateER.py
@@ -253,12 +253,12 @@ def split_sequence( self, sequence, usep=" ", fsep="" ):
         if usep=="":
             sequences = [
                   [ unit for unit in list(seq) ] 
-                    for seq in re.split( r' {2,}', sequence )
+                    for seq in re.split( r'(?:\t| {2,})', sequence )
                 ]
         else:
             sequences = [ 
                   [ unit for unit in re.split( usep, seq ) ] 
-                    for seq in re.split( r' {2,}', sequence ) 
+                    for seq in re.split( r'(?:\t| {2,})', sequence ) 
                 ]
         if len(sequences)==1: 
             return sequences[0]
@@ -284,7 +284,7 @@ def compute_PER_phonetisaurus( self, hypfile, reffile, usep=" ", fsep="", verbos
         words = []; hyps = []; refs = []
         for i,line in enumerate(open(hypfile,"r")):
             #There should be three fields
-            word, score, pron = re.split(r' {2,}', line.strip())
+            word, score, pron = re.split(r'(?:\t|  +)', line.strip())
             phons = re.split(usep, pron)
             #This assumes that we will never have a test situation
             # where the input list intentionally contains 2 repetitions
@@ -350,7 +350,7 @@ def print_ER( self, totals ):
     parser.add_argument('--hyp', "-y", help="The file/string containing G2P/ASR hypotheses.", required=True )
     parser.add_argument('--ref', "-r", help="The file/string containing G2P/ASR reference transcriptions.", required=True )
     parser.add_argument('--usep',     "-u", help="Character or regex separating units in a sequence. Defaults to ' '.", required=False, default=" " )
-    parser.add_argument('--fsep',     "-s", help="Character or regex separating fields in a sequence. Defaults to '  '.", required=False, default="r' {2,}'" )
+    parser.add_argument('--fsep',     "-s", help="Character or regex separating fields in a sequence. Defaults to '  '.", required=False, default="r'(?:\t| {2,})'" )
     parser.add_argument('--format',   "-f", help="Input format.  One of 'cmu', 'htk', 'g2p'. Defaults to 'g2p'.", required=False, default="g2p" )
     parser.add_argument('--ignore',   "-i", help="Ignore specified characters when encountered in a HYPOTHESIS.  A ' ' separated list.", required=False, default="" )
     parser.add_argument('--regex_ignore',   "-x", help="Ignore specified characters when encountered in a HYPOTHESIS.  A regular expression.", required=False, default="" )
diff --git a/scripts/0000.g2p_train/evaluate.py b/scripts/0000.g2p_train/evaluate.py
index d9eabc61..f8a5e77f 100755
--- a/scripts/0000.g2p_train/evaluate.py
+++ b/scripts/0000.g2p_train/evaluate.py
@@ -83,12 +83,12 @@ def evaluate_testset(
     references = {}
     for entry in open(referencefile,"r"):
 #        parts = entry.strip().split("  ")
-        parts = re.split(r' {2,}', entry.strip())
+        parts = re.split(r'(?:\t| {2,})', entry.strip())
         word  = parts.pop(0)
         references[word] = parts
     for entry in open(hypothesisfile,"r"):
         #word, score, hypothesis = entry.strip().split("  ")
-        word, score, hypothesis = re.split(r' {2,}', entry.strip())
+        word, score, hypothesis = re.split(r'(?:\t| {2,})', entry.strip())
 
     PERcalculator = ErrorRater( ignore=ignore, ignore_both=ignore_both, regex_ignore=regex_ignore )
     PERcalculator.compute_PER_phonetisaurus( hypothesisfile, referencefile, verbose=verbose )
diff --git a/scripts/0000.g2p_train/g2p_train.pl b/scripts/0000.g2p_train/g2p_train.pl
index 12a70b23..74ef81f9 100755
--- a/scripts/0000.g2p_train/g2p_train.pl
+++ b/scripts/0000.g2p_train/g2p_train.pl
@@ -78,7 +78,10 @@
     s/\(\d+\)//;
     s/^\s*//;
     s/\s*$//;
-    s/\s+/ /g;
+    # Use tab to separate word and pron to be consistent with MFA and Phonetisaurus
+    s/\s+/\t/;
+    # Collapse any multiple spaces that remain
+    s/ +/ /g;
     print OUTDICT "$_\n";
 }
 close INDICT or die $!;
diff --git a/src/programs/g2p_eval/CMakeLists.txt b/src/programs/g2p_eval/CMakeLists.txt
index 649e8259..2f662c67 100644
--- a/src/programs/g2p_eval/CMakeLists.txt
+++ b/src/programs/g2p_eval/CMakeLists.txt
@@ -1,9 +1,10 @@
 set(PROGRAM g2p_eval)
 set(SRCS
-main.cpp
-Phonetisaurus.cpp
-phonetisaurus-g2p.cpp
-${CMAKE_SOURCE_DIR}/src/programs/g2p_train/FstPathFinder.cpp
+main.c
+${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/M2MFstAligner.cc
+${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/LatticePruner.cc
+${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/util.cc
+phonetisaurus-g2p.cc
   )
 
 add_executable(${PROGRAM} ${SRCS})
@@ -11,6 +12,8 @@ target_link_libraries(${PROGRAM} sphinxtrain ${FST} ${NGRAM})
 target_include_directories(
   ${PROGRAM} PRIVATE ${CMAKE_BINARY_DIR}
   ${PROGRAM} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src
+  ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/3rdparty/utfcpp
   ${PROGRAM} PUBLIC ${CMAKE_SOURCE_DIR}/include
   ${PROGRAM} INTERFACE ${CMAKE_SOURCE_DIR}/include
   )
diff --git a/src/programs/g2p_eval/Phonetisaurus.cpp b/src/programs/g2p_eval/Phonetisaurus.cpp
deleted file mode 100644
index dbb81936..00000000
--- a/src/programs/g2p_eval/Phonetisaurus.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- *  Phonetisaurus.cpp
- *
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
-  modification, are permitted #provided that the following conditions
-  are met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of #conditions and the following
-    disclaimer in the documentation and/or other materials provided
-    with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <stdio.h>
-#include <string>
-#include <fst/fstlib.h>
-#include <iostream>
-#include <set>
-#include <algorithm>
-#include "../g2p_train/FstPathFinder.hpp"
-#include "Phonetisaurus.hpp"
-
-using namespace fst;
-
-Phonetisaurus::Phonetisaurus()
-{
-    //Default constructor
-}
-
-Phonetisaurus::Phonetisaurus(const char *_g2pmodel_file)
-{
-    //Base constructor.  Load the clusters file, the models and setup shop.
-    eps = "<eps>";
-    sb = "<s>";
-    se = "</s>";
-    skip = "_";
-
-    skipSeqs.insert(eps);
-    skipSeqs.insert(sb);
-    skipSeqs.insert(se);
-    skipSeqs.insert(skip);
-    skipSeqs.insert("-");
-
-    g2pmodel = StdVectorFst::Read(_g2pmodel_file);
-
-    isyms = (SymbolTable *) g2pmodel->InputSymbols();
-    tie = isyms->Find(1);       //The separator symbol is reserved for index 1
-
-    osyms = (SymbolTable *) g2pmodel->OutputSymbols();
-
-    loadClusters();
-
-    epsMapper = makeEpsMapper();
-
-    //We need make sure the g2pmodel is arcsorted
-    ILabelCompare<StdArc> icomp;
-    ArcSort(g2pmodel, icomp);
-}
-
-
-void
-Phonetisaurus::loadClusters()
-{
-    /*
-       Load the clusters file containing the list of
-       subsequences generated during multiple-to-multiple alignment
-     */
-
-    for (size_t i = 2; i < isyms->NumSymbols(); i++) {
-        string sym = isyms->Find(i);
-
-        if (sym.find(tie) != string::npos) {
-            char *tmpstring = (char *) sym.c_str();
-            char *p = strtok(tmpstring, tie.c_str());
-            vector <string> cluster;
-
-            while (p) {
-                cluster.push_back(p);
-                p = strtok(NULL, tie.c_str());
-            }
-
-            clusters[cluster] = i;
-        }
-    }
-    return;
-}
-
-StdVectorFst
-Phonetisaurus::makeEpsMapper()
-{
-    /*
-       Generate a mapper FST to transform unwanted output symbols
-       to the epsilon symbol.
-
-       This can be used to remove unwanted symbols from the final
-       result, but in tests was 7x slower than manual removal
-       via the FstPathFinder object.
-     */
-
-    StdVectorFst mfst;
-    mfst.AddState();
-    mfst.SetStart(0);
-
-    set <string>::iterator sit;
-    for (size_t i = 0; i < osyms->NumSymbols(); i++) {
-        string sym = osyms->Find(i);
-        sit = skipSeqs.find(sym);
-        if (sit != skipSeqs.end())
-            mfst.AddArc(0, StdArc(i, 0, 0, 0));
-        else
-            mfst.AddArc(0, StdArc(i, i, 0, 0));
-    }
-    mfst.SetFinal(0, 0);
-    ILabelCompare<StdArc> icomp;
-    ArcSort(&mfst, icomp);
-    mfst.SetInputSymbols(osyms);
-    mfst.SetOutputSymbols(osyms);
-
-    return mfst;
-}
-
-StdVectorFst
-Phonetisaurus::entryToFSA(vector <string> entry)
-{
-    /*
-       Transform an input spelling/pronunciation into an equivalent
-       FSA, adding extra arcs as needed to accomodate clusters.
-     */
-
-    StdVectorFst efst;
-    efst.AddState();
-    efst.SetStart(0);
-
-    efst.AddState();
-    efst.AddArc(0, StdArc(isyms->Find(sb), isyms->Find(sb), 0, 1));
-    size_t i = 0;
-
-    //Build the basic FSA
-    for (i = 0; i < entry.size(); i++) {
-        efst.AddState();
-        string ch = entry[i];
-        efst.AddArc(i + 1,
-                    StdArc(isyms->Find(ch), isyms->Find(ch), 0, i + 2));
-        if (i == 0)
-            continue;
-
-    }
-
-    //Add any cluster arcs
-    map<vector<string>,int>::iterator it_i;
-    for (it_i = clusters.begin(); it_i != clusters.end(); it_i++) {
-        vector<string>::iterator it_j;
-        vector<string>::iterator start = entry.begin();
-        vector<string> cluster = (*it_i).first;
-        while (it_j != entry.end()) {
-            it_j =
-                search(start, entry.end(), cluster.begin(), cluster.end());
-            if (it_j != entry.end()) {
-                efst.AddArc(it_j - entry.begin() + 1, StdArc((*it_i).second,    //input symbol
-                            (*it_i).second,    //output symbol
-                            0, //weight
-                            it_j - entry.begin() + cluster.size() + 1  //destination state
-                                                            ));
-                start = it_j + cluster.size();
-            }
-        }
-    }
-
-    efst.AddState();
-    efst.AddArc(i + 1, StdArc(isyms->Find(se), isyms->Find(se), 0, i + 2));
-    efst.SetFinal(i + 2, 0);
-    efst.SetInputSymbols(isyms);
-    efst.SetOutputSymbols(isyms);
-    return efst;
-}
-
-vector<PathData> Phonetisaurus::phoneticize(vector <string> entry,
-        int nbest, int beam)
-{
-    /*
-       Generate pronunciation/spelling hypotheses for an
-       input entry.
-     */
-    StdVectorFst
-    result;
-    StdVectorFst
-    epsMapped;
-    StdVectorFst
-    shortest;
-    StdVectorFst
-    efst = entryToFSA(entry);
-    StdVectorFst
-    smbr;
-    Compose(efst, *g2pmodel, &result);
-
-    Project(&result, PROJECT_OUTPUT);
-    if (nbest > 1) {
-        //This is a cheesy hack.
-        ShortestPath(result, &shortest, beam);
-    }
-    else {
-        ShortestPath(result, &shortest, 1);
-    }
-    RmEpsilon(&shortest);
-    FstPathFinder
-    pathfinder(skipSeqs);
-    pathfinder.findAllStrings(shortest);
-
-    return pathfinder.paths;
-}
-
-void
-printPath(PathData * path, string onepath, int k, ofstream * hypfile,
-          string correct, string word, bool output_cost)
-{
-    if (word != "") {
-        if (k != 0) {
-            *hypfile << word << "(" << (k + 1) << ")" << "  ";
-        }
-        else {
-            *hypfile << word << "  ";
-        }
-    }
-    if (output_cost) {
-        if (path) {
-            *hypfile << path->pathcost << "  " << onepath;
-        }
-        else {
-            *hypfile << "999.999  " << onepath;
-        }
-    }
-    else {
-        *hypfile << onepath;
-    }
-    if (correct != "")
-        *hypfile << "  " << correct;
-    *hypfile << "\n";
-}
-
-bool
-Phonetisaurus::printPaths(vector<PathData> paths, int nbest,
-                          ofstream * hypfile, string correct, string word,
-                          bool output_cost)
-{
-    /*
-       Convenience function to print out a path vector.
-       Will print only the first N unique entries.
-     */
-
-    set <string> seen;
-    set <string>::iterator sit;
-
-    int numseen = 0;
-    string onepath;
-    size_t k;
-    bool empty_path = true;
-    for (k = 0; k < paths.size(); k++) {
-        if (k >= nbest)
-            break;
-
-        size_t j;
-        for (j = 0; j < paths[k].path.size(); j++) {
-            if (paths[k].path[j] != tie)
-                replace(paths[k].path[j].begin(),
-                        paths[k].path[j].end(), *tie.c_str(), ' ');
-            onepath += paths[k].path[j];
-
-            if (j != paths[k].path.size() - 1)
-                onepath += " ";
-        }
-        if (onepath == "") {
-            continue;
-        }
-        empty_path = false;
-        printPath(&paths[k], onepath, k, hypfile, correct, word,
-                  output_cost);
-        onepath = "";
-    }
-    if (empty_path) {
-        if (k == 0) {
-            printPath(NULL, "-", 0, hypfile, correct, word, output_cost);
-        }
-        else {
-            printPath(&paths[0], "-", 0, hypfile, correct, word,
-                      output_cost);
-        }
-    }
-
-    return empty_path;
-}
diff --git a/src/programs/g2p_eval/Phonetisaurus.hpp b/src/programs/g2p_eval/Phonetisaurus.hpp
deleted file mode 100644
index 45d5939f..00000000
--- a/src/programs/g2p_eval/Phonetisaurus.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- Phonetisaurus.hpp
-
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
-  modification, are permitted #provided that the following conditions
-  are met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of #conditions and the following
-    disclaimer in the documentation and/or other materials provided
-    with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-*/
-#ifndef PHONETISAURUS_H
-#define PHONETISAURUS_H
-#include <fst/fstlib.h>
-#include "../g2p_train/FstPathFinder.hpp"
-using namespace fst;
-typedef PhiMatcher<SortedMatcher<Fst<StdArc> > > PM;
-
-class Phonetisaurus {
-    /*
-       Load a G2P/P2G model and generate pronunciation/spelling
-       hypotheses for input items.
-     */
-public:
-    //Basics
-    string eps;
-    string se;
-    string sb;
-    string skip;
-    string tie;
-    set<string> skipSeqs;
-    map<vector<string>,int> clusters;
-    //FST stuff
-    StdVectorFst *g2pmodel;
-    StdVectorFst epsMapper;
-    SymbolTable *isyms;
-    SymbolTable *osyms;
-
-    Phonetisaurus();
-
-    Phonetisaurus(const char *_g2pmodel_file);
-
-    StdVectorFst entryToFSA(vector<string> entry);
-
-    StdVectorFst makeEpsMapper();
-
-    vector<PathData>  phoneticize(vector<string> entry, int nbest,
-                                    int beam = 500);
-
-    bool printPaths(vector<PathData> paths, int nbest,
-                    ofstream * hypfile, string correct = "", string word =
-                        "", bool output_cost = true);
-
-private:
-    void loadClusters();
-};
-
-#endif                          // PHONETISAURUS_H //
diff --git a/src/programs/g2p_eval/main.cpp b/src/programs/g2p_eval/main.c
similarity index 77%
rename from src/programs/g2p_eval/main.cpp
rename to src/programs/g2p_eval/main.c
index b05e00f3..ff954806 100644
--- a/src/programs/g2p_eval/main.cpp
+++ b/src/programs/g2p_eval/main.c
@@ -1,9 +1,7 @@
+#include "phonetisaurus-g2p.h"
 #include <sphinxbase/cmd_ln.h>
-#include "phonetisaurus-g2p.hpp"
 
 
-using namespace std;
-
 const char helpstr[] =
     "Usage: g2p_eval -model MODEL -input INPUT [-output OUTPUT] [-isfile] [-output_cost] \n\
 		               [-nbest NBEST] [-beam BEAM] [-sep SEP] [-words] \n\
@@ -53,25 +51,23 @@ main(int argc, char *argv[])
         printf("%s\n\n", helpstr);
     }
 
-    string model = cmd_ln_str("-model");
-    string input = cmd_ln_str("-input");
-    string output = cmd_ln_str("-output");
-    bool output_cost = cmd_ln_boolean("-output_cost");
-    bool isfile = cmd_ln_boolean("-isfile");
+    const char * model = cmd_ln_str("-model");
+    const char * input = cmd_ln_str("-input");
+    const char * output = cmd_ln_str("-output");
+    int output_cost = cmd_ln_boolean("-output_cost");
+    int isfile = cmd_ln_boolean("-isfile");
     int nbest = cmd_ln_int32("-nbest");
     int beam = cmd_ln_int32("-beam");
-    string sep = cmd_ln_str("-sep");
-    bool words = cmd_ln_boolean("-words");
+    const char * sep = cmd_ln_str("-sep");
+    int words = cmd_ln_boolean("-words");
 
     if (isfile) {
-        //If its a file, go for it
-        phoneticizeTestSet(model.c_str(), output.c_str(), input, nbest,
+        phoneticizeTestSet(model, output, input, nbest,
                            sep, beam, words, output_cost);
     }
     else {
-        //Otherwise we just have a word
-        phoneticizeWord(model.c_str(), output.c_str(), input, nbest, sep,
-                        beam, words);
+        phoneticizeWord(model, output, input, nbest, sep,
+                        beam, words, output_cost);
     }
 
     return 0;
diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.cc b/src/programs/g2p_eval/phonetisaurus-g2p.cc
new file mode 100644
index 00000000..3d4b3231
--- /dev/null
+++ b/src/programs/g2p_eval/phonetisaurus-g2p.cc
@@ -0,0 +1,107 @@
+/*
+ phonetisaurus-g2pfst.cc
+
+ Copyright (c) [2012-], Josef Robert Novak
+ All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted #provided that the following conditions
+   are met:
+
+   * Redistributions of source code must retain the above copyright 
+     notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above 
+     copyright notice, this list of #conditions and the following 
+     disclaimer in the documentation and/or other materials provided 
+     with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
+   OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+using namespace std;
+#include <include/PhonetisaurusScript.h>
+#include <include/util.h>
+#include <iomanip>
+#include <fst/fstlib.h>
+using namespace fst;
+
+#include "phonetisaurus-g2p.h"
+
+typedef unordered_map<int, vector<PathData> > RMAP;
+
+void PrintPathData(ofstream &output,
+                   const vector<PathData>& results, string FLAGS_word,
+                   const SymbolTable* osyms, bool print_scores,
+                   bool nlog_probs, bool output_words) {
+    for (int i = 0; i < results.size(); i++) {
+        if (output_words)
+            output << FLAGS_word << "\t";
+        if (print_scores == true) {
+            if (nlog_probs == true) 
+                output << results[i].PathWeight << "\t";
+            else
+                output << std::setprecision (3) << exp(-results[i].PathWeight) << "\t";
+        }
+    
+        for (int j = 0; j < results[i].Uniques.size(); j++) {
+            output << osyms->Find(results[i].Uniques[j]);
+            if (j < results[i].Uniques.size() - 1)
+                output << " ";
+        }
+        output << endl;
+    }    
+}
+
+extern "C"
+void
+phoneticizeTestSet(const char *g2pmodel_file, const char *output,
+                   const char *testset_file, int nbest, const char *sep,
+                   int beam, int output_words, int output_cost)
+{
+    PhonetisaurusScript decoder(g2pmodel_file, sep);
+    
+    vector<string> corpus;
+    bool write_fsts = false;
+    bool accumulate = false;
+    double pmass = 99.0;
+    LoadWordList(testset_file, &corpus);
+    ofstream hypfile;
+    hypfile.open(output);
+    for (int i = 0; i < corpus.size(); i++) {
+        vector<PathData> results = decoder.Phoneticize(corpus[i], nbest,
+                                                       beam, 99.0,
+                                                       write_fsts,
+                                                       accumulate, pmass);
+        PrintPathData(hypfile, results, corpus[i],
+                      decoder.osyms_, output_cost, true, output_words);
+    }
+}
+
+extern "C"
+void
+phoneticizeWord(const char *g2pmodel_file, const char *output,
+                const char *testword, int nbest, const char *sep, int beam,
+                int output_words, int output_cost)
+{
+    PhonetisaurusScript decoder(g2pmodel_file, sep);
+    bool write_fsts = false;
+    bool accumulate = false;
+    vector<PathData> results = decoder.Phoneticize(testword, nbest,
+                                                   beam, 99.0, write_fsts, accumulate, 0.0);
+    ofstream hypfile;
+    hypfile.open(output);
+
+    PrintPathData(hypfile, results, testword,
+                  decoder.osyms_, output_cost, true, output_words);
+}
diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.cpp b/src/programs/g2p_eval/phonetisaurus-g2p.cpp
deleted file mode 100644
index 6e21627a..00000000
--- a/src/programs/g2p_eval/phonetisaurus-g2p.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted #provided that the following conditions
- are met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of #conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <iostream>
-#include "Phonetisaurus.hpp"
-#include "util.hpp"
-
-using namespace fst;
-
-void
-phoneticizeWord(const char *g2pmodel_file, const char *output,
-                string testword, int nbest, string sep, int beam = 500,
-                int output_words = 0)
-{
-
-    Phonetisaurus phonetisaurus(g2pmodel_file);
-
-    vector <string> entry =
-        tokenize_entry(&testword, &sep, phonetisaurus.isyms);
-
-    vector<PathData> paths =
-        phonetisaurus.phoneticize(entry, nbest, beam);
-    ofstream hypfile;
-    hypfile.open(output);
-
-    if (output_words == 0) {
-        while (phonetisaurus.printPaths(paths, nbest, &hypfile) == true
-                && nbest <= paths.size()) {
-            nbest++;
-            paths = phonetisaurus.phoneticize(entry, nbest, beam);
-        }
-    }
-    else {
-        while (phonetisaurus.
-                printPaths(paths, nbest, &hypfile, "", testword)
-                == true && nbest <= paths.size()) {
-            nbest++;
-            paths = phonetisaurus.phoneticize(entry, nbest, beam);
-        }
-    }
-    hypfile.flush();
-    hypfile.close();
-
-    return;
-}
-
-void
-phoneticizeTestSet(const char *g2pmodel_file, const char *output,
-                   string testset_file, int nbest, string sep, int beam =
-                       500, int output_words = 0, bool output_cost = true)
-{
-
-    Phonetisaurus phonetisaurus(g2pmodel_file);
-
-    ifstream test_fp;
-    test_fp.open(testset_file.c_str());
-    string line;
-
-    if (test_fp.is_open()) {
-        ofstream hypfile;
-        hypfile.open(output);
-        while (test_fp.good()) {
-            getline(test_fp, line);
-            if (line.compare("") == 0)
-                continue;
-
-            char *tmpstring = (char *) line.c_str();
-            char *p = strtok(tmpstring, "\t");
-            string word;
-            string pron;
-
-            int i = 0;
-            while (p) {
-                if (i == 0)
-                    word = p;
-                else
-                    pron = p;
-                i++;
-                p = strtok(NULL, "\t");
-            }
-
-            vector <string> entry = tokenize_entry(&word, &sep,
-                                                   phonetisaurus.isyms);
-            vector<PathData> paths =
-                phonetisaurus.phoneticize(entry, nbest, beam);
-            int nbest_new = nbest;
-            if (output_words == 0) {
-                while (phonetisaurus.
-                        printPaths(paths, nbest_new, &hypfile, output,
-                                   pron) == true
-                        && nbest_new <= paths.size()) {
-                    nbest_new++;
-                    paths =
-                        phonetisaurus.phoneticize(entry, nbest_new, beam);
-                }
-            }
-            else {
-                while (phonetisaurus.
-                        printPaths(paths, nbest_new, &hypfile, pron, word,
-                                   output_cost) == true
-                        && nbest_new <= paths.size()) {
-                    nbest_new++;
-                    paths =
-                        phonetisaurus.phoneticize(entry, nbest_new, beam);
-                }
-            }
-        }
-        test_fp.close();
-        hypfile.flush();
-        hypfile.close();
-    }
-    else {
-        cout << "Problem opening test file..." << endl;
-    }
-
-    return;
-}
diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.h b/src/programs/g2p_eval/phonetisaurus-g2p.h
new file mode 100644
index 00000000..b6c3d2d6
--- /dev/null
+++ b/src/programs/g2p_eval/phonetisaurus-g2p.h
@@ -0,0 +1,53 @@
+/*
+ phonetisaurus-g2pfst.h
+
+ Copyright (c) [2012-], Josef Robert Novak
+ All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted #provided that the following conditions
+   are met:
+
+   * Redistributions of source code must retain the above copyright 
+     notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above 
+     copyright notice, this list of #conditions and the following 
+     disclaimer in the documentation and/or other materials provided 
+     with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
+   OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#ifndef __PHONETISAURUS_G2PFST_H__
+#define __PHONETISAURUS_G2PFST_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* Fool Emacs into not indenting things. */
+#endif
+
+void phoneticizeWord(const char *g2pmodel_file, const char *output,
+                     const char *testword, int nbest, const char *sep, int beam,
+                     int output_words, int output_cost);
+void phoneticizeTestSet(const char *g2pmodel_file, const char *output,
+                        const char *testset_file, int nbest, const char *sep,
+                        int beam, int output_words, int output_cost);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __PHONETISAURUS_G2PFST_H__ */
diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.hpp b/src/programs/g2p_eval/phonetisaurus-g2p.hpp
deleted file mode 100644
index ad8be85c..00000000
--- a/src/programs/g2p_eval/phonetisaurus-g2p.hpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <string>
-
-using namespace std;
-
-void phoneticizeWord(const char *g2pmodel_file, const char *output,
-                     string testword, int nbest, string sep, int beam =
-                         500, int output_words = 0);
-void phoneticizeTestSet(const char *g2pmodel_file, const char *output,
-                        string testset_file, int nbest, string sep,
-                        int beam = 500, int output_words =
-                            0, bool output_cost = true);
diff --git a/src/programs/g2p_eval/utf8.h b/src/programs/g2p_eval/utf8.h
deleted file mode 100644
index 4e445140..00000000
--- a/src/programs/g2p_eval/utf8.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "utf8/checked.h"
-#include "utf8/unchecked.h"
-
-#endif // header guard
diff --git a/src/programs/g2p_eval/utf8/checked.h b/src/programs/g2p_eval/utf8/checked.h
deleted file mode 100644
index 9cb8d2c7..00000000
--- a/src/programs/g2p_eval/utf8/checked.h
+++ /dev/null
@@ -1,327 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "core.h"
-#include <stdexcept>
-
-namespace utf8
-{
-    // Base for the exceptions that may be thrown from the library
-    class exception : public std::exception {
-    };
-
-    // Exceptions that may be thrown from the library functions.
-    class invalid_code_point : public exception {
-        uint32_t cp;
-    public:
-        invalid_code_point(uint32_t cp) : cp(cp) {}
-        virtual const char* what() const throw() { return "Invalid code point"; }
-        uint32_t code_point() const {return cp;}
-    };
-
-    class invalid_utf8 : public exception {
-        uint8_t u8;
-    public:
-        invalid_utf8 (uint8_t u) : u8(u) {}
-        virtual const char* what() const throw() { return "Invalid UTF-8"; }
-        uint8_t utf8_octet() const {return u8;}
-    };
-
-    class invalid_utf16 : public exception {
-        uint16_t u16;
-    public:
-        invalid_utf16 (uint16_t u) : u16(u) {}
-        virtual const char* what() const throw() { return "Invalid UTF-16"; }
-        uint16_t utf16_word() const {return u16;}
-    };
-
-    class not_enough_room : public exception {
-    public:
-        virtual const char* what() const throw() { return "Not enough space"; }
-    };
-
-    /// The library API - functions intended to be called by the users
-
-    template <typename octet_iterator, typename output_iterator>
-    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
-    {
-        while (start != end) {
-            octet_iterator sequence_start = start;
-            internal::utf_error err_code = internal::validate_next(start, end);
-            switch (err_code) {
-                case internal::UTF8_OK :
-                    for (octet_iterator it = sequence_start; it != start; ++it)
-                        *out++ = *it;
-                    break;
-                case internal::NOT_ENOUGH_ROOM:
-                    throw not_enough_room();
-                case internal::INVALID_LEAD:
-                    append (replacement, out);
-                    ++start;
-                    break;
-                case internal::INCOMPLETE_SEQUENCE:
-                case internal::OVERLONG_SEQUENCE:
-                case internal::INVALID_CODE_POINT:
-                    append (replacement, out);
-                    ++start;
-                    // just one replacement mark for the sequence
-                    while (internal::is_trail(*start) && start != end)
-                        ++start;
-                    break;
-            }
-        }
-        return out;
-    }
-
-    template <typename octet_iterator, typename output_iterator>
-    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
-    {
-        static const uint32_t replacement_marker = internal::mask16(0xfffd);
-        return replace_invalid(start, end, out, replacement_marker);
-    }
-
-    template <typename octet_iterator>
-    octet_iterator append(uint32_t cp, octet_iterator result)
-    {
-        if (!internal::is_code_point_valid(cp))
-            throw invalid_code_point(cp);
-
-        if (cp < 0x80)                        // one octet
-            *(result++) = static_cast<uint8_t>(cp);
-        else if (cp < 0x800) {                // two octets
-            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
-            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
-        }
-        else if (cp < 0x10000) {              // three octets
-            *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
-            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
-            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
-        }
-        else {      // four octets
-            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
-            *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
-            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
-            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
-        }
-        return result;
-    }
-
-    template <typename octet_iterator>
-    uint32_t next(octet_iterator& it, octet_iterator end)
-    {
-        uint32_t cp = 0;
-        internal::utf_error err_code = internal::validate_next(it, end, &cp);
-        switch (err_code) {
-            case internal::UTF8_OK :
-                break;
-            case internal::NOT_ENOUGH_ROOM :
-                throw not_enough_room();
-            case internal::INVALID_LEAD :
-            case internal::INCOMPLETE_SEQUENCE :
-            case internal::OVERLONG_SEQUENCE :
-                throw invalid_utf8(*it);
-            case internal::INVALID_CODE_POINT :
-                throw invalid_code_point(cp);
-        }
-        return cp;
-    }
-
-    template <typename octet_iterator>
-    uint32_t peek_next(octet_iterator it, octet_iterator end)
-    {
-        return next(it, end);
-    }
-
-    template <typename octet_iterator>
-    uint32_t prior(octet_iterator& it, octet_iterator start)
-    {
-        // can't do much if it == start
-        if (it == start)
-            throw not_enough_room();
-
-        octet_iterator end = it;
-        // Go back until we hit either a lead octet or start
-        while (internal::is_trail(*(--it)))
-            if (it == start)
-                throw invalid_utf8(*it); // error - no lead byte in the sequence
-        return peek_next(it, end);
-    }
-
-    /// Deprecated in versions that include "prior"
-    template <typename octet_iterator>
-    uint32_t previous(octet_iterator& it, octet_iterator pass_start)
-    {
-        octet_iterator end = it;
-        while (internal::is_trail(*(--it)))
-            if (it == pass_start)
-                throw invalid_utf8(*it); // error - no lead byte in the sequence
-        octet_iterator temp = it;
-        return next(temp, end);
-    }
-
-    template <typename octet_iterator, typename distance_type>
-    void advance (octet_iterator& it, distance_type n, octet_iterator end)
-    {
-        for (distance_type i = 0; i < n; ++i)
-            next(it, end);
-    }
-
-    template <typename octet_iterator>
-    typename std::iterator_traits<octet_iterator>::difference_type
-    distance (octet_iterator first, octet_iterator last)
-    {
-        typename std::iterator_traits<octet_iterator>::difference_type dist;
-        for (dist = 0; first < last; ++dist)
-            next(first, last);
-        return dist;
-    }
-
-    template <typename u16bit_iterator, typename octet_iterator>
-    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
-    {
-        while (start != end) {
-            uint32_t cp = internal::mask16(*start++);
-            // Take care of surrogate pairs first
-            if (internal::is_lead_surrogate(cp)) {
-                if (start != end) {
-                    uint32_t trail_surrogate = internal::mask16(*start++);
-                    if (internal::is_trail_surrogate(trail_surrogate))
-                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
-                    else
-                        throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
-                }
-                else
-                    throw invalid_utf16(static_cast<uint16_t>(cp));
-
-            }
-            // Lone trail surrogate
-            else if (internal::is_trail_surrogate(cp))
-                throw invalid_utf16(static_cast<uint16_t>(cp));
-
-            result = append(cp, result);
-        }
-        return result;
-    }
-
-    template <typename u16bit_iterator, typename octet_iterator>
-    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
-    {
-        while (start != end) {
-            uint32_t cp = next(start, end);
-            if (cp > 0xffff) { //make a surrogate pair
-                *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
-                *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
-            }
-            else
-                *result++ = static_cast<uint16_t>(cp);
-        }
-        return result;
-    }
-
-    template <typename octet_iterator, typename u32bit_iterator>
-    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
-    {
-        while (start != end)
-            result = append(*(start++), result);
-
-        return result;
-    }
-
-    template <typename octet_iterator, typename u32bit_iterator>
-    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
-    {
-        while (start != end)
-            (*result++) = next(start, end);
-
-        return result;
-    }
-
-    // The iterator class
-    template <typename octet_iterator>
-    class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
-      octet_iterator it;
-      octet_iterator range_start;
-      octet_iterator range_end;
-      public:
-      iterator () {};
-      explicit iterator (const octet_iterator& octet_it,
-                         const octet_iterator& range_start,
-                         const octet_iterator& range_end) :
-               it(octet_it), range_start(range_start), range_end(range_end)
-      {
-          if (it < range_start || it > range_end)
-              throw std::out_of_range("Invalid utf-8 iterator position");
-      }
-      // the default "big three" are OK
-      octet_iterator base () const { return it; }
-      uint32_t operator * () const
-      {
-          octet_iterator temp = it;
-          return next(temp, range_end);
-      }
-      bool operator == (const iterator& rhs) const
-      {
-          if (range_start != rhs.range_start || range_end != rhs.range_end)
-              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
-          return (it == rhs.it);
-      }
-      bool operator != (const iterator& rhs) const
-      {
-          return !(operator == (rhs));
-      }
-      iterator& operator ++ ()
-      {
-          next(it, range_end);
-          return *this;
-      }
-      iterator operator ++ (int)
-      {
-          iterator temp = *this;
-          next(it, range_end);
-          return temp;
-      }
-      iterator& operator -- ()
-      {
-          prior(it, range_start);
-          return *this;
-      }
-      iterator operator -- (int)
-      {
-          iterator temp = *this;
-          prior(it, range_start);
-          return temp;
-      }
-    }; // class iterator
-
-} // namespace utf8
-
-#endif //header guard
-
-
diff --git a/src/programs/g2p_eval/utf8/core.h b/src/programs/g2p_eval/utf8/core.h
deleted file mode 100755
index 268cf7cd..00000000
--- a/src/programs/g2p_eval/utf8/core.h
+++ /dev/null
@@ -1,358 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include <iterator>
-
-namespace utf8
-{
-    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
-    // You may need to change them to match your system.
-    // These typedefs have the same names as ones from cstdint, or boost/cstdint
-    typedef unsigned char   uint8_t;
-    typedef unsigned short  uint16_t;
-    typedef unsigned int    uint32_t;
-
-// Helper code - not intended to be directly called by the library users. May be changed at any time
-namespace internal
-{
-    // Unicode constants
-    // Leading (high) surrogates: 0xd800 - 0xdbff
-    // Trailing (low) surrogates: 0xdc00 - 0xdfff
-    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
-    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
-    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
-    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
-    const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
-    const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
-
-    // Maximum valid value for a Unicode code point
-    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
-
-    template<typename octet_type>
-    inline uint8_t mask8(octet_type oc)
-    {
-        return static_cast<uint8_t>(0xff & oc);
-    }
-    template<typename u16_type>
-    inline uint16_t mask16(u16_type oc)
-    {
-        return static_cast<uint16_t>(0xffff & oc);
-    }
-    template<typename octet_type>
-    inline bool is_trail(octet_type oc)
-    {
-        return ((mask8(oc) >> 6) == 0x2);
-    }
-
-    template <typename u16>
-    inline bool is_lead_surrogate(u16 cp)
-    {
-        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
-    }
-
-    template <typename u16>
-    inline bool is_trail_surrogate(u16 cp)
-    {
-        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
-    }
-
-    template <typename u16>
-    inline bool is_surrogate(u16 cp)
-    {
-        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
-    }
-
-    template <typename u32>
-    inline bool is_code_point_valid(u32 cp)
-    {
-        return (cp <= CODE_POINT_MAX && !is_surrogate(cp));
-    }
-
-    template <typename octet_iterator>
-    inline typename std::iterator_traits<octet_iterator>::difference_type
-    sequence_length(octet_iterator lead_it)
-    {
-        uint8_t lead = mask8(*lead_it);
-        if (lead < 0x80)
-            return 1;
-        else if ((lead >> 5) == 0x6)
-            return 2;
-        else if ((lead >> 4) == 0xe)
-            return 3;
-        else if ((lead >> 3) == 0x1e)
-            return 4;
-        else
-            return 0;
-    }
-
-    template <typename octet_difference_type>
-    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
-    {
-        if (cp < 0x80) {
-            if (length != 1) 
-                return true;
-        }
-        else if (cp < 0x800) {
-            if (length != 2) 
-                return true;
-        }
-        else if (cp < 0x10000) {
-            if (length != 3) 
-                return true;
-        }
-
-        return false;
-    }
-
-    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
-
-    /// get_sequence_x functions decode utf-8 sequences of the length x
-
-    template <typename octet_iterator>
-    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
-    {
-        if (it != end) {
-            if (code_point)
-                *code_point = mask8(*it);
-            return UTF8_OK;
-        }
-        return NOT_ENOUGH_ROOM;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
-    {
-        utf_error ret_code = NOT_ENOUGH_ROOM;
-
-        if (it != end) {
-            uint32_t cp = mask8(*it);
-            if (++it != end) {
-                if (is_trail(*it)) {
-                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
-
-                    if (code_point)
-                        *code_point = cp;
-                    ret_code = UTF8_OK;
-                }
-                else
-                    ret_code = INCOMPLETE_SEQUENCE;
-            }
-            else
-                ret_code = NOT_ENOUGH_ROOM;
-        }
-
-        return ret_code;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
-    {
-        utf_error ret_code = NOT_ENOUGH_ROOM;
-
-        if (it != end) {
-            uint32_t cp = mask8(*it);
-            if (++it != end) {
-                if (is_trail(*it)) {
-                    cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
-                    if (++it != end) {
-                        if (is_trail(*it)) {
-                            cp += (*it) & 0x3f;
-
-                            if (code_point)
-                                *code_point = cp;
-                            ret_code = UTF8_OK;
-                        }
-                        else 
-                            ret_code = INCOMPLETE_SEQUENCE;
-                    }
-                    else
-                        ret_code = NOT_ENOUGH_ROOM;
-                }
-                else
-                    ret_code = INCOMPLETE_SEQUENCE;
-            }
-            else
-                ret_code = NOT_ENOUGH_ROOM;
-        }
-
-        return ret_code;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
-    {
-        utf_error ret_code = NOT_ENOUGH_ROOM;
-
-        if (it != end) {
-            uint32_t cp = mask8(*it);
-            if (++it != end) {
-                if (is_trail(*it)) {
-                    cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
-                    if (++it != end) {
-                        if (is_trail(*it)) {
-                            cp += (mask8(*it) << 6) & 0xfff;
-                            if (++it != end) {
-                                if (is_trail(*it)) {
-                                    cp += (*it) & 0x3f;
-
-                                    if (code_point)
-                                        *code_point = cp;
-                                    ret_code = UTF8_OK;
-                                }
-                                else
-                                    ret_code = INCOMPLETE_SEQUENCE;
-                            }
-                            else
-                                ret_code = NOT_ENOUGH_ROOM;
-                        }
-                        else
-                            ret_code = INCOMPLETE_SEQUENCE;
-                    }
-                    else
-                        ret_code = NOT_ENOUGH_ROOM;
-                }
-                else 
-                    ret_code = INCOMPLETE_SEQUENCE;
-            }
-            else
-                ret_code = NOT_ENOUGH_ROOM;
-        }
-
-        return ret_code;
-    }
-
-    template <typename octet_iterator>
-    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
-    {
-        // Save the original value of it so we can go back in case of failure
-        // Of course, it does not make much sense with i.e. stream iterators
-        octet_iterator original_it = it;
-
-        uint32_t cp = 0;
-        // Determine the sequence length based on the lead octet
-        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
-        octet_difference_type length = sequence_length(it);
-        if (length == 0)
-            return INVALID_LEAD;
-
-        // Now that we have a valid sequence length, get trail octets and calculate the code point
-        utf_error err = UTF8_OK;
-        switch (length) {
-            case 1:
-                err = get_sequence_1(it, end, &cp);
-                break;
-            case 2:
-                err = get_sequence_2(it, end, &cp);
-            break;
-            case 3:
-                err = get_sequence_3(it, end, &cp);
-            break;
-            case 4:
-                err = get_sequence_4(it, end, &cp);
-            break;
-        }
-
-        if (err == UTF8_OK) {
-            // Decoding succeeded. Now, security checks...
-            if (is_code_point_valid(cp)) {
-                if (!is_overlong_sequence(cp, length)){
-                    // Passed! Return here.
-                    if (code_point)
-                        *code_point = cp;
-                    ++it;
-                    return UTF8_OK;
-                }
-                else
-                    err = OVERLONG_SEQUENCE;
-            }
-            else 
-                err = INVALID_CODE_POINT;
-        }
-
-        // Failure branch - restore the original value of the iterator
-        it = original_it;
-        return err;
-    }
-
-    template <typename octet_iterator>
-    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
-        return validate_next(it, end, 0);
-    }
-
-} // namespace internal
-
-    /// The library API - functions intended to be called by the users
-
-    // Byte order mark
-    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
-
-    template <typename octet_iterator>
-    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
-    {
-        octet_iterator result = start;
-        while (result != end) {
-            internal::utf_error err_code = internal::validate_next(result, end);
-            if (err_code != internal::UTF8_OK)
-                return result;
-        }
-        return result;
-    }
-
-    template <typename octet_iterator>
-    inline bool is_valid(octet_iterator start, octet_iterator end)
-    {
-        return (find_invalid(start, end) == end);
-    }
-
-    template <typename octet_iterator>
-    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
-    {
-        return (
-            ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
-            ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
-            ((it != end) && (internal::mask8(*it))   == bom[2])
-           );
-    }
-	
-	//Deprecated in release 2.3 
-    template <typename octet_iterator>
-    inline bool is_bom (octet_iterator it)
-    {
-        return (
-            (internal::mask8(*it++)) == bom[0] &&
-            (internal::mask8(*it++)) == bom[1] &&
-            (internal::mask8(*it))   == bom[2]
-           );
-    }
-} // namespace utf8
-
-#endif // header guard
-
-
diff --git a/src/programs/g2p_eval/utf8/unchecked.h b/src/programs/g2p_eval/utf8/unchecked.h
deleted file mode 100755
index 2f3eb4d1..00000000
--- a/src/programs/g2p_eval/utf8/unchecked.h
+++ /dev/null
@@ -1,228 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "core.h"
-
-namespace utf8
-{
-    namespace unchecked 
-    {
-        template <typename octet_iterator>
-        octet_iterator append(uint32_t cp, octet_iterator result)
-        {
-            if (cp < 0x80)                        // one octet
-                *(result++) = static_cast<uint8_t>(cp);  
-            else if (cp < 0x800) {                // two octets
-                *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
-                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
-            }
-            else if (cp < 0x10000) {              // three octets
-                *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
-                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
-                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
-            }
-            else {                                // four octets
-                *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
-                *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
-                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
-                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
-            }
-            return result;
-        }
-
-        template <typename octet_iterator>
-        uint32_t next(octet_iterator& it)
-        {
-            uint32_t cp = internal::mask8(*it);
-            typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
-            switch (length) {
-                case 1:
-                    break;
-                case 2:
-                    it++;
-                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
-                    break;
-                case 3:
-                    ++it; 
-                    cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
-                    ++it;
-                    cp += (*it) & 0x3f;
-                    break;
-                case 4:
-                    ++it;
-                    cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);                
-                    ++it;
-                    cp += (internal::mask8(*it) << 6) & 0xfff;
-                    ++it;
-                    cp += (*it) & 0x3f; 
-                    break;
-            }
-            ++it;
-            return cp;        
-        }
-
-        template <typename octet_iterator>
-        uint32_t peek_next(octet_iterator it)
-        {
-            return next(it);    
-        }
-
-        template <typename octet_iterator>
-        uint32_t prior(octet_iterator& it)
-        {
-            while (internal::is_trail(*(--it))) ;
-            octet_iterator temp = it;
-            return next(temp);
-        }
-
-        // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
-        template <typename octet_iterator>
-        inline uint32_t previous(octet_iterator& it)
-        {
-            return prior(it);
-        }
-
-        template <typename octet_iterator, typename distance_type>
-        void advance (octet_iterator& it, distance_type n)
-        {
-            for (distance_type i = 0; i < n; ++i)
-                next(it);
-        }
-
-        template <typename octet_iterator>
-        typename std::iterator_traits<octet_iterator>::difference_type
-        distance (octet_iterator first, octet_iterator last)
-        {
-            typename std::iterator_traits<octet_iterator>::difference_type dist;
-            for (dist = 0; first < last; ++dist) 
-                next(first);
-            return dist;
-        }
-
-        template <typename u16bit_iterator, typename octet_iterator>
-        octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
-        {       
-            while (start != end) {
-                uint32_t cp = internal::mask16(*start++);
-            // Take care of surrogate pairs first
-                if (internal::is_lead_surrogate(cp)) {
-                    uint32_t trail_surrogate = internal::mask16(*start++);
-                    cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
-                }
-                result = append(cp, result);
-            }
-            return result;         
-        }
-
-        template <typename u16bit_iterator, typename octet_iterator>
-        u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
-        {
-            while (start < end) {
-                uint32_t cp = next(start);
-                if (cp > 0xffff) { //make a surrogate pair
-                    *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
-                    *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
-                }
-                else
-                    *result++ = static_cast<uint16_t>(cp);
-            }
-            return result;
-        }
-
-        template <typename octet_iterator, typename u32bit_iterator>
-        octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
-        {
-            while (start != end)
-                result = append(*(start++), result);
-
-            return result;
-        }
-
-        template <typename octet_iterator, typename u32bit_iterator>
-        u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
-        {
-            while (start < end)
-                (*result++) = next(start);
-
-            return result;
-        }
-
-        // The iterator class
-        template <typename octet_iterator>
-          class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
-            octet_iterator it;
-            public:
-            iterator () {};
-            explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
-            // the default "big three" are OK
-            octet_iterator base () const { return it; }
-            uint32_t operator * () const
-            {
-                octet_iterator temp = it;
-                return next(temp);
-            }
-            bool operator == (const iterator& rhs) const 
-            { 
-                return (it == rhs.it);
-            }
-            bool operator != (const iterator& rhs) const
-            {
-                return !(operator == (rhs));
-            }
-            iterator& operator ++ () 
-            {
-                std::advance(it, internal::sequence_length(it));
-                return *this;
-            }
-            iterator operator ++ (int)
-            {
-                iterator temp = *this;
-                std::advance(it, internal::sequence_length(it));
-                return temp;
-            }  
-            iterator& operator -- ()
-            {
-                prior(it);
-                return *this;
-            }
-            iterator operator -- (int)
-            {
-                iterator temp = *this;
-                prior(it);
-                return temp;
-            }
-          }; // class iterator
-
-    } // namespace utf8::unchecked
-} // namespace utf8 
-
-
-#endif // header guard
-
diff --git a/src/programs/g2p_eval/util.hpp b/src/programs/g2p_eval/util.hpp
deleted file mode 100644
index 21aa0186..00000000
--- a/src/programs/g2p_eval/util.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
-  modification, are permitted #provided that the following conditions
-  are met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of #conditions and the following
-    disclaimer in the documentation and/or other materials provided
-    with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-*/
-#include <fst/fstlib.h>
-#include "utf8.h"
-using namespace fst;
-using namespace std;
-
-string
-convertInt(int number)
-{
-    stringstream ss;            //create a stringstream
-    ss << number;               //add number to the stream
-    return ss.str();            //return a string with the contents of the stream
-}
-
-vector <string> tokenize_utf8_string(string * utf8_string,
-                                     string * delimiter)
-{
-    /*
-       Support for tokenizing a utf-8 string. Adapted to also support a delimiter.
-       Note that leading, trailing or multiple consecutive delimiters will result in
-       empty vector elements.  Normally should not be a problem but just in case.
-       FIXME: NO, IT IS A SERIOUS PROBLEM!!! WTF!!! WORST TOKENIZER EVER!!!
-       Also note that any tokens that cannot be found in the model symbol table will be
-       deleted from the input word prior to grapheme-to-phoneme conversion.
-
-       http://stackoverflow.com/questions/2852895/c-iterate-or-split-utf-8-string-into-array-of-symbols#2856241
-     */
-    char *str = (char *) utf8_string->c_str();  // utf-8 string
-    char *str_i = str;          // string iterator
-    char *str_j = str;
-    char *end = str + strlen(str) + 1;  // end iterator
-    vector <string> string_vec;
-    if (delimiter->compare("") != 0)
-        string_vec.push_back("");
-
-    do {
-        str_j = str_i;
-        uint32_t code = utf8::next(str_i, end); // get 32 bit code of a utf-8 symbol
-        if (code == 0)
-            continue;
-        int start = strlen(str) - strlen(str_j);
-        int end = strlen(str) - strlen(str_i);
-        int len = end - start;
-
-        if (delimiter->compare("") == 0) {
-            string_vec.push_back(utf8_string->substr(start, len));
-        }
-        else {
-            if (delimiter->compare(utf8_string->substr(start, len)) == 0)
-                string_vec.push_back("");
-            else
-                string_vec[string_vec.size() - 1] +=
-                    utf8_string->substr(start, len);
-        }
-    } while (str_i < end);
-
-    return string_vec;
-}
-
-vector <string> tokenize_entry(string * testword, string * sep,
-                               SymbolTable * syms)
-{
-    vector<string> tokens = tokenize_utf8_string(testword, sep);
-    vector<string> entry;
-    for (int i = 0; i < tokens.size(); i++) {
-        if (syms->Find(tokens.at(i)) != -1) {
-            entry.push_back(tokens.at(i));
-        }
-    }
-
-    return entry;
-}
diff --git a/src/programs/g2p_train/CMakeLists.txt b/src/programs/g2p_train/CMakeLists.txt
index 14bd7f96..6a9cd4b1 100644
--- a/src/programs/g2p_train/CMakeLists.txt
+++ b/src/programs/g2p_train/CMakeLists.txt
@@ -1,9 +1,10 @@
 set(PROGRAM g2p_train)
 set(SRCS
-FstPathFinder.cpp
 g2p_train.cpp
-M2MFstAligner.cpp
 main.cpp
+${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/M2MFstAligner.cc
+${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/LatticePruner.cc
+${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/util.cc
   )
 
 add_executable(${PROGRAM} ${SRCS})
@@ -12,6 +13,8 @@ target_link_libraries(${PROGRAM} sphinxtrain
 target_include_directories(
   ${PROGRAM} PRIVATE ${CMAKE_BINARY_DIR}
   ${PROGRAM} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src
+  ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/3rdparty/utfcpp
   ${PROGRAM} PUBLIC ${CMAKE_SOURCE_DIR}/include
   ${PROGRAM} INTERFACE ${CMAKE_SOURCE_DIR}/include
   )
diff --git a/src/programs/g2p_train/FstPathFinder.cpp b/src/programs/g2p_train/FstPathFinder.cpp
deleted file mode 100644
index 5983cbe1..00000000
--- a/src/programs/g2p_train/FstPathFinder.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- FstPathFinder.cpp
-
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
-  modification, are permitted #provided that the following conditions
-  are met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of #conditions and the following
-    disclaimer in the documentation and/or other materials provided
-    with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
-  ----------------
-    Original author: chris taylor
-
-    OpenFst forum post title: "Natural code for printing all strings accepted by an FST?"
-    OpenFst forum post link: http://openfst.cs.nyu.edu/twiki/bin/view/Forum/FstForum#Natural_code_for_printing_all_st
-
-  ----------------
-
-    2011-04-07: Modified by Josef Novak
-
-    Modified to build a 'paths' object to store the individual paths
-    and associated weights, rather than just print them out from
-    inside the class.  Useful if you want to return the paths for further
-    processing.
-*/
-
-#include "FstPathFinder.hpp"
-
-FstPathFinder::FstPathFinder()
-{
-    //Default constructor
-}
-
-FstPathFinder::FstPathFinder(set<string> skipset)
-{
-    //Constructor for a non-empty skipset
-    skipSeqs = skipset;
-}
-
-void
-FstPathFinder::findAllStrings(VectorFst<StdArc> &fst)
-{
-    /*
-       Main search function.  Initiates the WFSA traversal.
-       We are making three potentially dangerous assumptions
-       here regarding the input FST:
-
-       1. It has *ALREADY* been run through the shortestpath algorithm
-       *This guarantees the the FST is acyclic and that the paths are
-       sorted according to path cost.
-       2. It has *ALREADY* been projected
-       *This just saves us some hassle.
-       3. The symbol tables have been stored in the input FST
-       *This just saves us some hassle.
-
-       If the input FST does not meet these conditions this will
-       cause problems.
-     */
-
-    vector<string> path;
-    if (fst.InputSymbols() != NULL)
-        isyms = (SymbolTable *) fst.InputSymbols();
-    findAllStringsHelper(fst, fst.Start(), path, TropicalWeight::One());
-
-    return;
-}
-
-void
-FstPathFinder::addOrDiscardPath(PathData pdata)
-{
-    /*
-       Determine whether or not the input path has been added
-       to the paths vector or not.  If it hasn't, add it, otherwise
-       discard it.
-     */
-
-    set< vector<string> >::iterator sit;
-    sit = uniqueStrings.find(pdata.path);
-
-    if (sit == uniqueStrings.end()) {
-        paths.push_back(pdata);
-        uniqueStrings.insert(pdata.path);
-    }
-    return;
-}
-
-void
-FstPathFinder::findAllStringsHelper(VectorFst<StdArc> &fst, int state,
-                                    vector<string> &path,
-                                    TropicalWeight cost)
-{
-    /*
-       Recursively traverse the WFSA and build up a vector of
-       unique paths and associated costs.
-     */
-
-    if (fst.Final(state) != TropicalWeight::Zero()) {
-
-        PathData pdata;
-        pdata.path = path;
-        pdata.pathcost = Times(cost, fst.Final(state)).Value();
-
-        addOrDiscardPath(pdata);
-
-        path.clear();
-
-        return;
-    }
-
-    for (ArcIterator<VectorFst<StdArc> > iter(fst, state);
-            !iter.Done(); iter.Next()) {
-        StdArc arc = iter.Value();
-
-        string symbol = isyms->Find(arc.ilabel);
-
-        bool skip = false;
-        for (set<string>::iterator sit = skipSeqs.begin();
-                sit != skipSeqs.end(); sit++)
-            if (symbol.compare(*sit) == 0)
-                skip = true;
-        if (skip == false)
-            path.push_back(symbol);
-
-        findAllStringsHelper(fst, arc.nextstate, path,
-                             Times(cost, arc.weight.Value()));
-    }
-}
diff --git a/src/programs/g2p_train/FstPathFinder.hpp b/src/programs/g2p_train/FstPathFinder.hpp
deleted file mode 100644
index 5ba037d8..00000000
--- a/src/programs/g2p_train/FstPathFinder.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * FstPathFinder.hpp
-
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
-  modification, are permitted #provided that the following conditions
-  are met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-  * Redistributions in binary form must reproduce the above
-    copyright notice, this list of #conditions and the following
-    disclaimer in the documentation and/or other materials provided
-    with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
-  ----------------
-    Original author: Chris Taylor
-
-    OpenFst forum post title: "Natural code for printing all strings accepted by an FST?"
-    OpenFst forum post link: http://openfst.cs.nyu.edu/twiki/bin/view/Forum/FstForum#Natural_code_for_printing_all_st
-
-  ----------------
-
-    2011-04-07: Modified by Josef Novak
-
-    Modified to build a 'paths' object to store the individual paths
-    and associated weights, rather than just print them out from
-    inside the class.  Useful if you want to return the paths for further
-    processing.
-*
-*/
-#ifndef __FSTPATHFINDER__
-#define __FSTPATHFINDER__
-
-#include <fst/fstlib.h>
-
-using namespace fst;
-using namespace std;
-
-struct PathData {
-    vector<string> path;
-    float pathcost;
-};
-
-class FstPathFinder {
-
-public:
-
-    vector<PathData> paths;
-
-    set<string> skipSeqs;
-
-    set<vector <string> > uniqueStrings;
-
-    SymbolTable *isyms;
-
-    FstPathFinder();
-
-    FstPathFinder(set<string> skipset);
-
-    void findAllStrings(StdVectorFst & fst);
-
-private:
-
-    void addOrDiscardPath(PathData pdata);
-
-    void findAllStringsHelper(StdVectorFst & fst,
-                              int state,
-                              vector<string> &str, TropicalWeight cost);
-
-};                              // end class
-
-#endif
diff --git a/src/programs/g2p_train/M2MFstAligner.cpp b/src/programs/g2p_train/M2MFstAligner.cpp
deleted file mode 100644
index 718218df..00000000
--- a/src/programs/g2p_train/M2MFstAligner.cpp
+++ /dev/null
@@ -1,680 +0,0 @@
-/*
- M2MFstAligner.cpp
-
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
-  modification, are permitted #provided that the following conditions
-  are met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of #conditions and the following
-    disclaimer in the documentation and/or other materials provided
-    with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <fst/fstlib.h>
-#include <iostream>
-#include <set>
-#include "M2MFstAligner.hpp"
-
-//Begin Utility functions (these really need to go somewhere else
-vector<string> &split(const string & s, string delim,
-                         vector<string> &elems)
-{
-    stringstream ss(s);
-    string item;
-    //delim.c_str()[0] is a VERY bad thing to do
-    // this will produce behavior that makes not sense
-    // to the user if they try to use a multi-char delimiter
-    //Actually, this is inexcusable but first things first let's
-    // get everything else working properly.
-    while (getline(ss, item, delim.c_str()[0])) {
-        elems.push_back(item);
-    }
-    return elems;
-}
-
-
-vector<string> split(const string & s, string delim)
-{
-    vector<string> elems;
-    return split(s, delim, elems);
-}
-
-
-string
-vec2str(vector<string> vec, string sep)
-{
-    string ss;
-    for (size_t i = 0; i < vec.size(); ++i) {
-        if (i != 0)
-            ss += sep;
-        ss += vec[i];
-    }
-    return ss;
-}
-
-string
-itoas(int i)
-{
-    std::stringstream ostring;
-    ostring << i;
-    return ostring.str();
-}
-
-int
-M2MFstAligner::get_max_length(string joint_label)
-{
-    //We can probably make this a LOT faster...
-    vector<string> parts = split(joint_label, s1s2_sep);
-    assert(parts.size() > 1);
-    vector<string> s1 = split(parts[0], seq1_sep);
-    vector<string> s2 = split(parts[1], seq2_sep);
-    int m = max(s1.size(), s2.size());
-    //Probably want to rethink this placement..
-    //At this point the model should not contain any of these
-    // transitions anyway.  So this is redundant...
-    if (s1.size() > 1 && s2.size() > 1)
-        m = -1;
-    return m;
-}
-
-//End utility functions
-
-
-M2MFstAligner::M2MFstAligner()
-{
-    //Default constructor
-}
-
-M2MFstAligner::M2MFstAligner(bool _seq1_del, bool _seq2_del, int _seq1_max,
-                             int _seq2_max, string _seq1_sep,
-                             string _seq2_sep, string _s1s2_sep,
-                             string _eps, string _skip, bool _penalize)
-{
-    //Base constructor.  Determine whether or not to allow deletions in seq1 and seq2
-    // as well as the maximum allowable subsequence size.
-    seq1_del = _seq1_del;
-    seq2_del = _seq2_del;
-    seq1_max = _seq1_max;
-    seq2_max = _seq2_max;
-    seq1_sep = _seq1_sep;
-    seq2_sep = _seq2_sep;
-    s1s2_sep = _s1s2_sep;
-    penalize = _penalize;
-    eps = _eps;
-    skip = _skip;
-    skipSeqs.insert(eps);
-    isyms = new SymbolTable("syms");
-    //Add all the important symbols to the table.  We can store these
-    // in the model that we train and then attach them to the fst model
-    // if we want to use it later on.
-    //Thus, in addition to eps->0, we reserve symbol ids 1-4 as well.
-    isyms->AddSymbol(eps);
-    isyms->AddSymbol(skip);
-    //The '_' as a separator here is dangerous
-    isyms->AddSymbol(seq1_sep + "_" + seq2_sep);
-    isyms->AddSymbol(s1s2_sep);
-    string s1_del_str = seq1_del ? "true" : "false";
-    string s2_del_str = seq2_del ? "true" : "false";
-    string s1_max_str = itoas(seq1_max);
-    string s2_max_str = itoas(seq2_max);
-    string model_params =
-        s1_del_str + "_" + s2_del_str + "_" + s1_max_str + "_" +
-        s2_max_str;
-    isyms->AddSymbol(model_params);
-    total = LogWeight::Zero();
-    prevTotal = LogWeight::Zero();
-}
-
-M2MFstAligner::M2MFstAligner(string _model_file)
-{
-    VectorFst<LogArc> *model = VectorFst<LogArc>::Read(_model_file);
-    for (StateIterator<VectorFst<LogArc> > siter(*model);
-            !siter.Done(); siter.Next()) {
-        LogArc::StateId q = siter.Value();
-        for (ArcIterator<VectorFst<LogArc> > aiter(*model, q);
-                !aiter.Done(); aiter.Next()) {
-            const LogArc & arc = aiter.Value();
-            alignment_model.insert(pair<LogArc::Label,LogWeight> (arc.ilabel, arc.weight));
-        }
-    }
-    isyms = (SymbolTable *) model->InputSymbols();
-    int i = 0;
-    eps = isyms->Find(i);       //Can't write '0' here for some reason...
-    skip = isyms->Find(1);
-    vector<string> seps = split(isyms->Find(2), "_");
-    seq1_sep = seps[0];
-    seq2_sep = seps[1];
-    s1s2_sep = isyms->Find(3);
-    vector<string> params = split(isyms->Find(4), "_");
-    seq1_del = params[0].compare("true") ? false : true;
-    seq2_del = params[1].compare("true") ? false : true;
-    seq1_max = atoi(params[2].c_str());
-    seq2_max = atoi(params[3].c_str());
-
-}
-
-void
-M2MFstAligner::write_model(string _model_file)
-{
-    VectorFst<LogArc> model;
-    model.AddState();
-    model.SetStart(0);
-    model.SetFinal(0, LogWeight::One());
-    map<LogArc::Label,LogWeight>::iterator it;
-    for (it = alignment_model.begin(); it != alignment_model.end(); it++)
-        model.AddArc(0, LogArc((*it).first, (*it).first, (*it).second, 0));
-    model.SetInputSymbols(isyms);
-    model.Write(_model_file);
-    return;
-}
-
-void
-M2MFstAligner::expectation()
-{
-    for (int i = 0; i < fsas.size(); i++) {
-        //Comput Forward and Backward probabilities
-        ShortestDistance(fsas.at(i), &alpha);
-        ShortestDistance(fsas.at(i), &beta, true);
-
-        //Compute the normalized Gamma probabilities and
-        // update our running tally
-        for (StateIterator<VectorFst<LogArc> > siter(fsas.at(i));
-                !siter.Done(); siter.Next()) {
-            LogArc::StateId q = siter.Value();
-            for (ArcIterator<VectorFst<LogArc> > aiter(fsas.at(i), q);
-                    !aiter.Done(); aiter.Next()) {
-                const LogArc & arc = aiter.Value();
-                const LogWeight & gamma =
-                    Divide(Times
-                           (Times(alpha[q], arc.weight),
-                            beta[arc.nextstate]), beta[0]);
-                //Check for any BadValue results, otherwise add to the tally.
-                //We call this 'prev_alignment_model' which may seem misleading, but
-                // this conventions leads to 'alignment_model' being the final version.
-                if (gamma.Value() == gamma.Value()) {
-                    prev_alignment_model[arc.ilabel] =
-                        Plus(prev_alignment_model[arc.ilabel], gamma);
-                    total = Plus(total, gamma);
-                }
-            }
-        }
-        alpha.clear();
-        beta.clear();
-    }
-}
-
-void
-M2MFstAligner::Sequences2FST(VectorFst<LogArc> *fst,
-                             vector<string> *seq1,
-                             vector<string> *seq2)
-{
-    /*
-       Build an FST that represents all possible alignments between seq1 and seq2, given the
-       parameter values input by the user.  Here we encode the input and output labels, in fact
-       creating a WFSA.  This simplifies the training process, but means that we can only
-       easily compute a joint maximization.  In practice joint maximization seems to give the
-       best results anyway, so it probably doesn't matter.
-
-       Note: this also performs the initizization routine.  It performs a UNIFORM initialization
-       meaning that every non-null alignment sequence is eventually initialized to 1/Num(unique_alignments).
-       It might be more appropriate to consider subsequence length here, but for now we stick
-       to the m2m-aligner approach.
-
-       TODO: Add an FST version and support for conditional maximization.  May be useful for languages
-       like Japanese where there is a distinct imbalance in the seq1->seq2 length correspondences.
-     */
-    int istate = 0;
-    int ostate = 0;
-    for (int i = 0; i <= seq1->size(); i++) {
-        for (int j = 0; j <= seq2->size(); j++) {
-            fst->AddState();
-            istate = i * (seq2->size() + 1) + j;
-
-            //Epsilon arcs for seq1
-            if (seq1_del == true)
-                for (int l = 1; l <= seq2_max; l++) {
-                    if (j + l <= seq2->size()) {
-                        vector<string> subseq2(seq2->begin() + j,
-                                                  seq2->begin() + j + l);
-                        string sym = skip + s1s2_sep +
-                            vec2str(subseq2, seq2_sep);
-                        int is =
-                            isyms->AddSymbol(sym);
-                        ostate = i * (seq2->size() + 1) + (j + l);
-                        //LogArc arc( is, is, LogWeight::One().Value()*(l+1)*2, ostate );
-                        LogArc arc(is, is, 99, ostate);
-                        //LogArc arc( is, is, LogWeight::Zero(), ostate );
-                        fst->AddArc(istate, arc);
-                        if (prev_alignment_model.find(arc.ilabel) ==
-                                prev_alignment_model.end())
-                            prev_alignment_model.insert(pair <
-                                                        LogArc::Label,
-                                                        LogWeight >
-                                                        (arc.ilabel,
-                                                         arc.weight));
-                        else
-                            prev_alignment_model[arc.ilabel] =
-                                Plus(prev_alignment_model[arc.ilabel],
-                                     arc.weight);
-                        total = Plus(total, arc.weight);
-                    }
-                }
-
-            //Epsilon arcs for seq2
-            if (seq2_del == true)
-                for (int k = 1; k <= seq1_max; k++) {
-                    if (i + k <= seq1->size()) {
-                        vector<string> subseq1(seq1->begin() + i,
-                                                  seq1->begin() + i + k);
-                        string sym = vec2str(subseq1, seq1_sep) +
-                            s1s2_sep + skip;
-                        int is =
-                            isyms->AddSymbol(sym);
-                        ostate = (i + k) * (seq2->size() + 1) + j;
-                        //LogArc arc( is, is, LogWeight::One().Value()*(k+1)*2, ostate );
-                        LogArc arc(is, is, 99, ostate);
-                        //LogArc arc( is, is, LogWeight::Zero(), ostate );
-                        fst->AddArc(istate, arc);
-                        if (prev_alignment_model.find(arc.ilabel) ==
-                                prev_alignment_model.end())
-                            prev_alignment_model.insert(pair <
-                                                        LogArc::Label,
-                                                        LogWeight >
-                                                        (arc.ilabel,
-                                                         arc.weight));
-                        else
-                            prev_alignment_model[arc.ilabel] =
-                                Plus(prev_alignment_model[arc.ilabel],
-                                     arc.weight);
-                        total = Plus(total, arc.weight);
-                    }
-                }
-
-            //All the other arcs
-            for (int k = 1; k <= seq1_max; k++) {
-                for (int l = 1; l <= seq2_max; l++) {
-                    if (i + k <= seq1->size() && j + l <= seq2->size()) {
-                        vector<string> subseq1(seq1->begin() + i,
-                                               seq1->begin() + i + k);
-                        string s1 = vec2str(subseq1, seq1_sep);
-                        vector<string> subseq2(seq2->begin() + j,
-                                               seq2->begin() + j + l);
-                        string s2 = vec2str(subseq2, seq2_sep);
-                        if (l > 1 && k > 1)
-                            continue;
-                        string sym = s1 + s1s2_sep + s2;
-                        int is = isyms->AddSymbol(sym);
-                        ostate = (i + k) * (seq2->size() + 1) + (j + l);
-                        LogArc arc(is, is,
-                                   LogWeight::One().Value() * (k + l),
-                                   ostate);
-                        //LogArc arc( is, is, LogWeight::One().Value(), ostate );
-                        fst->AddArc(istate, arc);
-                        //During the initialization phase, just count non-eps transitions
-                        //We currently initialize to uniform probability so there is also
-                        // no need to tally anything here.
-                        if (prev_alignment_model.find(arc.ilabel) ==
-                                prev_alignment_model.end())
-                            prev_alignment_model.insert(pair <
-                                                        LogArc::Label,
-                                                        LogWeight >
-                                                        (arc.ilabel,
-                                                         arc.weight));
-                        else
-                            prev_alignment_model[arc.ilabel] =
-                                Plus(prev_alignment_model[arc.ilabel],
-                                     arc.weight);
-                        total = Plus(total, arc.weight);
-                    }
-                }
-            }
-
-        }
-    }
-
-    fst->SetStart(0);
-    fst->SetFinal(((seq1->size() + 1) * (seq2->size() + 1)) - 1,
-                  LogWeight::One());
-    //Unless seq1_del==true && seq2_del==true we will have unconnected states
-    // thus we need to run connect to clean out these states
-    //fst->SetInputSymbols(isyms);
-    //fst->Write("right.nc.fsa");
-    if (seq1_del == false or seq2_del == false)
-        Connect(fst);
-    //fst->Write("right.c.fsa");
-    return;
-}
-
-void
-M2MFstAligner::Sequences2FSTNoInit(VectorFst<LogArc> *fst,
-                                   vector<string> *seq1,
-                                   vector<string> *seq2)
-{
-    /*
-       Build an FST that represents all possible alignments between seq1 and seq2, given the
-       parameter values input by the user.  Here we encode the input and output labels, in fact
-       creating a WFSA.  This simplifies the training process, but means that we can only
-       easily compute a joint maximization.  In practice joint maximization seems to give the
-       best results anyway, so it probably doesn't matter.
-
-       It might be more appropriate to consider subsequence length here, but for now we stick
-       to the m2m-aligner approach.
-     */
-    int istate = 0;
-    int ostate = 0;
-    for (int i = 0; i <= seq1->size(); i++) {
-        for (int j = 0; j <= seq2->size(); j++) {
-            fst->AddState();
-            istate = i * (seq2->size() + 1) + j;
-
-            //Epsilon arcs for seq1
-            if (seq1_del == true)
-                for (int l = 1; l <= seq2_max; l++) {
-                    if (j + l <= seq2->size()) {
-                        vector<string> subseq2(seq2->begin() + j,
-                                                  seq2->begin() + j + l);
-                        int is =
-                            isyms->Find(skip + s1s2_sep +
-                                        vec2str(subseq2, seq2_sep));
-                        ostate = i * (seq2->size() + 1) + (j + l);
-                        //LogArc arc( is, is, LogWeight::One().Value()*(l+1)*2, ostate );
-                        LogArc arc(is, is, 99, ostate);
-                        fst->AddArc(istate, arc);
-                    }
-                }
-
-            //Epsilon arcs for seq2
-            if (seq2_del == true)
-                for (int k = 1; k <= seq1_max; k++) {
-                    if (i + k <= seq1->size()) {
-                        vector<string> subseq1(seq1->begin() + i,
-                                                  seq1->begin() + i + k);
-                        int is =
-                            isyms->Find(vec2str(subseq1, seq1_sep) +
-                                        s1s2_sep + skip);
-                        ostate = (i + k) * (seq2->size() + 1) + j;
-                        //LogArc arc( is, is, LogWeight::One().Value()*(k+1)*2, ostate );
-                        LogArc arc(is, is, 99, ostate);
-                        fst->AddArc(istate, arc);
-                    }
-                }
-
-            //All the other arcs
-            for (int k = 1; k <= seq1_max; k++) {
-                for (int l = 1; l <= seq2_max; l++) {
-                    if (i + k <= seq1->size() && j + l <= seq2->size()) {
-                        vector<string> subseq1(seq1->begin() + i,
-                                                  seq1->begin() + i + k);
-                        string s1 = vec2str(subseq1, seq1_sep);
-                        vector<string> subseq2(seq2->begin() + j,
-                                                  seq2->begin() + j + l);
-                        string s2 = vec2str(subseq2, seq2_sep);
-                        if (l > 1 && k > 1)
-                            continue;
-                        int is = isyms->Find(s1 + s1s2_sep + s2);
-                        ostate = (i + k) * (seq2->size() + 1) + (j + l);
-                        LogArc arc(is, is,
-                                   LogWeight::One().Value() * (k + l),
-                                   ostate);
-                        fst->AddArc(istate, arc);
-                    }
-                }
-            }
-
-        }
-    }
-
-    fst->SetStart(0);
-    fst->SetFinal(((seq1->size() + 1) * (seq2->size() + 1)) - 1,
-                  LogWeight::One());
-    //Unless seq1_del==true && seq2_del==true we will have unconnected states
-    // thus we need to run connect to clean out these states
-    if (seq1_del == false or seq2_del == false)
-        Connect(fst);
-    return;
-}
-
-//Build the composed alignment FST and add it to the list of training data
-void
-M2MFstAligner::entry2alignfst(vector<string> seq1,
-                              vector<string> seq2)
-{
-    VectorFst<LogArc> fst;
-    Sequences2FST(&fst, &seq1, &seq2);
-    fsas.push_back(fst);
-    return;
-}
-
-vector<PathData> M2MFstAligner::entry2alignfstnoinit(vector<string>
-        seq1,
-        vector<string>
-        seq2, int nbest,
-        string lattice)
-{
-    VectorFst<LogArc> fst;
-    Sequences2FSTNoInit(&fst, &seq1, &seq2);
-    if (lattice.compare("") != 0)
-        fst.Write(lattice);
-    return write_alignment(fst, nbest);
-}
-
-float
-M2MFstAligner::maximization(bool lastiter)
-{
-    //Maximization. Simple count normalization.  Probably get an improvement
-    // by using a more sophisticated regularization approach.
-    map<LogArc::Label,LogWeight>::iterator it;
-    float change = abs(total.Value() - prevTotal.Value());
-    //cout << "Total: " << total << " Change: " << abs(total.Value()-prevTotal.Value()) << endl;
-    prevTotal = total;
-
-    //Normalize and iterate to the next model.  We apply it dynamically
-    // during the expectation step.
-    for (it = prev_alignment_model.begin();
-            it != prev_alignment_model.end(); it++) {
-        alignment_model[(*it).first] = Divide((*it).second, total);
-        (*it).second = LogWeight::Zero();
-    }
-
-    for (int i = 0; i < fsas.size(); i++) {
-        for (StateIterator<VectorFst<LogArc> > siter(fsas[i]);
-                !siter.Done(); siter.Next()) {
-            LogArc::StateId q = siter.Value();
-            for (MutableArcIterator<VectorFst<LogArc> > aiter(&fsas[i], q); !aiter.Done(); aiter.Next()) {
-                LogArc arc = aiter.Value();
-                arc.weight = alignment_model[arc.ilabel];
-                aiter.SetValue(arc);
-            }
-        }
-    }
-
-    total = LogWeight::Zero();
-    return change;
-}
-
-int
-M2MFstAligner::num_fsas()
-{
-    //A getter function because I'm retarded.
-    return fsas.size();
-}
-
-vector<PathData> M2MFstAligner::write_alignment(const VectorFst<LogArc> &ifst,
-        int nbest)
-{
-    //Generic alignment generator
-    VectorFst<StdArc> fst;
-    Map(ifst, &fst, LogToStdMapper());
-
-    for (StateIterator<VectorFst<StdArc> > siter(fst); !siter.Done();
-            siter.Next()) {
-        StdArc::StateId q = siter.Value();
-        for (MutableArcIterator<VectorFst<StdArc> > aiter(&fst, q);
-                !aiter.Done(); aiter.Next()) {
-            //Prior to decoding we make several 'heuristic' modifications to the weights:
-            // 1. A multiplier is applied to any multi-token substrings
-            // 2. Any LogWeight::Zero() arc weights are reset to '99'.
-            //    We are basically resetting 'Infinity' values to a 'smallest non-Infinity'
-            //     so that the ShortestPath algorithm actually produces something no matter what.
-            // 3. Any arcs that consist of subseq1:subseq2 being the same length and subseq1>1
-            //       are set to '99' this forces shortestpath to choose arcs where one of the
-            //       following conditions holds true
-            //      * len(subseq1)>1 && len(subseq2)!=len(subseq1)
-            //      * len(subseq2)>1 && len(subseq1)!=len(subseq2)
-            //      * len(subseq1)==len(subseq2)==1
-            //I suspect these heuristics can be eliminated with a better choice of the initialization
-            // function and maximization function, but this is the way that m2m-aligner works, so
-            // it makes sense for our first cut implementation.
-            //In any case, this guarantees that M2MFstAligner produces results identical to those
-            // produced by m2m-aligner - but with a bit more reliability.
-            //UPDATE: this now produces a better alignment than m2m-aligner.
-            //  The maxl heuristic is still in place.  The aligner will produce *better* 1-best alignments
-            //  *without* the maxl heuristic below, BUT this comes at the cost of producing a less
-            //  flexible corpus.  That is, for a small training corpus like nettalk, if we use the
-            //  best alignment we wind up with more 'chunks' and thus get a worse coverage for unseen
-            //  data.  Using the aignment lattices to train the joint ngram model solves this problem.
-            //  Oh baby.  Can't wait to for everyone to see the paper!
-            //NOTE: this is going to fail if we encounter any alignments in a new test item that never
-            // occurred in the original model.
-            StdArc
-            arc = aiter.Value();
-            int
-            maxl = get_max_length(isyms->Find(arc.ilabel));
-            if (maxl == -1) {
-                arc.weight = 999;
-            }
-            else {
-                //Optionally penalize m-to-1 / 1-to-m links.  This produces
-                // WORSE 1-best alignments, but results in better joint n-gram
-                // models for small training corpora when using only the 1-best
-                // alignment.  By further favoring 1-to-1 alignments the 1-best
-                // alignment corpus results in a more flexible joint n-gram model
-                // with regard to previously unseen data.
-                //if( penalize==true ){
-                arc.weight = alignment_model[arc.ilabel].Value() * maxl;
-                //}else{
-                //For larger corpora this is probably unnecessary.
-                //arc.weight = alignment_model[arc.ilabel].Value();
-                //}
-            }
-            if (arc.weight == LogWeight::Zero())
-                arc.weight = 999;
-            if (arc.weight != arc.weight)
-                arc.weight = 999;
-            aiter.SetValue(arc);
-        }
-    }
-
-    VectorFst<StdArc> shortest;
-    ShortestPath(fst, &shortest, nbest);
-    RmEpsilon(&shortest);
-    //Skip empty results.  This should only happen
-    // in the following situations:
-    //  1. seq1_del=false && len(seq1)<len(seq2)
-    //  2. seq2_del=false && len(seq1)>len(seq2)
-    //In both 1.and 2. the issue is that we need to
-    // insert a 'skip' in order to guarantee at least
-    // one valid alignment path through seq1*seq2, but
-    // user params didn't allow us to.
-    //Probably better to insert these where necessary
-    // during initialization, regardless of user prefs.
-    if (shortest.NumStates() == 0) {
-        vector<PathData> dummy;
-        return dummy;
-    }
-    FstPathFinder
-    pathfinder(skipSeqs);
-    pathfinder.isyms = isyms;
-    pathfinder.findAllStrings(shortest);
-    return pathfinder.paths;
-}
-
-void
-M2MFstAligner::write_all_alignments(int nbest)
-{
-    //Convenience function for the python bindings
-    for (int i = 0; i < fsas.size(); i++)
-        write_alignment(fsas[i], nbest);
-
-    return;
-}
-
-vector<PathData> M2MFstAligner::write_alignment_wrapper(int i,
-        int nbest)
-{
-    //Wrapper for the python bindings.
-    return write_alignment(fsas[i], nbest);
-}
-
-void
-M2MFstAligner::write_lattice(string lattice)
-{
-    //Write out the entire training set in lattice format
-    //Perform the union first.  This output can then
-    // be plugged directly in to a counter to obtain expected
-    // alignment counts for the EM-trained corpus.  Yields
-    // far higher-quality joint n-gram models, which are also
-    // more robust for smaller training corpora.
-    //Make sure you call this BEFORE any call to
-    // write_all_alignments
-    // as the latter function will override some of the weights
-
-    //Chaining the standard Union operation, including using a
-    // rational FST still performs very poorly in the log semiring.
-    //Presumably it's running push or something at each step.  It
-    // should be fine to do that just once at the end.
-    //Rolling our own union turns out to be MUCH faster.
-    VectorFst<LogArc> ufst;
-    ufst.AddState();
-    ufst.SetStart(0);
-    int total_states = 0;
-    for (int i = 0; i < fsas.size(); i++) {
-        TopSort(&fsas[i]);
-        for (StateIterator<VectorFst<LogArc> > siter(fsas[i]);
-                !siter.Done(); siter.Next()) {
-            LogArc::StateId q = siter.Value();
-            LogArc::StateId r;
-            if (q == 0)
-                r = 0;
-            else
-                r = ufst.AddState();
-
-            for (ArcIterator <VectorFst<LogArc> > aiter(fsas[i], q);
-                    !aiter.Done(); aiter.Next()) {
-                const LogArc & arc = aiter.Value();
-                ufst.AddArc(r,
-                            LogArc(arc.ilabel, arc.ilabel, arc.weight,
-                                   arc.nextstate + total_states));
-            }
-            if (fsas[i].Final(q) != LogWeight::Zero())
-                ufst.SetFinal(r, LogWeight::One());
-        }
-        total_states += fsas[i].NumStates() - 1;
-    }
-    //Normalize weights
-    Push(&ufst, REWEIGHT_TO_INITIAL);
-    //Write the resulting lattice to disk
-    ufst.Write(lattice);
-    //Write the syms table too.
-    isyms->WriteText("lattice.syms");
-    return;
-}
diff --git a/src/programs/g2p_train/M2MFstAligner.hpp b/src/programs/g2p_train/M2MFstAligner.hpp
deleted file mode 100644
index da45cc25..00000000
--- a/src/programs/g2p_train/M2MFstAligner.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef M2MFSTALIGNER_H
-#define M2MFSTALIGNER_H
-/*
- M2MFstAligner.hpp
-
- Copyright (c) [2012-], Josef Robert Novak
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
-  modification, are permitted #provided that the following conditions
-  are met:
-
-* Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above
-    copyright notice, this list of #conditions and the following
-    disclaimer in the documentation and/or other materials provided
-    with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-#include <fst/fstlib.h>
-#include <vector>
-#include "FstPathFinder.hpp"
-using namespace std;
-
-namespace fst {
-class M2MFstAligner {
-    /*
-       Read in pairs of sequences of the form SEQ1 and SEQ2 and
-       transform them into an FST that encodes all possible
-       alignments between the symbols in the two sequences.
-       Note that this may include a combination of multi-symbol
-       subsequences depending on user specifications.
-
-       This is achieved by simply generating the entire alignment
-       graph during a single nested loop through the two input
-       sequences that are to be aligned.
-
-       The user may optionally specify whether to allow deletions
-       for SEQ1 or SEQ2, as well as a maximum subsequence length
-       for each sequence.
-     */
-public:
-    //Basics declarations
-    bool seq1_del;
-    bool seq2_del;
-    int seq1_max;
-    int seq2_max;
-    string seq1_sep;
-    string seq2_sep;
-    string s1s2_sep;
-    string eps;
-    string skip;
-    bool penalize;
-    vector<LogWeight> alpha, beta;
-    //This will be used during decoding to clean the paths
-    set<string> skipSeqs;
-    //OpenFst stuff
-    //These will be overwritten after each FST construction
-    vector<VectorFst<LogArc> > fsas;
-
-    //This will be maintained for the life of object
-    //These symbol tables will be maintained entire life of
-    // the object.  This will ensure that any resulting 'corpus'
-    // shares the same symbol tables.
-    SymbolTable *isyms;
-    map<LogArc::Label,LogWeight> alignment_model;
-    map<LogArc::Label,LogWeight> prev_alignment_model;
-    LogWeight total;
-    LogWeight prevTotal;
-
-    //Constructors
-    M2MFstAligner();
-    M2MFstAligner(bool _seq1_del, bool _seq2_del, int _seq1_max,
-                  int _seq2_max, string _seq1_sep, string _seq2_sep,
-                  string _s1s2_sep, string _eps, string _skip,
-                  bool _penalize);
-    M2MFstAligner(string _model_file);
-
-    //Write an aligner model to disk.  Critical info is stored in the
-    // the symbol table so that it can be restored when the model is loaded.
-    void write_model(string _model_name);
-    //Transform a sequence pair into an equivalent multiple-to-multiple FST,
-    // encoding all possible alignments between the two sequences
-    void Sequences2FST(VectorFst<LogArc> *fst,
-                       vector<string> *seq1,
-                       vector<string> *seq2);
-    void Sequences2FSTNoInit(VectorFst<LogArc> *fst,
-                             vector<string> *seq1,
-                             vector<string> *seq2);
-    //Initialize all of the training data
-    void entry2alignfst(vector<string> seq1,
-                        vector<string> seq2);
-    vector<PathData> entry2alignfstnoinit(vector<string> seq1,
-            vector<string> seq2,
-            int nbest,
-            string lattice = "");
-    vector<PathData> write_alignment_wrapper(int i, int nbest);
-    //The expectation routine
-    void expectation();
-    //The maximization routine.  Returns the change since the last iteration
-    float maximization(bool lastiter);
-    //Print out the EM-optimized alignment for the training data
-    vector<PathData> write_alignment(const VectorFst<LogArc>
-                                        &ifst, int nbest);
-    //Write out the union of the weighted alignment lattices for the training corpus
-    void write_lattice(string lattice);
-    //Convenience function to output all the alignments
-    void write_all_alignments(int nbest);
-    //max routine
-    int get_max_length(string joint_label);
-    int num_fsas();
-
-};
-}
-#endif                          // M2MFSTALIGNER_H //
diff --git a/src/programs/g2p_train/g2p_train.cpp b/src/programs/g2p_train/g2p_train.cpp
index 4dfb09c6..980c89ae 100644
--- a/src/programs/g2p_train/g2p_train.cpp
+++ b/src/programs/g2p_train/g2p_train.cpp
@@ -39,7 +39,7 @@
  * NGram language modeling toolkit instead of MITLM.
  *
  * for more details about phonetisaurus see
- * http://code.google.com/p/phonetisaurus/
+ * https://github.com/AdolfVonKleist/Phonetisaurus
  * http://www.openfst.org/twiki/bin/view/GRM/NGramLibrary
  */
 
@@ -64,8 +64,10 @@
 #include <ngram/ngram-witten-bell.h>
 #include <ngram/ngram-unsmoothed.h>
 #include <sphinxbase/err.h>
-#include "M2MFstAligner.hpp"
-#include "../g2p_eval/util.hpp"
+#include <assert.h>
+#include "include/PhonetisaurusRex.h"
+#include "include/M2MFstAligner.h"
+#include "include/LatticePruner.h"
 
 #define arc_type "standard"
 #define fst_type "vector"
@@ -113,10 +115,20 @@ template <class Arc> struct ToLog64Mapper {
         return props;
     }
 };
-} using namespace std;
+}
+
+using namespace std;
 using namespace ngram;
 using namespace fst;
 
+string
+convertInt(int number)
+{
+    stringstream ss;            //create a stringstream
+    ss << number;               //add number to the stream
+    return ss.str();            //return a string with the contents of the stream
+}
+
 void
 addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms,
         SymbolTable * isyms, SymbolTable * osyms, SymbolTable * ssyms,
@@ -127,7 +139,11 @@ addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms,
             aiter.Next()) {
         StdArc arc = aiter.Value();
         string oldlabel = oldsyms->Find(arc.ilabel);
+        // Make sure <eps> on its own maps to <eps>:<eps>
         if (oldlabel == eps) {
+            // Ensure we aren't losing any output symbol (it's an
+            // acceptor so this should not happen)
+            assert(oldsyms->Find(arc.olabel) == eps);
             oldlabel = oldlabel.append("}");
             oldlabel = oldlabel.append(eps);
         }
@@ -142,9 +158,8 @@ addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms,
 
         int64 nextstate = ssyms->Find(convertInt(arc.nextstate));
         if (nextstate == -1) {
-            out->AddState();
-            ssyms->AddSymbol(convertInt(arc.nextstate));
-            nextstate = ssyms->Find(convertInt(arc.nextstate));
+            nextstate = out->AddState();
+            ssyms->AddSymbol(convertInt(arc.nextstate), nextstate);
         }
         out->AddArc(newstate,
                     StdArc(ilabel, olabel,
@@ -155,10 +170,65 @@ addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms,
     }
 }
 
+void
+patch_labels(StdMutableFst *arpafst, SymbolTable* syms, int64 skip_id, bool input) {
+    /*
+      Patch all labels.  In some edge cases it is possible
+      to end up grapheme subsequences: e.g. 'QU' where one or
+      both tokens is only mapped to the multi-subsequence.  In thise
+      case the independent 'Q' and/or 'U' token will never appear
+      in isolation.
+      This bit resolves this by:
+
+      a.) finding and adding these missing subsequence tokens
+      b.) adding backoff loops to the LM
+
+    */
+    string tie = "|";
+    for (unsigned int i = skip_id + 1; i < syms->NumSymbols(); i++) {
+        string sym = syms->Find(i);
+        vector<string> parts  = tokenize_utf8_string(&sym, &tie);
+        if (parts.size() > 1) {
+            for (unsigned int j = 0; j < parts.size(); j++) {
+                if (syms->Find(parts[j]) == -1) {
+                    // Add the missing symbol
+                    int k = syms->AddSymbol(parts[j]);
+                    // Add a backoff loop mapped to the 'skip'
+                    // FIXME: phonetisaurus hard-codes this as 1 but I
+                    // believe that is wrong, it should maybe actually be
+                    // ssyms->Find("<s>")?
+                    int64 start_state = 1;
+                    if (input == true)
+                        arpafst->AddArc(start_state, StdArc(k, skip_id, 99, start_state));
+                    else
+                        arpafst->AddArc(start_state, StdArc(skip_id, k, 99, start_state));
+                }
+            }
+        }
+    }
+}
+
 void
 relabel(StdMutableFst * fst, StdMutableFst * out, string prefix,
         string eps, string skip, string s1s2_sep, string seq_sep)
 {
+    /*
+      Transform a statistical language model in ARPA format
+      to an equivalent Weighted Finite-State Acceptor.
+      This implementation adopts the Google format for the output
+      WFSA.  This differs from previous implementations in several ways:
+
+      Start-state and <s> arcs:
+      * There are no explicit sentence-begin (<s>) arcs
+      * There is a single <s> start-state.
+
+      Final-state and </s> arcs:
+      * There are no explicit sentence-end (</s>) arcs
+      * There is no explicit </s> state
+      * NGrams ending in </s> are designated as final
+      states, and any probability is assigned
+      to the final weight of said state.
+    */
     namespace s = fst::script;
     using fst::ostream;
     using fst::SymbolTable;
@@ -172,59 +242,41 @@ relabel(StdMutableFst * fst, StdMutableFst * out, string prefix,
     SymbolTable *osyms = new SymbolTable("osyms");
 
     out->AddState();
-    ssyms->AddSymbol("s0");
+    ssyms->AddSymbol("<s>");
     out->SetStart(0);
 
-    out->AddState();
-    ssyms->AddSymbol("s1");
-    out->SetFinal(1, TropicalWeight::One());
-
+    string tie = "|";
     isyms->AddSymbol(eps);
     osyms->AddSymbol(eps);
-
-    //Add separator, phi, start and end symbols
-    isyms->AddSymbol(seq_sep);
-    osyms->AddSymbol(seq_sep);
-    isyms->AddSymbol("<phi>");
-    osyms->AddSymbol("<phi>");
-    int istart = isyms->AddSymbol("<s>");
-    int iend = isyms->AddSymbol("</s>");
-    int ostart = osyms->AddSymbol("<s>");
-    int oend = osyms->AddSymbol("</s>");
-
-    out->AddState();
-    ssyms->AddSymbol("s2");
-    out->AddArc(0, StdArc(istart, ostart, TropicalWeight::One(), 2));
+    isyms->AddSymbol(tie);
+    osyms->AddSymbol(tie);
+    isyms->AddSymbol(skip);
+    osyms->AddSymbol(skip);
 
     for (StateIterator<StdFst> siter(*fst); !siter.Done(); siter.Next()) {
         StateId state_id = siter.Value();
 
         int64 newstate;
         if (state_id == fst->Start()) {
-            newstate = 2;
+            newstate = 0;
         }
         else {
             newstate = ssyms->Find(convertInt(state_id));
             if (newstate == -1) {
-                out->AddState();
-                ssyms->AddSymbol(convertInt(state_id));
-                newstate = ssyms->Find(convertInt(state_id));
+                newstate = out->AddState();
+                ssyms->AddSymbol(convertInt(state_id), newstate);
             }
         }
 
         TropicalWeight weight = fst->Final(state_id);
-
-        if (weight != TropicalWeight::Zero()) {
-            // this is a final state
-            StdArc a = StdArc(iend, oend, weight, 1);
-            out->AddArc(newstate, a);
-            out->SetFinal(newstate, TropicalWeight::Zero());
-        }
+        if (weight != TropicalWeight::Zero())
+            out->SetFinal(newstate, weight);
         addarcs(state_id, newstate, oldsyms, isyms, osyms, ssyms, eps,
                 s1s2_sep, fst, out);
     }
 
-
+    patch_labels(out, isyms, isyms->Find(skip), true);
+    patch_labels(out, osyms, osyms->Find(skip), false);
     out->SetInputSymbols(isyms);
     out->SetOutputSymbols(osyms);
 
@@ -252,14 +304,16 @@ train_model(string eps, string s1s2_sep, string skip, int order,
     using fst::script::VectorFstClass;
     using fst::script::WeightClass;
 
-    // create symbols file
-    cout << "Generating symbols..." << endl;
-    NGramInput *ingram =
-        new NGramInput(prefix + ".corpus.aligned", prefix + ".corpus.syms",
-                       "", eps, unknown_symbol, "", "");
-    ingram->ReadInput(0, 1);
+    // create symbols file (ngramsymbols)
+    {
+        cout << "Generating symbols..." << endl;
+        NGramInput ingram(prefix + ".corpus.aligned", prefix + ".corpus.syms",
+                          "", eps, unknown_symbol, "", "");
+        // Magic!?
+        ingram.ReadInput(0, 1);
+    }
 
-    // compile strings into a far archive
+    // compile strings into a far archive (farcompilestrings)
     cout << "Compiling symbols into FAR archive..." << endl;
     fst::FarEntryType fet;
     fst::script::GetFarEntryType(entry_type, &fet);
@@ -268,11 +322,8 @@ train_model(string eps, string s1s2_sep, string skip, int order,
     // Lovely inconsistent API you got there, OpenFST...
     fst::FarType fartype = fst::script::GetFarType(far_type);
 
-    delete ingram;
-
     vector<string> in_fname;
     in_fname.push_back(prefix + ".corpus.aligned");
-
     fst::script::FarCompileStrings(in_fname,
                                    prefix + ".corpus.far", arc_type,
                                    fst_type, fartype,
@@ -282,13 +333,12 @@ train_model(string eps, string s1s2_sep, string skip, int order,
                                    initial_symbols, allow_negative_labels,
                                    key_prefix, key_suffix);
 
-    //count n-grams
+    // count n-grams (ngramcount)
     cout << "Counting n-grams..." << endl;
     NGramCounter<Log64Weight> ngram_counter(order, epsilon_as_backoff);
-
     FstReadOptions opts;
-    FarReader<StdArc> *far_reader;
-    far_reader = FarReader<StdArc>::Open(prefix + ".corpus.far");
+    // NO RAII FOR YOU! NO!
+    FarReader<StdArc> *far_reader = FarReader<StdArc>::Open(prefix + ".corpus.far");
     int fstnumber = 1;
     const Fst<StdArc> *ifst = 0, *lfst = 0;
     while (!far_reader->Done()) {
@@ -323,18 +373,16 @@ train_model(string eps, string s1s2_sep, string skip, int order,
         ++fstnumber;
     }
     delete far_reader;
-
-    if (!lfst) {
+    if (!lfst)
         E_FATAL("None of the input FSTs had a symbol table\n");
-        //exit(1);
-    }
-
     VectorFst<StdArc> vfst;
     ngram_counter.GetFst(&vfst);
     ArcSort(&vfst, StdILabelCompare());
     vfst.SetInputSymbols(lfst->InputSymbols());
     vfst.SetOutputSymbols(lfst->InputSymbols());
     vfst.Write(prefix + ".corpus.cnts");
+
+    // Make smoothed N-Grams (ngrammake)
     StdMutableFst *fst =
         StdMutableFst::Read(prefix + ".corpus.cnts", true);
     if (smooth != "no") {
@@ -384,9 +432,9 @@ train_model(string eps, string s1s2_sep, string skip, int order,
             E_FATAL("Bad smoothing method: %s\n", smooth.c_str());
         }
     }
+    // fst->Write(prefix + ".smooth.mod");
     if (prune != "no") {
         cout << "Pruning model..." << endl;
-
         if (prune == "count_prune") {
             NGramCountPrune ngramsh(fst, count_pattern,
                                     shrink_opt, total_unigram_count,
@@ -410,12 +458,15 @@ train_model(string eps, string s1s2_sep, string skip, int order,
             E_FATAL("Bad shrink method:  %s\n", prune.c_str());
         }
     }
+    // fst->Write(prefix + ".shrink.mod");
 
     cout << "Minimizing model..." << endl;
     MutableFstClass *minimized = new s::MutableFstClass(*fst);
     Minimize(minimized, 0, fst::kDelta);
     fst = minimized->GetMutableFst<StdArc>();
+    // fst->Write(prefix + ".min.mod");
 
+    // Split input/output labels (phonetisaurus-arpa2wfst)
     cout << "Correcting final model..." << endl;
     StdMutableFst *out = new StdVectorFst();
     relabel(fst, out, prefix, eps, skip, s1s2_sep, seq_sep);
@@ -425,6 +476,77 @@ train_model(string eps, string s1s2_sep, string skip, int order,
 }
 
 
+void write_alignments (M2MFstAligner* aligner, std::ofstream &ofile,
+               StdArc::Weight threshold, int nbest,
+               bool fb, bool penalize) {
+  /*
+     Write the raw alignments to a file in text-based corpus format.
+
+     NOTE: Although N-best and other pruning strategies are supported,
+           the final format is that of a standard text corpus.  All relative
+       token and pronunciation scores will be stripped.  In general
+       this means that, unless you are very lucky with your combined
+       pruning strategy the un-ranked N-best hypotheses will result in a
+       lower-quality joint N-gram model.
+
+       This approach is best used with simple 1-best.
+  */
+
+  //Build us a lattice pruner
+  LatticePruner pruner (aligner->penalties, threshold, nbest, fb, penalize);
+
+  VetoSet veto_set_;
+  veto_set_.insert (0);
+  for (unsigned int i = 0; i < aligner->fsas.size (); i++) {
+    //Map to Tropical semiring
+    VectorFst<StdArc>* tfst = new VectorFst<StdArc> ();
+    Map (aligner->fsas.at (i), tfst, LogToStdMapper ());
+    pruner.prune_fst (tfst);
+    RmEpsilon (tfst);
+    //Skip empty results.  This should only happen
+    // in the following situations:
+    //  1. seq1_del=false && len(seq1)<len(seq2)
+    //  2. seq2_del=false && len(seq1)>len(seq2)
+    //In both 1.and 2. the issue is that we need to
+    // insert a 'skip' in order to guarantee at least
+    // one valid alignment path through seq1*seq2, but
+    // user params didn't allow us to.
+    //Probably better to insert these where necessary
+    // during initialization, regardless of user prefs.
+    if (tfst->NumStates () > 0) {
+      StdArc::Weight weight_threshold = 99;
+      StdArc::StateId state_threshold = kNoStateId;
+      AnyArcFilter<StdArc> arc_filter;
+      vector<StdArc::Weight> distance;
+      VectorFst<StdArc> ofst;
+
+      AutoQueue<StdArc::StateId> state_queue (*tfst, &distance, arc_filter);
+      IdentityPathFilter<StdArc> path_filter;
+
+      ShortestPathOptions<StdArc, AutoQueue<StdArc::StateId>,
+              AnyArcFilter<StdArc> >
+    opts (&state_queue, arc_filter, nbest, false, false,
+          kDelta, false, weight_threshold,
+          state_threshold);
+      ShortestPathSpecialized (*tfst, &ofst, &distance,
+                   &path_filter, 10000, opts);
+      for (size_t i = 0; i < path_filter.ordered_paths.size (); i++) {
+    const vector<int>& path = path_filter.ordered_paths[i];
+    for (size_t j = 0; j < path.size (); j++) {
+      ofile << aligner->isyms->Find (path [j]);
+      if (j < path.size () - 1)
+        ofile << " ";
+    }
+    ofile << "\n";
+      }
+    }
+    delete tfst;
+  }
+
+  return;
+}
+
+
 void
 align(string input_file, string prefix, bool seq1_del, bool seq2_del,
       int seq1_max, int seq2_max, string seq_sep, string s1s2_sep,
@@ -436,24 +558,37 @@ align(string input_file, string prefix, bool seq1_del, bool seq2_del,
     ofstream ofile(o.c_str(), ifstream::out);
     cout << "Loading..." << endl;
     M2MFstAligner fstaligner(seq1_del, seq2_del, seq1_max, seq2_max,
-                             seq_sep, seq_sep, s1s2_sep, eps, skip, true);
+                             seq_sep, seq_sep, s1s2_sep, eps, skip,
+                             // Fuck this stupid API
+                             false, false, true, false);
 
-    string sep1 = "";
-    string sep2 = " ";
+    string sepnone = "";
+    string septab = "\t";
+    string sepspace = " ";
     string line;
     if (dict.is_open()) {
         while (dict.good()) {
             getline(dict, line);
             if (line.empty())
                 continue;
-            vector<string> tokens = tokenize_utf8_string(&line, &sep2);
-            if (tokens.size() < 2) {
-                cout << "Cannot parse line:" << line << endl;
-                continue;
+            /* First try with tab */
+            vector<string> tokens = tokenize_utf8_string(&line, &septab);
+            if (tokens.size() != 2) {
+                vector<string> tokens = tokenize_utf8_string(&line, &sepspace);
+                if (tokens.size() < 2) {
+                    cout << "Cannot parse line (must use tab or single space "
+                         << "to separate word and phones):" << line << endl;
+                    continue;
+                }
+                vector<string> seq1 = tokenize_utf8_string(&tokens.at(0), &sepnone);
+                vector<string> seq2(tokens.begin() + 1, tokens.end());
+                fstaligner.entry2alignfst(seq1, seq2);
+            }
+            else {
+                vector<string> seq1 = tokenize_utf8_string(&tokens.at(0), &sepnone);
+                vector<string> seq2 = tokenize_utf8_string(&tokens.at(1), &sepspace);
+                fstaligner.entry2alignfst(seq1, seq2);
             }
-            vector <string> seq1 = tokenize_utf8_string(&tokens.at(0), &sep1);
-            vector <string> seq2(tokens.begin() + 1, tokens.end());
-            fstaligner.entry2alignfst(seq1, seq2);
         }
     }
     dict.close();
@@ -465,25 +600,14 @@ align(string input_file, string prefix, bool seq1_del, bool seq2_del,
     for (i = 1; i <= iter; i++) {
         fstaligner.expectation();
         change = fstaligner.maximization(false);
-        cout << "Iteration " << i << ": " << change << endl;
+        cout << "Iteration " << i << " Change: " << change << endl;
     }
     fstaligner.expectation();
     change = fstaligner.maximization(true);
-    cout << "Iteration " << i << ": " << change << endl;
+    cout << "Last iteration: " << change << endl;
 
     cout << "Generating best alignments..." << endl;
-    for (int i = 0; i < fstaligner.fsas.size(); i++) {
-        vector<PathData> paths =
-            fstaligner.write_alignment(fstaligner.fsas[i], 1);
-        for (int k = 0; k < paths.size(); k++) {
-            for (int j = 0; j < paths[k].path.size(); j++) {
-                ofile << paths[k].path[j];
-                //if (j < paths[k].path.size() - 1)
-                ofile << " ";
-            }
-            ofile << endl;
-        }
-    }
+    write_alignments(&fstaligner, ofile, -99.0, 1, false, true);
     ofile.flush();
     ofile.close();
 }
diff --git a/src/programs/g2p_train/main.cpp b/src/programs/g2p_train/main.cpp
index 6a50719c..35526ed7 100644
--- a/src/programs/g2p_train/main.cpp
+++ b/src/programs/g2p_train/main.cpp
@@ -53,7 +53,7 @@ main(int argc, char *argv[])
         {   "-seq1_del", ARG_BOOLEAN, "no",
             "Allow deletions in sequence 1"
         },
-        {   "-seq2_del", ARG_BOOLEAN, "no",
+        {   "-seq2_del", ARG_BOOLEAN, "yes",
             "Allow deletions in sequence 2"
         },
         {   "-noalign", ARG_BOOLEAN, "no",
@@ -71,7 +71,9 @@ main(int argc, char *argv[])
         {   "-iter", ARG_INT32, "10",
             "Maximum number of iterations for EM"
         },
-        {"-order", ARG_INT32, "6", "N-gram order"},
+        {   "-order", ARG_INT32, "5",
+            "N-gram order"
+        },
         {   "-prune", ARG_STRING, "no",
             "Pruning method. Available options are: 'no', 'count_prune', 'relative_entropy', 'seymore'"
         },
@@ -149,6 +151,11 @@ main(int argc, char *argv[])
               eps, skip, iter);
     }
 
+    /* 
+       phonetisaurus-align --input=INPUT --ofile=model.corpus --seq1_del=false --seq2_del=true --seq1_max=2 --seq2_max=2 --grow=false
+       farcompilestrings model.corpus | ngramcount --order=5 | ngrammake
+       phonetisaurus-arpa2wfst # NOTE: this is unnecessary as ngrammake produces the same representation
+    */
     train_model(eps, s1s2_sep, skip, order, smooth, prefix, seq_sep, prune,
                 theta, count_pattern);
 
diff --git a/src/upstream/Phonetisaurus b/src/upstream/Phonetisaurus
new file mode 160000
index 00000000..321443f9
--- /dev/null
+++ b/src/upstream/Phonetisaurus
@@ -0,0 +1 @@
+Subproject commit 321443f948220142d5cac4c9bb94ec766e9c86a0