diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3f61fe4d..ac31ec55 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,6 +10,7 @@ jobs: uses: actions/checkout@v3 with: path: sphinxtrain + submodules: recursive - name: Install run: | sudo apt-get install libfst-dev libngram-dev cmake \ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..01e9a7b6 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/upstream/Phonetisaurus"] + path = src/upstream/Phonetisaurus + url = git@github.com:cmusphinx/Phonetisaurus.git diff --git a/README.md b/README.md index 5a9d23f3..ece59283 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,10 @@ command above, namely: cmake -S . -B build -DBUILD_G2P=ON +You must also enable git submodules, e.g.: + + git submodule init + You can also enable shared libraries with `-DBUILD_SHARED_LIBS=ON`, but I suggest that you *not* do that unless you have a very good reason. diff --git a/include/sphinxbase/fixpoint.h b/include/sphinxbase/fixpoint.h index 30b5cb20..46132593 100644 --- a/include/sphinxbase/fixpoint.h +++ b/include/sphinxbase/fixpoint.h @@ -55,6 +55,10 @@ extern "C" { } #endif +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + #ifndef DEFAULT_RADIX #define DEFAULT_RADIX 12 #endif diff --git a/include/sphinxbase/prim_type.h b/include/sphinxbase/prim_type.h index aebb1982..9fa12f32 100644 --- a/include/sphinxbase/prim_type.h +++ b/include/sphinxbase/prim_type.h @@ -85,6 +85,10 @@ extern "C" { } /* Fool Emacs into not indenting things. */ #endif +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + /* Define some things for VisualDSP++ */ #if defined(__ADSPBLACKFIN__) && !defined(__GNUC__) # ifndef HAVE_LONG_LONG diff --git a/scripts/0000.g2p_train/calculateER.py b/scripts/0000.g2p_train/calculateER.py index b24f5abc..b0ef6e5a 100755 --- a/scripts/0000.g2p_train/calculateER.py +++ b/scripts/0000.g2p_train/calculateER.py @@ -253,12 +253,12 @@ def split_sequence( self, sequence, usep=" ", fsep="" ): if usep=="": sequences = [ [ unit for unit in list(seq) ] - for seq in re.split( r' {2,}', sequence ) + for seq in re.split( r'(?:\t| {2,})', sequence ) ] else: sequences = [ [ unit for unit in re.split( usep, seq ) ] - for seq in re.split( r' {2,}', sequence ) + for seq in re.split( r'(?:\t| {2,})', sequence ) ] if len(sequences)==1: return sequences[0] @@ -284,7 +284,7 @@ def compute_PER_phonetisaurus( self, hypfile, reffile, usep=" ", fsep="", verbos words = []; hyps = []; refs = [] for i,line in enumerate(open(hypfile,"r")): #There should be three fields - word, score, pron = re.split(r' {2,}', line.strip()) + word, score, pron = re.split(r'(?:\t| +)', line.strip()) phons = re.split(usep, pron) #This assumes that we will never have a test situation # where the input list intentionally contains 2 repetitions @@ -350,7 +350,7 @@ def print_ER( self, totals ): parser.add_argument('--hyp', "-y", help="The file/string containing G2P/ASR hypotheses.", required=True ) parser.add_argument('--ref', "-r", help="The file/string containing G2P/ASR reference transcriptions.", required=True ) parser.add_argument('--usep', "-u", help="Character or regex separating units in a sequence. Defaults to ' '.", required=False, default=" " ) - parser.add_argument('--fsep', "-s", help="Character or regex separating fields in a sequence. Defaults to ' '.", required=False, default="r' {2,}'" ) + parser.add_argument('--fsep', "-s", help="Character or regex separating fields in a sequence. Defaults to ' '.", required=False, default="r'(?:\t| {2,})'" ) parser.add_argument('--format', "-f", help="Input format. One of 'cmu', 'htk', 'g2p'. Defaults to 'g2p'.", required=False, default="g2p" ) parser.add_argument('--ignore', "-i", help="Ignore specified characters when encountered in a HYPOTHESIS. A ' ' separated list.", required=False, default="" ) parser.add_argument('--regex_ignore', "-x", help="Ignore specified characters when encountered in a HYPOTHESIS. A regular expression.", required=False, default="" ) diff --git a/scripts/0000.g2p_train/evaluate.py b/scripts/0000.g2p_train/evaluate.py index d9eabc61..f8a5e77f 100755 --- a/scripts/0000.g2p_train/evaluate.py +++ b/scripts/0000.g2p_train/evaluate.py @@ -83,12 +83,12 @@ def evaluate_testset( references = {} for entry in open(referencefile,"r"): # parts = entry.strip().split(" ") - parts = re.split(r' {2,}', entry.strip()) + parts = re.split(r'(?:\t| {2,})', entry.strip()) word = parts.pop(0) references[word] = parts for entry in open(hypothesisfile,"r"): #word, score, hypothesis = entry.strip().split(" ") - word, score, hypothesis = re.split(r' {2,}', entry.strip()) + word, score, hypothesis = re.split(r'(?:\t| {2,})', entry.strip()) PERcalculator = ErrorRater( ignore=ignore, ignore_both=ignore_both, regex_ignore=regex_ignore ) PERcalculator.compute_PER_phonetisaurus( hypothesisfile, referencefile, verbose=verbose ) diff --git a/scripts/0000.g2p_train/g2p_train.pl b/scripts/0000.g2p_train/g2p_train.pl index 12a70b23..74ef81f9 100755 --- a/scripts/0000.g2p_train/g2p_train.pl +++ b/scripts/0000.g2p_train/g2p_train.pl @@ -78,7 +78,10 @@ s/\(\d+\)//; s/^\s*//; s/\s*$//; - s/\s+/ /g; + # Use tab to separate word and pron to be consistent with MFA and Phonetisaurus + s/\s+/\t/; + # Collapse any multiple spaces that remain + s/ +/ /g; print OUTDICT "$_\n"; } close INDICT or die $!; diff --git a/src/programs/g2p_eval/CMakeLists.txt b/src/programs/g2p_eval/CMakeLists.txt index 649e8259..2f662c67 100644 --- a/src/programs/g2p_eval/CMakeLists.txt +++ b/src/programs/g2p_eval/CMakeLists.txt @@ -1,9 +1,10 @@ set(PROGRAM g2p_eval) set(SRCS -main.cpp -Phonetisaurus.cpp -phonetisaurus-g2p.cpp -${CMAKE_SOURCE_DIR}/src/programs/g2p_train/FstPathFinder.cpp +main.c +${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/M2MFstAligner.cc +${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/LatticePruner.cc +${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/util.cc +phonetisaurus-g2p.cc ) add_executable(${PROGRAM} ${SRCS}) @@ -11,6 +12,8 @@ target_link_libraries(${PROGRAM} sphinxtrain ${FST} ${NGRAM}) target_include_directories( ${PROGRAM} PRIVATE ${CMAKE_BINARY_DIR} ${PROGRAM} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src + ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/3rdparty/utfcpp ${PROGRAM} PUBLIC ${CMAKE_SOURCE_DIR}/include ${PROGRAM} INTERFACE ${CMAKE_SOURCE_DIR}/include ) diff --git a/src/programs/g2p_eval/Phonetisaurus.cpp b/src/programs/g2p_eval/Phonetisaurus.cpp deleted file mode 100644 index dbb81936..00000000 --- a/src/programs/g2p_eval/Phonetisaurus.cpp +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Phonetisaurus.cpp - * - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ -#include -#include -#include -#include -#include -#include -#include "../g2p_train/FstPathFinder.hpp" -#include "Phonetisaurus.hpp" - -using namespace fst; - -Phonetisaurus::Phonetisaurus() -{ - //Default constructor -} - -Phonetisaurus::Phonetisaurus(const char *_g2pmodel_file) -{ - //Base constructor. Load the clusters file, the models and setup shop. - eps = ""; - sb = ""; - se = ""; - skip = "_"; - - skipSeqs.insert(eps); - skipSeqs.insert(sb); - skipSeqs.insert(se); - skipSeqs.insert(skip); - skipSeqs.insert("-"); - - g2pmodel = StdVectorFst::Read(_g2pmodel_file); - - isyms = (SymbolTable *) g2pmodel->InputSymbols(); - tie = isyms->Find(1); //The separator symbol is reserved for index 1 - - osyms = (SymbolTable *) g2pmodel->OutputSymbols(); - - loadClusters(); - - epsMapper = makeEpsMapper(); - - //We need make sure the g2pmodel is arcsorted - ILabelCompare icomp; - ArcSort(g2pmodel, icomp); -} - - -void -Phonetisaurus::loadClusters() -{ - /* - Load the clusters file containing the list of - subsequences generated during multiple-to-multiple alignment - */ - - for (size_t i = 2; i < isyms->NumSymbols(); i++) { - string sym = isyms->Find(i); - - if (sym.find(tie) != string::npos) { - char *tmpstring = (char *) sym.c_str(); - char *p = strtok(tmpstring, tie.c_str()); - vector cluster; - - while (p) { - cluster.push_back(p); - p = strtok(NULL, tie.c_str()); - } - - clusters[cluster] = i; - } - } - return; -} - -StdVectorFst -Phonetisaurus::makeEpsMapper() -{ - /* - Generate a mapper FST to transform unwanted output symbols - to the epsilon symbol. - - This can be used to remove unwanted symbols from the final - result, but in tests was 7x slower than manual removal - via the FstPathFinder object. - */ - - StdVectorFst mfst; - mfst.AddState(); - mfst.SetStart(0); - - set ::iterator sit; - for (size_t i = 0; i < osyms->NumSymbols(); i++) { - string sym = osyms->Find(i); - sit = skipSeqs.find(sym); - if (sit != skipSeqs.end()) - mfst.AddArc(0, StdArc(i, 0, 0, 0)); - else - mfst.AddArc(0, StdArc(i, i, 0, 0)); - } - mfst.SetFinal(0, 0); - ILabelCompare icomp; - ArcSort(&mfst, icomp); - mfst.SetInputSymbols(osyms); - mfst.SetOutputSymbols(osyms); - - return mfst; -} - -StdVectorFst -Phonetisaurus::entryToFSA(vector entry) -{ - /* - Transform an input spelling/pronunciation into an equivalent - FSA, adding extra arcs as needed to accomodate clusters. - */ - - StdVectorFst efst; - efst.AddState(); - efst.SetStart(0); - - efst.AddState(); - efst.AddArc(0, StdArc(isyms->Find(sb), isyms->Find(sb), 0, 1)); - size_t i = 0; - - //Build the basic FSA - for (i = 0; i < entry.size(); i++) { - efst.AddState(); - string ch = entry[i]; - efst.AddArc(i + 1, - StdArc(isyms->Find(ch), isyms->Find(ch), 0, i + 2)); - if (i == 0) - continue; - - } - - //Add any cluster arcs - map,int>::iterator it_i; - for (it_i = clusters.begin(); it_i != clusters.end(); it_i++) { - vector::iterator it_j; - vector::iterator start = entry.begin(); - vector cluster = (*it_i).first; - while (it_j != entry.end()) { - it_j = - search(start, entry.end(), cluster.begin(), cluster.end()); - if (it_j != entry.end()) { - efst.AddArc(it_j - entry.begin() + 1, StdArc((*it_i).second, //input symbol - (*it_i).second, //output symbol - 0, //weight - it_j - entry.begin() + cluster.size() + 1 //destination state - )); - start = it_j + cluster.size(); - } - } - } - - efst.AddState(); - efst.AddArc(i + 1, StdArc(isyms->Find(se), isyms->Find(se), 0, i + 2)); - efst.SetFinal(i + 2, 0); - efst.SetInputSymbols(isyms); - efst.SetOutputSymbols(isyms); - return efst; -} - -vector Phonetisaurus::phoneticize(vector entry, - int nbest, int beam) -{ - /* - Generate pronunciation/spelling hypotheses for an - input entry. - */ - StdVectorFst - result; - StdVectorFst - epsMapped; - StdVectorFst - shortest; - StdVectorFst - efst = entryToFSA(entry); - StdVectorFst - smbr; - Compose(efst, *g2pmodel, &result); - - Project(&result, PROJECT_OUTPUT); - if (nbest > 1) { - //This is a cheesy hack. - ShortestPath(result, &shortest, beam); - } - else { - ShortestPath(result, &shortest, 1); - } - RmEpsilon(&shortest); - FstPathFinder - pathfinder(skipSeqs); - pathfinder.findAllStrings(shortest); - - return pathfinder.paths; -} - -void -printPath(PathData * path, string onepath, int k, ofstream * hypfile, - string correct, string word, bool output_cost) -{ - if (word != "") { - if (k != 0) { - *hypfile << word << "(" << (k + 1) << ")" << " "; - } - else { - *hypfile << word << " "; - } - } - if (output_cost) { - if (path) { - *hypfile << path->pathcost << " " << onepath; - } - else { - *hypfile << "999.999 " << onepath; - } - } - else { - *hypfile << onepath; - } - if (correct != "") - *hypfile << " " << correct; - *hypfile << "\n"; -} - -bool -Phonetisaurus::printPaths(vector paths, int nbest, - ofstream * hypfile, string correct, string word, - bool output_cost) -{ - /* - Convenience function to print out a path vector. - Will print only the first N unique entries. - */ - - set seen; - set ::iterator sit; - - int numseen = 0; - string onepath; - size_t k; - bool empty_path = true; - for (k = 0; k < paths.size(); k++) { - if (k >= nbest) - break; - - size_t j; - for (j = 0; j < paths[k].path.size(); j++) { - if (paths[k].path[j] != tie) - replace(paths[k].path[j].begin(), - paths[k].path[j].end(), *tie.c_str(), ' '); - onepath += paths[k].path[j]; - - if (j != paths[k].path.size() - 1) - onepath += " "; - } - if (onepath == "") { - continue; - } - empty_path = false; - printPath(&paths[k], onepath, k, hypfile, correct, word, - output_cost); - onepath = ""; - } - if (empty_path) { - if (k == 0) { - printPath(NULL, "-", 0, hypfile, correct, word, output_cost); - } - else { - printPath(&paths[0], "-", 0, hypfile, correct, word, - output_cost); - } - } - - return empty_path; -} diff --git a/src/programs/g2p_eval/Phonetisaurus.hpp b/src/programs/g2p_eval/Phonetisaurus.hpp deleted file mode 100644 index 45d5939f..00000000 --- a/src/programs/g2p_eval/Phonetisaurus.hpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - Phonetisaurus.hpp - - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. -* -*/ -#ifndef PHONETISAURUS_H -#define PHONETISAURUS_H -#include -#include "../g2p_train/FstPathFinder.hpp" -using namespace fst; -typedef PhiMatcher > > PM; - -class Phonetisaurus { - /* - Load a G2P/P2G model and generate pronunciation/spelling - hypotheses for input items. - */ -public: - //Basics - string eps; - string se; - string sb; - string skip; - string tie; - set skipSeqs; - map,int> clusters; - //FST stuff - StdVectorFst *g2pmodel; - StdVectorFst epsMapper; - SymbolTable *isyms; - SymbolTable *osyms; - - Phonetisaurus(); - - Phonetisaurus(const char *_g2pmodel_file); - - StdVectorFst entryToFSA(vector entry); - - StdVectorFst makeEpsMapper(); - - vector phoneticize(vector entry, int nbest, - int beam = 500); - - bool printPaths(vector paths, int nbest, - ofstream * hypfile, string correct = "", string word = - "", bool output_cost = true); - -private: - void loadClusters(); -}; - -#endif // PHONETISAURUS_H // diff --git a/src/programs/g2p_eval/main.cpp b/src/programs/g2p_eval/main.c similarity index 77% rename from src/programs/g2p_eval/main.cpp rename to src/programs/g2p_eval/main.c index b05e00f3..ff954806 100644 --- a/src/programs/g2p_eval/main.cpp +++ b/src/programs/g2p_eval/main.c @@ -1,9 +1,7 @@ +#include "phonetisaurus-g2p.h" #include -#include "phonetisaurus-g2p.hpp" -using namespace std; - const char helpstr[] = "Usage: g2p_eval -model MODEL -input INPUT [-output OUTPUT] [-isfile] [-output_cost] \n\ [-nbest NBEST] [-beam BEAM] [-sep SEP] [-words] \n\ @@ -53,25 +51,23 @@ main(int argc, char *argv[]) printf("%s\n\n", helpstr); } - string model = cmd_ln_str("-model"); - string input = cmd_ln_str("-input"); - string output = cmd_ln_str("-output"); - bool output_cost = cmd_ln_boolean("-output_cost"); - bool isfile = cmd_ln_boolean("-isfile"); + const char * model = cmd_ln_str("-model"); + const char * input = cmd_ln_str("-input"); + const char * output = cmd_ln_str("-output"); + int output_cost = cmd_ln_boolean("-output_cost"); + int isfile = cmd_ln_boolean("-isfile"); int nbest = cmd_ln_int32("-nbest"); int beam = cmd_ln_int32("-beam"); - string sep = cmd_ln_str("-sep"); - bool words = cmd_ln_boolean("-words"); + const char * sep = cmd_ln_str("-sep"); + int words = cmd_ln_boolean("-words"); if (isfile) { - //If its a file, go for it - phoneticizeTestSet(model.c_str(), output.c_str(), input, nbest, + phoneticizeTestSet(model, output, input, nbest, sep, beam, words, output_cost); } else { - //Otherwise we just have a word - phoneticizeWord(model.c_str(), output.c_str(), input, nbest, sep, - beam, words); + phoneticizeWord(model, output, input, nbest, sep, + beam, words, output_cost); } return 0; diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.cc b/src/programs/g2p_eval/phonetisaurus-g2p.cc new file mode 100644 index 00000000..3d4b3231 --- /dev/null +++ b/src/programs/g2p_eval/phonetisaurus-g2p.cc @@ -0,0 +1,107 @@ +/* + phonetisaurus-g2pfst.cc + + Copyright (c) [2012-], Josef Robert Novak + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted #provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of #conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +using namespace std; +#include +#include +#include +#include +using namespace fst; + +#include "phonetisaurus-g2p.h" + +typedef unordered_map > RMAP; + +void PrintPathData(ofstream &output, + const vector& results, string FLAGS_word, + const SymbolTable* osyms, bool print_scores, + bool nlog_probs, bool output_words) { + for (int i = 0; i < results.size(); i++) { + if (output_words) + output << FLAGS_word << "\t"; + if (print_scores == true) { + if (nlog_probs == true) + output << results[i].PathWeight << "\t"; + else + output << std::setprecision (3) << exp(-results[i].PathWeight) << "\t"; + } + + for (int j = 0; j < results[i].Uniques.size(); j++) { + output << osyms->Find(results[i].Uniques[j]); + if (j < results[i].Uniques.size() - 1) + output << " "; + } + output << endl; + } +} + +extern "C" +void +phoneticizeTestSet(const char *g2pmodel_file, const char *output, + const char *testset_file, int nbest, const char *sep, + int beam, int output_words, int output_cost) +{ + PhonetisaurusScript decoder(g2pmodel_file, sep); + + vector corpus; + bool write_fsts = false; + bool accumulate = false; + double pmass = 99.0; + LoadWordList(testset_file, &corpus); + ofstream hypfile; + hypfile.open(output); + for (int i = 0; i < corpus.size(); i++) { + vector results = decoder.Phoneticize(corpus[i], nbest, + beam, 99.0, + write_fsts, + accumulate, pmass); + PrintPathData(hypfile, results, corpus[i], + decoder.osyms_, output_cost, true, output_words); + } +} + +extern "C" +void +phoneticizeWord(const char *g2pmodel_file, const char *output, + const char *testword, int nbest, const char *sep, int beam, + int output_words, int output_cost) +{ + PhonetisaurusScript decoder(g2pmodel_file, sep); + bool write_fsts = false; + bool accumulate = false; + vector results = decoder.Phoneticize(testword, nbest, + beam, 99.0, write_fsts, accumulate, 0.0); + ofstream hypfile; + hypfile.open(output); + + PrintPathData(hypfile, results, testword, + decoder.osyms_, output_cost, true, output_words); +} diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.cpp b/src/programs/g2p_eval/phonetisaurus-g2p.cpp deleted file mode 100644 index 6e21627a..00000000 --- a/src/programs/g2p_eval/phonetisaurus-g2p.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include -#include "Phonetisaurus.hpp" -#include "util.hpp" - -using namespace fst; - -void -phoneticizeWord(const char *g2pmodel_file, const char *output, - string testword, int nbest, string sep, int beam = 500, - int output_words = 0) -{ - - Phonetisaurus phonetisaurus(g2pmodel_file); - - vector entry = - tokenize_entry(&testword, &sep, phonetisaurus.isyms); - - vector paths = - phonetisaurus.phoneticize(entry, nbest, beam); - ofstream hypfile; - hypfile.open(output); - - if (output_words == 0) { - while (phonetisaurus.printPaths(paths, nbest, &hypfile) == true - && nbest <= paths.size()) { - nbest++; - paths = phonetisaurus.phoneticize(entry, nbest, beam); - } - } - else { - while (phonetisaurus. - printPaths(paths, nbest, &hypfile, "", testword) - == true && nbest <= paths.size()) { - nbest++; - paths = phonetisaurus.phoneticize(entry, nbest, beam); - } - } - hypfile.flush(); - hypfile.close(); - - return; -} - -void -phoneticizeTestSet(const char *g2pmodel_file, const char *output, - string testset_file, int nbest, string sep, int beam = - 500, int output_words = 0, bool output_cost = true) -{ - - Phonetisaurus phonetisaurus(g2pmodel_file); - - ifstream test_fp; - test_fp.open(testset_file.c_str()); - string line; - - if (test_fp.is_open()) { - ofstream hypfile; - hypfile.open(output); - while (test_fp.good()) { - getline(test_fp, line); - if (line.compare("") == 0) - continue; - - char *tmpstring = (char *) line.c_str(); - char *p = strtok(tmpstring, "\t"); - string word; - string pron; - - int i = 0; - while (p) { - if (i == 0) - word = p; - else - pron = p; - i++; - p = strtok(NULL, "\t"); - } - - vector entry = tokenize_entry(&word, &sep, - phonetisaurus.isyms); - vector paths = - phonetisaurus.phoneticize(entry, nbest, beam); - int nbest_new = nbest; - if (output_words == 0) { - while (phonetisaurus. - printPaths(paths, nbest_new, &hypfile, output, - pron) == true - && nbest_new <= paths.size()) { - nbest_new++; - paths = - phonetisaurus.phoneticize(entry, nbest_new, beam); - } - } - else { - while (phonetisaurus. - printPaths(paths, nbest_new, &hypfile, pron, word, - output_cost) == true - && nbest_new <= paths.size()) { - nbest_new++; - paths = - phonetisaurus.phoneticize(entry, nbest_new, beam); - } - } - } - test_fp.close(); - hypfile.flush(); - hypfile.close(); - } - else { - cout << "Problem opening test file..." << endl; - } - - return; -} diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.h b/src/programs/g2p_eval/phonetisaurus-g2p.h new file mode 100644 index 00000000..b6c3d2d6 --- /dev/null +++ b/src/programs/g2p_eval/phonetisaurus-g2p.h @@ -0,0 +1,53 @@ +/* + phonetisaurus-g2pfst.h + + Copyright (c) [2012-], Josef Robert Novak + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted #provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of #conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#ifndef __PHONETISAURUS_G2PFST_H__ +#define __PHONETISAURUS_G2PFST_H__ + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +} /* Fool Emacs into not indenting things. */ +#endif + +void phoneticizeWord(const char *g2pmodel_file, const char *output, + const char *testword, int nbest, const char *sep, int beam, + int output_words, int output_cost); +void phoneticizeTestSet(const char *g2pmodel_file, const char *output, + const char *testset_file, int nbest, const char *sep, + int beam, int output_words, int output_cost); + +#ifdef __cplusplus +} +#endif + +#endif /* __PHONETISAURUS_G2PFST_H__ */ diff --git a/src/programs/g2p_eval/phonetisaurus-g2p.hpp b/src/programs/g2p_eval/phonetisaurus-g2p.hpp deleted file mode 100644 index ad8be85c..00000000 --- a/src/programs/g2p_eval/phonetisaurus-g2p.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#include - -using namespace std; - -void phoneticizeWord(const char *g2pmodel_file, const char *output, - string testword, int nbest, string sep, int beam = - 500, int output_words = 0); -void phoneticizeTestSet(const char *g2pmodel_file, const char *output, - string testset_file, int nbest, string sep, - int beam = 500, int output_words = - 0, bool output_cost = true); diff --git a/src/programs/g2p_eval/utf8.h b/src/programs/g2p_eval/utf8.h deleted file mode 100644 index 4e445140..00000000 --- a/src/programs/g2p_eval/utf8.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "utf8/checked.h" -#include "utf8/unchecked.h" - -#endif // header guard diff --git a/src/programs/g2p_eval/utf8/checked.h b/src/programs/g2p_eval/utf8/checked.h deleted file mode 100644 index 9cb8d2c7..00000000 --- a/src/programs/g2p_eval/utf8/checked.h +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" -#include - -namespace utf8 -{ - // Base for the exceptions that may be thrown from the library - class exception : public std::exception { - }; - - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - uint32_t cp; - public: - invalid_code_point(uint32_t cp) : cp(cp) {} - virtual const char* what() const throw() { return "Invalid code point"; } - uint32_t code_point() const {return cp;} - }; - - class invalid_utf8 : public exception { - uint8_t u8; - public: - invalid_utf8 (uint8_t u) : u8(u) {} - virtual const char* what() const throw() { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} - }; - - class invalid_utf16 : public exception { - uint16_t u16; - public: - invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const throw() { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} - }; - - class not_enough_room : public exception { - public: - virtual const char* what() const throw() { return "Not enough space"; } - }; - - /// The library API - functions intended to be called by the users - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - throw not_enough_room(); - case internal::INVALID_LEAD: - append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (internal::is_trail(*start) && start != end) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = internal::mask16(0xfffd); - return replace_invalid(start, end, out, replacement_marker); - } - - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (!internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - uint32_t next(octet_iterator& it, octet_iterator end) - { - uint32_t cp = 0; - internal::utf_error err_code = internal::validate_next(it, end, &cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::NOT_ENOUGH_ROOM : - throw not_enough_room(); - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(*it); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); - } - return cp; - } - - template - uint32_t peek_next(octet_iterator it, octet_iterator end) - { - return next(it, end); - } - - template - uint32_t prior(octet_iterator& it, octet_iterator start) - { - // can't do much if it == start - if (it == start) - throw not_enough_room(); - - octet_iterator end = it; - // Go back until we hit either a lead octet or start - while (internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return peek_next(it, end); - } - - /// Deprecated in versions that include "prior" - template - uint32_t previous(octet_iterator& it, octet_iterator pass_start) - { - octet_iterator end = it; - while (internal::is_trail(*(--it))) - if (it == pass_start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - octet_iterator temp = it; - return next(temp, end); - } - - template - void advance (octet_iterator& it, distance_type n, octet_iterator end) - { - for (distance_type i = 0; i < n; ++i) - next(it, end); - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - next(first, last); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = internal::mask16(*start++); - // Take care of surrogate pairs first - if (internal::is_lead_surrogate(cp)) { - if (start != end) { - uint32_t trail_surrogate = internal::mask16(*start++); - if (internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } - else - throw invalid_utf16(static_cast(cp)); - - } - // Lone trail surrogate - else if (internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); - - result = append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start != end) { - uint32_t cp = next(start, end); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start != end) - (*result++) = next(start, end); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - octet_iterator range_start; - octet_iterator range_end; - public: - iterator () {}; - explicit iterator (const octet_iterator& octet_it, - const octet_iterator& range_start, - const octet_iterator& range_end) : - it(octet_it), range_start(range_start), range_end(range_end) - { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return next(temp, range_end); - } - bool operator == (const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - next(it, range_end); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - next(it, range_end); - return temp; - } - iterator& operator -- () - { - prior(it, range_start); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - prior(it, range_start); - return temp; - } - }; // class iterator - -} // namespace utf8 - -#endif //header guard - - diff --git a/src/programs/g2p_eval/utf8/core.h b/src/programs/g2p_eval/utf8/core.h deleted file mode 100755 index 268cf7cd..00000000 --- a/src/programs/g2p_eval/utf8/core.h +++ /dev/null @@ -1,358 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include - -namespace utf8 -{ - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); - const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; - - // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; - - template - inline uint8_t mask8(octet_type oc) - { - return static_cast(0xff & oc); - } - template - inline uint16_t mask16(u16_type oc) - { - return static_cast(0xffff & oc); - } - template - inline bool is_trail(octet_type oc) - { - return ((mask8(oc) >> 6) == 0x2); - } - - template - inline bool is_lead_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); - } - - template - inline bool is_trail_surrogate(u16 cp) - { - return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_code_point_valid(u32 cp) - { - return (cp <= CODE_POINT_MAX && !is_surrogate(cp)); - } - - template - inline typename std::iterator_traits::difference_type - sequence_length(octet_iterator lead_it) - { - uint8_t lead = mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } - - template - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } - - return false; - } - - enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; - - /// get_sequence_x functions decode utf-8 sequences of the length x - - template - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point) - { - if (it != end) { - if (code_point) - *code_point = mask8(*it); - return UTF8_OK; - } - return NOT_ENOUGH_ROOM; - } - - template - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point) - { - utf_error ret_code = NOT_ENOUGH_ROOM; - - if (it != end) { - uint32_t cp = mask8(*it); - if (++it != end) { - if (is_trail(*it)) { - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - - if (code_point) - *code_point = cp; - ret_code = UTF8_OK; - } - else - ret_code = INCOMPLETE_SEQUENCE; - } - else - ret_code = NOT_ENOUGH_ROOM; - } - - return ret_code; - } - - template - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point) - { - utf_error ret_code = NOT_ENOUGH_ROOM; - - if (it != end) { - uint32_t cp = mask8(*it); - if (++it != end) { - if (is_trail(*it)) { - cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); - if (++it != end) { - if (is_trail(*it)) { - cp += (*it) & 0x3f; - - if (code_point) - *code_point = cp; - ret_code = UTF8_OK; - } - else - ret_code = INCOMPLETE_SEQUENCE; - } - else - ret_code = NOT_ENOUGH_ROOM; - } - else - ret_code = INCOMPLETE_SEQUENCE; - } - else - ret_code = NOT_ENOUGH_ROOM; - } - - return ret_code; - } - - template - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point) - { - utf_error ret_code = NOT_ENOUGH_ROOM; - - if (it != end) { - uint32_t cp = mask8(*it); - if (++it != end) { - if (is_trail(*it)) { - cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); - if (++it != end) { - if (is_trail(*it)) { - cp += (mask8(*it) << 6) & 0xfff; - if (++it != end) { - if (is_trail(*it)) { - cp += (*it) & 0x3f; - - if (code_point) - *code_point = cp; - ret_code = UTF8_OK; - } - else - ret_code = INCOMPLETE_SEQUENCE; - } - else - ret_code = NOT_ENOUGH_ROOM; - } - else - ret_code = INCOMPLETE_SEQUENCE; - } - else - ret_code = NOT_ENOUGH_ROOM; - } - else - ret_code = INCOMPLETE_SEQUENCE; - } - else - ret_code = NOT_ENOUGH_ROOM; - } - - return ret_code; - } - - template - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) - { - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; - - uint32_t cp = 0; - // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits::difference_type octet_difference_type; - octet_difference_type length = sequence_length(it); - if (length == 0) - return INVALID_LEAD; - - // Now that we have a valid sequence length, get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 1: - err = get_sequence_1(it, end, &cp); - break; - case 2: - err = get_sequence_2(it, end, &cp); - break; - case 3: - err = get_sequence_3(it, end, &cp); - break; - case 4: - err = get_sequence_4(it, end, &cp); - break; - } - - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (is_code_point_valid(cp)) { - if (!is_overlong_sequence(cp, length)){ - // Passed! Return here. - if (code_point) - *code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template - inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - return validate_next(it, end, 0); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - - template - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - internal::utf_error err_code = internal::validate_next(result, end); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - template - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (find_invalid(start, end) == end); - } - - template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (internal::mask8(*it++)) == bom[0]) && - ((it != end) && (internal::mask8(*it++)) == bom[1]) && - ((it != end) && (internal::mask8(*it)) == bom[2]) - ); - } - - //Deprecated in release 2.3 - template - inline bool is_bom (octet_iterator it) - { - return ( - (internal::mask8(*it++)) == bom[0] && - (internal::mask8(*it++)) == bom[1] && - (internal::mask8(*it)) == bom[2] - ); - } -} // namespace utf8 - -#endif // header guard - - diff --git a/src/programs/g2p_eval/utf8/unchecked.h b/src/programs/g2p_eval/utf8/unchecked.h deleted file mode 100755 index 2f3eb4d1..00000000 --- a/src/programs/g2p_eval/utf8/unchecked.h +++ /dev/null @@ -1,228 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" - -namespace utf8 -{ - namespace unchecked - { - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - uint32_t next(octet_iterator& it) - { - uint32_t cp = internal::mask8(*it); - typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); - switch (length) { - case 1: - break; - case 2: - it++; - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - break; - case 3: - ++it; - cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); - ++it; - cp += (*it) & 0x3f; - break; - case 4: - ++it; - cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); - ++it; - cp += (internal::mask8(*it) << 6) & 0xfff; - ++it; - cp += (*it) & 0x3f; - break; - } - ++it; - return cp; - } - - template - uint32_t peek_next(octet_iterator it) - { - return next(it); - } - - template - uint32_t prior(octet_iterator& it) - { - while (internal::is_trail(*(--it))) ; - octet_iterator temp = it; - return next(temp); - } - - // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) - template - inline uint32_t previous(octet_iterator& it) - { - return prior(it); - } - - template - void advance (octet_iterator& it, distance_type n) - { - for (distance_type i = 0; i < n; ++i) - next(it); - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - next(first); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = internal::mask16(*start++); - // Take care of surrogate pairs first - if (internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = internal::mask16(*start++); - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - } - result = append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = next(start); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = next(start); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - public: - iterator () {}; - explicit iterator (const octet_iterator& octet_it): it(octet_it) {} - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return next(temp); - } - bool operator == (const iterator& rhs) const - { - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - std::advance(it, internal::sequence_length(it)); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - std::advance(it, internal::sequence_length(it)); - return temp; - } - iterator& operator -- () - { - prior(it); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - prior(it); - return temp; - } - }; // class iterator - - } // namespace utf8::unchecked -} // namespace utf8 - - -#endif // header guard - diff --git a/src/programs/g2p_eval/util.hpp b/src/programs/g2p_eval/util.hpp deleted file mode 100644 index 21aa0186..00000000 --- a/src/programs/g2p_eval/util.hpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. -* -*/ -#include -#include "utf8.h" -using namespace fst; -using namespace std; - -string -convertInt(int number) -{ - stringstream ss; //create a stringstream - ss << number; //add number to the stream - return ss.str(); //return a string with the contents of the stream -} - -vector tokenize_utf8_string(string * utf8_string, - string * delimiter) -{ - /* - Support for tokenizing a utf-8 string. Adapted to also support a delimiter. - Note that leading, trailing or multiple consecutive delimiters will result in - empty vector elements. Normally should not be a problem but just in case. - FIXME: NO, IT IS A SERIOUS PROBLEM!!! WTF!!! WORST TOKENIZER EVER!!! - Also note that any tokens that cannot be found in the model symbol table will be - deleted from the input word prior to grapheme-to-phoneme conversion. - - http://stackoverflow.com/questions/2852895/c-iterate-or-split-utf-8-string-into-array-of-symbols#2856241 - */ - char *str = (char *) utf8_string->c_str(); // utf-8 string - char *str_i = str; // string iterator - char *str_j = str; - char *end = str + strlen(str) + 1; // end iterator - vector string_vec; - if (delimiter->compare("") != 0) - string_vec.push_back(""); - - do { - str_j = str_i; - uint32_t code = utf8::next(str_i, end); // get 32 bit code of a utf-8 symbol - if (code == 0) - continue; - int start = strlen(str) - strlen(str_j); - int end = strlen(str) - strlen(str_i); - int len = end - start; - - if (delimiter->compare("") == 0) { - string_vec.push_back(utf8_string->substr(start, len)); - } - else { - if (delimiter->compare(utf8_string->substr(start, len)) == 0) - string_vec.push_back(""); - else - string_vec[string_vec.size() - 1] += - utf8_string->substr(start, len); - } - } while (str_i < end); - - return string_vec; -} - -vector tokenize_entry(string * testword, string * sep, - SymbolTable * syms) -{ - vector tokens = tokenize_utf8_string(testword, sep); - vector entry; - for (int i = 0; i < tokens.size(); i++) { - if (syms->Find(tokens.at(i)) != -1) { - entry.push_back(tokens.at(i)); - } - } - - return entry; -} diff --git a/src/programs/g2p_train/CMakeLists.txt b/src/programs/g2p_train/CMakeLists.txt index 14bd7f96..6a9cd4b1 100644 --- a/src/programs/g2p_train/CMakeLists.txt +++ b/src/programs/g2p_train/CMakeLists.txt @@ -1,9 +1,10 @@ set(PROGRAM g2p_train) set(SRCS -FstPathFinder.cpp g2p_train.cpp -M2MFstAligner.cpp main.cpp +${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/M2MFstAligner.cc +${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/LatticePruner.cc +${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/lib/util.cc ) add_executable(${PROGRAM} ${SRCS}) @@ -12,6 +13,8 @@ target_link_libraries(${PROGRAM} sphinxtrain target_include_directories( ${PROGRAM} PRIVATE ${CMAKE_BINARY_DIR} ${PROGRAM} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} + ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src + ${PROGRAM} PRIVATE ${CMAKE_SOURCE_DIR}/src/upstream/Phonetisaurus/src/3rdparty/utfcpp ${PROGRAM} PUBLIC ${CMAKE_SOURCE_DIR}/include ${PROGRAM} INTERFACE ${CMAKE_SOURCE_DIR}/include ) diff --git a/src/programs/g2p_train/FstPathFinder.cpp b/src/programs/g2p_train/FstPathFinder.cpp deleted file mode 100644 index 5983cbe1..00000000 --- a/src/programs/g2p_train/FstPathFinder.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - FstPathFinder.cpp - - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. - ---------------- - Original author: chris taylor - - OpenFst forum post title: "Natural code for printing all strings accepted by an FST?" - OpenFst forum post link: http://openfst.cs.nyu.edu/twiki/bin/view/Forum/FstForum#Natural_code_for_printing_all_st - - ---------------- - - 2011-04-07: Modified by Josef Novak - - Modified to build a 'paths' object to store the individual paths - and associated weights, rather than just print them out from - inside the class. Useful if you want to return the paths for further - processing. -*/ - -#include "FstPathFinder.hpp" - -FstPathFinder::FstPathFinder() -{ - //Default constructor -} - -FstPathFinder::FstPathFinder(set skipset) -{ - //Constructor for a non-empty skipset - skipSeqs = skipset; -} - -void -FstPathFinder::findAllStrings(VectorFst &fst) -{ - /* - Main search function. Initiates the WFSA traversal. - We are making three potentially dangerous assumptions - here regarding the input FST: - - 1. It has *ALREADY* been run through the shortestpath algorithm - *This guarantees the the FST is acyclic and that the paths are - sorted according to path cost. - 2. It has *ALREADY* been projected - *This just saves us some hassle. - 3. The symbol tables have been stored in the input FST - *This just saves us some hassle. - - If the input FST does not meet these conditions this will - cause problems. - */ - - vector path; - if (fst.InputSymbols() != NULL) - isyms = (SymbolTable *) fst.InputSymbols(); - findAllStringsHelper(fst, fst.Start(), path, TropicalWeight::One()); - - return; -} - -void -FstPathFinder::addOrDiscardPath(PathData pdata) -{ - /* - Determine whether or not the input path has been added - to the paths vector or not. If it hasn't, add it, otherwise - discard it. - */ - - set< vector >::iterator sit; - sit = uniqueStrings.find(pdata.path); - - if (sit == uniqueStrings.end()) { - paths.push_back(pdata); - uniqueStrings.insert(pdata.path); - } - return; -} - -void -FstPathFinder::findAllStringsHelper(VectorFst &fst, int state, - vector &path, - TropicalWeight cost) -{ - /* - Recursively traverse the WFSA and build up a vector of - unique paths and associated costs. - */ - - if (fst.Final(state) != TropicalWeight::Zero()) { - - PathData pdata; - pdata.path = path; - pdata.pathcost = Times(cost, fst.Final(state)).Value(); - - addOrDiscardPath(pdata); - - path.clear(); - - return; - } - - for (ArcIterator > iter(fst, state); - !iter.Done(); iter.Next()) { - StdArc arc = iter.Value(); - - string symbol = isyms->Find(arc.ilabel); - - bool skip = false; - for (set::iterator sit = skipSeqs.begin(); - sit != skipSeqs.end(); sit++) - if (symbol.compare(*sit) == 0) - skip = true; - if (skip == false) - path.push_back(symbol); - - findAllStringsHelper(fst, arc.nextstate, path, - Times(cost, arc.weight.Value())); - } -} diff --git a/src/programs/g2p_train/FstPathFinder.hpp b/src/programs/g2p_train/FstPathFinder.hpp deleted file mode 100644 index 5ba037d8..00000000 --- a/src/programs/g2p_train/FstPathFinder.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * FstPathFinder.hpp - - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. - ---------------- - Original author: Chris Taylor - - OpenFst forum post title: "Natural code for printing all strings accepted by an FST?" - OpenFst forum post link: http://openfst.cs.nyu.edu/twiki/bin/view/Forum/FstForum#Natural_code_for_printing_all_st - - ---------------- - - 2011-04-07: Modified by Josef Novak - - Modified to build a 'paths' object to store the individual paths - and associated weights, rather than just print them out from - inside the class. Useful if you want to return the paths for further - processing. -* -*/ -#ifndef __FSTPATHFINDER__ -#define __FSTPATHFINDER__ - -#include - -using namespace fst; -using namespace std; - -struct PathData { - vector path; - float pathcost; -}; - -class FstPathFinder { - -public: - - vector paths; - - set skipSeqs; - - set > uniqueStrings; - - SymbolTable *isyms; - - FstPathFinder(); - - FstPathFinder(set skipset); - - void findAllStrings(StdVectorFst & fst); - -private: - - void addOrDiscardPath(PathData pdata); - - void findAllStringsHelper(StdVectorFst & fst, - int state, - vector &str, TropicalWeight cost); - -}; // end class - -#endif diff --git a/src/programs/g2p_train/M2MFstAligner.cpp b/src/programs/g2p_train/M2MFstAligner.cpp deleted file mode 100644 index 718218df..00000000 --- a/src/programs/g2p_train/M2MFstAligner.cpp +++ /dev/null @@ -1,680 +0,0 @@ -/* - M2MFstAligner.cpp - - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ -#include -#include -#include -#include "M2MFstAligner.hpp" - -//Begin Utility functions (these really need to go somewhere else -vector &split(const string & s, string delim, - vector &elems) -{ - stringstream ss(s); - string item; - //delim.c_str()[0] is a VERY bad thing to do - // this will produce behavior that makes not sense - // to the user if they try to use a multi-char delimiter - //Actually, this is inexcusable but first things first let's - // get everything else working properly. - while (getline(ss, item, delim.c_str()[0])) { - elems.push_back(item); - } - return elems; -} - - -vector split(const string & s, string delim) -{ - vector elems; - return split(s, delim, elems); -} - - -string -vec2str(vector vec, string sep) -{ - string ss; - for (size_t i = 0; i < vec.size(); ++i) { - if (i != 0) - ss += sep; - ss += vec[i]; - } - return ss; -} - -string -itoas(int i) -{ - std::stringstream ostring; - ostring << i; - return ostring.str(); -} - -int -M2MFstAligner::get_max_length(string joint_label) -{ - //We can probably make this a LOT faster... - vector parts = split(joint_label, s1s2_sep); - assert(parts.size() > 1); - vector s1 = split(parts[0], seq1_sep); - vector s2 = split(parts[1], seq2_sep); - int m = max(s1.size(), s2.size()); - //Probably want to rethink this placement.. - //At this point the model should not contain any of these - // transitions anyway. So this is redundant... - if (s1.size() > 1 && s2.size() > 1) - m = -1; - return m; -} - -//End utility functions - - -M2MFstAligner::M2MFstAligner() -{ - //Default constructor -} - -M2MFstAligner::M2MFstAligner(bool _seq1_del, bool _seq2_del, int _seq1_max, - int _seq2_max, string _seq1_sep, - string _seq2_sep, string _s1s2_sep, - string _eps, string _skip, bool _penalize) -{ - //Base constructor. Determine whether or not to allow deletions in seq1 and seq2 - // as well as the maximum allowable subsequence size. - seq1_del = _seq1_del; - seq2_del = _seq2_del; - seq1_max = _seq1_max; - seq2_max = _seq2_max; - seq1_sep = _seq1_sep; - seq2_sep = _seq2_sep; - s1s2_sep = _s1s2_sep; - penalize = _penalize; - eps = _eps; - skip = _skip; - skipSeqs.insert(eps); - isyms = new SymbolTable("syms"); - //Add all the important symbols to the table. We can store these - // in the model that we train and then attach them to the fst model - // if we want to use it later on. - //Thus, in addition to eps->0, we reserve symbol ids 1-4 as well. - isyms->AddSymbol(eps); - isyms->AddSymbol(skip); - //The '_' as a separator here is dangerous - isyms->AddSymbol(seq1_sep + "_" + seq2_sep); - isyms->AddSymbol(s1s2_sep); - string s1_del_str = seq1_del ? "true" : "false"; - string s2_del_str = seq2_del ? "true" : "false"; - string s1_max_str = itoas(seq1_max); - string s2_max_str = itoas(seq2_max); - string model_params = - s1_del_str + "_" + s2_del_str + "_" + s1_max_str + "_" + - s2_max_str; - isyms->AddSymbol(model_params); - total = LogWeight::Zero(); - prevTotal = LogWeight::Zero(); -} - -M2MFstAligner::M2MFstAligner(string _model_file) -{ - VectorFst *model = VectorFst::Read(_model_file); - for (StateIterator > siter(*model); - !siter.Done(); siter.Next()) { - LogArc::StateId q = siter.Value(); - for (ArcIterator > aiter(*model, q); - !aiter.Done(); aiter.Next()) { - const LogArc & arc = aiter.Value(); - alignment_model.insert(pair (arc.ilabel, arc.weight)); - } - } - isyms = (SymbolTable *) model->InputSymbols(); - int i = 0; - eps = isyms->Find(i); //Can't write '0' here for some reason... - skip = isyms->Find(1); - vector seps = split(isyms->Find(2), "_"); - seq1_sep = seps[0]; - seq2_sep = seps[1]; - s1s2_sep = isyms->Find(3); - vector params = split(isyms->Find(4), "_"); - seq1_del = params[0].compare("true") ? false : true; - seq2_del = params[1].compare("true") ? false : true; - seq1_max = atoi(params[2].c_str()); - seq2_max = atoi(params[3].c_str()); - -} - -void -M2MFstAligner::write_model(string _model_file) -{ - VectorFst model; - model.AddState(); - model.SetStart(0); - model.SetFinal(0, LogWeight::One()); - map::iterator it; - for (it = alignment_model.begin(); it != alignment_model.end(); it++) - model.AddArc(0, LogArc((*it).first, (*it).first, (*it).second, 0)); - model.SetInputSymbols(isyms); - model.Write(_model_file); - return; -} - -void -M2MFstAligner::expectation() -{ - for (int i = 0; i < fsas.size(); i++) { - //Comput Forward and Backward probabilities - ShortestDistance(fsas.at(i), &alpha); - ShortestDistance(fsas.at(i), &beta, true); - - //Compute the normalized Gamma probabilities and - // update our running tally - for (StateIterator > siter(fsas.at(i)); - !siter.Done(); siter.Next()) { - LogArc::StateId q = siter.Value(); - for (ArcIterator > aiter(fsas.at(i), q); - !aiter.Done(); aiter.Next()) { - const LogArc & arc = aiter.Value(); - const LogWeight & gamma = - Divide(Times - (Times(alpha[q], arc.weight), - beta[arc.nextstate]), beta[0]); - //Check for any BadValue results, otherwise add to the tally. - //We call this 'prev_alignment_model' which may seem misleading, but - // this conventions leads to 'alignment_model' being the final version. - if (gamma.Value() == gamma.Value()) { - prev_alignment_model[arc.ilabel] = - Plus(prev_alignment_model[arc.ilabel], gamma); - total = Plus(total, gamma); - } - } - } - alpha.clear(); - beta.clear(); - } -} - -void -M2MFstAligner::Sequences2FST(VectorFst *fst, - vector *seq1, - vector *seq2) -{ - /* - Build an FST that represents all possible alignments between seq1 and seq2, given the - parameter values input by the user. Here we encode the input and output labels, in fact - creating a WFSA. This simplifies the training process, but means that we can only - easily compute a joint maximization. In practice joint maximization seems to give the - best results anyway, so it probably doesn't matter. - - Note: this also performs the initizization routine. It performs a UNIFORM initialization - meaning that every non-null alignment sequence is eventually initialized to 1/Num(unique_alignments). - It might be more appropriate to consider subsequence length here, but for now we stick - to the m2m-aligner approach. - - TODO: Add an FST version and support for conditional maximization. May be useful for languages - like Japanese where there is a distinct imbalance in the seq1->seq2 length correspondences. - */ - int istate = 0; - int ostate = 0; - for (int i = 0; i <= seq1->size(); i++) { - for (int j = 0; j <= seq2->size(); j++) { - fst->AddState(); - istate = i * (seq2->size() + 1) + j; - - //Epsilon arcs for seq1 - if (seq1_del == true) - for (int l = 1; l <= seq2_max; l++) { - if (j + l <= seq2->size()) { - vector subseq2(seq2->begin() + j, - seq2->begin() + j + l); - string sym = skip + s1s2_sep + - vec2str(subseq2, seq2_sep); - int is = - isyms->AddSymbol(sym); - ostate = i * (seq2->size() + 1) + (j + l); - //LogArc arc( is, is, LogWeight::One().Value()*(l+1)*2, ostate ); - LogArc arc(is, is, 99, ostate); - //LogArc arc( is, is, LogWeight::Zero(), ostate ); - fst->AddArc(istate, arc); - if (prev_alignment_model.find(arc.ilabel) == - prev_alignment_model.end()) - prev_alignment_model.insert(pair < - LogArc::Label, - LogWeight > - (arc.ilabel, - arc.weight)); - else - prev_alignment_model[arc.ilabel] = - Plus(prev_alignment_model[arc.ilabel], - arc.weight); - total = Plus(total, arc.weight); - } - } - - //Epsilon arcs for seq2 - if (seq2_del == true) - for (int k = 1; k <= seq1_max; k++) { - if (i + k <= seq1->size()) { - vector subseq1(seq1->begin() + i, - seq1->begin() + i + k); - string sym = vec2str(subseq1, seq1_sep) + - s1s2_sep + skip; - int is = - isyms->AddSymbol(sym); - ostate = (i + k) * (seq2->size() + 1) + j; - //LogArc arc( is, is, LogWeight::One().Value()*(k+1)*2, ostate ); - LogArc arc(is, is, 99, ostate); - //LogArc arc( is, is, LogWeight::Zero(), ostate ); - fst->AddArc(istate, arc); - if (prev_alignment_model.find(arc.ilabel) == - prev_alignment_model.end()) - prev_alignment_model.insert(pair < - LogArc::Label, - LogWeight > - (arc.ilabel, - arc.weight)); - else - prev_alignment_model[arc.ilabel] = - Plus(prev_alignment_model[arc.ilabel], - arc.weight); - total = Plus(total, arc.weight); - } - } - - //All the other arcs - for (int k = 1; k <= seq1_max; k++) { - for (int l = 1; l <= seq2_max; l++) { - if (i + k <= seq1->size() && j + l <= seq2->size()) { - vector subseq1(seq1->begin() + i, - seq1->begin() + i + k); - string s1 = vec2str(subseq1, seq1_sep); - vector subseq2(seq2->begin() + j, - seq2->begin() + j + l); - string s2 = vec2str(subseq2, seq2_sep); - if (l > 1 && k > 1) - continue; - string sym = s1 + s1s2_sep + s2; - int is = isyms->AddSymbol(sym); - ostate = (i + k) * (seq2->size() + 1) + (j + l); - LogArc arc(is, is, - LogWeight::One().Value() * (k + l), - ostate); - //LogArc arc( is, is, LogWeight::One().Value(), ostate ); - fst->AddArc(istate, arc); - //During the initialization phase, just count non-eps transitions - //We currently initialize to uniform probability so there is also - // no need to tally anything here. - if (prev_alignment_model.find(arc.ilabel) == - prev_alignment_model.end()) - prev_alignment_model.insert(pair < - LogArc::Label, - LogWeight > - (arc.ilabel, - arc.weight)); - else - prev_alignment_model[arc.ilabel] = - Plus(prev_alignment_model[arc.ilabel], - arc.weight); - total = Plus(total, arc.weight); - } - } - } - - } - } - - fst->SetStart(0); - fst->SetFinal(((seq1->size() + 1) * (seq2->size() + 1)) - 1, - LogWeight::One()); - //Unless seq1_del==true && seq2_del==true we will have unconnected states - // thus we need to run connect to clean out these states - //fst->SetInputSymbols(isyms); - //fst->Write("right.nc.fsa"); - if (seq1_del == false or seq2_del == false) - Connect(fst); - //fst->Write("right.c.fsa"); - return; -} - -void -M2MFstAligner::Sequences2FSTNoInit(VectorFst *fst, - vector *seq1, - vector *seq2) -{ - /* - Build an FST that represents all possible alignments between seq1 and seq2, given the - parameter values input by the user. Here we encode the input and output labels, in fact - creating a WFSA. This simplifies the training process, but means that we can only - easily compute a joint maximization. In practice joint maximization seems to give the - best results anyway, so it probably doesn't matter. - - It might be more appropriate to consider subsequence length here, but for now we stick - to the m2m-aligner approach. - */ - int istate = 0; - int ostate = 0; - for (int i = 0; i <= seq1->size(); i++) { - for (int j = 0; j <= seq2->size(); j++) { - fst->AddState(); - istate = i * (seq2->size() + 1) + j; - - //Epsilon arcs for seq1 - if (seq1_del == true) - for (int l = 1; l <= seq2_max; l++) { - if (j + l <= seq2->size()) { - vector subseq2(seq2->begin() + j, - seq2->begin() + j + l); - int is = - isyms->Find(skip + s1s2_sep + - vec2str(subseq2, seq2_sep)); - ostate = i * (seq2->size() + 1) + (j + l); - //LogArc arc( is, is, LogWeight::One().Value()*(l+1)*2, ostate ); - LogArc arc(is, is, 99, ostate); - fst->AddArc(istate, arc); - } - } - - //Epsilon arcs for seq2 - if (seq2_del == true) - for (int k = 1; k <= seq1_max; k++) { - if (i + k <= seq1->size()) { - vector subseq1(seq1->begin() + i, - seq1->begin() + i + k); - int is = - isyms->Find(vec2str(subseq1, seq1_sep) + - s1s2_sep + skip); - ostate = (i + k) * (seq2->size() + 1) + j; - //LogArc arc( is, is, LogWeight::One().Value()*(k+1)*2, ostate ); - LogArc arc(is, is, 99, ostate); - fst->AddArc(istate, arc); - } - } - - //All the other arcs - for (int k = 1; k <= seq1_max; k++) { - for (int l = 1; l <= seq2_max; l++) { - if (i + k <= seq1->size() && j + l <= seq2->size()) { - vector subseq1(seq1->begin() + i, - seq1->begin() + i + k); - string s1 = vec2str(subseq1, seq1_sep); - vector subseq2(seq2->begin() + j, - seq2->begin() + j + l); - string s2 = vec2str(subseq2, seq2_sep); - if (l > 1 && k > 1) - continue; - int is = isyms->Find(s1 + s1s2_sep + s2); - ostate = (i + k) * (seq2->size() + 1) + (j + l); - LogArc arc(is, is, - LogWeight::One().Value() * (k + l), - ostate); - fst->AddArc(istate, arc); - } - } - } - - } - } - - fst->SetStart(0); - fst->SetFinal(((seq1->size() + 1) * (seq2->size() + 1)) - 1, - LogWeight::One()); - //Unless seq1_del==true && seq2_del==true we will have unconnected states - // thus we need to run connect to clean out these states - if (seq1_del == false or seq2_del == false) - Connect(fst); - return; -} - -//Build the composed alignment FST and add it to the list of training data -void -M2MFstAligner::entry2alignfst(vector seq1, - vector seq2) -{ - VectorFst fst; - Sequences2FST(&fst, &seq1, &seq2); - fsas.push_back(fst); - return; -} - -vector M2MFstAligner::entry2alignfstnoinit(vector - seq1, - vector - seq2, int nbest, - string lattice) -{ - VectorFst fst; - Sequences2FSTNoInit(&fst, &seq1, &seq2); - if (lattice.compare("") != 0) - fst.Write(lattice); - return write_alignment(fst, nbest); -} - -float -M2MFstAligner::maximization(bool lastiter) -{ - //Maximization. Simple count normalization. Probably get an improvement - // by using a more sophisticated regularization approach. - map::iterator it; - float change = abs(total.Value() - prevTotal.Value()); - //cout << "Total: " << total << " Change: " << abs(total.Value()-prevTotal.Value()) << endl; - prevTotal = total; - - //Normalize and iterate to the next model. We apply it dynamically - // during the expectation step. - for (it = prev_alignment_model.begin(); - it != prev_alignment_model.end(); it++) { - alignment_model[(*it).first] = Divide((*it).second, total); - (*it).second = LogWeight::Zero(); - } - - for (int i = 0; i < fsas.size(); i++) { - for (StateIterator > siter(fsas[i]); - !siter.Done(); siter.Next()) { - LogArc::StateId q = siter.Value(); - for (MutableArcIterator > aiter(&fsas[i], q); !aiter.Done(); aiter.Next()) { - LogArc arc = aiter.Value(); - arc.weight = alignment_model[arc.ilabel]; - aiter.SetValue(arc); - } - } - } - - total = LogWeight::Zero(); - return change; -} - -int -M2MFstAligner::num_fsas() -{ - //A getter function because I'm retarded. - return fsas.size(); -} - -vector M2MFstAligner::write_alignment(const VectorFst &ifst, - int nbest) -{ - //Generic alignment generator - VectorFst fst; - Map(ifst, &fst, LogToStdMapper()); - - for (StateIterator > siter(fst); !siter.Done(); - siter.Next()) { - StdArc::StateId q = siter.Value(); - for (MutableArcIterator > aiter(&fst, q); - !aiter.Done(); aiter.Next()) { - //Prior to decoding we make several 'heuristic' modifications to the weights: - // 1. A multiplier is applied to any multi-token substrings - // 2. Any LogWeight::Zero() arc weights are reset to '99'. - // We are basically resetting 'Infinity' values to a 'smallest non-Infinity' - // so that the ShortestPath algorithm actually produces something no matter what. - // 3. Any arcs that consist of subseq1:subseq2 being the same length and subseq1>1 - // are set to '99' this forces shortestpath to choose arcs where one of the - // following conditions holds true - // * len(subseq1)>1 && len(subseq2)!=len(subseq1) - // * len(subseq2)>1 && len(subseq1)!=len(subseq2) - // * len(subseq1)==len(subseq2)==1 - //I suspect these heuristics can be eliminated with a better choice of the initialization - // function and maximization function, but this is the way that m2m-aligner works, so - // it makes sense for our first cut implementation. - //In any case, this guarantees that M2MFstAligner produces results identical to those - // produced by m2m-aligner - but with a bit more reliability. - //UPDATE: this now produces a better alignment than m2m-aligner. - // The maxl heuristic is still in place. The aligner will produce *better* 1-best alignments - // *without* the maxl heuristic below, BUT this comes at the cost of producing a less - // flexible corpus. That is, for a small training corpus like nettalk, if we use the - // best alignment we wind up with more 'chunks' and thus get a worse coverage for unseen - // data. Using the aignment lattices to train the joint ngram model solves this problem. - // Oh baby. Can't wait to for everyone to see the paper! - //NOTE: this is going to fail if we encounter any alignments in a new test item that never - // occurred in the original model. - StdArc - arc = aiter.Value(); - int - maxl = get_max_length(isyms->Find(arc.ilabel)); - if (maxl == -1) { - arc.weight = 999; - } - else { - //Optionally penalize m-to-1 / 1-to-m links. This produces - // WORSE 1-best alignments, but results in better joint n-gram - // models for small training corpora when using only the 1-best - // alignment. By further favoring 1-to-1 alignments the 1-best - // alignment corpus results in a more flexible joint n-gram model - // with regard to previously unseen data. - //if( penalize==true ){ - arc.weight = alignment_model[arc.ilabel].Value() * maxl; - //}else{ - //For larger corpora this is probably unnecessary. - //arc.weight = alignment_model[arc.ilabel].Value(); - //} - } - if (arc.weight == LogWeight::Zero()) - arc.weight = 999; - if (arc.weight != arc.weight) - arc.weight = 999; - aiter.SetValue(arc); - } - } - - VectorFst shortest; - ShortestPath(fst, &shortest, nbest); - RmEpsilon(&shortest); - //Skip empty results. This should only happen - // in the following situations: - // 1. seq1_del=false && len(seq1)len(seq2) - //In both 1.and 2. the issue is that we need to - // insert a 'skip' in order to guarantee at least - // one valid alignment path through seq1*seq2, but - // user params didn't allow us to. - //Probably better to insert these where necessary - // during initialization, regardless of user prefs. - if (shortest.NumStates() == 0) { - vector dummy; - return dummy; - } - FstPathFinder - pathfinder(skipSeqs); - pathfinder.isyms = isyms; - pathfinder.findAllStrings(shortest); - return pathfinder.paths; -} - -void -M2MFstAligner::write_all_alignments(int nbest) -{ - //Convenience function for the python bindings - for (int i = 0; i < fsas.size(); i++) - write_alignment(fsas[i], nbest); - - return; -} - -vector M2MFstAligner::write_alignment_wrapper(int i, - int nbest) -{ - //Wrapper for the python bindings. - return write_alignment(fsas[i], nbest); -} - -void -M2MFstAligner::write_lattice(string lattice) -{ - //Write out the entire training set in lattice format - //Perform the union first. This output can then - // be plugged directly in to a counter to obtain expected - // alignment counts for the EM-trained corpus. Yields - // far higher-quality joint n-gram models, which are also - // more robust for smaller training corpora. - //Make sure you call this BEFORE any call to - // write_all_alignments - // as the latter function will override some of the weights - - //Chaining the standard Union operation, including using a - // rational FST still performs very poorly in the log semiring. - //Presumably it's running push or something at each step. It - // should be fine to do that just once at the end. - //Rolling our own union turns out to be MUCH faster. - VectorFst ufst; - ufst.AddState(); - ufst.SetStart(0); - int total_states = 0; - for (int i = 0; i < fsas.size(); i++) { - TopSort(&fsas[i]); - for (StateIterator > siter(fsas[i]); - !siter.Done(); siter.Next()) { - LogArc::StateId q = siter.Value(); - LogArc::StateId r; - if (q == 0) - r = 0; - else - r = ufst.AddState(); - - for (ArcIterator > aiter(fsas[i], q); - !aiter.Done(); aiter.Next()) { - const LogArc & arc = aiter.Value(); - ufst.AddArc(r, - LogArc(arc.ilabel, arc.ilabel, arc.weight, - arc.nextstate + total_states)); - } - if (fsas[i].Final(q) != LogWeight::Zero()) - ufst.SetFinal(r, LogWeight::One()); - } - total_states += fsas[i].NumStates() - 1; - } - //Normalize weights - Push(&ufst, REWEIGHT_TO_INITIAL); - //Write the resulting lattice to disk - ufst.Write(lattice); - //Write the syms table too. - isyms->WriteText("lattice.syms"); - return; -} diff --git a/src/programs/g2p_train/M2MFstAligner.hpp b/src/programs/g2p_train/M2MFstAligner.hpp deleted file mode 100644 index da45cc25..00000000 --- a/src/programs/g2p_train/M2MFstAligner.hpp +++ /dev/null @@ -1,128 +0,0 @@ -#ifndef M2MFSTALIGNER_H -#define M2MFSTALIGNER_H -/* - M2MFstAligner.hpp - - Copyright (c) [2012-], Josef Robert Novak - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted #provided that the following conditions - are met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above - copyright notice, this list of #conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - OF THE POSSIBILITY OF SUCH DAMAGE. -*/ -#include -#include -#include "FstPathFinder.hpp" -using namespace std; - -namespace fst { -class M2MFstAligner { - /* - Read in pairs of sequences of the form SEQ1 and SEQ2 and - transform them into an FST that encodes all possible - alignments between the symbols in the two sequences. - Note that this may include a combination of multi-symbol - subsequences depending on user specifications. - - This is achieved by simply generating the entire alignment - graph during a single nested loop through the two input - sequences that are to be aligned. - - The user may optionally specify whether to allow deletions - for SEQ1 or SEQ2, as well as a maximum subsequence length - for each sequence. - */ -public: - //Basics declarations - bool seq1_del; - bool seq2_del; - int seq1_max; - int seq2_max; - string seq1_sep; - string seq2_sep; - string s1s2_sep; - string eps; - string skip; - bool penalize; - vector alpha, beta; - //This will be used during decoding to clean the paths - set skipSeqs; - //OpenFst stuff - //These will be overwritten after each FST construction - vector > fsas; - - //This will be maintained for the life of object - //These symbol tables will be maintained entire life of - // the object. This will ensure that any resulting 'corpus' - // shares the same symbol tables. - SymbolTable *isyms; - map alignment_model; - map prev_alignment_model; - LogWeight total; - LogWeight prevTotal; - - //Constructors - M2MFstAligner(); - M2MFstAligner(bool _seq1_del, bool _seq2_del, int _seq1_max, - int _seq2_max, string _seq1_sep, string _seq2_sep, - string _s1s2_sep, string _eps, string _skip, - bool _penalize); - M2MFstAligner(string _model_file); - - //Write an aligner model to disk. Critical info is stored in the - // the symbol table so that it can be restored when the model is loaded. - void write_model(string _model_name); - //Transform a sequence pair into an equivalent multiple-to-multiple FST, - // encoding all possible alignments between the two sequences - void Sequences2FST(VectorFst *fst, - vector *seq1, - vector *seq2); - void Sequences2FSTNoInit(VectorFst *fst, - vector *seq1, - vector *seq2); - //Initialize all of the training data - void entry2alignfst(vector seq1, - vector seq2); - vector entry2alignfstnoinit(vector seq1, - vector seq2, - int nbest, - string lattice = ""); - vector write_alignment_wrapper(int i, int nbest); - //The expectation routine - void expectation(); - //The maximization routine. Returns the change since the last iteration - float maximization(bool lastiter); - //Print out the EM-optimized alignment for the training data - vector write_alignment(const VectorFst - &ifst, int nbest); - //Write out the union of the weighted alignment lattices for the training corpus - void write_lattice(string lattice); - //Convenience function to output all the alignments - void write_all_alignments(int nbest); - //max routine - int get_max_length(string joint_label); - int num_fsas(); - -}; -} -#endif // M2MFSTALIGNER_H // diff --git a/src/programs/g2p_train/g2p_train.cpp b/src/programs/g2p_train/g2p_train.cpp index 4dfb09c6..980c89ae 100644 --- a/src/programs/g2p_train/g2p_train.cpp +++ b/src/programs/g2p_train/g2p_train.cpp @@ -39,7 +39,7 @@ * NGram language modeling toolkit instead of MITLM. * * for more details about phonetisaurus see - * http://code.google.com/p/phonetisaurus/ + * https://github.com/AdolfVonKleist/Phonetisaurus * http://www.openfst.org/twiki/bin/view/GRM/NGramLibrary */ @@ -64,8 +64,10 @@ #include #include #include -#include "M2MFstAligner.hpp" -#include "../g2p_eval/util.hpp" +#include +#include "include/PhonetisaurusRex.h" +#include "include/M2MFstAligner.h" +#include "include/LatticePruner.h" #define arc_type "standard" #define fst_type "vector" @@ -113,10 +115,20 @@ template struct ToLog64Mapper { return props; } }; -} using namespace std; +} + +using namespace std; using namespace ngram; using namespace fst; +string +convertInt(int number) +{ + stringstream ss; //create a stringstream + ss << number; //add number to the stream + return ss.str(); //return a string with the contents of the stream +} + void addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms, SymbolTable * isyms, SymbolTable * osyms, SymbolTable * ssyms, @@ -127,7 +139,11 @@ addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms, aiter.Next()) { StdArc arc = aiter.Value(); string oldlabel = oldsyms->Find(arc.ilabel); + // Make sure on its own maps to : if (oldlabel == eps) { + // Ensure we aren't losing any output symbol (it's an + // acceptor so this should not happen) + assert(oldsyms->Find(arc.olabel) == eps); oldlabel = oldlabel.append("}"); oldlabel = oldlabel.append(eps); } @@ -142,9 +158,8 @@ addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms, int64 nextstate = ssyms->Find(convertInt(arc.nextstate)); if (nextstate == -1) { - out->AddState(); - ssyms->AddSymbol(convertInt(arc.nextstate)); - nextstate = ssyms->Find(convertInt(arc.nextstate)); + nextstate = out->AddState(); + ssyms->AddSymbol(convertInt(arc.nextstate), nextstate); } out->AddArc(newstate, StdArc(ilabel, olabel, @@ -155,10 +170,65 @@ addarcs(StateId state_id, StateId newstate, const SymbolTable * oldsyms, } } +void +patch_labels(StdMutableFst *arpafst, SymbolTable* syms, int64 skip_id, bool input) { + /* + Patch all labels. In some edge cases it is possible + to end up grapheme subsequences: e.g. 'QU' where one or + both tokens is only mapped to the multi-subsequence. In thise + case the independent 'Q' and/or 'U' token will never appear + in isolation. + This bit resolves this by: + + a.) finding and adding these missing subsequence tokens + b.) adding backoff loops to the LM + + */ + string tie = "|"; + for (unsigned int i = skip_id + 1; i < syms->NumSymbols(); i++) { + string sym = syms->Find(i); + vector parts = tokenize_utf8_string(&sym, &tie); + if (parts.size() > 1) { + for (unsigned int j = 0; j < parts.size(); j++) { + if (syms->Find(parts[j]) == -1) { + // Add the missing symbol + int k = syms->AddSymbol(parts[j]); + // Add a backoff loop mapped to the 'skip' + // FIXME: phonetisaurus hard-codes this as 1 but I + // believe that is wrong, it should maybe actually be + // ssyms->Find("")? + int64 start_state = 1; + if (input == true) + arpafst->AddArc(start_state, StdArc(k, skip_id, 99, start_state)); + else + arpafst->AddArc(start_state, StdArc(skip_id, k, 99, start_state)); + } + } + } + } +} + void relabel(StdMutableFst * fst, StdMutableFst * out, string prefix, string eps, string skip, string s1s2_sep, string seq_sep) { + /* + Transform a statistical language model in ARPA format + to an equivalent Weighted Finite-State Acceptor. + This implementation adopts the Google format for the output + WFSA. This differs from previous implementations in several ways: + + Start-state and arcs: + * There are no explicit sentence-begin () arcs + * There is a single start-state. + + Final-state and arcs: + * There are no explicit sentence-end () arcs + * There is no explicit state + * NGrams ending in are designated as final + states, and any probability is assigned + to the final weight of said state. + */ namespace s = fst::script; using fst::ostream; using fst::SymbolTable; @@ -172,59 +242,41 @@ relabel(StdMutableFst * fst, StdMutableFst * out, string prefix, SymbolTable *osyms = new SymbolTable("osyms"); out->AddState(); - ssyms->AddSymbol("s0"); + ssyms->AddSymbol(""); out->SetStart(0); - out->AddState(); - ssyms->AddSymbol("s1"); - out->SetFinal(1, TropicalWeight::One()); - + string tie = "|"; isyms->AddSymbol(eps); osyms->AddSymbol(eps); - - //Add separator, phi, start and end symbols - isyms->AddSymbol(seq_sep); - osyms->AddSymbol(seq_sep); - isyms->AddSymbol(""); - osyms->AddSymbol(""); - int istart = isyms->AddSymbol(""); - int iend = isyms->AddSymbol(""); - int ostart = osyms->AddSymbol(""); - int oend = osyms->AddSymbol(""); - - out->AddState(); - ssyms->AddSymbol("s2"); - out->AddArc(0, StdArc(istart, ostart, TropicalWeight::One(), 2)); + isyms->AddSymbol(tie); + osyms->AddSymbol(tie); + isyms->AddSymbol(skip); + osyms->AddSymbol(skip); for (StateIterator siter(*fst); !siter.Done(); siter.Next()) { StateId state_id = siter.Value(); int64 newstate; if (state_id == fst->Start()) { - newstate = 2; + newstate = 0; } else { newstate = ssyms->Find(convertInt(state_id)); if (newstate == -1) { - out->AddState(); - ssyms->AddSymbol(convertInt(state_id)); - newstate = ssyms->Find(convertInt(state_id)); + newstate = out->AddState(); + ssyms->AddSymbol(convertInt(state_id), newstate); } } TropicalWeight weight = fst->Final(state_id); - - if (weight != TropicalWeight::Zero()) { - // this is a final state - StdArc a = StdArc(iend, oend, weight, 1); - out->AddArc(newstate, a); - out->SetFinal(newstate, TropicalWeight::Zero()); - } + if (weight != TropicalWeight::Zero()) + out->SetFinal(newstate, weight); addarcs(state_id, newstate, oldsyms, isyms, osyms, ssyms, eps, s1s2_sep, fst, out); } - + patch_labels(out, isyms, isyms->Find(skip), true); + patch_labels(out, osyms, osyms->Find(skip), false); out->SetInputSymbols(isyms); out->SetOutputSymbols(osyms); @@ -252,14 +304,16 @@ train_model(string eps, string s1s2_sep, string skip, int order, using fst::script::VectorFstClass; using fst::script::WeightClass; - // create symbols file - cout << "Generating symbols..." << endl; - NGramInput *ingram = - new NGramInput(prefix + ".corpus.aligned", prefix + ".corpus.syms", - "", eps, unknown_symbol, "", ""); - ingram->ReadInput(0, 1); + // create symbols file (ngramsymbols) + { + cout << "Generating symbols..." << endl; + NGramInput ingram(prefix + ".corpus.aligned", prefix + ".corpus.syms", + "", eps, unknown_symbol, "", ""); + // Magic!? + ingram.ReadInput(0, 1); + } - // compile strings into a far archive + // compile strings into a far archive (farcompilestrings) cout << "Compiling symbols into FAR archive..." << endl; fst::FarEntryType fet; fst::script::GetFarEntryType(entry_type, &fet); @@ -268,11 +322,8 @@ train_model(string eps, string s1s2_sep, string skip, int order, // Lovely inconsistent API you got there, OpenFST... fst::FarType fartype = fst::script::GetFarType(far_type); - delete ingram; - vector in_fname; in_fname.push_back(prefix + ".corpus.aligned"); - fst::script::FarCompileStrings(in_fname, prefix + ".corpus.far", arc_type, fst_type, fartype, @@ -282,13 +333,12 @@ train_model(string eps, string s1s2_sep, string skip, int order, initial_symbols, allow_negative_labels, key_prefix, key_suffix); - //count n-grams + // count n-grams (ngramcount) cout << "Counting n-grams..." << endl; NGramCounter ngram_counter(order, epsilon_as_backoff); - FstReadOptions opts; - FarReader *far_reader; - far_reader = FarReader::Open(prefix + ".corpus.far"); + // NO RAII FOR YOU! NO! + FarReader *far_reader = FarReader::Open(prefix + ".corpus.far"); int fstnumber = 1; const Fst *ifst = 0, *lfst = 0; while (!far_reader->Done()) { @@ -323,18 +373,16 @@ train_model(string eps, string s1s2_sep, string skip, int order, ++fstnumber; } delete far_reader; - - if (!lfst) { + if (!lfst) E_FATAL("None of the input FSTs had a symbol table\n"); - //exit(1); - } - VectorFst vfst; ngram_counter.GetFst(&vfst); ArcSort(&vfst, StdILabelCompare()); vfst.SetInputSymbols(lfst->InputSymbols()); vfst.SetOutputSymbols(lfst->InputSymbols()); vfst.Write(prefix + ".corpus.cnts"); + + // Make smoothed N-Grams (ngrammake) StdMutableFst *fst = StdMutableFst::Read(prefix + ".corpus.cnts", true); if (smooth != "no") { @@ -384,9 +432,9 @@ train_model(string eps, string s1s2_sep, string skip, int order, E_FATAL("Bad smoothing method: %s\n", smooth.c_str()); } } + // fst->Write(prefix + ".smooth.mod"); if (prune != "no") { cout << "Pruning model..." << endl; - if (prune == "count_prune") { NGramCountPrune ngramsh(fst, count_pattern, shrink_opt, total_unigram_count, @@ -410,12 +458,15 @@ train_model(string eps, string s1s2_sep, string skip, int order, E_FATAL("Bad shrink method: %s\n", prune.c_str()); } } + // fst->Write(prefix + ".shrink.mod"); cout << "Minimizing model..." << endl; MutableFstClass *minimized = new s::MutableFstClass(*fst); Minimize(minimized, 0, fst::kDelta); fst = minimized->GetMutableFst(); + // fst->Write(prefix + ".min.mod"); + // Split input/output labels (phonetisaurus-arpa2wfst) cout << "Correcting final model..." << endl; StdMutableFst *out = new StdVectorFst(); relabel(fst, out, prefix, eps, skip, s1s2_sep, seq_sep); @@ -425,6 +476,77 @@ train_model(string eps, string s1s2_sep, string skip, int order, } +void write_alignments (M2MFstAligner* aligner, std::ofstream &ofile, + StdArc::Weight threshold, int nbest, + bool fb, bool penalize) { + /* + Write the raw alignments to a file in text-based corpus format. + + NOTE: Although N-best and other pruning strategies are supported, + the final format is that of a standard text corpus. All relative + token and pronunciation scores will be stripped. In general + this means that, unless you are very lucky with your combined + pruning strategy the un-ranked N-best hypotheses will result in a + lower-quality joint N-gram model. + + This approach is best used with simple 1-best. + */ + + //Build us a lattice pruner + LatticePruner pruner (aligner->penalties, threshold, nbest, fb, penalize); + + VetoSet veto_set_; + veto_set_.insert (0); + for (unsigned int i = 0; i < aligner->fsas.size (); i++) { + //Map to Tropical semiring + VectorFst* tfst = new VectorFst (); + Map (aligner->fsas.at (i), tfst, LogToStdMapper ()); + pruner.prune_fst (tfst); + RmEpsilon (tfst); + //Skip empty results. This should only happen + // in the following situations: + // 1. seq1_del=false && len(seq1)len(seq2) + //In both 1.and 2. the issue is that we need to + // insert a 'skip' in order to guarantee at least + // one valid alignment path through seq1*seq2, but + // user params didn't allow us to. + //Probably better to insert these where necessary + // during initialization, regardless of user prefs. + if (tfst->NumStates () > 0) { + StdArc::Weight weight_threshold = 99; + StdArc::StateId state_threshold = kNoStateId; + AnyArcFilter arc_filter; + vector distance; + VectorFst ofst; + + AutoQueue state_queue (*tfst, &distance, arc_filter); + IdentityPathFilter path_filter; + + ShortestPathOptions, + AnyArcFilter > + opts (&state_queue, arc_filter, nbest, false, false, + kDelta, false, weight_threshold, + state_threshold); + ShortestPathSpecialized (*tfst, &ofst, &distance, + &path_filter, 10000, opts); + for (size_t i = 0; i < path_filter.ordered_paths.size (); i++) { + const vector& path = path_filter.ordered_paths[i]; + for (size_t j = 0; j < path.size (); j++) { + ofile << aligner->isyms->Find (path [j]); + if (j < path.size () - 1) + ofile << " "; + } + ofile << "\n"; + } + } + delete tfst; + } + + return; +} + + void align(string input_file, string prefix, bool seq1_del, bool seq2_del, int seq1_max, int seq2_max, string seq_sep, string s1s2_sep, @@ -436,24 +558,37 @@ align(string input_file, string prefix, bool seq1_del, bool seq2_del, ofstream ofile(o.c_str(), ifstream::out); cout << "Loading..." << endl; M2MFstAligner fstaligner(seq1_del, seq2_del, seq1_max, seq2_max, - seq_sep, seq_sep, s1s2_sep, eps, skip, true); + seq_sep, seq_sep, s1s2_sep, eps, skip, + // Fuck this stupid API + false, false, true, false); - string sep1 = ""; - string sep2 = " "; + string sepnone = ""; + string septab = "\t"; + string sepspace = " "; string line; if (dict.is_open()) { while (dict.good()) { getline(dict, line); if (line.empty()) continue; - vector tokens = tokenize_utf8_string(&line, &sep2); - if (tokens.size() < 2) { - cout << "Cannot parse line:" << line << endl; - continue; + /* First try with tab */ + vector tokens = tokenize_utf8_string(&line, &septab); + if (tokens.size() != 2) { + vector tokens = tokenize_utf8_string(&line, &sepspace); + if (tokens.size() < 2) { + cout << "Cannot parse line (must use tab or single space " + << "to separate word and phones):" << line << endl; + continue; + } + vector seq1 = tokenize_utf8_string(&tokens.at(0), &sepnone); + vector seq2(tokens.begin() + 1, tokens.end()); + fstaligner.entry2alignfst(seq1, seq2); + } + else { + vector seq1 = tokenize_utf8_string(&tokens.at(0), &sepnone); + vector seq2 = tokenize_utf8_string(&tokens.at(1), &sepspace); + fstaligner.entry2alignfst(seq1, seq2); } - vector seq1 = tokenize_utf8_string(&tokens.at(0), &sep1); - vector seq2(tokens.begin() + 1, tokens.end()); - fstaligner.entry2alignfst(seq1, seq2); } } dict.close(); @@ -465,25 +600,14 @@ align(string input_file, string prefix, bool seq1_del, bool seq2_del, for (i = 1; i <= iter; i++) { fstaligner.expectation(); change = fstaligner.maximization(false); - cout << "Iteration " << i << ": " << change << endl; + cout << "Iteration " << i << " Change: " << change << endl; } fstaligner.expectation(); change = fstaligner.maximization(true); - cout << "Iteration " << i << ": " << change << endl; + cout << "Last iteration: " << change << endl; cout << "Generating best alignments..." << endl; - for (int i = 0; i < fstaligner.fsas.size(); i++) { - vector paths = - fstaligner.write_alignment(fstaligner.fsas[i], 1); - for (int k = 0; k < paths.size(); k++) { - for (int j = 0; j < paths[k].path.size(); j++) { - ofile << paths[k].path[j]; - //if (j < paths[k].path.size() - 1) - ofile << " "; - } - ofile << endl; - } - } + write_alignments(&fstaligner, ofile, -99.0, 1, false, true); ofile.flush(); ofile.close(); } diff --git a/src/programs/g2p_train/main.cpp b/src/programs/g2p_train/main.cpp index 6a50719c..35526ed7 100644 --- a/src/programs/g2p_train/main.cpp +++ b/src/programs/g2p_train/main.cpp @@ -53,7 +53,7 @@ main(int argc, char *argv[]) { "-seq1_del", ARG_BOOLEAN, "no", "Allow deletions in sequence 1" }, - { "-seq2_del", ARG_BOOLEAN, "no", + { "-seq2_del", ARG_BOOLEAN, "yes", "Allow deletions in sequence 2" }, { "-noalign", ARG_BOOLEAN, "no", @@ -71,7 +71,9 @@ main(int argc, char *argv[]) { "-iter", ARG_INT32, "10", "Maximum number of iterations for EM" }, - {"-order", ARG_INT32, "6", "N-gram order"}, + { "-order", ARG_INT32, "5", + "N-gram order" + }, { "-prune", ARG_STRING, "no", "Pruning method. Available options are: 'no', 'count_prune', 'relative_entropy', 'seymore'" }, @@ -149,6 +151,11 @@ main(int argc, char *argv[]) eps, skip, iter); } + /* + phonetisaurus-align --input=INPUT --ofile=model.corpus --seq1_del=false --seq2_del=true --seq1_max=2 --seq2_max=2 --grow=false + farcompilestrings model.corpus | ngramcount --order=5 | ngrammake + phonetisaurus-arpa2wfst # NOTE: this is unnecessary as ngrammake produces the same representation + */ train_model(eps, s1s2_sep, skip, order, smooth, prefix, seq_sep, prune, theta, count_pattern); diff --git a/src/upstream/Phonetisaurus b/src/upstream/Phonetisaurus new file mode 160000 index 00000000..321443f9 --- /dev/null +++ b/src/upstream/Phonetisaurus @@ -0,0 +1 @@ +Subproject commit 321443f948220142d5cac4c9bb94ec766e9c86a0