fix: adjust all calls to make_g2p to its new signature

roedoejet · May 1, 2023 · bea7cec · bea7cec
1 parent f99774f
commit bea7cec
Show file tree

Hide file tree

Showing 9 changed files with 52 additions and 43 deletions.
diff --git a/g2p/__init__.py b/g2p/__init__.py
@@ -4,12 +4,12 @@
 
 The main entry points for the g2p module are:
  - make_g2p() to create a mapper from and lang to another
- - make_tokenizer() to create a tokenizeer for a given language
+ - make_tokenizer() to create a tokenizer for a given language
  - get_arpabet_langs() to get the list of languages with a path to eng-arpabet
 
 Basic Usage:
     from g2p import make_g2p
-    converter = make_g2p(in_lang, out_lang, tok_lang)
+    converter = make_g2p(in_lang, out_lang)
     transduction_graph = converter(input_text_in_in_alang)
     converted_text_in_out_lang = transduction_graph.output_string
 
@@ -51,26 +51,26 @@ def make_g2p(  # noqa: C901
     """Make a g2p Transducer for mapping text from in_lang to out_lang via the
     shortest path between them.
 
-    In general you should also add `tok_lang` to specify the language
-    for tokenization (probably the same as `in_lang`), because
-    transducers are not guaranteed to deal with whitespace,
+    By default, the input is tokenized using the path of mappings from in_lang
+    to out_lang, because transducers are not guaranteed to deal with whitespace,
     punctuation, etc, properly.
 
     Args:
         in_lang (str): input language code
         out_lang (str): output language code
         tok_lang (Optional[str]): DEPRECATED language for tokenization
-        tokenize (bool): whether tokenization should happen (default: yes)
-        custom_tokenizer (Tokenizer): the tokenizer to use (default: a tokenizer built on the)
+        tokenize (bool): whether tokenization should happen (default: True)
+        custom_tokenizer (Tokenizer): the tokenizer to use (default: a tokenizer
+                                      built on the path from in_lang and out_lang)
 
     Returns:
         Transducer from in_lang to out_lang, optionally with a tokenizer.
 
     Raises:
         InvalidLanguageCode: if in_lang or out_lang don't exist
         NoPath: if there is path between in_lang and out_lang
-
     """
+
     if (in_lang, out_lang, tok_lang, tokenize, id(custom_tokenizer)) in _g2p_cache:
         return _g2p_cache[(in_lang, out_lang, tok_lang, tokenize, id(custom_tokenizer))]
 

diff --git a/g2p/api.py b/g2p/api.py
@@ -119,7 +119,7 @@ def get(self):
         index = args["index"]
         debugger = args["debugger"]
         try:
-            transducer = make_g2p(in_lang, out_lang)
+            transducer = make_g2p(in_lang, out_lang, tokenize=False)
             tg = transducer(text)
             text = tg.output_string
             input_text = tg.input_string

diff --git a/g2p/app.py b/g2p/app.py
@@ -202,12 +202,14 @@ def change_table(message):
         # because it is the individual ones which are cached by g2p
         path = shortest_path(LANGS_NETWORK, message["in_lang"], message["out_lang"])
         if len(path) == 1:
-            transducer = make_g2p(message["in_lang"], message["out_lang"])
+            transducer = make_g2p(
+                message["in_lang"], message["out_lang"], tokenize=False
+            )
             mappings = [transducer.mapping]
         else:
             mappings = []
             for lang1, lang2 in zip(path[:-1], path[1:]):
-                transducer = make_g2p(lang1, lang2)
+                transducer = make_g2p(lang1, lang2, tokenize=False)
                 mappings.append(transducer.mapping)
         emit(
             "table response",

diff --git a/g2p/cli.py b/g2p/cli.py
@@ -13,7 +13,7 @@
 from flask.cli import FlaskGroup
 from networkx import has_path
 
-from g2p import make_g2p
+from g2p import make_g2p, make_tokenizer
 from g2p._version import VERSION
 from g2p.api import update_docs
 from g2p.app import APP
@@ -492,7 +492,6 @@ def convert(  # noqa: C901
     in_lang,
     out_lang,
     input_text,
-    path,
     tok,
     check,
     debugger,
@@ -556,11 +555,12 @@ def convert(  # noqa: C901
     # Determine which tokenizer to use, if any
     if tok is not None and not tok and tok_lang is not None:
         raise click.UsageError("Specified conflicting --no-tok and --tok-lang options.")
-    if tok and tok_lang is None:
-        tok_lang = "path"
+    custom_tokenizer = make_tokenizer(tok_lang) if tok_lang else None
     # Transduce!!!
     assert in_lang and out_lang
-    transducer = make_g2p(in_lang, out_lang, tok_lang=tok_lang)
+    transducer = make_g2p(
+        in_lang, out_lang, tokenize=tok, custom_tokenizer=custom_tokenizer
+    )
     tg = transducer(input_text)
     if check:
         transducer.check(tg, display_warnings=True)
@@ -740,7 +740,7 @@ def show_mappings(lang1, lang2, verbose, csv):
 
     if lang1 is not None and lang2 is not None:
         try:
-            transducer = make_g2p(lang1, lang2)
+            transducer = make_g2p(lang1, lang2, tokenize=False)
         except (NoPath, InvalidLanguageCode) as e:
             raise click.UsageError(
                 f'Cannot find mapping from "{lang1}" to "{lang2}": {e}'

diff --git a/g2p/tests/test_check_ipa_arpabet.py b/g2p/tests/test_check_ipa_arpabet.py
@@ -33,7 +33,7 @@ def test_check_arpabet(self):
         self.assertFalse(transducer.check(transducer("ñ")))
 
     def test_check_ipa(self):
-        transducer = make_g2p("fra", "fra-ipa")
+        transducer = make_g2p("fra", "fra-ipa", tokenize=False)
         self.assertTrue(transducer.check(transducer("ceci")))
         self.assertFalse(transducer.check(transducer("ñ")))
         with self.assertLogs(LOGGER, level="WARNING"):
@@ -49,12 +49,12 @@ def test_is_ipa_with_panphon_preprocessor(self):
         self.assertTrue(utils.is_panphon("ɻ̊j̊ oⁿk oᵐp"))
 
     def test_check_composite_transducer(self):
-        transducer = make_g2p("fra", "eng-arpabet")
+        transducer = make_g2p("fra", "eng-arpabet", tokenize=False)
         self.assertTrue(transducer.check(transducer("ceci est un test été à")))
         self.assertFalse(transducer.check(transducer("ñ")))
 
     def test_check_tokenizing_transducer(self):
-        transducer = make_g2p("fra", "fra-ipa", tok_lang="fra")
+        transducer = make_g2p("fra", "fra-ipa")
         self.assertTrue(transducer.check(transducer("ceci est un test été à")))
         self.assertFalse(transducer.check(transducer("ñ oǹ")))
         self.assertTrue(
@@ -65,7 +65,7 @@ def test_check_tokenizing_transducer(self):
         )
 
     def test_check_tokenizing_composite_transducer(self):
-        transducer = make_g2p("fra", "eng-arpabet", tok_lang="fra")
+        transducer = make_g2p("fra", "eng-arpabet")
         self.assertTrue(transducer.check(transducer("ceci est un test été à")))
         self.assertFalse(transducer.check(transducer("ñ oǹ")))
         self.assertTrue(
@@ -83,7 +83,7 @@ def test_check_tokenizing_composite_transducer(self):
             )
 
     def test_shallow_check(self):
-        transducer = make_g2p("win", "eng-arpabet", tok_lang="win")
+        transducer = make_g2p("win", "eng-arpabet")
         # This is False, but should be True! It's False because the mapping outputs :
         # instead of ː
         # EJJ 2022-06-16 With #100 fixed, this check is no longer failing.
@@ -92,16 +92,16 @@ def test_shallow_check(self):
         self.assertTrue(transducer.check(transducer("uu"), shallow=True))
 
     def test_check_with_equiv(self):
-        transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau")
-        tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")(
+        transducer = make_g2p("tau", "eng-arpabet")
+        tau_ipa = make_g2p("tau", "tau-ipa")(
             "sh'oo Jign maasee' do'eent'aa shyyyh"
         ).output_string
         self.assertTrue(utils.is_panphon(tau_ipa))
-        eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")(
+        eng_ipa = make_g2p("tau", "eng-ipa")(
             "sh'oo Jign maasee' do'eent'aa shyyyh"
         ).output_string
         self.assertTrue(utils.is_panphon(eng_ipa))
-        eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")(
+        eng_arpabet = make_g2p("tau", "eng-arpabet")(
             "sh'oo Jign maasee' do'eent'aa shyyyh"
         ).output_string
         self.assertTrue(utils.is_arpabet(eng_arpabet))

diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py
@@ -203,6 +203,11 @@ def test_eng_transducer(self):
         tg = transducer("hello")
         self.assertEqual(tg.output_string, "HH AH L OW ")
 
+        # since we tokenize by default now, this works:
+        self.assertEqual(
+            transducer("hello my friend").output_string, "HH AH L OW  M AY  F R EH N D "
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/g2p/tests/test_network.py b/g2p/tests/test_network.py
@@ -27,12 +27,12 @@ def test_no_path(self):
             make_g2p("hei", "git")
 
     def test_valid_composite(self):
-        transducer = make_g2p("atj", "eng-ipa")
+        transducer = make_g2p("atj", "eng-ipa", tokenize=False)
         self.assertTrue(isinstance(transducer, CompositeTransducer))
         self.assertEqual("niɡiɡw", transducer("nikikw").output_string)
 
     def test_valid_transducer(self):
-        transducer = make_g2p("atj", "atj-ipa")
+        transducer = make_g2p("atj", "atj-ipa", tokenize=False)
         self.assertTrue(isinstance(transducer, Transducer))
         self.assertEqual("niɡiɡw", transducer("nikikw").output_string)
 

diff --git a/g2p/tests/test_tokenize_and_map.py b/g2p/tests/test_tokenize_and_map.py
@@ -17,7 +17,7 @@ def contextualize(self, word: str):
 
     def test_tok_and_map_fra(self):
         """Chaining tests: tokenize and map a string"""
-        transducer = g2p.make_g2p("fra", "fra-ipa")
+        transducer = g2p.make_g2p("fra", "fra-ipa", tokenize=False)
         tokenizer = g2p.make_tokenizer("fra")
         # "teste" in isolation is at string and word end and beginning
         word_ipa = transducer("teste").output_string
@@ -28,7 +28,7 @@ def test_tok_and_map_fra(self):
         self.assertEqual(string_ipa, self.contextualize(word_ipa))
 
     def test_tok_and_map_mic(self):
-        transducer = g2p.make_g2p("mic", "mic-ipa")
+        transducer = g2p.make_g2p("mic", "mic-ipa", tokenize=False)
         tokenizer = g2p.make_tokenizer("mic")
         word_ipa = transducer("sq").output_string
         string_ipa = g2p.tokenize_and_map(
@@ -37,8 +37,10 @@ def test_tok_and_map_mic(self):
         self.assertEqual(string_ipa, self.contextualize(word_ipa))
 
     def test_tokenizing_transducer(self):
-        ref_word_ipa = g2p.make_g2p("mic", "mic-ipa")("sq").output_string
-        transducer = g2p.make_g2p("mic", "mic-ipa", tok_lang="mic")
+        ref_word_ipa = g2p.make_g2p("mic", "mic-ipa", tokenize=False)(
+            "sq"
+        ).output_string
+        transducer = g2p.make_g2p("mic", "mic-ipa")  # tokenizes on "mic" via "path"
         self.assertEqual(transducer.transducer.in_lang, transducer.in_lang)
         self.assertEqual(transducer.transducer.out_lang, transducer.out_lang)
         self.assertEqual(transducer.transducer, transducer.transducers[0])
@@ -48,19 +50,19 @@ def test_tokenizing_transducer(self):
         self.assertEqual(string_ipa, self.contextualize(ref_word_ipa))
 
     def test_tokenizing_transducer_chain(self):
-        transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
+        transducer = g2p.make_g2p("fra", "eng-arpabet")
         self.assertEqual(
             self.contextualize(transducer("teste").output_string),
             transducer(self.contextualize("teste")).output_string,
         )
 
     def test_tokenizing_transducer_debugger(self):
-        transducer = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra")
+        transducer = g2p.make_g2p("fra", "fra-ipa")
         debugger = transducer("ceci est un test.").debugger
         self.assertEqual(len(debugger), 4)
 
     def test_tokenizing_transducer_edges(self):
-        transducer = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra")
+        transducer = g2p.make_g2p("fra", "fra-ipa")
         tg = transducer("est est")
         # est -> ɛ, so edges are (0, 0), (1, 0), (2, 0) for each "est", plus the
         # space to the space, and the second set of edges being offset
@@ -70,12 +72,12 @@ def test_tokenizing_transducer_edges(self):
         self.assertEqual(tg.substring_alignments(), ref_alignments)
 
     def test_tokenizing_transducer_edges2(self):
-        ref_edges = g2p.make_g2p("fra", "fra-ipa")("ça ça").edges
-        edges = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra")("ça ça").edges
+        ref_edges = g2p.make_g2p("fra", "fra-ipa", tokenize=False)("ça ça").edges
+        edges = g2p.make_g2p("fra", "fra-ipa")("ça ça").edges
         self.assertEqual(edges, ref_edges)
 
     def test_tokenizing_transducer_edge_chain(self):
-        transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
+        transducer = g2p.make_g2p("fra", "eng-arpabet")
         # .edges on a transducer is always a single array with the
         # end-to-end mapping, for a composed transducer we can access
         # the individual tiers with .tiers
@@ -128,7 +130,7 @@ def test_tokenizing_transducer_edge_chain(self):
         self.assertEqual(tier_alignments, ref_tier_alignments)
 
     def test_tokenizing_transducer_edge_spaces(self):
-        transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
+        transducer = g2p.make_g2p("fra", "eng-arpabet")
         ref_edges = [
             # "  a, " -> "  AA , "
             (0, 0),

diff --git a/g2p/tests/test_unidecode_transducer.py b/g2p/tests/test_unidecode_transducer.py
@@ -18,7 +18,7 @@ def test_unidecode_mapping(self):
         self.assertEqual(tg.output_string, "ete Nunavut nonafot")
 
     def test_unidecode_g2p(self):
-        transducer = make_g2p("und", "und-ascii")
+        transducer = make_g2p("und", "und-ascii", tokenize=False)
         tg = transducer(normalize("éçà", "NFD"))
         self.assertEqual(tg.output_string, "eca")
         self.assertEqual(tg.edges, [(0, 0), (1, 0), (2, 1), (3, 1), (4, 2), (5, 2)])
@@ -28,14 +28,14 @@ def test_unidecode_g2p(self):
         self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2)])
 
     def test_unidecode_empty_output(self):
-        transducer = make_g2p("und", "und-ascii")
+        transducer = make_g2p("und", "und-ascii", tokenize=False)
         # \u0361 on its own gets deleted completely by unidecode
         tg = transducer("\u0361")
         self.assertEqual(tg.output_string, "")
         self.assertEqual(tg.edges, [])
 
     def test_unidecode_to_arpabet(self):
-        transducer = make_g2p("und", "eng-arpabet")
+        transducer = make_g2p("und", "eng-arpabet", tokenize=False)
         tg = transducer("été Nunavut ᓄᓇᕗᑦ")
         self.assertEqual(
             tg.output_string, "EY T EY  N UW N AA V UW T  N OW N AA F OW T "
@@ -57,7 +57,7 @@ def test_unidecode_kanji_to_arpabet(self):
         self.assertEqual(tg.output_string, "D IY B EY N Y UW ")
 
     def test_unidecode_hanzi_to_arpabet(self):
-        transducer = make_g2p("und", "eng-arpabet")
+        transducer = make_g2p("und", "eng-arpabet", tokenize=False)
         tg = transducer("你们好!你们说汉语马?")
         self.assertEqual(
             tg.output_string,