fix: remove spontaneous extraneous spaces from und-ipa

if spaces are not present in the input they should not appear in the output. if they are present in the input they will get stripped, this should not be a problem in practice, as they are still visible in the alignments
roedoejet · Mar 14, 2023 · 9e64b7f · 9e64b7f
1 parent ffba389
commit 9e64b7f
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 8 deletions.
diff --git a/g2p/mappings/langs/langs.pkl b/g2p/mappings/langs/langs.pkl
diff --git a/g2p/mappings/langs/und/config.yaml b/g2p/mappings/langs/und/config.yaml
@@ -27,7 +27,7 @@ mappings:
     authors:
       - Patrick Littell
     <<: *shared
-  - display_name: Undertermined Unicode to ASCII
+  - display_name: Undetermined Unicode to ASCII
     type: unidecode
     norm: NFD
     in_lang: und

diff --git a/g2p/mappings/langs/und/und_to_ipa.json b/g2p/mappings/langs/und/und_to_ipa.json
@@ -29,5 +29,6 @@
     {"in": "?", "out": "ʔ"},
     {"in": "'", "out": "ʔ"},
     {"in": ",", "out": "ʔ"},
-    {"in": ":", "out": ""}
+    {"in": ":", "out": ""},
+    {"in": " ", "out": ""}
 ]
diff --git a/g2p/tests/test_unidecode_transducer.py b/g2p/tests/test_unidecode_transducer.py
@@ -37,24 +37,49 @@ def test_unidecode_empty_output(self):
     def test_unidecode_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("été Nunavut ᓄᓇᕗᑦ")
-        self.assertEqual(
-            tg.output_string, "EY T EY  N UW N AA V UW T  N OW N AA F OW T "
-        )
+        self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ")
+        # NOTE: spaces are deleted in the output, but they should be
+        # retained in the input alignments and thus recoverable
+        pe = tg.pretty_edges()
+        self.assertIn((" ", " "), (tuple(x) for x in pe[0]))  # present in und-ascii
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))  # absent in und-ipa
 
     def test_unidecode_arabic_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("السلام عليكم")
-        self.assertEqual(tg.output_string, "L S L M  L Y K M ")
+        self.assertEqual(tg.output_string, "L S L M L Y K M ")
+        pe = tg.pretty_edges()
+        self.assertIn((" ", " "), (tuple(x) for x in pe[0]))  # present in und-ascii
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))  # absent in und-ipa
 
     def test_unidecode_arabic_presentation_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("ﺷﻜﺮﺍﹰ")
-        self.assertEqual(tg.output_string, "S HH K D  AA N ")
+        self.assertEqual(tg.output_string, "S HH K D AA N ")
+        # No input spaces, so no spaces in first transduction
+        pe = tg.pretty_edges()
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
 
     def test_unidecode_kanji_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("日本語")
-        self.assertEqual(tg.output_string, "D IY  B EY N  Y UW  ")
+        self.assertEqual(tg.output_string, "D IY B EY N Y UW ")
+        pe = tg.pretty_edges()
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
+
+    def test_unidecode_hanzi_to_arpabet(self):
+        transducer = make_g2p("und", "eng-arpabet")
+        tg = transducer("你们好!你们说汉语马?")
+        self.assertEqual(
+            tg.output_string,
+            "N IY M EY N HH AA OW N IY M EY N S HH UW OW Y IY Y UW M AA HH ",
+        )
+        pe = tg.pretty_edges()
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
+        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
+
 
 if __name__ == "__main__":
     main()