fix: most sensible possible behaviour, keep spaces if user wanted them

roedoejet · Mar 14, 2023 · 70ab1e6 · 70ab1e6
1 parent bd1b1ec
commit 70ab1e6
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 12 deletions.
diff --git a/g2p/tests/test_unidecode_transducer.py b/g2p/tests/test_unidecode_transducer.py
@@ -15,7 +15,7 @@ def test_unidecode_mapping(self):
         self.assertEqual(m.kwargs["type"], "unidecode")
         t = Transducer(m)
         tg = t("été Nunavut ᓄᓇᕗᑦ")
-        self.assertEqual(tg.output_string, "eteNunavutnonafot")
+        self.assertEqual(tg.output_string, "ete Nunavut nonafot")
 
     def test_unidecode_g2p(self):
         transducer = make_g2p("und", "und-ascii")
@@ -37,12 +37,14 @@ def test_unidecode_empty_output(self):
     def test_unidecode_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("été Nunavut ᓄᓇᕗᑦ")
-        self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ")
+        self.assertEqual(
+            tg.output_string, "EY T EY  N UW N AA V UW T  N OW N AA F OW T "
+        )
 
     def test_unidecode_arabic_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("السلام عليكم")
-        self.assertEqual(tg.output_string, "L S L M L Y K M ")
+        self.assertEqual(tg.output_string, "L S L M  L Y K M ")
 
     def test_unidecode_arabic_presentation_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")

diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py
@@ -45,10 +45,6 @@
 UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":"]
 
 
-def sanitize_unidecode_output(s: str) -> str:
-    return "".join(c if c.isalpha() or c in UNIDECODE_SPECIALS else "" for c in s)
-
-
 class TransductionGraph:
     """This is the object returned after performing a transduction using a Transducer.
 
@@ -529,11 +525,18 @@ def apply_unidecode(self, to_convert: str):
         tg = TransductionGraph(to_convert)
 
         # Conversion is done character by character using unidecode
-        converted = [
-            text_unidecode.unidecode(unicodedata.normalize("NFKC", c))
-            for c in to_convert
-        ]
-        converted = [sanitize_unidecode_output(c) for c in converted]
+        # We retain spaces in the input, but spaces from unidecode are removed
+        converted = []
+        for in_char in to_convert:
+            unidecode_str = text_unidecode.unidecode(
+                unicodedata.normalize("NFKC", in_char)
+            )
+            cc = [
+                c
+                for c in unidecode_str
+                if c.isalpha() or c in UNIDECODE_SPECIALS or in_char.isspace()
+            ]
+            converted.append("".join(cc))
         tg.output_string = "".join(converted)
 
         # Edges are calculated to follow the conversion step by step