fix: remove spaces in sanitize_unidecode_output as suggested by @li…

…ttell
roedoejet · Mar 14, 2023 · bd1b1ec · bd1b1ec
1 parent 7af2f0b
commit bd1b1ec
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 25 deletions.
diff --git a/g2p/mappings/langs/langs.pkl b/g2p/mappings/langs/langs.pkl
diff --git a/g2p/mappings/langs/und/config.yaml b/g2p/mappings/langs/und/config.yaml
@@ -7,6 +7,7 @@ mappings:
     out_lang: und-ipa
     norm: NFD
     case_sensitive: false
+    escape_special: true
     authors:
       - Patrick Littell
     <<: *shared

diff --git a/g2p/mappings/langs/und/und_to_ipa.json b/g2p/mappings/langs/und/und_to_ipa.json
@@ -26,9 +26,8 @@
     {"in": "y", "out": "j"},
     {"in": "z", "out": "z"},
     {"in": "@", "out": "ə"},
-    {"in": "\\?", "out": "ʔ"},
+    {"in": "?", "out": "ʔ"},
     {"in": "'", "out": "ʔ"},
     {"in": ",", "out": "ʔ"},
-    {"in": ":", "out": ""},
-    {"in": " ", "out": ""}
+    {"in": ":", "out": ""}
 ]
diff --git a/g2p/tests/test_unidecode_transducer.py b/g2p/tests/test_unidecode_transducer.py
@@ -15,7 +15,7 @@ def test_unidecode_mapping(self):
         self.assertEqual(m.kwargs["type"], "unidecode")
         t = Transducer(m)
         tg = t("été Nunavut ᓄᓇᕗᑦ")
-        self.assertEqual(tg.output_string, "ete Nunavut nonafot")
+        self.assertEqual(tg.output_string, "eteNunavutnonafot")
 
     def test_unidecode_g2p(self):
         transducer = make_g2p("und", "und-ascii")
@@ -38,36 +38,21 @@ def test_unidecode_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("été Nunavut ᓄᓇᕗᑦ")
         self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ")
-        # NOTE: spaces are deleted in the output, but they should be
-        # retained in the input alignments and thus recoverable
-        pe = tg.pretty_edges()
-        self.assertIn((" ", " "), (tuple(x) for x in pe[0]))  # present in und-ascii
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))  # absent in und-ipa
 
     def test_unidecode_arabic_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("السلام عليكم")
         self.assertEqual(tg.output_string, "L S L M L Y K M ")
-        pe = tg.pretty_edges()
-        self.assertIn((" ", " "), (tuple(x) for x in pe[0]))  # present in und-ascii
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))  # absent in und-ipa
 
     def test_unidecode_arabic_presentation_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("ﺷﻜﺮﺍﹰ")
         self.assertEqual(tg.output_string, "S HH K D AA N ")
-        # No input spaces, so no spaces in first transduction
-        pe = tg.pretty_edges()
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
 
     def test_unidecode_kanji_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("日本語")
         self.assertEqual(tg.output_string, "D IY B EY N Y UW ")
-        pe = tg.pretty_edges()
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
 
     def test_unidecode_hanzi_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
@@ -76,9 +61,6 @@ def test_unidecode_hanzi_to_arpabet(self):
             tg.output_string,
             "N IY M EY N HH AA OW N IY M EY N S HH UW OW Y IY Y UW M AA HH ",
         )
-        pe = tg.pretty_edges()
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
 
 
 if __name__ == "__main__":

diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py
@@ -6,9 +6,9 @@
 
 import copy
 import re
+import unicodedata
 from collections import defaultdict
 from typing import Dict, List
-import unicodedata
 
 import text_unidecode
 
@@ -42,11 +42,13 @@
 # [[0,1],[2,-1]]
 ChangeLog = List[List[int]]
 
-UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":", " "]
+UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":"]
 
-def sanitize_unidecode_output(s: str) -> bool:
+
+def sanitize_unidecode_output(s: str) -> str:
     return "".join(c if c.isalpha() or c in UNIDECODE_SPECIALS else "" for c in s)
 
+
 class TransductionGraph:
     """This is the object returned after performing a transduction using a Transducer.