From 70ab1e61e67d2684d350c6dcb7f74b8cc14d68bd Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 6 Mar 2023 14:27:40 -0500 Subject: [PATCH] fix: most sensible possible behaviour, keep spaces if user wanted them --- g2p/tests/test_unidecode_transducer.py | 8 +++++--- g2p/transducer/__init__.py | 21 ++++++++++++--------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/g2p/tests/test_unidecode_transducer.py b/g2p/tests/test_unidecode_transducer.py index 024acc02..50bff6a3 100755 --- a/g2p/tests/test_unidecode_transducer.py +++ b/g2p/tests/test_unidecode_transducer.py @@ -15,7 +15,7 @@ def test_unidecode_mapping(self): self.assertEqual(m.kwargs["type"], "unidecode") t = Transducer(m) tg = t("été Nunavut ᓄᓇᕗᑦ") - self.assertEqual(tg.output_string, "eteNunavutnonafot") + self.assertEqual(tg.output_string, "ete Nunavut nonafot") def test_unidecode_g2p(self): transducer = make_g2p("und", "und-ascii") @@ -37,12 +37,14 @@ def test_unidecode_empty_output(self): def test_unidecode_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") tg = transducer("été Nunavut ᓄᓇᕗᑦ") - self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ") + self.assertEqual( + tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T " + ) def test_unidecode_arabic_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") tg = transducer("السلام عليكم") - self.assertEqual(tg.output_string, "L S L M L Y K M ") + self.assertEqual(tg.output_string, "L S L M L Y K M ") def test_unidecode_arabic_presentation_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py index fd88f3c2..79e28193 100644 --- a/g2p/transducer/__init__.py +++ b/g2p/transducer/__init__.py @@ -45,10 +45,6 @@ UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":"] -def sanitize_unidecode_output(s: str) -> str: - return "".join(c if c.isalpha() or c in UNIDECODE_SPECIALS else "" for c in s) - - class TransductionGraph: """This is the object returned after performing a transduction using a Transducer. @@ -529,11 +525,18 @@ def apply_unidecode(self, to_convert: str): tg = TransductionGraph(to_convert) # Conversion is done character by character using unidecode - converted = [ - text_unidecode.unidecode(unicodedata.normalize("NFKC", c)) - for c in to_convert - ] - converted = [sanitize_unidecode_output(c) for c in converted] + # We retain spaces in the input, but spaces from unidecode are removed + converted = [] + for in_char in to_convert: + unidecode_str = text_unidecode.unidecode( + unicodedata.normalize("NFKC", in_char) + ) + cc = [ + c + for c in unidecode_str + if c.isalpha() or c in UNIDECODE_SPECIALS or in_char.isspace() + ] + converted.append("".join(cc)) tg.output_string = "".join(converted) # Edges are calculated to follow the conversion step by step