From bd1b1ece13ce7c596608b56f2897ad301a622c59 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 6 Mar 2023 14:11:51 -0500 Subject: [PATCH] fix: remove spaces in `sanitize_unidecode_output` as suggested by @littell --- g2p/mappings/langs/langs.pkl | Bin 308938 -> 308926 bytes g2p/mappings/langs/und/config.yaml | 1 + g2p/mappings/langs/und/und_to_ipa.json | 5 ++--- g2p/tests/test_unidecode_transducer.py | 20 +------------------- g2p/transducer/__init__.py | 8 +++++--- 5 files changed, 9 insertions(+), 25 deletions(-) diff --git a/g2p/mappings/langs/langs.pkl b/g2p/mappings/langs/langs.pkl index a533dc75a5da715f49d865dcd5fad6dfed4c5fd6..9f4eca76f647d53214736af578c9aa73ef5c6c88 100644 GIT binary patch delta 802 zcmX^0R%qW_p@tU57N!>FDJ&)&P1_j2V0--!=C@4SHJMo+&7XWKTWvC|8Cm__QrY8xPG8!*&nctK>mFlDqLGFq52`4E|W44Et` zMo=IC0~yQ$$xrU#<%GDQ8KSEhLsv!xM5Y2$rWhhqj47i5knX6+DN?U=fX6~Hc^9(aiv8ot30wF;POZ-_(G2Ap9O(FTQYRvOIT!04Wd8QndM z_EUhSO?Q~jEW!iUa6t!R1VW_P3|-`)4MZf-k41HQ!dgb&={)OM_@^_iW=WmC?jWQ7 Tbbcphc~C-E-+p>I%YQxqf0Ee7 delta 803 zcmdn@R_N4Qp@tU57N!>FDJ&)&ecKqoV0--!=C@4SHJMo+&7U4n%)&c;e;K3T^gFYe z`KIsPz#^u^9?QUxQJBR6rZSW>R5DaER7z{7Xk>w9s_%y9;}mMe4z78z5>MFw|yvL1erzWwanNT9`8V5Se@onJg(ru&;pxhy{|L z+{4QWaYHjiS2KpLj0%WM1*S|fM5Y*1MgtEBkdq)uOV gkWqg+zZ0`OD1CEIo5ajIegA3}$?f}>v;5}+0Jf{t;{X5v diff --git a/g2p/mappings/langs/und/config.yaml b/g2p/mappings/langs/und/config.yaml index 326304ad..5dae9405 100644 --- a/g2p/mappings/langs/und/config.yaml +++ b/g2p/mappings/langs/und/config.yaml @@ -7,6 +7,7 @@ mappings: out_lang: und-ipa norm: NFD case_sensitive: false + escape_special: true authors: - Patrick Littell <<: *shared diff --git a/g2p/mappings/langs/und/und_to_ipa.json b/g2p/mappings/langs/und/und_to_ipa.json index 277e2147..713e9648 100644 --- a/g2p/mappings/langs/und/und_to_ipa.json +++ b/g2p/mappings/langs/und/und_to_ipa.json @@ -26,9 +26,8 @@ {"in": "y", "out": "j"}, {"in": "z", "out": "z"}, {"in": "@", "out": "ə"}, - {"in": "\\?", "out": "ʔ"}, + {"in": "?", "out": "ʔ"}, {"in": "'", "out": "ʔ"}, {"in": ",", "out": "ʔ"}, - {"in": ":", "out": ""}, - {"in": " ", "out": ""} + {"in": ":", "out": ""} ] diff --git a/g2p/tests/test_unidecode_transducer.py b/g2p/tests/test_unidecode_transducer.py index d583441c..024acc02 100755 --- a/g2p/tests/test_unidecode_transducer.py +++ b/g2p/tests/test_unidecode_transducer.py @@ -15,7 +15,7 @@ def test_unidecode_mapping(self): self.assertEqual(m.kwargs["type"], "unidecode") t = Transducer(m) tg = t("été Nunavut ᓄᓇᕗᑦ") - self.assertEqual(tg.output_string, "ete Nunavut nonafot") + self.assertEqual(tg.output_string, "eteNunavutnonafot") def test_unidecode_g2p(self): transducer = make_g2p("und", "und-ascii") @@ -38,36 +38,21 @@ def test_unidecode_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") tg = transducer("été Nunavut ᓄᓇᕗᑦ") self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ") - # NOTE: spaces are deleted in the output, but they should be - # retained in the input alignments and thus recoverable - pe = tg.pretty_edges() - self.assertIn((" ", " "), (tuple(x) for x in pe[0])) # present in und-ascii - self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) # absent in und-ipa def test_unidecode_arabic_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") tg = transducer("السلام عليكم") self.assertEqual(tg.output_string, "L S L M L Y K M ") - pe = tg.pretty_edges() - self.assertIn((" ", " "), (tuple(x) for x in pe[0])) # present in und-ascii - self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) # absent in und-ipa def test_unidecode_arabic_presentation_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") tg = transducer("ﺷﻜﺮﺍﹰ") self.assertEqual(tg.output_string, "S HH K D AA N ") - # No input spaces, so no spaces in first transduction - pe = tg.pretty_edges() - self.assertNotIn((" ", " "), (tuple(x) for x in pe[0])) - self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) def test_unidecode_kanji_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") tg = transducer("日本語") self.assertEqual(tg.output_string, "D IY B EY N Y UW ") - pe = tg.pretty_edges() - self.assertNotIn((" ", " "), (tuple(x) for x in pe[0])) - self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) def test_unidecode_hanzi_to_arpabet(self): transducer = make_g2p("und", "eng-arpabet") @@ -76,9 +61,6 @@ def test_unidecode_hanzi_to_arpabet(self): tg.output_string, "N IY M EY N HH AA OW N IY M EY N S HH UW OW Y IY Y UW M AA HH ", ) - pe = tg.pretty_edges() - self.assertNotIn((" ", " "), (tuple(x) for x in pe[0])) - self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) if __name__ == "__main__": diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py index efcae67c..fd88f3c2 100644 --- a/g2p/transducer/__init__.py +++ b/g2p/transducer/__init__.py @@ -6,9 +6,9 @@ import copy import re +import unicodedata from collections import defaultdict from typing import Dict, List -import unicodedata import text_unidecode @@ -42,11 +42,13 @@ # [[0,1],[2,-1]] ChangeLog = List[List[int]] -UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":", " "] +UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":"] -def sanitize_unidecode_output(s: str) -> bool: + +def sanitize_unidecode_output(s: str) -> str: return "".join(c if c.isalpha() or c in UNIDECODE_SPECIALS else "" for c in s) + class TransductionGraph: """This is the object returned after performing a transduction using a Transducer.