From bd1b1ece13ce7c596608b56f2897ad301a622c59 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Mon, 6 Mar 2023 14:11:51 -0500
Subject: [PATCH] fix: remove spaces in `sanitize_unidecode_output` as
 suggested by @littell

---
 g2p/mappings/langs/langs.pkl           | Bin 308938 -> 308926 bytes
 g2p/mappings/langs/und/config.yaml     |   1 +
 g2p/mappings/langs/und/und_to_ipa.json |   5 ++---
 g2p/tests/test_unidecode_transducer.py |  20 +-------------------
 g2p/transducer/__init__.py             |   8 +++++---
 5 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/g2p/mappings/langs/langs.pkl b/g2p/mappings/langs/langs.pkl
index a533dc75a5da715f49d865dcd5fad6dfed4c5fd6..9f4eca76f647d53214736af578c9aa73ef5c6c88 100644
GIT binary patch
delta 802
zcmX^0R%qW_p@tU57N!>FDJ&)&P1_j2V0--!=C@4SHJMo+&7XWKTW<QkGDe~4dsnk?
zPPaJ4$TR)I1{Qv0j#vhUjKVBVFqNU4p^~ARp;B5qMI#F=Qw5Zn&Ul$o+#bwzfhcsr
zROkVb@xYMDQet6X$l{#txQ0bE3T%iISZNQ_$%RueRA;n8%xT4x(TB+BW6CH1Wu|}l
z%_O1*Hop|2uoOdK7Bko>vC|8Cm__QrY8xPG8!*&nctK>mFlDqLGFq52`4E|W44Et`
zMo=IC0~yQ$$xrU#<%GDQ8KSEhLsv!xM5Y2$rWhhqj47i5k<q}A$&!co9_R)T3nV`|
zCXf>nX6+DN?U=fX6~Hc^9(aiv8ot30wF;POZ-_(G2Ap9O(FTQYRvOIT!04Wd8QndM
z_EUhSO?Q~jEW!iUa6t!R1VW_P3|-`)4MZf-k41HQ!dgb&={)OM_@^_iW=WmC?jWQ7
Tbbcphc~C-E-+p>I%YQxqf0Ee7

delta 803
zcmdn@R_N4Qp@tU57N!>FDJ&)&ecKqoV0--!=C@4SHJMo+&7U4n%)&c;e;K3T^gFYe
z`KIsPz#^u^9?QUxQJBR6rZSW>R5DaER7z{7Xk>w9s<JqyH!fz8Z~*gMAnIH&)p<Z<
zJTPRklvo%TfRfWUx-yG!fK71%npwDtMbrk&YlY}+#n73d50TNwlu-bhFcD~i8rXzV
zh`Le?by>_%y9;}mMe4z78z5>MFw|yvL1erzWwanNT9`8V5Se@onJg(ru&;pxhy{|L
z+{4QWaYHjiS2KpLj0%WM1*S|fM5Y*1Mgt<FfgzJ65Ai+F4ImasesWA8rz$uc+9A5y
zF?AIyfLuOZ5E#4=rNI!T3Ybc7h=Y|H0+n)uOwCFI`+9mHP=E&{(8CmCKLx0Ly2E^C
z5wJ+s1s#wcWRYSsgoq~Cn1eQ85ulq@FfEN&geXt+V^N)+u$GZ`+IklL>EBkdq)uOV
gkWqg+zZ0`OD1CEIo5ajIegA3}$?f}>v;5}+0Jf{t;{X5v

diff --git a/g2p/mappings/langs/und/config.yaml b/g2p/mappings/langs/und/config.yaml
index 326304ad..5dae9405 100644
--- a/g2p/mappings/langs/und/config.yaml
+++ b/g2p/mappings/langs/und/config.yaml
@@ -7,6 +7,7 @@ mappings:
     out_lang: und-ipa
     norm: NFD
     case_sensitive: false
+    escape_special: true
     authors:
       - Patrick Littell
     <<: *shared
diff --git a/g2p/mappings/langs/und/und_to_ipa.json b/g2p/mappings/langs/und/und_to_ipa.json
index 277e2147..713e9648 100644
--- a/g2p/mappings/langs/und/und_to_ipa.json
+++ b/g2p/mappings/langs/und/und_to_ipa.json
@@ -26,9 +26,8 @@
     {"in": "y", "out": "j"},
     {"in": "z", "out": "z"},
     {"in": "@", "out": "ə"},
-    {"in": "\\?", "out": "ʔ"},
+    {"in": "?", "out": "ʔ"},
     {"in": "'", "out": "ʔ"},
     {"in": ",", "out": "ʔ"},
-    {"in": ":", "out": ""},
-    {"in": " ", "out": ""}
+    {"in": ":", "out": ""}
 ]
diff --git a/g2p/tests/test_unidecode_transducer.py b/g2p/tests/test_unidecode_transducer.py
index d583441c..024acc02 100755
--- a/g2p/tests/test_unidecode_transducer.py
+++ b/g2p/tests/test_unidecode_transducer.py
@@ -15,7 +15,7 @@ def test_unidecode_mapping(self):
         self.assertEqual(m.kwargs["type"], "unidecode")
         t = Transducer(m)
         tg = t("été Nunavut ᓄᓇᕗᑦ")
-        self.assertEqual(tg.output_string, "ete Nunavut nonafot")
+        self.assertEqual(tg.output_string, "eteNunavutnonafot")
 
     def test_unidecode_g2p(self):
         transducer = make_g2p("und", "und-ascii")
@@ -38,36 +38,21 @@ def test_unidecode_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("été Nunavut ᓄᓇᕗᑦ")
         self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ")
-        # NOTE: spaces are deleted in the output, but they should be
-        # retained in the input alignments and thus recoverable
-        pe = tg.pretty_edges()
-        self.assertIn((" ", " "), (tuple(x) for x in pe[0]))  # present in und-ascii
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))  # absent in und-ipa
 
     def test_unidecode_arabic_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("السلام عليكم")
         self.assertEqual(tg.output_string, "L S L M L Y K M ")
-        pe = tg.pretty_edges()
-        self.assertIn((" ", " "), (tuple(x) for x in pe[0]))  # present in und-ascii
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))  # absent in und-ipa
 
     def test_unidecode_arabic_presentation_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("ﺷﻜﺮﺍﹰ")
         self.assertEqual(tg.output_string, "S HH K D AA N ")
-        # No input spaces, so no spaces in first transduction
-        pe = tg.pretty_edges()
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
 
     def test_unidecode_kanji_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
         tg = transducer("日本語")
         self.assertEqual(tg.output_string, "D IY B EY N Y UW ")
-        pe = tg.pretty_edges()
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
 
     def test_unidecode_hanzi_to_arpabet(self):
         transducer = make_g2p("und", "eng-arpabet")
@@ -76,9 +61,6 @@ def test_unidecode_hanzi_to_arpabet(self):
             tg.output_string,
             "N IY M EY N HH AA OW N IY M EY N S HH UW OW Y IY Y UW M AA HH ",
         )
-        pe = tg.pretty_edges()
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
-        self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))
 
 
 if __name__ == "__main__":
diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py
index efcae67c..fd88f3c2 100644
--- a/g2p/transducer/__init__.py
+++ b/g2p/transducer/__init__.py
@@ -6,9 +6,9 @@
 
 import copy
 import re
+import unicodedata
 from collections import defaultdict
 from typing import Dict, List
-import unicodedata
 
 import text_unidecode
 
@@ -42,11 +42,13 @@
 # [[0,1],[2,-1]]
 ChangeLog = List[List[int]]
 
-UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":", " "]
+UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":"]
 
-def sanitize_unidecode_output(s: str) -> bool:
+
+def sanitize_unidecode_output(s: str) -> str:
     return "".join(c if c.isalpha() or c in UNIDECODE_SPECIALS else "" for c in s)
 
+
 class TransductionGraph:
     """This is the object returned after performing a transduction using a Transducer.