Skip to content

Commit

Permalink
fix: most sensible possible behaviour, keep spaces if user wanted them
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines authored and roedoejet committed Mar 14, 2023
1 parent bd1b1ec commit 70ab1e6
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 12 deletions.
8 changes: 5 additions & 3 deletions g2p/tests/test_unidecode_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_unidecode_mapping(self):
self.assertEqual(m.kwargs["type"], "unidecode")
t = Transducer(m)
tg = t("été Nunavut ᓄᓇᕗᑦ")
self.assertEqual(tg.output_string, "eteNunavutnonafot")
self.assertEqual(tg.output_string, "ete Nunavut nonafot")

def test_unidecode_g2p(self):
transducer = make_g2p("und", "und-ascii")
Expand All @@ -37,12 +37,14 @@ def test_unidecode_empty_output(self):
def test_unidecode_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("été Nunavut ᓄᓇᕗᑦ")
self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ")
self.assertEqual(
tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T "
)

def test_unidecode_arabic_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("السلام عليكم")
self.assertEqual(tg.output_string, "L S L M L Y K M ")
self.assertEqual(tg.output_string, "L S L M L Y K M ")

def test_unidecode_arabic_presentation_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
Expand Down
21 changes: 12 additions & 9 deletions g2p/transducer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@
UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":"]


def sanitize_unidecode_output(s: str) -> str:
return "".join(c if c.isalpha() or c in UNIDECODE_SPECIALS else "" for c in s)


class TransductionGraph:
"""This is the object returned after performing a transduction using a Transducer.
Expand Down Expand Up @@ -529,11 +525,18 @@ def apply_unidecode(self, to_convert: str):
tg = TransductionGraph(to_convert)

# Conversion is done character by character using unidecode
converted = [
text_unidecode.unidecode(unicodedata.normalize("NFKC", c))
for c in to_convert
]
converted = [sanitize_unidecode_output(c) for c in converted]
# We retain spaces in the input, but spaces from unidecode are removed
converted = []
for in_char in to_convert:
unidecode_str = text_unidecode.unidecode(
unicodedata.normalize("NFKC", in_char)
)
cc = [
c
for c in unidecode_str
if c.isalpha() or c in UNIDECODE_SPECIALS or in_char.isspace()
]
converted.append("".join(cc))
tg.output_string = "".join(converted)

# Edges are calculated to follow the conversion step by step
Expand Down

0 comments on commit 70ab1e6

Please sign in to comment.