Skip to content

Commit

Permalink
fix: remove spaces in sanitize_unidecode_output as suggested by @li…
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines authored and roedoejet committed Mar 14, 2023
1 parent 7af2f0b commit bd1b1ec
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 25 deletions.
Binary file modified g2p/mappings/langs/langs.pkl
Binary file not shown.
1 change: 1 addition & 0 deletions g2p/mappings/langs/und/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ mappings:
out_lang: und-ipa
norm: NFD
case_sensitive: false
escape_special: true
authors:
- Patrick Littell
<<: *shared
Expand Down
5 changes: 2 additions & 3 deletions g2p/mappings/langs/und/und_to_ipa.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@
{"in": "y", "out": "j"},
{"in": "z", "out": "z"},
{"in": "@", "out": "ə"},
{"in": "\\?", "out": "ʔ"},
{"in": "?", "out": "ʔ"},
{"in": "'", "out": "ʔ"},
{"in": ",", "out": "ʔ"},
{"in": ":", "out": ""},
{"in": " ", "out": ""}
{"in": ":", "out": ""}
]
20 changes: 1 addition & 19 deletions g2p/tests/test_unidecode_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_unidecode_mapping(self):
self.assertEqual(m.kwargs["type"], "unidecode")
t = Transducer(m)
tg = t("été Nunavut ᓄᓇᕗᑦ")
self.assertEqual(tg.output_string, "ete Nunavut nonafot")
self.assertEqual(tg.output_string, "eteNunavutnonafot")

def test_unidecode_g2p(self):
transducer = make_g2p("und", "und-ascii")
Expand All @@ -38,36 +38,21 @@ def test_unidecode_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("été Nunavut ᓄᓇᕗᑦ")
self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ")
# NOTE: spaces are deleted in the output, but they should be
# retained in the input alignments and thus recoverable
pe = tg.pretty_edges()
self.assertIn((" ", " "), (tuple(x) for x in pe[0])) # present in und-ascii
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) # absent in und-ipa

def test_unidecode_arabic_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("السلام عليكم")
self.assertEqual(tg.output_string, "L S L M L Y K M ")
pe = tg.pretty_edges()
self.assertIn((" ", " "), (tuple(x) for x in pe[0])) # present in und-ascii
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) # absent in und-ipa

def test_unidecode_arabic_presentation_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("ﺷﻜﺮﺍﹰ")
self.assertEqual(tg.output_string, "S HH K D AA N ")
# No input spaces, so no spaces in first transduction
pe = tg.pretty_edges()
self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))

def test_unidecode_kanji_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("日本語")
self.assertEqual(tg.output_string, "D IY B EY N Y UW ")
pe = tg.pretty_edges()
self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))

def test_unidecode_hanzi_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
Expand All @@ -76,9 +61,6 @@ def test_unidecode_hanzi_to_arpabet(self):
tg.output_string,
"N IY M EY N HH AA OW N IY M EY N S HH UW OW Y IY Y UW M AA HH ",
)
pe = tg.pretty_edges()
self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))


if __name__ == "__main__":
Expand Down
8 changes: 5 additions & 3 deletions g2p/transducer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

import copy
import re
import unicodedata
from collections import defaultdict
from typing import Dict, List
import unicodedata

import text_unidecode

Expand Down Expand Up @@ -42,11 +42,13 @@
# [[0,1],[2,-1]]
ChangeLog = List[List[int]]

UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":", " "]
UNIDECODE_SPECIALS = ["@", "?", "'", ",", ":"]

def sanitize_unidecode_output(s: str) -> bool:

def sanitize_unidecode_output(s: str) -> str:
return "".join(c if c.isalpha() or c in UNIDECODE_SPECIALS else "" for c in s)


class TransductionGraph:
"""This is the object returned after performing a transduction using a Transducer.
Expand Down

0 comments on commit bd1b1ec

Please sign in to comment.