Skip to content

Commit

Permalink
fix: remove spontaneous extraneous spaces from und-ipa
Browse files Browse the repository at this point in the history
if spaces are not present in the input they should not appear in the output.
if they are present in the input they will get stripped, this should not be
a problem in practice, as they are still visible in the alignments
  • Loading branch information
dhdaines authored and roedoejet committed Mar 14, 2023
1 parent ffba389 commit 9e64b7f
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 8 deletions.
Binary file modified g2p/mappings/langs/langs.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion g2p/mappings/langs/und/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mappings:
authors:
- Patrick Littell
<<: *shared
- display_name: Undertermined Unicode to ASCII
- display_name: Undetermined Unicode to ASCII
type: unidecode
norm: NFD
in_lang: und
Expand Down
3 changes: 2 additions & 1 deletion g2p/mappings/langs/und/und_to_ipa.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@
{"in": "?", "out": "ʔ"},
{"in": "'", "out": "ʔ"},
{"in": ",", "out": "ʔ"},
{"in": ":", "out": ""}
{"in": ":", "out": ""},
{"in": " ", "out": ""}
]
37 changes: 31 additions & 6 deletions g2p/tests/test_unidecode_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,49 @@ def test_unidecode_empty_output(self):
def test_unidecode_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("été Nunavut ᓄᓇᕗᑦ")
self.assertEqual(
tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T "
)
self.assertEqual(tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T ")
# NOTE: spaces are deleted in the output, but they should be
# retained in the input alignments and thus recoverable
pe = tg.pretty_edges()
self.assertIn((" ", " "), (tuple(x) for x in pe[0])) # present in und-ascii
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) # absent in und-ipa

def test_unidecode_arabic_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("السلام عليكم")
self.assertEqual(tg.output_string, "L S L M L Y K M ")
self.assertEqual(tg.output_string, "L S L M L Y K M ")
pe = tg.pretty_edges()
self.assertIn((" ", " "), (tuple(x) for x in pe[0])) # present in und-ascii
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1])) # absent in und-ipa

def test_unidecode_arabic_presentation_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("ﺷﻜﺮﺍﹰ")
self.assertEqual(tg.output_string, "S HH K D AA N ")
self.assertEqual(tg.output_string, "S HH K D AA N ")
# No input spaces, so no spaces in first transduction
pe = tg.pretty_edges()
self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))

def test_unidecode_kanji_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("日本語")
self.assertEqual(tg.output_string, "D IY B EY N Y UW ")
self.assertEqual(tg.output_string, "D IY B EY N Y UW ")
pe = tg.pretty_edges()
self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))

def test_unidecode_hanzi_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
tg = transducer("你们好!你们说汉语马?")
self.assertEqual(
tg.output_string,
"N IY M EY N HH AA OW N IY M EY N S HH UW OW Y IY Y UW M AA HH ",
)
pe = tg.pretty_edges()
self.assertNotIn((" ", " "), (tuple(x) for x in pe[0]))
self.assertNotIn((" ", " "), (tuple(x) for x in pe[1]))


if __name__ == "__main__":
main()

0 comments on commit 9e64b7f

Please sign in to comment.