Skip to content

Commit

Permalink
fix: adjust all calls to make_g2p to its new signature
Browse files Browse the repository at this point in the history
  • Loading branch information
joanise committed May 1, 2023
1 parent f99774f commit bea7cec
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 43 deletions.
16 changes: 8 additions & 8 deletions g2p/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
The main entry points for the g2p module are:
- make_g2p() to create a mapper from and lang to another
- make_tokenizer() to create a tokenizeer for a given language
- make_tokenizer() to create a tokenizer for a given language
- get_arpabet_langs() to get the list of languages with a path to eng-arpabet
Basic Usage:
from g2p import make_g2p
converter = make_g2p(in_lang, out_lang, tok_lang)
converter = make_g2p(in_lang, out_lang)
transduction_graph = converter(input_text_in_in_alang)
converted_text_in_out_lang = transduction_graph.output_string
Expand Down Expand Up @@ -51,26 +51,26 @@ def make_g2p( # noqa: C901
"""Make a g2p Transducer for mapping text from in_lang to out_lang via the
shortest path between them.
In general you should also add `tok_lang` to specify the language
for tokenization (probably the same as `in_lang`), because
transducers are not guaranteed to deal with whitespace,
By default, the input is tokenized using the path of mappings from in_lang
to out_lang, because transducers are not guaranteed to deal with whitespace,
punctuation, etc, properly.
Args:
in_lang (str): input language code
out_lang (str): output language code
tok_lang (Optional[str]): DEPRECATED language for tokenization
tokenize (bool): whether tokenization should happen (default: yes)
custom_tokenizer (Tokenizer): the tokenizer to use (default: a tokenizer built on the)
tokenize (bool): whether tokenization should happen (default: True)
custom_tokenizer (Tokenizer): the tokenizer to use (default: a tokenizer
built on the path from in_lang and out_lang)
Returns:
Transducer from in_lang to out_lang, optionally with a tokenizer.
Raises:
InvalidLanguageCode: if in_lang or out_lang don't exist
NoPath: if there is path between in_lang and out_lang
"""

if (in_lang, out_lang, tok_lang, tokenize, id(custom_tokenizer)) in _g2p_cache:
return _g2p_cache[(in_lang, out_lang, tok_lang, tokenize, id(custom_tokenizer))]

Expand Down
2 changes: 1 addition & 1 deletion g2p/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def get(self):
index = args["index"]
debugger = args["debugger"]
try:
transducer = make_g2p(in_lang, out_lang)
transducer = make_g2p(in_lang, out_lang, tokenize=False)
tg = transducer(text)
text = tg.output_string
input_text = tg.input_string
Expand Down
6 changes: 4 additions & 2 deletions g2p/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,14 @@ def change_table(message):
# because it is the individual ones which are cached by g2p
path = shortest_path(LANGS_NETWORK, message["in_lang"], message["out_lang"])
if len(path) == 1:
transducer = make_g2p(message["in_lang"], message["out_lang"])
transducer = make_g2p(
message["in_lang"], message["out_lang"], tokenize=False
)
mappings = [transducer.mapping]
else:
mappings = []
for lang1, lang2 in zip(path[:-1], path[1:]):
transducer = make_g2p(lang1, lang2)
transducer = make_g2p(lang1, lang2, tokenize=False)
mappings.append(transducer.mapping)
emit(
"table response",
Expand Down
12 changes: 6 additions & 6 deletions g2p/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from flask.cli import FlaskGroup
from networkx import has_path

from g2p import make_g2p
from g2p import make_g2p, make_tokenizer
from g2p._version import VERSION
from g2p.api import update_docs
from g2p.app import APP
Expand Down Expand Up @@ -492,7 +492,6 @@ def convert( # noqa: C901
in_lang,
out_lang,
input_text,
path,
tok,
check,
debugger,
Expand Down Expand Up @@ -556,11 +555,12 @@ def convert( # noqa: C901
# Determine which tokenizer to use, if any
if tok is not None and not tok and tok_lang is not None:
raise click.UsageError("Specified conflicting --no-tok and --tok-lang options.")
if tok and tok_lang is None:
tok_lang = "path"
custom_tokenizer = make_tokenizer(tok_lang) if tok_lang else None
# Transduce!!!
assert in_lang and out_lang
transducer = make_g2p(in_lang, out_lang, tok_lang=tok_lang)
transducer = make_g2p(
in_lang, out_lang, tokenize=tok, custom_tokenizer=custom_tokenizer
)
tg = transducer(input_text)
if check:
transducer.check(tg, display_warnings=True)
Expand Down Expand Up @@ -740,7 +740,7 @@ def show_mappings(lang1, lang2, verbose, csv):

if lang1 is not None and lang2 is not None:
try:
transducer = make_g2p(lang1, lang2)
transducer = make_g2p(lang1, lang2, tokenize=False)
except (NoPath, InvalidLanguageCode) as e:
raise click.UsageError(
f'Cannot find mapping from "{lang1}" to "{lang2}": {e}'
Expand Down
18 changes: 9 additions & 9 deletions g2p/tests/test_check_ipa_arpabet.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_check_arpabet(self):
self.assertFalse(transducer.check(transducer("ñ")))

def test_check_ipa(self):
transducer = make_g2p("fra", "fra-ipa")
transducer = make_g2p("fra", "fra-ipa", tokenize=False)
self.assertTrue(transducer.check(transducer("ceci")))
self.assertFalse(transducer.check(transducer("ñ")))
with self.assertLogs(LOGGER, level="WARNING"):
Expand All @@ -49,12 +49,12 @@ def test_is_ipa_with_panphon_preprocessor(self):
self.assertTrue(utils.is_panphon("ɻ̊j̊ oⁿk oᵐp"))

def test_check_composite_transducer(self):
transducer = make_g2p("fra", "eng-arpabet")
transducer = make_g2p("fra", "eng-arpabet", tokenize=False)
self.assertTrue(transducer.check(transducer("ceci est un test été à")))
self.assertFalse(transducer.check(transducer("ñ")))

def test_check_tokenizing_transducer(self):
transducer = make_g2p("fra", "fra-ipa", tok_lang="fra")
transducer = make_g2p("fra", "fra-ipa")
self.assertTrue(transducer.check(transducer("ceci est un test été à")))
self.assertFalse(transducer.check(transducer("ñ oǹ")))
self.assertTrue(
Expand All @@ -65,7 +65,7 @@ def test_check_tokenizing_transducer(self):
)

def test_check_tokenizing_composite_transducer(self):
transducer = make_g2p("fra", "eng-arpabet", tok_lang="fra")
transducer = make_g2p("fra", "eng-arpabet")
self.assertTrue(transducer.check(transducer("ceci est un test été à")))
self.assertFalse(transducer.check(transducer("ñ oǹ")))
self.assertTrue(
Expand All @@ -83,7 +83,7 @@ def test_check_tokenizing_composite_transducer(self):
)

def test_shallow_check(self):
transducer = make_g2p("win", "eng-arpabet", tok_lang="win")
transducer = make_g2p("win", "eng-arpabet")
# This is False, but should be True! It's False because the mapping outputs :
# instead of ː
# EJJ 2022-06-16 With #100 fixed, this check is no longer failing.
Expand All @@ -92,16 +92,16 @@ def test_shallow_check(self):
self.assertTrue(transducer.check(transducer("uu"), shallow=True))

def test_check_with_equiv(self):
transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau")
tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")(
transducer = make_g2p("tau", "eng-arpabet")
tau_ipa = make_g2p("tau", "tau-ipa")(
"sh'oo Jign maasee' do'eent'aa shyyyh"
).output_string
self.assertTrue(utils.is_panphon(tau_ipa))
eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")(
eng_ipa = make_g2p("tau", "eng-ipa")(
"sh'oo Jign maasee' do'eent'aa shyyyh"
).output_string
self.assertTrue(utils.is_panphon(eng_ipa))
eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")(
eng_arpabet = make_g2p("tau", "eng-arpabet")(
"sh'oo Jign maasee' do'eent'aa shyyyh"
).output_string
self.assertTrue(utils.is_arpabet(eng_arpabet))
Expand Down
5 changes: 5 additions & 0 deletions g2p/tests/test_lexicon_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,11 @@ def test_eng_transducer(self):
tg = transducer("hello")
self.assertEqual(tg.output_string, "HH AH L OW ")

# since we tokenize by default now, this works:
self.assertEqual(
transducer("hello my friend").output_string, "HH AH L OW M AY F R EH N D "
)


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions g2p/tests/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ def test_no_path(self):
make_g2p("hei", "git")

def test_valid_composite(self):
transducer = make_g2p("atj", "eng-ipa")
transducer = make_g2p("atj", "eng-ipa", tokenize=False)
self.assertTrue(isinstance(transducer, CompositeTransducer))
self.assertEqual("niɡiɡw", transducer("nikikw").output_string)

def test_valid_transducer(self):
transducer = make_g2p("atj", "atj-ipa")
transducer = make_g2p("atj", "atj-ipa", tokenize=False)
self.assertTrue(isinstance(transducer, Transducer))
self.assertEqual("niɡiɡw", transducer("nikikw").output_string)

Expand Down
24 changes: 13 additions & 11 deletions g2p/tests/test_tokenize_and_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def contextualize(self, word: str):

def test_tok_and_map_fra(self):
"""Chaining tests: tokenize and map a string"""
transducer = g2p.make_g2p("fra", "fra-ipa")
transducer = g2p.make_g2p("fra", "fra-ipa", tokenize=False)
tokenizer = g2p.make_tokenizer("fra")
# "teste" in isolation is at string and word end and beginning
word_ipa = transducer("teste").output_string
Expand All @@ -28,7 +28,7 @@ def test_tok_and_map_fra(self):
self.assertEqual(string_ipa, self.contextualize(word_ipa))

def test_tok_and_map_mic(self):
transducer = g2p.make_g2p("mic", "mic-ipa")
transducer = g2p.make_g2p("mic", "mic-ipa", tokenize=False)
tokenizer = g2p.make_tokenizer("mic")
word_ipa = transducer("sq").output_string
string_ipa = g2p.tokenize_and_map(
Expand All @@ -37,8 +37,10 @@ def test_tok_and_map_mic(self):
self.assertEqual(string_ipa, self.contextualize(word_ipa))

def test_tokenizing_transducer(self):
ref_word_ipa = g2p.make_g2p("mic", "mic-ipa")("sq").output_string
transducer = g2p.make_g2p("mic", "mic-ipa", tok_lang="mic")
ref_word_ipa = g2p.make_g2p("mic", "mic-ipa", tokenize=False)(
"sq"
).output_string
transducer = g2p.make_g2p("mic", "mic-ipa") # tokenizes on "mic" via "path"
self.assertEqual(transducer.transducer.in_lang, transducer.in_lang)
self.assertEqual(transducer.transducer.out_lang, transducer.out_lang)
self.assertEqual(transducer.transducer, transducer.transducers[0])
Expand All @@ -48,19 +50,19 @@ def test_tokenizing_transducer(self):
self.assertEqual(string_ipa, self.contextualize(ref_word_ipa))

def test_tokenizing_transducer_chain(self):
transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
transducer = g2p.make_g2p("fra", "eng-arpabet")
self.assertEqual(
self.contextualize(transducer("teste").output_string),
transducer(self.contextualize("teste")).output_string,
)

def test_tokenizing_transducer_debugger(self):
transducer = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra")
transducer = g2p.make_g2p("fra", "fra-ipa")
debugger = transducer("ceci est un test.").debugger
self.assertEqual(len(debugger), 4)

def test_tokenizing_transducer_edges(self):
transducer = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra")
transducer = g2p.make_g2p("fra", "fra-ipa")
tg = transducer("est est")
# est -> ɛ, so edges are (0, 0), (1, 0), (2, 0) for each "est", plus the
# space to the space, and the second set of edges being offset
Expand All @@ -70,12 +72,12 @@ def test_tokenizing_transducer_edges(self):
self.assertEqual(tg.substring_alignments(), ref_alignments)

def test_tokenizing_transducer_edges2(self):
ref_edges = g2p.make_g2p("fra", "fra-ipa")("ça ça").edges
edges = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra")("ça ça").edges
ref_edges = g2p.make_g2p("fra", "fra-ipa", tokenize=False)("ça ça").edges
edges = g2p.make_g2p("fra", "fra-ipa")("ça ça").edges
self.assertEqual(edges, ref_edges)

def test_tokenizing_transducer_edge_chain(self):
transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
transducer = g2p.make_g2p("fra", "eng-arpabet")
# .edges on a transducer is always a single array with the
# end-to-end mapping, for a composed transducer we can access
# the individual tiers with .tiers
Expand Down Expand Up @@ -128,7 +130,7 @@ def test_tokenizing_transducer_edge_chain(self):
self.assertEqual(tier_alignments, ref_tier_alignments)

def test_tokenizing_transducer_edge_spaces(self):
transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
transducer = g2p.make_g2p("fra", "eng-arpabet")
ref_edges = [
# " a, " -> " AA , "
(0, 0),
Expand Down
8 changes: 4 additions & 4 deletions g2p/tests/test_unidecode_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_unidecode_mapping(self):
self.assertEqual(tg.output_string, "ete Nunavut nonafot")

def test_unidecode_g2p(self):
transducer = make_g2p("und", "und-ascii")
transducer = make_g2p("und", "und-ascii", tokenize=False)
tg = transducer(normalize("éçà", "NFD"))
self.assertEqual(tg.output_string, "eca")
self.assertEqual(tg.edges, [(0, 0), (1, 0), (2, 1), (3, 1), (4, 2), (5, 2)])
Expand All @@ -28,14 +28,14 @@ def test_unidecode_g2p(self):
self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2)])

def test_unidecode_empty_output(self):
transducer = make_g2p("und", "und-ascii")
transducer = make_g2p("und", "und-ascii", tokenize=False)
# \u0361 on its own gets deleted completely by unidecode
tg = transducer("\u0361")
self.assertEqual(tg.output_string, "")
self.assertEqual(tg.edges, [])

def test_unidecode_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
transducer = make_g2p("und", "eng-arpabet", tokenize=False)
tg = transducer("été Nunavut ᓄᓇᕗᑦ")
self.assertEqual(
tg.output_string, "EY T EY N UW N AA V UW T N OW N AA F OW T "
Expand All @@ -57,7 +57,7 @@ def test_unidecode_kanji_to_arpabet(self):
self.assertEqual(tg.output_string, "D IY B EY N Y UW ")

def test_unidecode_hanzi_to_arpabet(self):
transducer = make_g2p("und", "eng-arpabet")
transducer = make_g2p("und", "eng-arpabet", tokenize=False)
tg = transducer("你们好!你们说汉语马?")
self.assertEqual(
tg.output_string,
Expand Down

0 comments on commit bea7cec

Please sign in to comment.