Skip to content

Commit

Permalink
perf: build only in_seq or mappings as needed for alignments
Browse files Browse the repository at this point in the history
  • Loading branch information
joanise committed May 16, 2023
1 parent 6543214 commit 4e6de3b
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 5 deletions.
38 changes: 36 additions & 2 deletions g2p/mappings/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,40 @@ def load_abbreviations_from_file(path):
return abbs


def get_alignment_input_string(alignment: str) -> str:
"""Parse one alignment of the format in *.aligned.txt and return just the input"""
chars = ""
return "".join(
[
tok
for mapping in alignment.split()
for tok in mapping[: mapping.rindex("}")].split("|")
if tok != "_"
]
)
for mapping in alignment.split():
idx = mapping.rindex("}")
in_seq = "".join(tok for tok in mapping[:idx].split("|") if tok != "_")
chars += in_seq
return "".join(chars)


def get_alignment_output_tuple(alignment: str, delimiter="") -> Tuple:
"""Parse one alignment of the format in *.aligned.txt and return just the output seq"""
mappings: List[Union[int, str]] = []
for mapping in alignment.split():
idx = mapping.rindex("}")
# Note that we care about *character* indices, so we join them together
in_len = sum(len(tok) for tok in mapping[:idx].split("|") if tok != "_")
out_seq = delimiter.join(
tok for tok in mapping[idx + 1 :].split("|") if tok != "_"
)
# To save space, make the mappings flat and only store
# the number of input characters rather than the characters themselves
mappings.extend((in_len, out_seq))
return tuple(mappings)


def parse_alignment(alignment: str, delimiter="") -> Tuple[str, Tuple]:
"""Parse one alignment of the format in *.aligned.txt
Expand All @@ -491,7 +525,7 @@ def parse_alignment(alignment: str, delimiter="") -> Tuple[str, Tuple]:
return ("".join(chars), tuple(mappings))


def load_alignments_from_file(path, delimiter="") -> Dict[str, Tuple]:
def load_alignments_from_file(path, delimiter="") -> Dict[str, str]:
"""Load alignments in Phonetisaurus default format.
Returns a mapping of input words to output alignments used to
Expand All @@ -508,7 +542,7 @@ def load_alignments_from_file(path, delimiter="") -> Dict[str, Tuple]:
spam = spam.strip()
if not spam:
continue
(word, mappings) = parse_alignment(spam, delimiter)
word = get_alignment_input_string(spam)
alignments[word] = spam
return alignments

Expand Down
5 changes: 2 additions & 3 deletions g2p/transducer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from g2p.mappings.tokenizer import Tokenizer
from g2p.mappings.utils import (
compose_indices,
get_alignment_output_tuple,
is_ipa,
normalize,
normalize_with_indices,
parse_alignment,
unicode_escape,
)

Expand Down Expand Up @@ -807,8 +807,7 @@ def apply_lexicon(self, to_convert: str):
tg.edges = []
tg.output_string = ""
else:
(word, alignment) = parse_alignment(alignment_str, self.out_delimiter)
assert word == to_convert
alignment = get_alignment_output_tuple(alignment_str, self.out_delimiter)
tg.output_string = ""
edges: List[Tuple[int, int]] = []
in_pos = 0
Expand Down

0 comments on commit 4e6de3b

Please sign in to comment.