Skip to content

Commit

Permalink
refactor(config): require a 'mappings' key
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Sep 5, 2023
1 parent 30dc282 commit a753e07
Show file tree
Hide file tree
Showing 22 changed files with 262 additions and 231 deletions.
14 changes: 7 additions & 7 deletions g2p/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
import g2p.deprecation
from g2p.exceptions import InvalidLanguageCode, NoPath
from g2p.log import LOGGER
from g2p.mappings import Mapping
from g2p.mappings.langs import LANGS, LANGS_NETWORK
from g2p.mappings import LANGS, Mapping
from g2p.mappings.langs import LANGS_NETWORK
from g2p.mappings.tokenizer import Tokenizer, make_tokenizer
from g2p.transducer import CompositeTransducer, TokenizingTransducer, Transducer

Expand Down Expand Up @@ -177,13 +177,13 @@ def get_arpabet_langs():
# this will be the set of all langs in g2p, which we need temporarily
full_lang_names = {}

for _, v in LANGS.items():
for mapping in v["mappings"]:
for v in LANGS.values():
for mapping in v.mappings:
# add mapping to names hash table
full_lang_names[mapping["in_lang"]] = mapping["language_name"]
full_lang_names[mapping.in_lang] = mapping.language_name
# add input id to all available langs list
if mapping["in_lang"] not in langs_available:
langs_available.append(mapping["in_lang"])
if mapping.in_lang not in langs_available:
langs_available.append(mapping.in_lang)

# get the key from all networks in g2p module that have a path to 'eng-arpabet',
# which is needed for the readalongs
Expand Down
30 changes: 12 additions & 18 deletions g2p/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from typing import List, Tuple

import click
import yaml
from flask.cli import FlaskGroup
from networkx import has_path

Expand All @@ -20,7 +19,7 @@
from g2p.app import APP
from g2p.exceptions import InvalidLanguageCode, MappingMissing, NoPath
from g2p.log import LOGGER
from g2p.mappings import MAPPINGS_AVAILABLE, Mapping, Rule
from g2p.mappings import MAPPINGS_AVAILABLE, Mapping, MappingConfig, Rule
from g2p.mappings.create_fallback_mapping import (
DUMMY_INVENTORY,
align_to_dummy_fallback,
Expand Down Expand Up @@ -517,29 +516,25 @@ def convert( # noqa: C901
if config:
# This isn't that DRY - copied from g2p/mappings/langs/__init__.py
mappings_legal_pairs = []
with open(config, encoding="utf8") as f:
data = yaml.safe_load(f)
if "mappings" in data:
for index, mapping in enumerate(data["mappings"]):
mappings_legal_pairs.append(
(
data["mappings"][index]["in_lang"],
data["mappings"][index]["out_lang"],
)
mapping_config = MappingConfig.load_mapping_config_from_path(config)
for index in range(len(mapping_config.mappings)):
mappings_legal_pairs.append(
(
mapping_config.mappings[index].in_lang,
mapping_config.mappings[index].out_lang,
)
data["mappings"][index] = Mapping.load_mapping_from_path(config, index)
else:
mapping = Mapping.load_mapping_from_path(config)
data["mappings"] = [mapping]
mappings_legal_pairs.append((mapping.in_lang, mapping.out_lang))
)
mapping_config.mappings[index] = Mapping.load_mapping_from_path(
config, index
)
for pair in mappings_legal_pairs:
if pair[0] in LANGS_NETWORK.nodes:
LOGGER.warning(
f"A mapping with the name '{pair[0]}' is already defined in g2p. "
"Your local mapping with the same name might not function properly."
)
LANGS_NETWORK.add_edges_from(mappings_legal_pairs)
MAPPINGS_AVAILABLE.extend(data["mappings"])
MAPPINGS_AVAILABLE.extend(mapping_config.mappings)
# Check input lang exists
if in_lang not in LANGS_NETWORK.nodes:
raise click.UsageError(f"'{in_lang}' is not a valid value for 'IN_LANG'")
Expand Down Expand Up @@ -687,7 +682,6 @@ def scan(lang, path):
Displays the set of un-mapped characters in a document.
Accounts for case sensitivity in the configuration.
"""
# breakpoint()
# Check input lang exists
if lang not in LANGS_NETWORK.nodes:
raise click.UsageError(f"'{lang}' is not a valid value for 'LANG'")
Expand Down
94 changes: 61 additions & 33 deletions g2p/mappings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
import re
from copy import deepcopy
from pathlib import Path
from typing import List, Pattern, Union
from typing import Dict, List, Pattern, Union

import yaml
from pydantic import BaseModel

from g2p import exceptions
from g2p.log import LOGGER
from g2p.mappings.langs import _MAPPINGS_AVAILABLE
from g2p.mappings.langs import _LANGS, _MAPPINGS_AVAILABLE
from g2p.mappings.langs import __file__ as LANGS_FILE
from g2p.mappings.utils import (
MAPPING_TYPE,
Expand Down Expand Up @@ -67,21 +68,10 @@ def load_mapping_from_path(path_to_mapping_config: Union[str, Path], index=0):
"""Loads a mapping from a path, if there is more than one mapping, then it loads based on the int
provided to the 'index' argument. Default is 0.
"""
if isinstance(path_to_mapping_config, str):
path = Path(path_to_mapping_config)
else:
path = path_to_mapping_config
parent_dir = path.parent
with open(path, encoding="utf8") as f:
loaded_config = yaml.safe_load(f)
if not isinstance(loaded_config, dict):
raise exceptions.MalformedMapping(
f"The mapping config at {path} is malformed, please check it is properly formed."
)
if "mappings" in loaded_config:
loaded_config = loaded_config["mappings"][index]
loaded_config["parent_dir"] = parent_dir
return Mapping(**loaded_config)
mapping_config = MappingConfig.load_mapping_config_from_path(
path_to_mapping_config
)
return mapping_config.mappings[index]

def model_post_init(self, *args, **kwargs) -> None:
"""After the model is constructed, we process the model specs by applying all the configuration to the rules (ie prevent feeding, unicode normalization etc..)"""
Expand Down Expand Up @@ -342,6 +332,13 @@ def mapping_to_file(self, output_path: str = GEN_DIR, file_type: str = "json"):
with open(fn, "w", encoding="utf8", newline="\n") as f:
self.mapping_to_stream(f, file_type)

def export_to_dict(self, mapping_type="json"):
model_dict = json.loads(
self.model_dump_json(exclude_none=True, exclude={"parent_dir": True})
)
model_dict["rules"] = f"{self.in_lang}_to_{self.out_lang}.{mapping_type}"
return model_dict

def config_to_file(
self,
output_path: str = os.path.join(GEN_DIR, "config.yaml"),
Expand All @@ -357,34 +354,65 @@ def config_to_file(
else:
LOGGER.warning(f"writing mapping config to file at {output_path}")
fn = output_path
config_template = json.loads(
self.model_dump_json(exclude_none=True, exclude={"parent_dir": True})
)
config_template["rules"] = f"{self.in_lang}_to_{self.out_lang}.{mapping_type}"
template = {"mappings": [config_template]}
config_template = self.export_to_dict()
# Serialize piece-by-piece, which is why this is a list of type dict and not type Mapping
# If config file exists already, just add the mapping.
to_export = None
if add_config:
with open(fn, encoding="utf8") as f:
existing_data = yaml.safe_load(f.read())
existing_data = MappingConfig.load_mapping_config_from_path(fn)
updated = False
for i, mapping in enumerate(existing_data["mappings"]):
for i, mapping in enumerate(existing_data.mappings):
# if the mapping exists, just update the generation data
if (
mapping["in_lang"] == template["mappings"][0]["in_lang"]
and mapping["out_lang"] == template["mappings"][0]["out_lang"]
mapping.in_lang == config_template["in_lang"]
and mapping.out_lang == config_template["out_lang"]
):
existing_data["mappings"][i]["authors"] = template["mappings"][0][
"authors"
]
existing_data.mappings[i].authors = config_template["authors"]
updated = True
break
if not updated:
existing_data["mappings"].append(template["mappings"][0])
template = existing_data
existing_data.mappings.append(config_template)
to_export = {
"mappings": [
x.export_to_dict() if isinstance(x, Mapping) else x
for x in existing_data.mappings
]
}
else:
to_export = {"mappings": [config_template]}
with open(fn, "w", encoding="utf8", newline="\n") as f:
yaml.dump(template, f, Dumper=IndentDumper, default_flow_style=False)
yaml.dump(to_export, f, Dumper=IndentDumper, default_flow_style=False)


MAPPINGS_AVAILABLE: List[Mapping] = [
Mapping(**mapping) for mapping in _MAPPINGS_AVAILABLE
]


class MappingConfig(BaseModel):
"""This is the format used by g2p for configuring mappings."""

mappings: List[Mapping]

@staticmethod
def load_mapping_config_from_path(path_to_mapping_config: Union[str, Path]):
"""Loads a mapping configuration from a path, if you just want one specific mapping
from the config, you can try Mapping.load_mapping_from_path instead.
"""
if isinstance(path_to_mapping_config, str):
path = Path(path_to_mapping_config)
else:
path = path_to_mapping_config
parent_dir = path.parent
with open(path, encoding="utf8") as f:
loaded_config = yaml.safe_load(f)
if "mappings" in loaded_config:
for mapping in loaded_config["mappings"]:
mapping["parent_dir"] = parent_dir
try:
return MappingConfig(**loaded_config)
except TypeError as e:
raise exceptions.MalformedMapping from e


LANGS: Dict[str, MappingConfig] = {k: MappingConfig(**v) for k, v in _LANGS.items()}
25 changes: 11 additions & 14 deletions g2p/mappings/langs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,9 @@ def get_available_languages(langs: dict) -> list:
for k, v in langs.items():
if k in ["generated", "font-encodings"]:
continue
if "mappings" in v:
for vv in v["mappings"]:
if "language_name" in vv:
language_names.add(vv["language_name"])
elif "language_name" in v:
language_names.add(v["language_name"])
for vv in v["mappings"]:
if "language_name" in vv:
language_names.add(vv["language_name"])
return sorted(language_names)


Expand All @@ -56,11 +53,11 @@ def get_available_mappings(langs: dict) -> list:
return mappings_available


LANGS = load_langs()
LANGS_NETWORK = load_network()
LANGS_AVAILABLE = get_available_languages(LANGS)
# Making private because it should be imported from g2p.mappings instead
_MAPPINGS_AVAILABLE = get_available_mappings(LANGS)
_LANGS = load_langs()
LANGS_AVAILABLE = get_available_languages(_LANGS)
_MAPPINGS_AVAILABLE = get_available_mappings(_LANGS)


def reload_db():
Expand All @@ -69,9 +66,9 @@ def reload_db():
# We update all structures in place, so that another module having done from
# g2p.mappings.langs import VAR will see the udpates without any code changes.

global LANGS
LANGS.clear()
LANGS.update(load_langs())
global _LANGS
_LANGS.clear()
_LANGS.update(load_langs())

global LANGS_NETWORK
LANGS_NETWORK.clear()
Expand All @@ -80,8 +77,8 @@ def reload_db():

global LANGS_AVAILABLE
LANGS_AVAILABLE.clear()
LANGS_AVAILABLE.extend(get_available_languages(LANGS))
LANGS_AVAILABLE.extend(get_available_languages(_LANGS))

global _MAPPINGS_AVAILABLE
_MAPPINGS_AVAILABLE.clear()
_MAPPINGS_AVAILABLE.extend(get_available_mappings(LANGS))
_MAPPINGS_AVAILABLE.extend(get_available_mappings(_LANGS))
Binary file modified g2p/mappings/langs/langs.pkl
Binary file not shown.
41 changes: 15 additions & 26 deletions g2p/mappings/langs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
import pickle
from pathlib import Path

import yaml
from networkx import DiGraph, write_gpickle
from networkx.algorithms.dag import ancestors, descendants

from g2p.exceptions import MalformedMapping, MappingNotInitializedProperlyError
from g2p.log import LOGGER
from g2p.mappings import MAPPINGS_AVAILABLE, Mapping
from g2p.mappings import MAPPINGS_AVAILABLE, Mapping, MappingConfig
from g2p.mappings.langs import LANGS_DIR, LANGS_NETWORK, LANGS_NWORK_PATH, LANGS_PKL
from g2p.mappings.utils import MAPPING_TYPE, Rule, is_ipa

Expand Down Expand Up @@ -159,33 +158,23 @@ def cache_langs(
mappings_legal_pairs = []
for path in paths:
code = path.parent.stem
with open(path, encoding="utf8") as f:
data = yaml.safe_load(f)
# If there is a mappings key, there is more than one mapping
mapping_config = MappingConfig.load_mapping_config_from_path(path)
# TODO: should put in some measure to prioritize non-generated
# mappings and warn when they override
if "mappings" in data:
for index, mapping in enumerate(data["mappings"]):
in_lang = data["mappings"][index]["in_lang"]
out_lang = data["mappings"][index]["out_lang"]
mappings_legal_pairs.append((in_lang, out_lang))
if "language_name" not in mapping:
raise MalformedMapping(
f"language_name missing in {path} from mapping "
f"from {in_lang} to {out_lang}"
)
data["mappings"][index] = json.loads(
Mapping.load_mapping_from_path(path, index).model_dump_json(
exclude={"parent_dir": True}
)
for index, mapping in enumerate(mapping_config.mappings):
in_lang = mapping_config.mappings[index].in_lang
out_lang = mapping_config.mappings[index].out_lang
mappings_legal_pairs.append((in_lang, out_lang))

if not mapping.language_name:
raise MalformedMapping(
f"language_name missing in {path} from mapping "
f"from {in_lang} to {out_lang}"
)
# if "abbreviations" in data['mappings'][index] and data['mappings'][index]['abbreviations'] is not None:
# breakpoint()
else:
data = Mapping.load_mapping_from_path(path)
if "language_name" not in data:
raise MalformedMapping(f"language_name missing in {path}")
langs[code] = data
mapping_config.mappings[index] = Mapping.load_mapping_from_path(path, index)
# Exclude the parent directory when caching
mapping_config.mappings[index].parent_dir = None
langs[code] = mapping_config.model_dump()

# Save as a Directional Graph
lang_network = DiGraph()
Expand Down
12 changes: 2 additions & 10 deletions g2p/mappings/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,10 +644,10 @@ class _MappingModelDefinition(BaseModel):
id: Optional[str] = None
"""A unique ID for the mapping"""

in_lang: str = "und"
in_lang: str = "standalone"
"""The input language ID"""

out_lang: str = "und"
out_lang: str = "standalone"
"""The output language ID"""

language_name: Optional[str] = None
Expand Down Expand Up @@ -788,11 +788,3 @@ def create_display_name(cls, display_name, values):
if display_name is None:
display_name = f"{values['in_lang']} {find_mapping_type(values['in_lang'])} to {values['out_lang']} {find_mapping_type(values['out_lang'])}"
return display_name


class MappingConfig(BaseModel):
"""This is the format used by g2p for configuring mappings.
It is solely used to generate schemas to validate
"""

mappings: List[_MappingModelDefinition]
Loading

0 comments on commit a753e07

Please sign in to comment.