Skip to content

Commit

Permalink
refactor: change langs.pkl to langs.json
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Sep 5, 2023
1 parent b448523 commit 5a67040
Show file tree
Hide file tree
Showing 10 changed files with 37 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Pull request template for adding a new language
- [ ] Mapping files are added in `g2p/mappings/langs`
- [ ] Mapping is either added to an existing folder or a new folder has been added
- [ ] Language folder and files use appropriate [ISO 639-3 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes)
- [ ] Config.yaml file includes all author names, and settings necessary
- [ ] `config-g2p.yaml` file includes all author names, and settings necessary
- [ ] Please add some test data in `g2p/tests/public/data`. The added file should be a csv/tsv/psv file and each row should have the format `[input_mapping_code,output_mapping_code,input_string,output_string]`
- [ ] As the last step, G2P has been updated by running `g2p update` locally and committing the change
- [ ] You agree to license your contribution under the same license as this project (see [LICENSE](https://github.com/roedoejet/g2p/blob/main/LICENSE) file).
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
sleep 5
coverage combine
coverage xml
if git status | grep -E 'static.*json|mapping.*pkl'; then echo 'g2p databases out of date, please run "g2p update" and commit the results.'; false; else echo OK; fi
if git status | grep -E 'static.*json|mapping.*pkl|mapping.*langs.json.gz'; then echo 'g2p databases out of date, please run "g2p update" and commit the results.'; false; else echo OK; fi
- name: Upload coverage information
uses: codecov/codecov-action@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ Gen DB: this is the part of the textual database that is generated when running
* g2p/mappings/generated/\*

Compiled DB: this contains the same info as Text DB + Gen DB, but in a format optimized for fast reading by the machine. This is what any program using `g2p` reads: `g2p convert`, `readalongs align`, `convertextract`, and also `g2p generate-mapping`. It consists of these files:
* g2p/mappings/langs/langs.pkl
* g2p/mappings/langs/langs.json.gz
* g2p/mappings/langs/network.pkl
* g2p/mappings/langs/static/languages-network.json
* g2p/mappings/langs/static/swagger.json
Expand Down
6 changes: 3 additions & 3 deletions g2p/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
)
from g2p.mappings.langs import (
LANGS_DIR,
LANGS_JSON_NAME,
LANGS_NETWORK,
LANGS_PKL_NAME,
NETWORK_PKL_NAME,
reload_db,
)
Expand Down Expand Up @@ -654,10 +654,10 @@ def update(in_dir, out_dir):
if in_dir is None:
in_dir = LANGS_DIR
if out_dir is None:
langs_path = os.path.join(in_dir, LANGS_PKL_NAME)
langs_path = os.path.join(in_dir, LANGS_JSON_NAME)
network_path = os.path.join(in_dir, NETWORK_PKL_NAME)
else:
langs_path = os.path.join(out_dir, LANGS_PKL_NAME)
langs_path = os.path.join(out_dir, LANGS_JSON_NAME)
network_path = os.path.join(out_dir, NETWORK_PKL_NAME)
cache_langs(dir_path=in_dir, langs_path=langs_path, network_path=network_path)

Expand Down
3 changes: 3 additions & 0 deletions g2p/mappings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,9 @@ class MappingConfig(BaseModel):

mappings: List[Mapping]

def export_to_dict(self):
return {"mappings": [mapping.export_to_dict() for mapping in self.mappings]}

@staticmethod
def load_mapping_config_from_path(path_to_mapping_config: Union[str, Path]):
"""Loads a mapping configuration from a path, if you just want one specific mapping
Expand Down
11 changes: 6 additions & 5 deletions g2p/mappings/langs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
"""
Language mappings for g2p.
"""
import gzip
import json
import os
import pickle

from networkx import DiGraph, read_gpickle

from g2p.log import LOGGER

LANGS_DIR = os.path.dirname(__file__)
LANGS_PKL_NAME = "langs.pkl"
LANGS_PKL = os.path.join(LANGS_DIR, LANGS_PKL_NAME)
LANGS_JSON_NAME = "langs.json.gz"
LANGS_PKL = os.path.join(LANGS_DIR, LANGS_JSON_NAME)
NETWORK_PKL_NAME = "network.pkl"
LANGS_NWORK_PATH = os.path.join(LANGS_DIR, NETWORK_PKL_NAME)


def load_langs(path: str = LANGS_PKL):
try:
with open(path, "rb") as f:
return pickle.load(f)
with gzip.open(path, "rt", encoding="utf8") as f:
return json.load(f)
except Exception as e:
LOGGER.warning(f"Failed to read language cache from {path}: {e}")
return {}
Expand Down
Binary file added g2p/mappings/langs/langs.json.gz
Binary file not shown.
Binary file removed g2p/mappings/langs/langs.pkl
Binary file not shown.
22 changes: 15 additions & 7 deletions g2p/mappings/langs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
"""

import gzip
import io
import json
import pickle
from pathlib import Path

from networkx import DiGraph, write_gpickle
Expand Down Expand Up @@ -146,8 +147,8 @@ def cache_langs(
Args:
dir_path: Path to scan for config-g2p.yaml files. Default is the
installed g2p/mappings/langs directory.
langs_path: Path to output langs.pkl pickle file. Default is
the installed g2p/mappings/langs/langs.pkl
langs_path: Path to output langs.json.gz file. Default is
the installed g2p/mappings/langs/langs.json.gz
network_path: Path to output pickle file. Default is the
installed g2p/mappings/langs/network.pkl.
"""
Expand Down Expand Up @@ -180,7 +181,7 @@ def cache_langs(
mapping_config.mappings[index] = Mapping.load_mapping_from_path(path, index)
# Exclude the parent directory when caching
mapping_config.mappings[index].parent_dir = None
langs[code] = mapping_config.model_dump()
langs[code] = mapping_config.model_dump(exclude_none=True)

# Save as a Directional Graph
lang_network = DiGraph()
Expand All @@ -189,9 +190,16 @@ def cache_langs(
with open(network_path, "wb") as f:
write_gpickle(lang_network, f, protocol=4)

with open(langs_path, "wb") as f:
pickle.dump(langs, f, protocol=4)

with open(langs_path, "w", encoding="utf8") as f:
with gzip.GzipFile(langs_path, "wb", mtime=0) as zipfile_raw:
with io.TextIOWrapper(zipfile_raw, encoding="utf-8") as zipfile:
json.dump(
langs,
zipfile,
separators=(",", ":"),
ensure_ascii=False,
sort_keys=True,
)
return langs


Expand Down
14 changes: 7 additions & 7 deletions g2p/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,30 +36,30 @@ def test_update(self):
os.path.join(lang1_dir, "config-g2p.yaml"),
)
result = self.runner.invoke(update, ["-i", tmpdir])
langs_pkl = os.path.join(tmpdir, "langs.pkl")
langs_json = os.path.join(tmpdir, "langs.json.gz")
network_pkl = os.path.join(tmpdir, "network.pkl")
self.assertTrue(os.path.exists(langs_pkl))
self.assertTrue(os.path.exists(langs_json))
self.assertTrue(os.path.exists(network_pkl))

# Make sure it produces output
with tempfile.TemporaryDirectory() as tmpdir:
result = self.runner.invoke(update, ["-o", tmpdir])
self.assertEqual(result.exit_code, 0)
langs_pkl = os.path.join(tmpdir, "langs.pkl")
langs_json = os.path.join(tmpdir, "langs.json.gz")
network_pkl = os.path.join(tmpdir, "network.pkl")
self.assertTrue(os.path.exists(langs_pkl))
self.assertTrue(os.path.exists(langs_json))
self.assertTrue(os.path.exists(network_pkl))
langs = load_langs(langs_pkl)
langs = load_langs(langs_json)
self.assertTrue(langs is not None)
network = load_network(network_pkl)
self.assertTrue(network is not None)
# Corrupt the output and make sure we still can run
with open(langs_pkl, "wb") as fh:
with open(langs_json, "wb") as fh:
fh.write(b"spam spam spam")
with open(network_pkl, "wb") as fh:
fh.write(b"eggs bacon spam")
with self.assertLogs(LOGGER, "WARNING"):
langs = load_langs(langs_pkl)
langs = load_langs(langs_json)
self.assertTrue(langs is not None)
with self.assertLogs(LOGGER, "WARNING"):
network = load_network(network_pkl)
Expand Down

0 comments on commit 5a67040

Please sign in to comment.