Skip to content

Commit

Permalink
Merge pull request #244 from roedoejet/main
Browse files Browse the repository at this point in the history
Patch
  • Loading branch information
roedoejet authored Apr 12, 2023
2 parents b29435b + bd5690a commit 7836f9d
Show file tree
Hide file tree
Showing 62 changed files with 128,293 additions and 133,656 deletions.
12 changes: 5 additions & 7 deletions .github/workflows/pythonpublish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,16 @@ jobs:
- name: Determine tag
id: determine_tag
run: |
echo "::set-output name=TAG_VERSION::$(ls dist/g2p-*.tar.gz | sed -e 's/.*g2p-//' -e 's/.tar.gz.*//')"
echo "TAG_VERSION=$(ls dist/g2p-*.tar.gz | sed -e 's/.*g2p-//' -e 's/.tar.gz.*//')" >> $GITHUB_OUTPUT
- name: Bump version and push tag
id: tag_version
uses: mathieudutour/github-tag-action@v5.5
uses: mathieudutour/github-tag-action@v6.1
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
custom_tag: ${{ steps.determine_tag.outputs.TAG_VERSION }}
- name: Create a GitHub release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
uses: ncipollo/release-action@v1
with:
tag_name: ${{ steps.tag_version.outputs.new_tag }}
release_name: Release ${{ steps.tag_version.outputs.new_tag }}
tag: ${{ steps.tag_version.outputs.new_tag }}
name: Release ${{ steps.tag_version.outputs.new_tag }}
body: ${{ steps.tag_version.outputs.changelog }}
14 changes: 12 additions & 2 deletions .github/workflows/studio-release-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,20 @@ jobs:
pip install -r requirements.txt
pip install -r requirements/requirements.test.txt
pip install -e .
pip install coverage coveralls
- name: Ensure browser is installed
run: python -m playwright install --with-deps chromium
- name: Run tests
run: |
gunicorn --worker-class eventlet -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
coverage run --include g2p/app.py --parallel-mode \
-m gunicorn --worker-class eventlet -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
sleep 5
cd g2p/tests && python test_studio.py
python g2p/tests/test_studio.py
pkill coverage
sleep 5
coverage combine
coverage xml
- uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: false # too many upload errors to keep "true"
8 changes: 6 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@ jobs:
pip install coverage coveralls
- name: Run tests
run: |
gunicorn --worker-class eventlet -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
coverage run --omit g2p/app.py --omit 'g2p/tests/*' --parallel-mode \
-m gunicorn --worker-class eventlet -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
sleep 5
coverage run run_tests.py dev
coverage run --omit g2p/app.py --omit 'g2p/tests/*' --parallel-mode run_tests.py dev
pkill coverage
sleep 5
coverage combine
coverage xml
if git status | grep -E 'static.*json|mapping.*pkl'; then echo 'g2p databases out of date, please run "g2p update" and commit the results.'; false; else echo OK; fi
- uses: codecov/codecov-action@v3
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![Build Status](https://github.com/roedoejet/g2p/actions/workflows/tests.yml/badge.svg)](https://github.com/roedoejet/g2p/actions)
[![PyPI package](https://img.shields.io/pypi/v/g2p.svg)](https://pypi.org/project/g2p/)
[![license](https://img.shields.io/badge/Licence-MIT-green)](LICENSE)
[![standard-readme compliant](https://img.shields.io/badge/readme%20style-standard-brightgreen.svg?style=flat-square)](https://github.com/roedoejet/g2p)
[![standard-readme compliant](https://img.shields.io/badge/readme%20style-standard-brightgreen.svg)](https://github.com/roedoejet/g2p)

> Grapheme-to-Phoneme transformations that preserve input and output indices!
Expand Down
28 changes: 20 additions & 8 deletions g2p/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""
import io
import sys
from typing import Dict, Optional, Tuple, Union

from networkx import shortest_path
from networkx.exception import NetworkXNoPath
Expand All @@ -28,11 +29,11 @@
from g2p.mappings.tokenizer import make_tokenizer
from g2p.transducer import CompositeTransducer, TokenizingTransducer, Transducer

if sys.stdout.encoding != "utf8" and hasattr(sys.stdout, "buffer"):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")

if sys.stderr.encoding != "utf8" and hasattr(sys.stderr, "buffer"):
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
if "pytest" not in sys.modules:
if sys.stdout.encoding != "utf8" and hasattr(sys.stdout, "buffer"):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
if sys.stderr.encoding != "utf8" and hasattr(sys.stderr, "buffer"):
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")

if sys.version_info < (3, 6):
sys.exit(
Expand All @@ -41,23 +42,33 @@
)


_g2p_cache = {}
_g2p_cache: Dict[
Tuple[str, str, Optional[str]],
Union[Transducer, CompositeTransducer, TokenizingTransducer],
] = {}


def make_g2p(in_lang: str, out_lang: str, tok_lang=None):
def make_g2p(in_lang: str, out_lang: str, tok_lang: Optional[str] = None):
"""Make a g2p Transducer for mapping text from in_lang to out_lang via the
shortest path between them.
In general you should also add `tok_lang` to specify the language
for tokenization (probably the same as `in_lang`), because
transducers are not guaranteed to deal with whitespace,
punctuation, etc, properly.
Args:
in_lang (str): input language code
out_lang (str): output language code
tok_lang (Optional[str]): language for tokenization
Returns:
Transducer from in_lang to out_lang
Transducer from in_lang to out_lang, optionally with a tokenizer.
Raises:
InvalidLanguageCode: if in_lang or out_lang don't exist
NoPath: if there is path between in_lang and out_lang
"""
if (in_lang, out_lang, tok_lang) in _g2p_cache:
return _g2p_cache[(in_lang, out_lang, tok_lang)]
Expand Down Expand Up @@ -97,6 +108,7 @@ def make_g2p(in_lang: str, out_lang: str, tok_lang=None):
mappings_needed.append(mapping)

# Either construct a Transducer or Composite Transducer
transducer: Union[Transducer, CompositeTransducer, TokenizingTransducer]
if len(mappings_needed) == 1:
transducer = Transducer(mappings_needed[0])
else:
Expand Down
129 changes: 57 additions & 72 deletions g2p/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,18 @@
Views and config to the g2p Studio web app
"""
import json
import os
from typing import Union

from flask import Flask, render_template
from flask_cors import CORS
from flask_socketio import SocketIO, emit
from networkx.algorithms.dag import ancestors, descendants
from networkx import shortest_path

from g2p import make_g2p
from g2p.api import g2p_api
from g2p.log import LOGGER
from g2p.mappings import Mapping
from g2p.mappings.langs import LANGS_NETWORK
from g2p.mappings.utils import expand_abbreviations_format, flatten_abbreviations_format
from g2p.static import __file__ as static_file
from g2p.transducer import (
CompositeTransducer,
CompositeTransductionGraph,
Expand Down Expand Up @@ -56,37 +52,6 @@ def contrasting_text_color(hex_str):
)


def network_to_echart(write_to_file: bool = False, layout: bool = False):
nodes = []
no_nodes = len(LANGS_NETWORK.nodes)
for node in LANGS_NETWORK.nodes:
lang_name = node.split("-")[0]
no_ancestors = len(ancestors(LANGS_NETWORK, node))
no_descendants = len(descendants(LANGS_NETWORK, node))
size = min(
20,
max(
2, ((no_ancestors / no_nodes) * 100 + (no_descendants / no_nodes) * 100)
),
)
node = {"name": node, "symbolSize": size, "id": node, "category": lang_name}
nodes.append(node)
nodes.sort(key=lambda x: x["name"])
edges = []
for edge in LANGS_NETWORK.edges:
edges.append({"source": edge[0], "target": edge[1]})
if write_to_file:
with open(
os.path.join(os.path.dirname(static_file), "languages-network.json"),
"w",
encoding="utf-8",
newline="\n",
) as f:
f.write(json.dumps({"nodes": nodes, "edges": edges}) + "\n")
LOGGER.info("Wrote network nodes and edges to static file.")
return nodes, edges


def return_echart_data(tg: Union[CompositeTransductionGraph, TransductionGraph]):
x = 100
diff = 200
Expand Down Expand Up @@ -152,23 +117,6 @@ def return_echart_data(tg: Union[CompositeTransductionGraph, TransductionGraph])
return nodes, edges


def return_empty_mappings(n=DEFAULT_N):
"""Return 'n' * empty mappings"""
y = 0
mappings = []
while y < n:
mappings.append(
{"in": "", "out": "", "context_before": "", "context_after": ""}
)
y += 1
return mappings


def return_descendant_nodes(node: str):
"""Return possible outputs for a given input"""
return [x for x in descendants(LANGS_NETWORK, node)]


@APP.route("/")
def home():
"""Return homepage of g2p studio"""
Expand All @@ -194,6 +142,9 @@ def convert(message):
)
transducer = Transducer(mappings_obj)
transducers.append(transducer)
if len(transducers) == 0:
emit("conversion response", {"output_string": message["data"]["input_string"]})
return
transducer = CompositeTransducer(transducers)
if message["data"]["index"]:
tg = transducer(message["data"]["input_string"])
Expand All @@ -215,26 +166,60 @@ def convert(message):
def change_table(message):
"""Change the lookup table"""
if message["in_lang"] == "custom" or message["out_lang"] == "custom":
mappings = Mapping(return_empty_mappings())
else:
transducer = make_g2p(message["in_lang"], message["out_lang"])
if isinstance(transducer, Transducer):
mappings = [transducer.mapping]
elif isinstance(transducer, CompositeTransducer):
mappings = [x.mapping for x in transducer._transducers]
# These are only used to generate JSON to send to the client,
# so it's safe to create a list of references to the same thing.
mappings = [
{"in": "", "out": "", "context_before": "", "context_after": ""}
] * DEFAULT_N
abbs = [[""] * 6] * DEFAULT_N
kwargs = {
"language_name": "Custom",
"display_name": "Custom",
"in_lang": "custom",
"out_lang": "custom",
"include": False,
"type": "mapping",
"case_sensitive": True,
"norm_form": "NFC",
"escape_special": False,
"prevent_feeding": False,
"reverse": False,
"rule_ordering": "as-written",
"out_delimiter": "",
}
emit(
"table response",
[
{
"mappings": mappings,
"abbs": abbs,
"kwargs": kwargs,
}
],
)
else:
pass
emit(
"table response",
[
{
"mappings": x.plain_mapping(),
"abbs": expand_abbreviations_format(x.abbreviations),
"kwargs": x.kwargs,
}
for x in mappings
],
)
# Do not create a composite transducer just to decompose it,
# because it is the individual ones which are cached by g2p
path = shortest_path(LANGS_NETWORK, message["in_lang"], message["out_lang"])
if len(path) == 1:
transducer = make_g2p(message["in_lang"], message["out_lang"])
mappings = [transducer.mapping]
else:
mappings = []
for lang1, lang2 in zip(path[:-1], path[1:]):
transducer = make_g2p(lang1, lang2)
mappings.append(transducer.mapping)
emit(
"table response",
[
{
"mappings": x.plain_mapping(),
"abbs": expand_abbreviations_format(x.abbreviations),
"kwargs": x.kwargs,
}
for x in mappings
],
)


@SOCKETIO.on("connect", namespace="/connect")
Expand Down
23 changes: 20 additions & 3 deletions g2p/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from g2p import make_g2p
from g2p._version import VERSION
from g2p.api import update_docs
from g2p.app import APP, network_to_echart
from g2p.app import APP
from g2p.exceptions import InvalidLanguageCode, MappingMissing, NoPath
from g2p.log import LOGGER
from g2p.mappings import Mapping
Expand All @@ -35,8 +35,13 @@
MAPPINGS_AVAILABLE,
NETWORK_PKL_NAME,
)
from g2p.mappings.langs.utils import cache_langs, check_ipa_known_segs
from g2p.mappings.langs.utils import (
cache_langs,
check_ipa_known_segs,
network_to_echart,
)
from g2p.mappings.utils import is_ipa, is_xsampa, load_mapping_from_path, normalize
from g2p.static import __file__ as static_file
from g2p.transducer import Transducer

PRINTER = pprint.PrettyPrinter(indent=4)
Expand Down Expand Up @@ -423,6 +428,13 @@ def generate_mapping( # noqa: C901
new_mapping.mapping_to_file()


@click.option(
"--substring-alignments",
"-a",
default=False,
is_flag=True,
help="Show the minimal monotonic substring alignments.",
)
@click.option(
"--pretty-edges",
"-e",
Expand Down Expand Up @@ -484,6 +496,7 @@ def convert( # noqa: C901
pretty_edges,
tok_lang,
config,
substring_alignments,
):
"""Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG.
Expand Down Expand Up @@ -551,6 +564,8 @@ def convert( # noqa: C901
if check:
transducer.check(tg, display_warnings=True)
outputs = [tg.output_string]
if substring_alignments:
outputs += [tg.substring_alignments()]
if pretty_edges:
outputs += [tg.pretty_edges()]
if debugger:
Expand Down Expand Up @@ -648,7 +663,9 @@ def update(in_dir, out_dir):
network_path = os.path.join(out_dir, NETWORK_PKL_NAME)
cache_langs(dir_path=in_dir, langs_path=langs_path, network_path=network_path)
update_docs()
network_to_echart(write_to_file=True)
network_to_echart(
outfile=os.path.join(os.path.dirname(static_file), "languages-network.json")
)


@click.argument("path", type=click.Path(exists=True, file_okay=True, dir_okay=False))
Expand Down
4 changes: 0 additions & 4 deletions g2p/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@
def setup_logger(name):
"""Create logger and configure with cool colors!"""

logging.basicConfig(
level=logging.INFO
# filename="logger.log"
)
logger = logging.getLogger(name)
coloredlogs.install(
level="INFO",
Expand Down
Loading

0 comments on commit 7836f9d

Please sign in to comment.