Merge pull request #244 from roedoejet/main

Patch
roedoejet · Apr 12, 2023 · 7836f9d · 7836f9d
2 parents b29435b + bd5690a
commit 7836f9d
Show file tree

Hide file tree

Showing 62 changed files with 128,293 additions and 133,656 deletions.
diff --git a/.github/workflows/pythonpublish.yml b/.github/workflows/pythonpublish.yml
@@ -39,18 +39,16 @@ jobs:
     - name: Determine tag
       id: determine_tag
       run: |
-        echo "::set-output name=TAG_VERSION::$(ls dist/g2p-*.tar.gz | sed -e 's/.*g2p-//' -e 's/.tar.gz.*//')"
+        echo "TAG_VERSION=$(ls dist/g2p-*.tar.gz | sed -e 's/.*g2p-//' -e 's/.tar.gz.*//')" >> $GITHUB_OUTPUT
     - name: Bump version and push tag
       id: tag_version
-      uses: mathieudutour/github-tag-action@v5.5
+      uses: mathieudutour/github-tag-action@v6.1
       with:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         custom_tag: ${{ steps.determine_tag.outputs.TAG_VERSION }}
     - name: Create a GitHub release
-      uses: actions/create-release@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      uses: ncipollo/release-action@v1
       with:
-        tag_name: ${{ steps.tag_version.outputs.new_tag }}
-        release_name: Release ${{ steps.tag_version.outputs.new_tag }}
+        tag: ${{ steps.tag_version.outputs.new_tag }}
+        name: Release ${{ steps.tag_version.outputs.new_tag }}
         body: ${{ steps.tag_version.outputs.changelog }}
diff --git a/.github/workflows/studio-release-tests.yml b/.github/workflows/studio-release-tests.yml
@@ -19,10 +19,20 @@ jobs:
           pip install -r requirements.txt
           pip install -r requirements/requirements.test.txt
           pip install -e .
+          pip install coverage coveralls
       - name: Ensure browser is installed
         run: python -m playwright install --with-deps chromium
       - name: Run tests
         run: |
-          gunicorn --worker-class eventlet  -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
+          coverage run --include g2p/app.py --parallel-mode \
+              -m gunicorn --worker-class eventlet  -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
           sleep 5
-          cd g2p/tests && python test_studio.py
+          python g2p/tests/test_studio.py
+          pkill coverage
+          sleep 5
+          coverage combine
+          coverage xml
+      - uses: codecov/codecov-action@v3
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: false # too many upload errors to keep "true"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -23,9 +23,13 @@ jobs:
           pip install coverage coveralls
       - name: Run tests
         run: |
-          gunicorn --worker-class eventlet  -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
+          coverage run --omit g2p/app.py --omit 'g2p/tests/*' --parallel-mode \
+              -m gunicorn --worker-class eventlet  -w 1 g2p.app:APP --no-sendfile --bind 0.0.0.0:5000 --daemon
           sleep 5
-          coverage run run_tests.py dev
+          coverage run --omit g2p/app.py --omit 'g2p/tests/*' --parallel-mode run_tests.py dev
+          pkill coverage
+          sleep 5
+          coverage combine
           coverage xml
           if git status | grep -E 'static.*json|mapping.*pkl'; then echo 'g2p databases out of date, please run "g2p update" and commit the results.'; false; else echo OK; fi
       - uses: codecov/codecov-action@v3

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 [![Build Status](https://github.com/roedoejet/g2p/actions/workflows/tests.yml/badge.svg)](https://github.com/roedoejet/g2p/actions)
 [![PyPI package](https://img.shields.io/pypi/v/g2p.svg)](https://pypi.org/project/g2p/)
 [![license](https://img.shields.io/badge/Licence-MIT-green)](LICENSE)
-[![standard-readme compliant](https://img.shields.io/badge/readme%20style-standard-brightgreen.svg?style=flat-square)](https://github.com/roedoejet/g2p)
+[![standard-readme compliant](https://img.shields.io/badge/readme%20style-standard-brightgreen.svg)](https://github.com/roedoejet/g2p)
 
 > Grapheme-to-Phoneme transformations that preserve input and output indices!
 

diff --git a/g2p/__init__.py b/g2p/__init__.py
@@ -17,6 +17,7 @@
 """
 import io
 import sys
+from typing import Dict, Optional, Tuple, Union
 
 from networkx import shortest_path
 from networkx.exception import NetworkXNoPath
@@ -28,11 +29,11 @@
 from g2p.mappings.tokenizer import make_tokenizer
 from g2p.transducer import CompositeTransducer, TokenizingTransducer, Transducer
 
-if sys.stdout.encoding != "utf8" and hasattr(sys.stdout, "buffer"):
-    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
-
-if sys.stderr.encoding != "utf8" and hasattr(sys.stderr, "buffer"):
-    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
+if "pytest" not in sys.modules:
+    if sys.stdout.encoding != "utf8" and hasattr(sys.stdout, "buffer"):
+        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+    if sys.stderr.encoding != "utf8" and hasattr(sys.stderr, "buffer"):
+        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
 
 if sys.version_info < (3, 6):
     sys.exit(
@@ -41,23 +42,33 @@
     )
 
 
-_g2p_cache = {}
+_g2p_cache: Dict[
+    Tuple[str, str, Optional[str]],
+    Union[Transducer, CompositeTransducer, TokenizingTransducer],
+] = {}
 
 
-def make_g2p(in_lang: str, out_lang: str, tok_lang=None):
+def make_g2p(in_lang: str, out_lang: str, tok_lang: Optional[str] = None):
     """Make a g2p Transducer for mapping text from in_lang to out_lang via the
     shortest path between them.
 
+    In general you should also add `tok_lang` to specify the language
+    for tokenization (probably the same as `in_lang`), because
+    transducers are not guaranteed to deal with whitespace,
+    punctuation, etc, properly.
+
     Args:
         in_lang (str): input language code
         out_lang (str): output language code
+        tok_lang (Optional[str]): language for tokenization
 
     Returns:
-        Transducer from in_lang to out_lang
+        Transducer from in_lang to out_lang, optionally with a tokenizer.
 
     Raises:
         InvalidLanguageCode: if in_lang or out_lang don't exist
         NoPath: if there is path between in_lang and out_lang
+
     """
     if (in_lang, out_lang, tok_lang) in _g2p_cache:
         return _g2p_cache[(in_lang, out_lang, tok_lang)]
@@ -97,6 +108,7 @@ def make_g2p(in_lang: str, out_lang: str, tok_lang=None):
         mappings_needed.append(mapping)
 
     # Either construct a Transducer or Composite Transducer
+    transducer: Union[Transducer, CompositeTransducer, TokenizingTransducer]
     if len(mappings_needed) == 1:
         transducer = Transducer(mappings_needed[0])
     else:

diff --git a/g2p/app.py b/g2p/app.py
@@ -3,22 +3,18 @@
 Views and config to the g2p Studio web app
 
 """
-import json
-import os
 from typing import Union
 
 from flask import Flask, render_template
 from flask_cors import CORS
 from flask_socketio import SocketIO, emit
-from networkx.algorithms.dag import ancestors, descendants
+from networkx import shortest_path
 
 from g2p import make_g2p
 from g2p.api import g2p_api
-from g2p.log import LOGGER
 from g2p.mappings import Mapping
 from g2p.mappings.langs import LANGS_NETWORK
 from g2p.mappings.utils import expand_abbreviations_format, flatten_abbreviations_format
-from g2p.static import __file__ as static_file
 from g2p.transducer import (
     CompositeTransducer,
     CompositeTransductionGraph,
@@ -56,37 +52,6 @@ def contrasting_text_color(hex_str):
     )
 
 
-def network_to_echart(write_to_file: bool = False, layout: bool = False):
-    nodes = []
-    no_nodes = len(LANGS_NETWORK.nodes)
-    for node in LANGS_NETWORK.nodes:
-        lang_name = node.split("-")[0]
-        no_ancestors = len(ancestors(LANGS_NETWORK, node))
-        no_descendants = len(descendants(LANGS_NETWORK, node))
-        size = min(
-            20,
-            max(
-                2, ((no_ancestors / no_nodes) * 100 + (no_descendants / no_nodes) * 100)
-            ),
-        )
-        node = {"name": node, "symbolSize": size, "id": node, "category": lang_name}
-        nodes.append(node)
-    nodes.sort(key=lambda x: x["name"])
-    edges = []
-    for edge in LANGS_NETWORK.edges:
-        edges.append({"source": edge[0], "target": edge[1]})
-    if write_to_file:
-        with open(
-            os.path.join(os.path.dirname(static_file), "languages-network.json"),
-            "w",
-            encoding="utf-8",
-            newline="\n",
-        ) as f:
-            f.write(json.dumps({"nodes": nodes, "edges": edges}) + "\n")
-        LOGGER.info("Wrote network nodes and edges to static file.")
-    return nodes, edges
-
-
 def return_echart_data(tg: Union[CompositeTransductionGraph, TransductionGraph]):
     x = 100
     diff = 200
@@ -152,23 +117,6 @@ def return_echart_data(tg: Union[CompositeTransductionGraph, TransductionGraph])
     return nodes, edges
 
 
-def return_empty_mappings(n=DEFAULT_N):
-    """Return 'n' * empty mappings"""
-    y = 0
-    mappings = []
-    while y < n:
-        mappings.append(
-            {"in": "", "out": "", "context_before": "", "context_after": ""}
-        )
-        y += 1
-    return mappings
-
-
-def return_descendant_nodes(node: str):
-    """Return possible outputs for a given input"""
-    return [x for x in descendants(LANGS_NETWORK, node)]
-
-
 @APP.route("/")
 def home():
     """Return homepage of g2p studio"""
@@ -194,6 +142,9 @@ def convert(message):
         )
         transducer = Transducer(mappings_obj)
         transducers.append(transducer)
+    if len(transducers) == 0:
+        emit("conversion response", {"output_string": message["data"]["input_string"]})
+        return
     transducer = CompositeTransducer(transducers)
     if message["data"]["index"]:
         tg = transducer(message["data"]["input_string"])
@@ -215,26 +166,60 @@ def convert(message):
 def change_table(message):
     """Change the lookup table"""
     if message["in_lang"] == "custom" or message["out_lang"] == "custom":
-        mappings = Mapping(return_empty_mappings())
-    else:
-        transducer = make_g2p(message["in_lang"], message["out_lang"])
-    if isinstance(transducer, Transducer):
-        mappings = [transducer.mapping]
-    elif isinstance(transducer, CompositeTransducer):
-        mappings = [x.mapping for x in transducer._transducers]
+        # These are only used to generate JSON to send to the client,
+        # so it's safe to create a list of references to the same thing.
+        mappings = [
+            {"in": "", "out": "", "context_before": "", "context_after": ""}
+        ] * DEFAULT_N
+        abbs = [[""] * 6] * DEFAULT_N
+        kwargs = {
+            "language_name": "Custom",
+            "display_name": "Custom",
+            "in_lang": "custom",
+            "out_lang": "custom",
+            "include": False,
+            "type": "mapping",
+            "case_sensitive": True,
+            "norm_form": "NFC",
+            "escape_special": False,
+            "prevent_feeding": False,
+            "reverse": False,
+            "rule_ordering": "as-written",
+            "out_delimiter": "",
+        }
+        emit(
+            "table response",
+            [
+                {
+                    "mappings": mappings,
+                    "abbs": abbs,
+                    "kwargs": kwargs,
+                }
+            ],
+        )
     else:
-        pass
-    emit(
-        "table response",
-        [
-            {
-                "mappings": x.plain_mapping(),
-                "abbs": expand_abbreviations_format(x.abbreviations),
-                "kwargs": x.kwargs,
-            }
-            for x in mappings
-        ],
-    )
+        # Do not create a composite transducer just to decompose it,
+        # because it is the individual ones which are cached by g2p
+        path = shortest_path(LANGS_NETWORK, message["in_lang"], message["out_lang"])
+        if len(path) == 1:
+            transducer = make_g2p(message["in_lang"], message["out_lang"])
+            mappings = [transducer.mapping]
+        else:
+            mappings = []
+            for lang1, lang2 in zip(path[:-1], path[1:]):
+                transducer = make_g2p(lang1, lang2)
+                mappings.append(transducer.mapping)
+        emit(
+            "table response",
+            [
+                {
+                    "mappings": x.plain_mapping(),
+                    "abbs": expand_abbreviations_format(x.abbreviations),
+                    "kwargs": x.kwargs,
+                }
+                for x in mappings
+            ],
+        )
 
 
 @SOCKETIO.on("connect", namespace="/connect")

diff --git a/g2p/cli.py b/g2p/cli.py
@@ -15,7 +15,7 @@
 from g2p import make_g2p
 from g2p._version import VERSION
 from g2p.api import update_docs
-from g2p.app import APP, network_to_echart
+from g2p.app import APP
 from g2p.exceptions import InvalidLanguageCode, MappingMissing, NoPath
 from g2p.log import LOGGER
 from g2p.mappings import Mapping
@@ -35,8 +35,13 @@
     MAPPINGS_AVAILABLE,
     NETWORK_PKL_NAME,
 )
-from g2p.mappings.langs.utils import cache_langs, check_ipa_known_segs
+from g2p.mappings.langs.utils import (
+    cache_langs,
+    check_ipa_known_segs,
+    network_to_echart,
+)
 from g2p.mappings.utils import is_ipa, is_xsampa, load_mapping_from_path, normalize
+from g2p.static import __file__ as static_file
 from g2p.transducer import Transducer
 
 PRINTER = pprint.PrettyPrinter(indent=4)
@@ -423,6 +428,13 @@ def generate_mapping(  # noqa: C901
             new_mapping.mapping_to_file()
 
 
+@click.option(
+    "--substring-alignments",
+    "-a",
+    default=False,
+    is_flag=True,
+    help="Show the minimal monotonic substring alignments.",
+)
 @click.option(
     "--pretty-edges",
     "-e",
@@ -484,6 +496,7 @@ def convert(  # noqa: C901
     pretty_edges,
     tok_lang,
     config,
+    substring_alignments,
 ):
     """Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG.
 
@@ -551,6 +564,8 @@ def convert(  # noqa: C901
     if check:
         transducer.check(tg, display_warnings=True)
     outputs = [tg.output_string]
+    if substring_alignments:
+        outputs += [tg.substring_alignments()]
     if pretty_edges:
         outputs += [tg.pretty_edges()]
     if debugger:
@@ -648,7 +663,9 @@ def update(in_dir, out_dir):
         network_path = os.path.join(out_dir, NETWORK_PKL_NAME)
     cache_langs(dir_path=in_dir, langs_path=langs_path, network_path=network_path)
     update_docs()
-    network_to_echart(write_to_file=True)
+    network_to_echart(
+        outfile=os.path.join(os.path.dirname(static_file), "languages-network.json")
+    )
 
 
 @click.argument("path", type=click.Path(exists=True, file_okay=True, dir_okay=False))

diff --git a/g2p/log.py b/g2p/log.py
@@ -15,10 +15,6 @@
 def setup_logger(name):
     """Create logger and configure with cool colors!"""
 
-    logging.basicConfig(
-        level=logging.INFO
-        # filename="logger.log"
-    )
     logger = logging.getLogger(name)
     coloredlogs.install(
         level="INFO",