From 1c009c2d093fdb3dc470ad2bcc607ded4dd49652 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Mon, 10 Apr 2023 19:57:09 -0700 Subject: [PATCH] fix: Improve UI for the fine-grained NER model --- docs/tutorials/basic.ipynb | 36 ++++++++++++++++++------------------ pyproject.toml | 1 + src/dacy/__init__.py | 2 +- src/dacy/download.py | 28 ++++++++++++++++++++++++++-- src/dacy/ner/fine_grained.py | 19 ++++++++++++------- 5 files changed, 58 insertions(+), 28 deletions(-) diff --git a/docs/tutorials/basic.ipynb b/docs/tutorials/basic.ipynb index 9047d029..85e3372f 100644 --- a/docs/tutorials/basic.ipynb +++ b/docs/tutorials/basic.ipynb @@ -319,7 +319,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 7, @@ -339,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -394,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -466,7 +466,7 @@ { "data": { "text/html": [ - "\n", + "\n", "\n", " DaCy\n", " PROPN\n", @@ -508,57 +508,57 @@ "\n", "\n", "\n", - " \n", + " \n", " \n", - " nsubj\n", + " nsubj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " cop\n", + " cop\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " amod\n", + " amod\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " case\n", + " case\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " amod\n", + " amod\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " nmod\n", + " nmod\n", " \n", " \n", "\n", diff --git a/pyproject.toml b/pyproject.toml index d24e8c95..2d556740 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -184,6 +184,7 @@ exclude = [ "training/v0.0.0/**", "training/v0.1.0/**", "training/v0.1.1/**", + "training/ner_fine_grained/**", "papers/DaCy-A-Unified-Framework-for-Danish-NLP/**" ] # Allow unused variables when underscore-prefixed. diff --git a/src/dacy/__init__.py b/src/dacy/__init__.py index d390a690..5191c647 100644 --- a/src/dacy/__init__.py +++ b/src/dacy/__init__.py @@ -2,5 +2,5 @@ from dacy.sentiment import make_emotion_transformer # noqa from .about import __download_url__, __title__, __version__ # noqa -from .download import download_model # noqa +from .download import download_model, get_latest_version # noqa from .load import load, models, where_is_my_dacy # noqa diff --git a/src/dacy/download.py b/src/dacy/download.py index 8e907376..551c8149 100644 --- a/src/dacy/download.py +++ b/src/dacy/download.py @@ -1,11 +1,14 @@ """Functions for downloading DaCy models.""" import os +from distutils.version import StrictVersion from importlib.metadata import version from pathlib import Path from spacy.util import get_installed_models from tqdm import tqdm +versions = ["1.1.2", "1.0.0", "1.3.3", "1.0.12", "1.0.2"] +versions.sort(key=StrictVersion) DACY_DEFAULT_PATH = Path.home() / ".dacy" DEFAULT_CACHE_DIR = os.getenv( @@ -26,6 +29,26 @@ } +def get_latest_version(model: str) -> str: + """Returns the latest version of a DaCy model. + + Args: + model: string indicating the model + + Returns: + str: latest version of the model + """ + if model in {"small", "medium", "large"}: + model = f"da_dacy_{model}_trf" + versions = [mdl.split("-")[-1] for mdl in models_url if "ner_fine_grained" in mdl] + versions = sorted( + versions, + key=lambda s: [int(u) for u in s.split(".")], + reverse=True, + ) + return versions[0] + + def models() -> list[str]: """Returns a list of valid DaCy models. @@ -82,12 +105,13 @@ def download_model( >>> download_model(model="da_dacy_medium_trf-0.1.0") """ if model in {"small", "medium", "large"}: - model = f"da_dacy_{model}_trf-0.1.0" + latest_version = get_latest_version(model) + model = f"da_dacy_{model}_trf-{latest_version}" mdl_version = model.split("-")[-1] if model not in models_url: raise ValueError( - "The model is not available in DaCy. Please use dacy.models() to see a" + f"The model '{model}' is not available in DaCy. Please use dacy.models() to see a" + " list of all models", ) diff --git a/src/dacy/ner/fine_grained.py b/src/dacy/ner/fine_grained.py index a9967db6..afd07763 100644 --- a/src/dacy/ner/fine_grained.py +++ b/src/dacy/ner/fine_grained.py @@ -1,4 +1,4 @@ -from typing import Callable, Literal +from typing import Callable, Literal, Optional from spacy.lang.da import Danish from spacy.language import Language @@ -9,14 +9,18 @@ @Danish.factory( "dacy/ner-fine-grained", - default_config={}, + default_config={ + "version": None, + "size": "medium", + "transformer_name": "ner-transformer", + }, ) def create_finegrained_ner_component( nlp: Language, name: str, - size: Literal["small", "medium", "large"] = "small", - transformer_name: str = "ner-transformer", - version: str = "0.1.0", + size: Literal["small", "medium", "large"], + transformer_name: str, + version: Optional[str], ) -> Callable[[Doc], Doc]: """Create a fine grained NER component using the dacy models. @@ -25,9 +29,10 @@ def create_finegrained_ner_component( name: The name of the component size: The size of the model to use. Can be "small", "medium" or "large" transformer_name: The name of the transformer component which the NER moel will listen to - version: The version of the model to use + version: The version of the model to use. If None, the latest version will be used """ - + if version is None: + version = dacy.get_latest_version("da_dacy_{size}_ner_fine_grained") nlp_ner = dacy.load(f"da_dacy_{size}_ner_fine_grained-{version}") nlp.add_pipe(factory_name="transformer", name=transformer_name, source=nlp_ner) name_, component = nlp_ner.components[-1]