Skip to content

Commit

Permalink
Merge pull request #801 from NatLibFi/issue799-language-detection-cli
Browse files Browse the repository at this point in the history
Add detect-language CLI command
  • Loading branch information
osma authored Sep 17, 2024
2 parents c81ed1c + 0dda55b commit 2611586
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 13 deletions.
34 changes: 34 additions & 0 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
OperationFailedException,
)
from annif.project import Access
from annif.simplemma_util import detect_language
from annif.util import metric_code

logger = annif.logger
Expand Down Expand Up @@ -735,5 +736,38 @@ def run_completion(shell):
click.echo(script)


@cli.command("detect-language")
@click.argument("languages")
@click.argument(
"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
def run_detect_language(languages, paths):
"""
Detect the language of a single text document from standard input or for one or more
document file(s) given its/their path(s).
"""

langs = tuple(languages.split(","))

def detect_language_and_show(text, languages):
try:
proportions = detect_language(text, languages)
except ValueError as e:
raise click.UsageError(e)
for lang, score in proportions.items():
if lang == "unk":
lang = "?"
click.echo(f"{lang}\t{score:.04f}")

if paths and not (len(paths) == 1 and paths[0] == "-"):
doclist = cli_util.open_text_documents(paths, docs_limit=None)
for doc, path in zip(doclist.documents, paths):
click.echo(f"Detected languages for {path}")
detect_language_and_show(doc.text, langs)
else:
text = sys.stdin.read()
detect_language_and_show(text, langs)


if __name__ == "__main__":
cli()
17 changes: 6 additions & 11 deletions annif/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
import connexion

import annif.registry
import annif.simplemma_util
from annif.corpus import Document, DocumentList, SubjectSet
from annif.exception import AnnifException
from annif.project import Access
from annif.simplemma_util import get_language_detector

if TYPE_CHECKING:
from connexion.lifecycle import ConnexionResponse
Expand Down Expand Up @@ -89,9 +89,8 @@ def detect_language(body: dict[str, Any]):
text = body.get("text")
languages = body.get("languages")

detector = get_language_detector(tuple(languages))
try:
proportions = detector.proportion_in_each_language(text)
proportions = annif.simplemma_util.detect_language(text, tuple(languages))
except ValueError:
return connexion.problem(
status=400,
Expand All @@ -100,14 +99,10 @@ def detect_language(body: dict[str, Any]):
)

result = {
"results": sorted(
[
{"language": lang if lang != "unk" else None, "score": score}
for lang, score in proportions.items()
],
key=lambda x: x["score"],
reverse=True,
)
"results": [
{"language": lang if lang != "unk" else None, "score": score}
for lang, score in proportions.items()
]
}
return result, 200, {"Content-Type": "application/json"}

Expand Down
8 changes: 7 additions & 1 deletion annif/simplemma_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Wrapper code for using Simplemma functionality in Annif"""

from typing import Tuple, Union
from typing import Dict, Tuple, Union

from simplemma import LanguageDetector, Lemmatizer
from simplemma.strategies import DefaultStrategy
Expand All @@ -15,3 +15,9 @@

def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)


def detect_language(text: str, languages: Tuple[str, ...]) -> Dict[str, float]:
detector = get_language_detector(languages)
proportions = detector.proportion_in_each_language(text)
return dict(sorted(proportions.items(), key=lambda x: x[1], reverse=True))
36 changes: 36 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,3 +1391,39 @@ def test_completion_show_project_project_ids_dummy():
def test_completion_load_vocab_vocab_ids_all():
completions = get_completions(annif.cli.cli, ["load-vocab"], "")
assert completions == ["dummy", "dummy-noname", "yso"]


def test_detect_language_stdin():
result = runner.invoke(
annif.cli.cli,
["detect-language", "fi,sv,en"],
input="This is some example text",
)
assert not result.exception
assert result.exit_code == 0
assert result.output.split("\n")[0] == "en\t1.0000"
assert result.output.split("\n")[-2] == "?\t0.0000"


def test_detect_language_unknown_language():
failed_result = runner.invoke(
annif.cli.cli,
["detect-language", "xxx"],
input="This is some example text",
)
assert failed_result.exception
assert failed_result.exit_code != 0
assert "Error: Unsupported language: xxx" in failed_result.output


def test_detect_language_file_and_stdin(tmpdir):
docfile1 = tmpdir.join("doc-1.txt")
docfile1.write("nothing special")

result = runner.invoke(
annif.cli.cli, ["detect-language", "fi,en", str(docfile1), "-"], input="kissa"
)

assert not result.exception
assert f"Detected languages for {docfile1}" in result.output
assert "Detected languages for -" in result.output
11 changes: 10 additions & 1 deletion tests/test_simplemma_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from annif.simplemma_util import get_language_detector
from annif.simplemma_util import detect_language, get_language_detector


def test_get_language_detector():
Expand All @@ -17,3 +17,12 @@ def test_get_language_detector_many():
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == pytest.approx(1.0)


def test_detect_language():
text = "She said 'au revoir' and left"
languages = ("fr", "en")
proportions = detect_language(text, languages)
assert proportions["en"] == pytest.approx(0.75)
assert proportions["fr"] == pytest.approx(0.25)
assert list(proportions.keys())[0] == "en"

0 comments on commit 2611586

Please sign in to comment.