Skip to content

Commit

Permalink
support detecting the language of multiple files in a single CLI command
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Sep 17, 2024
1 parent 2c06655 commit 0dda55b
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 28 deletions.
43 changes: 28 additions & 15 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,23 +737,36 @@ def run_completion(shell):


@cli.command("detect-language")
@click.argument("languages", nargs=-1)
def run_detect_language(languages):
"""Detect the language of a text given a list of candidate languages."""
@click.argument("languages")
@click.argument(
"paths", type=click.Path(dir_okay=False, exists=True, allow_dash=True), nargs=-1
)
def run_detect_language(languages, paths):
"""
Detect the language of a single text document from standard input or for one or more
document file(s) given its/their path(s).
"""

if not languages:
raise click.UsageError("At least one language is required as an argument")
langs = tuple(languages.split(","))

text = sys.stdin.read()
try:
proportions = detect_language(text, languages)
except ValueError as e:
raise click.UsageError(e)

for lang, score in proportions.items():
if lang == "unk":
lang = "?"
click.echo(f"{lang}\t{score:.04f}")
def detect_language_and_show(text, languages):
try:
proportions = detect_language(text, languages)
except ValueError as e:
raise click.UsageError(e)
for lang, score in proportions.items():
if lang == "unk":
lang = "?"
click.echo(f"{lang}\t{score:.04f}")

if paths and not (len(paths) == 1 and paths[0] == "-"):
doclist = cli_util.open_text_documents(paths, docs_limit=None)
for doc, path in zip(doclist.documents, paths):
click.echo(f"Detected languages for {path}")
detect_language_and_show(doc.text, langs)
else:
text = sys.stdin.read()
detect_language_and_show(text, langs)


if __name__ == "__main__":
Expand Down
28 changes: 15 additions & 13 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1393,10 +1393,10 @@ def test_completion_load_vocab_vocab_ids_all():
assert completions == ["dummy", "dummy-noname", "yso"]


def test_detect_language():
def test_detect_language_stdin():
result = runner.invoke(
annif.cli.cli,
["detect-language", "fi", "sv", "en"],
["detect-language", "fi,sv,en"],
input="This is some example text",
)
assert not result.exception
Expand All @@ -1405,17 +1405,6 @@ def test_detect_language():
assert result.output.split("\n")[-2] == "?\t0.0000"


def test_detect_language_no_candidates():
failed_result = runner.invoke(
annif.cli.cli,
["detect-language"],
input="This is some example text",
)
assert failed_result.exception
assert failed_result.exit_code != 0
assert "At least one language is required as an argument" in failed_result.output


def test_detect_language_unknown_language():
failed_result = runner.invoke(
annif.cli.cli,
Expand All @@ -1425,3 +1414,16 @@ def test_detect_language_unknown_language():
assert failed_result.exception
assert failed_result.exit_code != 0
assert "Error: Unsupported language: xxx" in failed_result.output


def test_detect_language_file_and_stdin(tmpdir):
docfile1 = tmpdir.join("doc-1.txt")
docfile1.write("nothing special")

result = runner.invoke(
annif.cli.cli, ["detect-language", "fi,en", str(docfile1), "-"], input="kissa"
)

assert not result.exception
assert f"Detected languages for {docfile1}" in result.output
assert "Detected languages for -" in result.output

0 comments on commit 0dda55b

Please sign in to comment.