Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add spaCy analyzer #527

Merged
merged 13 commits into from
Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ jobs:
if [[ ${{ matrix.python-version }} != '3.8' ]]; then pip install .[omikuji,yake]; fi
# Install the optional fastText dependencies for Python 3.8 only
if [[ ${{ matrix.python-version }} == '3.8' ]]; then pip install .[fasttext]; fi
# Install the optional spaCy dependencies for Python 3.8 only
if [[ ${{ matrix.python-version }} == '3.8' ]]; then
pip install .[spacy]
# download the small English pretrained spaCy model needed by spacy analyzer
python -m spacy download en_core_web_sm
fi
# For Python 3.7
# - voikko and pycld3 dependencies
if [[ ${{ matrix.python-version }} == '3.7' ]]; then python -m pip install .[voikko,pycld3]; fi
Expand Down
13 changes: 11 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM python:3.8-slim-bullseye AS builder
LABEL maintainer="Juho Inkinen <juho.inkinen@helsinki.fi>"

SHELL ["/bin/bash", "-c"]
ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake
ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake,spacy
# Bulding fastText needs some system packages
RUN if [[ $optional_dependencies =~ "fasttext" ]]; then \
apt-get update && \
Expand All @@ -16,6 +16,7 @@ RUN if [[ $optional_dependencies =~ "fasttext" ]]; then \

FROM python:3.8-slim-bullseye

SHELL ["/bin/bash", "-c"]
COPY --from=builder /usr/local/lib/python3.8 /usr/local/lib/python3.8

# Install system dependencies needed at runtime:
Expand All @@ -32,13 +33,21 @@ RUN pip install --upgrade pip --no-cache-dir

COPY setup.py README.md LICENSE.txt projects.cfg.dist /Annif/
# Install dependencies for optional features.
ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake
ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake,spacy
RUN echo "Installing dependencies for optional features: $optional_dependencies" \
&& pip install .[$optional_dependencies] --no-cache-dir

# Download nltk data (handle occasional timeout in with 3 tries):
RUN for i in 1 2 3; do python -m nltk.downloader punkt -d /usr/share/nltk_data && break || sleep 1; done

# Download spaCy models, if the optional feature was selected
ARG spacy_models=en_core_web_sm
RUN if [[ $optional_dependencies =~ "spacy" ]]; then \
for model in $(echo $spacy_models | tr "," "\n"); do \
python -m spacy download $model; \
done; \
fi

# Install Annif by copying source and make the installation editable:
COPY annif /Annif/annif
COPY tests /Annif/tests
Expand Down
6 changes: 6 additions & 0 deletions annif/analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,9 @@ def get_analyzer(analyzerspec):
register_analyzer(voikko.VoikkoAnalyzer)
except ImportError:
annif.logger.debug("voikko not available, not enabling voikko analyzer")

try:
from . import spacy
register_analyzer(spacy.SpacyAnalyzer)
except ImportError:
annif.logger.debug("spaCy not available, not enabling spacy analyzer")
20 changes: 11 additions & 9 deletions annif/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@


class Analyzer(metaclass=abc.ABCMeta):
"""Base class for language-specific analyzers. The non-implemented
methods should be overridden in subclasses. Tokenize functions may
be overridden when necessary."""
"""Base class for language-specific analyzers. Either tokenize_words or
_normalize_word must be overridden in subclasses. Other methods may be
overridden when necessary."""

name = None
token_min_length = 3 # default value, can be overridden in instances
Expand All @@ -35,14 +35,16 @@ def is_valid_token(self, word):
return True
return False

def tokenize_words(self, text):
"""Tokenize a piece of text (e.g. a sentence) into words."""
def tokenize_words(self, text, filter=True):
"""Tokenize a piece of text (e.g. a sentence) into words. If
filter=True (default), only return valid tokens (e.g. not
punctuation, numbers or very short words)"""

import nltk.tokenize
return [self.normalize_word(word)
return [self._normalize_word(word)
for word in nltk.tokenize.word_tokenize(text)
if self.is_valid_token(word)]
if (not filter or self.is_valid_token(word))]

@abc.abstractmethod
def normalize_word(self, word):
def _normalize_word(self, word):
"""Normalize (stem or lemmatize) a word form into a normal form."""
pass # pragma: no cover
2 changes: 1 addition & 1 deletion annif/analyzer/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ def __init__(self, param, **kwargs):
self.param = param
super().__init__(**kwargs)

def normalize_word(self, word):
def _normalize_word(self, word):
return word.lower()
2 changes: 1 addition & 1 deletion annif/analyzer/snowball.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ def __init__(self, param, **kwargs):
super().__init__(**kwargs)

@functools.lru_cache(maxsize=500000)
def normalize_word(self, word):
def _normalize_word(self, word):
return self.stemmer.stem(word.lower())
36 changes: 36 additions & 0 deletions annif/analyzer/spacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""spaCy analyzer for Annif which uses spaCy for lemmatization"""

import spacy
from . import analyzer
from annif.exception import OperationFailedException
import annif.util

_KEY_LOWERCASE = 'lowercase'


class SpacyAnalyzer(analyzer.Analyzer):
name = "spacy"

def __init__(self, param, **kwargs):
self.param = param
try:
self.nlp = spacy.load(param, exclude=['ner', 'parser'])
except IOError as err:
raise OperationFailedException(
f"Loading spaCy model '{param}' failed - " +
f"please download the model.\n{err}")
if _KEY_LOWERCASE in kwargs:
self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
else:
self.lowercase = False
super().__init__(**kwargs)

def tokenize_words(self, text, filter=True):
lemmas = [lemma
for lemma in (token.lemma_
for token in self.nlp(text.strip()))
if (not filter or self.is_valid_token(lemma))]
if self.lowercase:
return [lemma.lower() for lemma in lemmas]
else:
return lemmas
2 changes: 1 addition & 1 deletion annif/analyzer/voikko.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __getstate__(self):
return {'param': self.param, 'voikko': None}

@functools.lru_cache(maxsize=500000)
def normalize_word(self, word):
def _normalize_word(self, word):
if self.voikko is None:
self.voikko = voikko.libvoikko.Voikko(self.param)
result = self.voikko.analyze(word)
Expand Down
7 changes: 2 additions & 5 deletions annif/backend/yake.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,8 @@ def _normalize_label(self, label):
return self._sort_phrase(normalized_label)

def _normalize_phrase(self, phrase):
normalized = []
for word in phrase.split():
normalized.append(
self.project.analyzer.normalize_word(word).lower())
return ' '.join(normalized)
return ' '.join(self.project.analyzer.tokenize_words(phrase,
filter=False))

def _sort_phrase(self, phrase):
words = phrase.split()
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def read(fname):
'omikuji': ['omikuji==0.4.*'],
'yake': ['yake==0.4.5'],
'pycld3': ['pycld3'],
'spacy': ['spacy==3.2.*'],
'dev': [
'codecov',
'pytest-cov',
Expand Down
22 changes: 15 additions & 7 deletions tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def test_get_analyzer_badspec():

def test_english_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("snowball(english)")
assert analyzer.normalize_word("running") == "run"
assert analyzer.normalize_word("words") == "word"
assert analyzer._normalize_word("running") == "run"
assert analyzer._normalize_word("words") == "word"


def test_english_tokenize_sentences():
Expand Down Expand Up @@ -51,6 +51,14 @@ def test_english_tokenize_words():
assert len(words) == 14


def test_english_tokenize_words_no_filter():
analyzer = annif.analyzer.get_analyzer("snowball(english)")
text = """To take a trivial example, which of us ever undertakes
laborious physical exercise, except to obtain some advantage from it?"""
words = analyzer.tokenize_words(text, filter=False)
assert len(words) == 23


def test_english_filter_words_min_token():
analyzer = annif.analyzer.get_analyzer(
"snowball(english,token_min_length=2)")
Expand All @@ -66,19 +74,19 @@ def test_english_filter_words_min_token():

def test_swedish_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("snowball(swedish)")
assert analyzer.normalize_word("gamla") == "gaml"
assert analyzer.normalize_word("hundar") == "hund"
assert analyzer._normalize_word("gamla") == "gaml"
assert analyzer._normalize_word("hundar") == "hund"


def test_snowball_finnish_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("snowball(finnish)")
assert analyzer.normalize_word("vanhat") == "vanh"
assert analyzer.normalize_word("koirien") == "koir"
assert analyzer._normalize_word("vanhat") == "vanh"
assert analyzer._normalize_word("koirien") == "koir"


def test_simple_analyzer():
analyzer = annif.analyzer.get_analyzer("simple")
assert analyzer.normalize_word("Big") == "big"
assert analyzer._normalize_word("Big") == "big"


def test_simple_analyzer_token_size():
Expand Down
40 changes: 40 additions & 0 deletions tests/test_analyzer_spacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Unit tests for spacy analyzer in Annif"""

import pytest
import annif.analyzer
from annif.exception import OperationFailedException

spacy = pytest.importorskip("annif.analyzer.spacy")


def test_spacy_model_not_found():
with pytest.raises(OperationFailedException) as excinfo:
annif.analyzer.get_analyzer("spacy(not_found)")
assert "Loading spaCy model 'not_found' failed" in str(excinfo.value)


def test_spacy_english_tokenize_words():
analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
words = analyzer.tokenize_words("""
The quick brown foxes jumped over the lazy dogs in Paris.
""")
assert words == ['the', 'quick', 'brown', 'fox',
'jump', 'over', 'the', 'lazy', 'dog', 'Paris']


def test_spacy_english_tokenize_words_no_filter():
analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
words = analyzer.tokenize_words("""
The quick brown foxes jumped over the lazy dogs in Paris.
""", filter=False)
assert words == ['the', 'quick', 'brown', 'fox',
'jump', 'over', 'the', 'lazy', 'dog', 'in', 'Paris', '.']


def test_spacy_english_tokenize_words_lowercase():
analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm,lowercase=1)")
words = analyzer.tokenize_words("""
The quick brown foxes jumped over the lazy dogs in Paris.
""")
assert words == ['the', 'quick', 'brown', 'fox',
'jump', 'over', 'the', 'lazy', 'dog', 'paris']
6 changes: 3 additions & 3 deletions tests/test_analyzer_voikko.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ def test_voikko_getstate():

def test_voikko_finnish_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("voikko(fi)")
assert analyzer.normalize_word("xyzzy") == "xyzzy"
assert analyzer.normalize_word("vanhat") == "vanha"
assert analyzer.normalize_word("koirien") == "koira"
assert analyzer._normalize_word("xyzzy") == "xyzzy"
assert analyzer._normalize_word("vanhat") == "vanha"
assert analyzer._normalize_word("koirien") == "koira"