NatLibFi · osma · Jan 24, 2022 · Sep 1, 2021 · Sep 1, 2021 · Sep 1, 2021
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -37,6 +37,12 @@ jobs:
         if [[ ${{ matrix.python-version }} != '3.8' ]]; then pip install .[omikuji,yake]; fi
         # Install the optional fastText dependencies for Python 3.8 only
         if [[ ${{ matrix.python-version }} == '3.8' ]]; then pip install .[fasttext]; fi
+        # Install the optional spaCy dependencies for Python 3.8 only
+        if [[ ${{ matrix.python-version }} == '3.8' ]]; then
+          pip install .[spacy]
+          # download the small English pretrained spaCy model needed by spacy analyzer
+          python -m spacy download en_core_web_sm
+        fi
         # For Python 3.7
         # - voikko and pycld3 dependencies
         if [[ ${{ matrix.python-version }} == '3.7' ]]; then python -m pip install .[voikko,pycld3]; fi

diff --git a/Dockerfile b/Dockerfile
@@ -3,7 +3,7 @@ FROM python:3.8-slim-bullseye AS builder
 LABEL maintainer="Juho Inkinen <juho.inkinen@helsinki.fi>"
 
 SHELL ["/bin/bash", "-c"]
-ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake
+ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake,spacy
 # Bulding fastText needs some system packages
 RUN if [[ $optional_dependencies =~ "fasttext" ]]; then \
 		apt-get update && \
@@ -16,6 +16,7 @@ RUN if [[ $optional_dependencies =~ "fasttext" ]]; then \
 
 FROM python:3.8-slim-bullseye
 
+SHELL ["/bin/bash", "-c"]
 COPY --from=builder /usr/local/lib/python3.8 /usr/local/lib/python3.8
 
 # Install system dependencies needed at runtime:
@@ -32,13 +33,21 @@ RUN pip install --upgrade pip --no-cache-dir
 
 COPY setup.py README.md LICENSE.txt projects.cfg.dist /Annif/
 # Install dependencies for optional features.
-ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake
+ARG optional_dependencies=dev,voikko,pycld3,fasttext,nn,omikuji,yake,spacy
 RUN echo "Installing dependencies for optional features: $optional_dependencies" \
 	&& pip install .[$optional_dependencies] --no-cache-dir
 
 # Download nltk data (handle occasional timeout in with 3 tries):
 RUN for i in 1 2 3; do python -m nltk.downloader punkt -d /usr/share/nltk_data && break || sleep 1; done
 
+# Download spaCy models, if the optional feature was selected
+ARG spacy_models=en_core_web_sm
+RUN if [[ $optional_dependencies =~ "spacy" ]]; then \
+		for model in $(echo $spacy_models | tr "," "\n"); do \
+			python -m spacy download $model; \
+		done; \
+	fi
+
 # Install Annif by copying source and make the installation editable:
 COPY annif /Annif/annif
 COPY tests /Annif/tests

diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
@@ -37,3 +37,9 @@ def get_analyzer(analyzerspec):
     register_analyzer(voikko.VoikkoAnalyzer)
 except ImportError:
     annif.logger.debug("voikko not available, not enabling voikko analyzer")
+
+try:
+    from . import spacy
+    register_analyzer(spacy.SpacyAnalyzer)
+except ImportError:
+    annif.logger.debug("spaCy not available, not enabling spacy analyzer")
diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py
@@ -8,9 +8,9 @@
 
 
 class Analyzer(metaclass=abc.ABCMeta):
-    """Base class for language-specific analyzers. The non-implemented
-    methods should be overridden in subclasses. Tokenize functions may
-    be overridden when necessary."""
+    """Base class for language-specific analyzers. Either tokenize_words or
+    _normalize_word must be overridden in subclasses. Other methods may be
+    overridden when necessary."""
 
     name = None
     token_min_length = 3  # default value, can be overridden in instances
@@ -35,14 +35,16 @@ def is_valid_token(self, word):
                 return True
         return False
 
-    def tokenize_words(self, text):
-        """Tokenize a piece of text (e.g. a sentence) into words."""
+    def tokenize_words(self, text, filter=True):
+        """Tokenize a piece of text (e.g. a sentence) into words. If
+        filter=True (default), only return valid tokens (e.g. not
+        punctuation, numbers or very short words)"""
+
         import nltk.tokenize
-        return [self.normalize_word(word)
+        return [self._normalize_word(word)
                 for word in nltk.tokenize.word_tokenize(text)
-                if self.is_valid_token(word)]
+                if (not filter or self.is_valid_token(word))]
 
-    @abc.abstractmethod
-    def normalize_word(self, word):
+    def _normalize_word(self, word):
         """Normalize (stem or lemmatize) a word form into a normal form."""
         pass  # pragma: no cover
diff --git a/annif/analyzer/simple.py b/annif/analyzer/simple.py
@@ -10,5 +10,5 @@ def __init__(self, param, **kwargs):
         self.param = param
         super().__init__(**kwargs)
 
-    def normalize_word(self, word):
+    def _normalize_word(self, word):
         return word.lower()
diff --git a/annif/analyzer/snowball.py b/annif/analyzer/snowball.py
@@ -14,5 +14,5 @@ def __init__(self, param, **kwargs):
         super().__init__(**kwargs)
 
     @functools.lru_cache(maxsize=500000)
-    def normalize_word(self, word):
+    def _normalize_word(self, word):
         return self.stemmer.stem(word.lower())
diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py
@@ -0,0 +1,36 @@
+"""spaCy analyzer for Annif which uses spaCy for lemmatization"""
+
+import spacy
+from . import analyzer
+from annif.exception import OperationFailedException
+import annif.util
+
+_KEY_LOWERCASE = 'lowercase'
+
+
+class SpacyAnalyzer(analyzer.Analyzer):
+    name = "spacy"
+
+    def __init__(self, param, **kwargs):
+        self.param = param
+        try:
+            self.nlp = spacy.load(param, exclude=['ner', 'parser'])
+        except IOError as err:
+            raise OperationFailedException(
+                f"Loading spaCy model '{param}' failed - " +
+                f"please download the model.\n{err}")
+        if _KEY_LOWERCASE in kwargs:
+            self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
+        else:
+            self.lowercase = False
+        super().__init__(**kwargs)
+
+    def tokenize_words(self, text, filter=True):
+        lemmas = [lemma
+                  for lemma in (token.lemma_
+                                for token in self.nlp(text.strip()))
+                  if (not filter or self.is_valid_token(lemma))]
+        if self.lowercase:
+            return [lemma.lower() for lemma in lemmas]
+        else:
+            return lemmas
diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py
@@ -21,7 +21,7 @@ def __getstate__(self):
         return {'param': self.param, 'voikko': None}
 
     @functools.lru_cache(maxsize=500000)
-    def normalize_word(self, word):
+    def _normalize_word(self, word):
         if self.voikko is None:
             self.voikko = voikko.libvoikko.Voikko(self.param)
         result = self.voikko.analyze(word)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
@@ -104,11 +104,8 @@ def _normalize_label(self, label):
         return self._sort_phrase(normalized_label)
 
     def _normalize_phrase(self, phrase):
-        normalized = []
-        for word in phrase.split():
-            normalized.append(
-                self.project.analyzer.normalize_word(word).lower())
-        return ' '.join(normalized)
+        return ' '.join(self.project.analyzer.tokenize_words(phrase,
+                                                             filter=False))
 
     def _sort_phrase(self, phrase):
         words = phrase.split()

diff --git a/setup.py b/setup.py
@@ -46,6 +46,7 @@ def read(fname):
         'omikuji': ['omikuji==0.4.*'],
         'yake': ['yake==0.4.5'],
         'pycld3': ['pycld3'],
+        'spacy': ['spacy==3.2.*'],
         'dev': [
             'codecov',
             'pytest-cov',

diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
@@ -16,8 +16,8 @@ def test_get_analyzer_badspec():
 
 def test_english_analyzer_normalize_word():
     analyzer = annif.analyzer.get_analyzer("snowball(english)")
-    assert analyzer.normalize_word("running") == "run"
-    assert analyzer.normalize_word("words") == "word"
+    assert analyzer._normalize_word("running") == "run"
+    assert analyzer._normalize_word("words") == "word"
 
 
 def test_english_tokenize_sentences():
@@ -51,6 +51,14 @@ def test_english_tokenize_words():
     assert len(words) == 14
 
 
+def test_english_tokenize_words_no_filter():
+    analyzer = annif.analyzer.get_analyzer("snowball(english)")
+    text = """To take a trivial example, which of us ever undertakes
+    laborious physical exercise, except to obtain some advantage from it?"""
+    words = analyzer.tokenize_words(text, filter=False)
+    assert len(words) == 23
+
+
 def test_english_filter_words_min_token():
     analyzer = annif.analyzer.get_analyzer(
         "snowball(english,token_min_length=2)")
@@ -66,19 +74,19 @@ def test_english_filter_words_min_token():
 
 def test_swedish_analyzer_normalize_word():
     analyzer = annif.analyzer.get_analyzer("snowball(swedish)")
-    assert analyzer.normalize_word("gamla") == "gaml"
-    assert analyzer.normalize_word("hundar") == "hund"
+    assert analyzer._normalize_word("gamla") == "gaml"
+    assert analyzer._normalize_word("hundar") == "hund"
 
 
 def test_snowball_finnish_analyzer_normalize_word():
     analyzer = annif.analyzer.get_analyzer("snowball(finnish)")
-    assert analyzer.normalize_word("vanhat") == "vanh"
-    assert analyzer.normalize_word("koirien") == "koir"
+    assert analyzer._normalize_word("vanhat") == "vanh"
+    assert analyzer._normalize_word("koirien") == "koir"
 
 
 def test_simple_analyzer():
     analyzer = annif.analyzer.get_analyzer("simple")
-    assert analyzer.normalize_word("Big") == "big"
+    assert analyzer._normalize_word("Big") == "big"
 
 
 def test_simple_analyzer_token_size():

diff --git a/tests/test_analyzer_spacy.py b/tests/test_analyzer_spacy.py
@@ -0,0 +1,40 @@
+"""Unit tests for spacy analyzer in Annif"""
+
+import pytest
+import annif.analyzer
+from annif.exception import OperationFailedException
+
+spacy = pytest.importorskip("annif.analyzer.spacy")
+
+
+def test_spacy_model_not_found():
+    with pytest.raises(OperationFailedException) as excinfo:
+        annif.analyzer.get_analyzer("spacy(not_found)")
+    assert "Loading spaCy model 'not_found' failed" in str(excinfo.value)
+
+
+def test_spacy_english_tokenize_words():
+    analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
+    words = analyzer.tokenize_words("""
+        The quick brown foxes jumped over the lazy dogs in Paris.
+        """)
+    assert words == ['the', 'quick', 'brown', 'fox',
+                     'jump', 'over', 'the', 'lazy', 'dog', 'Paris']
+
+
+def test_spacy_english_tokenize_words_no_filter():
+    analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
+    words = analyzer.tokenize_words("""
+        The quick brown foxes jumped over the lazy dogs in Paris.
+        """, filter=False)
+    assert words == ['the', 'quick', 'brown', 'fox',
+                     'jump', 'over', 'the', 'lazy', 'dog', 'in', 'Paris', '.']
+
+
+def test_spacy_english_tokenize_words_lowercase():
+    analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm,lowercase=1)")
+    words = analyzer.tokenize_words("""
+        The quick brown foxes jumped over the lazy dogs in Paris.
+        """)
+    assert words == ['the', 'quick', 'brown', 'fox',
+                     'jump', 'over', 'the', 'lazy', 'dog', 'paris']
diff --git a/tests/test_analyzer_voikko.py b/tests/test_analyzer_voikko.py
@@ -14,6 +14,6 @@ def test_voikko_getstate():
 
 def test_voikko_finnish_analyzer_normalize_word():
     analyzer = annif.analyzer.get_analyzer("voikko(fi)")
-    assert analyzer.normalize_word("xyzzy") == "xyzzy"
-    assert analyzer.normalize_word("vanhat") == "vanha"
-    assert analyzer.normalize_word("koirien") == "koira"
+    assert analyzer._normalize_word("xyzzy") == "xyzzy"
+    assert analyzer._normalize_word("vanhat") == "vanha"
+    assert analyzer._normalize_word("koirien") == "koira"