diff --git a/Dockerfile b/Dockerfile index 0ed3d7f..72bbb9b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,10 +3,10 @@ ## STAGE 1 - Core package(s) -FROM ghcr.io/pyo3/maturin:main as maturin +FROM ghcr.io/pyo3/maturin:main AS maturin RUN mkdir -p /app/build/bonn -WORKDIR /app/build/test_data +# WORKDIR /app/build/test_data # RUN curl -L -O "...wiki/wiki.en.fifu" WORKDIR /app/build @@ -19,8 +19,8 @@ COPY README.md /app/build RUN RUSTFLAGS="-L /usr/lib64/atlas -C link-args=-lsatlas -ltatlas -llapack" cargo install finalfusion-utils --features=opq -COPY pyproject.toml /app/build COPY src /app/build/src -COPY bonn /app/build/bonn +COPY pyproject.toml /app/build +COPY python/bonn /app/build/bonn WORKDIR /app/build diff --git a/Makefile b/Makefile index 6f2fca5..dd8460d 100644 --- a/Makefile +++ b/Makefile @@ -13,14 +13,11 @@ RESET := $(shell tput -Txterm sgr0) all: build .PHONY: build -build: Dockerfile +build: @mkdir -p $(BUILD)/wheels docker build -t bonn_py_build -f Dockerfile . docker run --platform "linux/amd64" --entrypoint maturin -v $(shell pwd)/$(BUILD)/wheels:/app/build/target/wheels bonn_py_build build --find-interpreter -Dockerfile: - m4 Dockerfile.in > Dockerfile - test_data/wiki.en.fifu: curl -o test_data/wiki.en.fifu http://www.sfs.uni-tuebingen.de/a3-public-data/finalfusion-fasttext/wiki/wiki.en.fifu diff --git a/pyproject.toml b/pyproject.toml index 769328f..bf6ad48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "bonn" -version = "0.1.5" +version = "0.1.6" description = "Created for ONS. Proof-of-concept mmap'd Rust word2vec implementation linked with category matching" readme = "README.md" license = { "file" = "LICENSE.md" } @@ -29,5 +29,5 @@ classifiers = [ ] [tool.maturin] -python-source = "python" +python-source = "" module-name = "bonn._bonn" diff --git a/python/bonn/category_manager.py b/python/bonn/category_manager.py index a98aa47..83b650f 100644 --- a/python/bonn/category_manager.py +++ b/python/bonn/category_manager.py @@ -2,10 +2,12 @@ import math import re from sortedcontainers import SortedDict -from nltk.corpus import stopwords +from nltk.corpus import stopwords, wordnet from nltk.stem.wordnet import WordNetLemmatizer from .utils import cosine_similarities +stopwords.ensure_loaded() +wordnet.ensure_loaded() re_ws = re.compile(r"\s+") re_num = re.compile(r"[^\w\s\']", flags=re.UNICODE)