diff --git a/docker/Dockerfile b/docker/Dockerfile index 3ba9e3c284..b3a82c8465 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -3,7 +3,7 @@ FROM ubuntu:16.04 MAINTAINER Parul Sethi ENV GENSIM_REPOSITORY https://github.com/RaRe-Technologies/gensim.git -ENV GENSIM_VERSION bd6db9a41baf219ecc4a1770cc21b01c8ff122e5 +ENV GENSIM_VERSION b818c91c698b4a149c55455b88953714d1701031 # Installs python, pip and setup tools (with fixed versions) RUN apt-get update \ @@ -47,6 +47,7 @@ RUN pip2 install \ matplotlib==2.0.0 \ nltk==3.2.2 \ pandas==0.19.2 \ + spacy==1.8.1 \ git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \ -r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt @@ -56,6 +57,7 @@ RUN pip3 install \ matplotlib==2.0.0 \ nltk==3.2.2 \ pandas==0.19.2 \ + spacy==1.8.1 \ git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \ -r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt @@ -63,6 +65,10 @@ RUN pip3 install \ RUN pip2 install -U numpy RUN pip3 install -U numpy +# Download english model of Spacy +RUN python2 -m spacy download en +RUN python3 -m spacy download en + # Download gensim from Github RUN git clone $GENSIM_REPOSITORY \ && cd /gensim \ @@ -76,12 +82,14 @@ RUN git clone $GENSIM_REPOSITORY \ RUN mkdir /gensim/gensim_dependencies # Set ENV variables for wrappers +ENV WR_HOME /gensim/gensim_dependencies/wordrank ENV FT_HOME /gensim/gensim_dependencies/fastText ENV MALLET_HOME /gensim/gensim_dependencies/mallet ENV DTM_PATH /gensim/gensim_dependencies/dtm/dtm/main ENV VOWPAL_WABBIT_PATH /gensim/gensim_dependencies/vowpal_wabbit/vowpalwabbit/vw -# For fixed version downloads of gensim wrappers dependencies +# For fixed version downloads of gensim wrappers dependencies +ENV WORDRANK_VERSION 44f3f7786f76c79c083dfad9d64e20bacfb4a0b0 ENV FASTTEXT_VERSION f24a781021862f0e475a5fb9c55b7c1cec3b6e2e ENV MORPHOLOGICALPRIORSFORWORDEMBEDDINGS_VERSION ec2e37a3bcb8bd7b56b75b043c47076bc5decf22 ENV DTM_VERSION 67139e6f526b2bc33aef56dc36176a1b8b210056 @@ -90,7 +98,17 @@ ENV VOWPAL_WABBIT_VERSION 69ecc2847fa0c876c6e0557af409f386f0ced59a # Install custom dependencies -# TODO: Install wordrank (need to install mpich/openmpi with multithreading enabled) +# Install mpich (a wordrank dependency) and remove openmpi to avoid mpirun conflict +RUN apt-get purge -y openmpi-common openmpi-bin libopenmpi1.10 +RUN apt-get install -y mpich + +# Install wordrank +RUN cd /gensim/gensim_dependencies \ + && git clone https://bitbucket.org/shihaoji/wordrank \ + && cd /gensim/gensim_dependencies/wordrank \ + && git checkout $WORDRANK_VERSION \ + && sed -i -e 's/#export CC=gcc CXX=g++/export CC=gcc CXX=g++/g' install.sh \ + && sh ./install.sh # Install fastText RUN cd /gensim/gensim_dependencies \ diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index dbcca9ebb9..51fcc870c7 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -13,6 +13,8 @@ .. [1] https://bitbucket.org/shihaoji/wordrank/ .. [2] https://arxiv.org/pdf/1506.02761v3.pdf + +Note that the wrapper might not work in a docker container for large datasets due to memory limits (caused by MPI). """ from __future__ import division @@ -47,13 +49,13 @@ class Wordrank(KeyedVectors): @classmethod def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, - beta=99, loss='hinge', memory=4.0, cleanup_files=False, sorted_vocab=1, ensemble=0): + beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """ The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. - `wr_path` is the path to the Wordrank directory. + `wr_path` is the absolute path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. @@ -79,6 +81,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. + `np` number of copies to execute. (mpirun option) `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors @@ -137,7 +140,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, ) wr_args = { - 'path': 'meta', + 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, @@ -153,9 +156,10 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, 'loss': loss } - os.chdir('..') # run wordrank executable with wr_args - cmd = ['mpirun', '-np', '1', '../wordrank'] + cmd = ['mpirun', '-np'] + cmd.append(np) + cmd.append(os.path.join(wr_path, 'wordrank')) for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index bc07ff5776..5b8260fdf2 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -34,7 +34,7 @@ def setUp(self): self.wr_file = datapath('test_glove.txt') if not self.wr_path: return - self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, cleanup_files=True) + self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, np=2, cleanup_files=True) def testLoadWordrankFormat(self): """Test model successfully loaded from Wordrank format file"""