From 0faf08a217be92603500fa05f3b2fa3a71f7ac0b Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 4 Jul 2017 02:44:28 +0530 Subject: [PATCH 1/7] add wordrank in docker --- docker/Dockerfile | 20 ++++++++++++++++---- docker/wordrank_install.sh | 20 ++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 docker/wordrank_install.sh diff --git a/docker/Dockerfile b/docker/Dockerfile index 3ba9e3c284..fb0e617c82 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ FROM ubuntu:16.04 MAINTAINER Parul Sethi -ENV GENSIM_REPOSITORY https://github.com/RaRe-Technologies/gensim.git -ENV GENSIM_VERSION bd6db9a41baf219ecc4a1770cc21b01c8ff122e5 +ENV GENSIM_REPOSITORY https://github.com/parulsethi/gensim.git +ENV GENSIM_VERSION add_wordrank_in_docker # Installs python, pip and setup tools (with fixed versions) RUN apt-get update \ @@ -76,12 +76,14 @@ RUN git clone $GENSIM_REPOSITORY \ RUN mkdir /gensim/gensim_dependencies # Set ENV variables for wrappers +ENV WR_HOME /gensim/gensim_dependencies/wordrank ENV FT_HOME /gensim/gensim_dependencies/fastText ENV MALLET_HOME /gensim/gensim_dependencies/mallet ENV DTM_PATH /gensim/gensim_dependencies/dtm/dtm/main ENV VOWPAL_WABBIT_PATH /gensim/gensim_dependencies/vowpal_wabbit/vowpalwabbit/vw -# For fixed version downloads of gensim wrappers dependencies +# For fixed version downloads of gensim wrappers dependencies +ENV WORDRANK_VERSION 44f3f7786f76c79c083dfad9d64e20bacfb4a0b0 ENV FASTTEXT_VERSION f24a781021862f0e475a5fb9c55b7c1cec3b6e2e ENV MORPHOLOGICALPRIORSFORWORDEMBEDDINGS_VERSION ec2e37a3bcb8bd7b56b75b043c47076bc5decf22 ENV DTM_VERSION 67139e6f526b2bc33aef56dc36176a1b8b210056 @@ -90,7 +92,17 @@ ENV VOWPAL_WABBIT_VERSION 69ecc2847fa0c876c6e0557af409f386f0ced59a # Install custom dependencies -# TODO: Install wordrank (need to install mpich/openmpi with multithreading enabled) +# Install mpich (a wordrank dependency) and remove openmpi to avoid mpirun conflict +RUN apt-get purge -y openmpi-common openmpi-bin libopenmpi1.10 +RUN apt-get install -y mpich + +# Install wordrank +RUN cd /gensim/gensim_dependencies \ + && git clone https://bitbucket.org/shihaoji/wordrank \ + && cp /gensim/docker/wordrank_install.sh /gensim/gensim_dependencies/wordrank/install.sh \ + && cd /gensim/gensim_dependencies/wordrank \ + && git checkout $WORDRANK_VERSION \ + && sh ./install.sh # Install fastText RUN cd /gensim/gensim_dependencies \ diff --git a/docker/wordrank_install.sh b/docker/wordrank_install.sh new file mode 100644 index 0000000000..b6859655cb --- /dev/null +++ b/docker/wordrank_install.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +printf "1. clean up workspace\n" +./clean.sh + +printf "\n2. install glove to construct cooccurrence matrix\n" +wget http://nlp.stanford.edu/software/GloVe-1.0.tar.gz # if failed, check http://nlp.stanford.edu/projects/glove/ for the original version +tar -xvzf GloVe-1.0.tar.gz; rm GloVe-1.0.tar.gz +patch -p0 -i glove.patch +cd glove; make clean all; cd .. + +printf "\n3. install hyperwords for evaluation\n" +hg clone -r 56 https://bitbucket.org/omerlevy/hyperwords +patch -p0 -i hyperwords.patch + +printf "\n4. build wordrank\n" +#export CC=icc CXX=icpc +export CC=gcc CXX=g++ # uncomment this line if you don't have an Intel compiler, but with gcc all #pragma simd are ignored as of now +cmake . +make clean all From 0158e0b9d2a57685419f19a40138894350afc699 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 4 Jul 2017 04:58:09 +0530 Subject: [PATCH 2/7] add spacy also --- docker/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index fb0e617c82..0048f7c8e9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -47,6 +47,7 @@ RUN pip2 install \ matplotlib==2.0.0 \ nltk==3.2.2 \ pandas==0.19.2 \ + spacy==1.8.1 \ git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \ -r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt @@ -56,6 +57,7 @@ RUN pip3 install \ matplotlib==2.0.0 \ nltk==3.2.2 \ pandas==0.19.2 \ + spacy==1.8.1 \ git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \ -r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt @@ -63,6 +65,10 @@ RUN pip3 install \ RUN pip2 install -U numpy RUN pip3 install -U numpy +# Download english model of Spacy +python2 -m spacy download en +python3 -m spacy download en + # Download gensim from Github RUN git clone $GENSIM_REPOSITORY \ && cd /gensim \ From 2c4459a9f1b488cc8ee6a80bf1999d44b631040d Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 4 Jul 2017 18:36:18 +0530 Subject: [PATCH 3/7] fix RUN syntax --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0048f7c8e9..3163dc0665 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -66,8 +66,8 @@ RUN pip2 install -U numpy RUN pip3 install -U numpy # Download english model of Spacy -python2 -m spacy download en -python3 -m spacy download en +RUN python2 -m spacy download en +RUN python3 -m spacy download en # Download gensim from Github RUN git clone $GENSIM_REPOSITORY \ From 99099fd0d83fec5b59c6184d98c0bd76282c2cf5 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Mon, 17 Jul 2017 17:28:10 +0530 Subject: [PATCH 4/7] add np param --- docker/Dockerfile | 2 +- gensim/models/wrappers/wordrank.py | 8 +++++--- gensim/test/test_wordrank_wrapper.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 3163dc0665..ac6c0ae079 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -105,9 +105,9 @@ RUN apt-get install -y mpich # Install wordrank RUN cd /gensim/gensim_dependencies \ && git clone https://bitbucket.org/shihaoji/wordrank \ - && cp /gensim/docker/wordrank_install.sh /gensim/gensim_dependencies/wordrank/install.sh \ && cd /gensim/gensim_dependencies/wordrank \ && git checkout $WORDRANK_VERSION \ + && sed -i -e 's/#export CC=gcc CXX=g++/export CC=gcc CXX=g++/g' install.sh \ && sh ./install.sh # Install fastText diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index dbcca9ebb9..2633a218b9 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -47,7 +47,7 @@ class Wordrank(KeyedVectors): @classmethod def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, - beta=99, loss='hinge', memory=4.0, cleanup_files=False, sorted_vocab=1, ensemble=0): + beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """ The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code @@ -79,6 +79,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. + `np` number of copies to execute. (mpirun option) `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors @@ -153,9 +154,10 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, 'loss': loss } - os.chdir('..') # run wordrank executable with wr_args - cmd = ['mpirun', '-np', '1', '../wordrank'] + cmd = ['mpirun', '-np'] + cmd.append(np) + cmd.append(wr_path+'/wordrank') for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index bc07ff5776..5b8260fdf2 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -34,7 +34,7 @@ def setUp(self): self.wr_file = datapath('test_glove.txt') if not self.wr_path: return - self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, cleanup_files=True) + self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, np=2, cleanup_files=True) def testLoadWordrankFormat(self): """Test model successfully loaded from Wordrank format file""" From 02d88db6d8477704bcb6d756e3ede7d08658df34 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 18 Jul 2017 16:46:38 +0530 Subject: [PATCH 5/7] made requested changes --- docker/wordrank_install.sh | 20 -------------------- gensim/models/wrappers/wordrank.py | 4 +++- 2 files changed, 3 insertions(+), 21 deletions(-) delete mode 100644 docker/wordrank_install.sh diff --git a/docker/wordrank_install.sh b/docker/wordrank_install.sh deleted file mode 100644 index b6859655cb..0000000000 --- a/docker/wordrank_install.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -printf "1. clean up workspace\n" -./clean.sh - -printf "\n2. install glove to construct cooccurrence matrix\n" -wget http://nlp.stanford.edu/software/GloVe-1.0.tar.gz # if failed, check http://nlp.stanford.edu/projects/glove/ for the original version -tar -xvzf GloVe-1.0.tar.gz; rm GloVe-1.0.tar.gz -patch -p0 -i glove.patch -cd glove; make clean all; cd .. - -printf "\n3. install hyperwords for evaluation\n" -hg clone -r 56 https://bitbucket.org/omerlevy/hyperwords -patch -p0 -i hyperwords.patch - -printf "\n4. build wordrank\n" -#export CC=icc CXX=icpc -export CC=gcc CXX=g++ # uncomment this line if you don't have an Intel compiler, but with gcc all #pragma simd are ignored as of now -cmake . -make clean all diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 2633a218b9..8970a65fd2 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -13,6 +13,8 @@ .. [1] https://bitbucket.org/shihaoji/wordrank/ .. [2] https://arxiv.org/pdf/1506.02761v3.pdf + +Note that the wrapper might not work in a docker container for large datasets due to memory limits (caused by MPI). """ from __future__ import division @@ -53,7 +55,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. - `wr_path` is the path to the Wordrank directory. + `wr_path` is the absolute path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. From aa573c41d04e19a029dfd10d812acfbeaa6d7799 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Tue, 18 Jul 2017 18:10:06 +0530 Subject: [PATCH 6/7] use os.path.join for wordrank binary --- gensim/models/wrappers/wordrank.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 8970a65fd2..51fcc870c7 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -140,7 +140,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, ) wr_args = { - 'path': 'meta', + 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, @@ -159,7 +159,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, # run wordrank executable with wr_args cmd = ['mpirun', '-np'] cmd.append(np) - cmd.append(wr_path+'/wordrank') + cmd.append(os.path.join(wr_path, 'wordrank')) for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) From 3a51301e5b0a6656078b92732a73a680492c27dd Mon Sep 17 00:00:00 2001 From: parulsethi Date: Wed, 19 Jul 2017 20:03:19 +0530 Subject: [PATCH 7/7] change to original repo --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index ac6c0ae079..b3a82c8465 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ FROM ubuntu:16.04 MAINTAINER Parul Sethi -ENV GENSIM_REPOSITORY https://github.com/parulsethi/gensim.git -ENV GENSIM_VERSION add_wordrank_in_docker +ENV GENSIM_REPOSITORY https://github.com/RaRe-Technologies/gensim.git +ENV GENSIM_VERSION b818c91c698b4a149c55455b88953714d1701031 # Installs python, pip and setup tools (with fixed versions) RUN apt-get update \