Capturing Word Order in Averaging Based Sentence Embeddings
@inproceedings{lee_capturing_2020,
author={Lee, Jae Hee and Camacho-Collados, Jose and Espinosa Anke, Luis and Schockaert, Steven},
title={Capturing Word Order in Averaging Based Sentence Embeddings},
booktitle={24th European Conference on Artificial Intelligence},
year={2020}
}
- gitpython
- humanize
- matplotlib
- nltk
- ray[tune]
- pytorch
- tqdm
- cupy
cd capturing-word-order
pip install -e .
# Store the path of the current folder as cwd
cwd=$(pwd)
# Update and initialize submodules
git submodule update
git submodule init
# Create a folder for raw data
mkdir -p $cwd/data/raw
# Download an archived Wikipedia dump (alternatively, you can download
# a recent dump from https://dumps.wikimedia.org/enwiki.) and save it
# as data/raw/wiki.bz2.
wget https://archive.org/download/enwiki-20190201/enwiki-20190201-pages-articles-multistream.xml.bz2 \
-P $cwd/data/raw/wiki.bz2
# Create a folder for interim data
mkdir -p $cwd/data/interim
# Extract text from the wikipedia dump as data/interim/wiki.json.
# Replace --process 32 with --process n where n is the number of
# available CPU cores.
cd $cwd/src/data/wikiextractor
python WikiExtractor.py --process 32 --json -co ../../../data/interim/wiki ../../../data/raw/wiki.bz2
# Combine the articles as one json file.
cd $cwd/data/interim
find wiki -name '*bz2' -exec bunzip2 -k -c {} \; > wiki.json
# Download nltk 'punkt' and 'stopwords' packages that are necessery for tokenization and for training the models.
python -c "import nltk; nltk.download('punkt')"
python -c "import nltk; nltk.download('stopwords')"
# Create a folder for processed data
mkdir -p $cwd/data/processed
# Generate tokenized 1 million wikipedia sentences. The output is stored in processed data folder.
cd $cwd/src/data/
python tokenize_wiki.py
# Generate tokenized 1 million wikipedia sentences of length <= 25.
python tokenize_wiki.py --max_len 25
# Generate tokenized 2 million wikipedia sentences of length <= 25.
python tokenize_wiki.py --n_sents 2000000 --max_len 25
cd $cwd/
python -m src.data.make_splits
wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P $cwd/data/raw/
cd $cwd/data/raw
unzip crawl-300d-2M.vec.zip
cd src
python tune_bigram_models.py
Run notebooks/generate_figure_1_and_table_1.ipynb
to generate Figure 1 and Table 1
Run notebooks/generate_table_2.ipynb
to generate Table 2