-
Notifications
You must be signed in to change notification settings - Fork 4
/
_fetch_data.sh
60 lines (49 loc) · 2.41 KB
/
_fetch_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#/bin/bash
GET_SEMEVAL_DATA=true
GET_PEPROCESSED_SEMEVAL_DATA=true
GET_GOOGLE_NEWS_EMBEDDINGS=false
friends_subfolder=data/friends/
echo ">> Downloading Data into "$friends_subfolder
if [ \! -d $friends_subfolder ]; then
mkdir -p $friends_subfolder;
fi
if [ $GET_SEMEVAL_DATA == true ]; then
echo ">> Downloading SemEval-Task-4 data (test data)."
github_path="https://raw.githubusercontent.com/emorynlp/semeval-2018-task4/master/dat/"
file_prefix="friends."
for split in "train" "test"; do
for setting in "episode" "scene"; do
fname=$file_prefix$split'.'$setting'_delim.conll';
wget $github_path$fname -O $friends_subfolder/$fname;
if [ $split == "test" ]; then
fname=$fname'.nokey';
wget $github_path$fname -O $friends_subfolder/$fname;
fi
done
done
wget $github_path'ref.out' -O $friends_subfolder/'ref.out';
wget $github_path'friends_entity_map.txt' -O $friends_subfolder/'friends_entity_map.txt';
echo ">> Downloading SemEval-Task-4 trial data."
https://github.com/emorynlp/semeval-2018-task4/tree/master/dat
OUT_F_FRIENDS=data/friends_train.trial.zip
wget https://competitions.codalab.org/my/datasets/download/d8e0b7e1-1c4f-4171-93e9-74339e6c759e -O $OUT_F_FRIENDS
#unzip $OUT_F_FRIENDS -d $friends_subfolder/.
unzip -j $OUT_F_FRIENDS 'friends.trial.episode_delim.conll' -d $friends_subfolder/.
unzip -j $OUT_F_FRIENDS 'friends.trial.scene_delim.conll' -d $friends_subfolder/.
rm $OUT_F_FRIENDS
fi
if [ $GET_PEPROCESSED_SEMEVAL_DATA == true ]; then
echo ">> Downloading data obtained by preprocessing SemEval-Task-4 data."
echo "TODO"
fi
if [ $GET_GOOGLE_NEWS_EMBEDDINGS == true ]; then
echo ">> Downloading GoogleNews skipgram embeddings"
# Acknowledgements to https://gist.github.com/yanaiela/ for the script:
# https://gist.github.com/yanaiela/cfef50380de8a5bfc8c272bb0c91d6e1.js
OUTPUT=$( wget --save-cookies cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/Code: \1\n/p' )
CODE=${OUTPUT##*Code: }
echo $CODE
F='GoogleNews-vectors-negative300.bin.gz'
OUT_F=data/$F
wget --load-cookies cookies.txt 'https://docs.google.com/uc?export=download&confirm='$CODE'&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM' -O $OUT_F
fi