Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Merge pull request #4 from Microsoft/master
Browse files Browse the repository at this point in the history
merge master
  • Loading branch information
SparkSnail authored Sep 17, 2018
2 parents 86243e7 + 45650c4 commit 6d09780
Show file tree
Hide file tree
Showing 19 changed files with 375 additions and 110 deletions.
64 changes: 34 additions & 30 deletions examples/trials/ga_squad/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

'''
Data processing script for the QA model.
'''

import csv
import json
from random import shuffle
Expand Down Expand Up @@ -73,19 +77,19 @@ def load_from_file(path, fmt=None, is_training=True):
for doc in data:
for paragraph in doc['paragraphs']:
passage = paragraph['context']
for qa in paragraph['qas']:
question = qa['question']
id = qa['id']
for qa_pair in paragraph['qas']:
question = qa_pair['question']
qa_id = qa_pair['id']
if not is_training:
qp_pairs.append(
{'passage': passage, 'question': question, 'id': id})
{'passage': passage, 'question': question, 'id': qa_id})
else:
for answer in qa['answers']:
for answer in qa_pair['answers']:
answer_begin = int(answer['answer_start'])
answer_end = answer_begin + len(answer['text'])
qp_pairs.append({'passage': passage,
'question': question,
'id': id,
'id': qa_id,
'answer_begin': answer_begin,
'answer_end': answer_end})
else:
Expand Down Expand Up @@ -121,21 +125,21 @@ def collect_vocab(qp_pairs):
Build the vocab from corpus.
'''
vocab = set()
for qp in qp_pairs:
for word in qp['question_tokens']:
for qp_pair in qp_pairs:
for word in qp_pair['question_tokens']:
vocab.add(word['word'])
for word in qp['passage_tokens']:
for word in qp_pair['passage_tokens']:
vocab.add(word['word'])
return vocab


def shuffle_step(l, step):
def shuffle_step(entries, step):
'''
Shuffle the step
'''
answer = []
for i in range(0, len(l), step):
sub = l[i:i+step]
for i in range(0, len(entries), step):
sub = entries[i:i+step]
shuffle(sub)
answer += sub
return answer
Expand Down Expand Up @@ -163,13 +167,13 @@ def get_char_input(data, char_dict, max_char_length):
char_id = np.zeros((max_char_length, sequence_length,
batch_size), dtype=np.int32)
char_lengths = np.zeros((sequence_length, batch_size), dtype=np.float32)
for b in range(0, min(len(data), batch_size)):
d = data[b]
for s in range(0, min(len(d), sequence_length)):
word = d[s]['word']
char_lengths[s, b] = min(len(word), max_char_length)
for batch_idx in range(0, min(len(data), batch_size)):
batch_data = data[batch_idx]
for sample_idx in range(0, min(len(batch_data), sequence_length)):
word = batch_data[sample_idx]['word']
char_lengths[sample_idx, batch_idx] = min(len(word), max_char_length)
for i in range(0, min(len(word), max_char_length)):
char_id[i, s, b] = get_id(char_dict, word[i])
char_id[i, sample_idx, batch_idx] = get_id(char_dict, word[i])
return char_id, char_lengths


Expand All @@ -180,26 +184,26 @@ def get_word_input(data, word_dict, embed, embed_dim):
batch_size = len(data)
max_sequence_length = max(len(d) for d in data)
sequence_length = max_sequence_length
t = np.zeros((max_sequence_length, batch_size,
embed_dim), dtype=np.float32)
word_input = np.zeros((max_sequence_length, batch_size,
embed_dim), dtype=np.float32)
ids = np.zeros((sequence_length, batch_size), dtype=np.int32)
masks = np.zeros((sequence_length, batch_size), dtype=np.float32)
lengths = np.zeros([batch_size], dtype=np.int32)

for b in range(0, min(len(data), batch_size)):
d = data[b]
for batch_idx in range(0, min(len(data), batch_size)):
batch_data = data[batch_idx]

lengths[b] = len(d)
lengths[batch_idx] = len(batch_data)

for s in range(0, min(len(d), sequence_length)):
word = d[s]['word'].lower()
for sample_idx in range(0, min(len(batch_data), sequence_length)):
word = batch_data[sample_idx]['word'].lower()
if word in word_dict.keys():
t[s, b] = embed[word_dict[word]]
ids[s, b] = word_dict[word]
masks[s, b] = 1
word_input[sample_idx, batch_idx] = embed[word_dict[word]]
ids[sample_idx, batch_idx] = word_dict[word]
masks[sample_idx, batch_idx] = 1

t = np.reshape(t, (-1, embed_dim))
return t, ids, masks, lengths
word_input = np.reshape(word_input, (-1, embed_dim))
return word_input, ids, masks, lengths


def get_word_index(tokens, char_index):
Expand Down
6 changes: 6 additions & 0 deletions examples/trials/ga_squad/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
wget http://nlp.stanford.edu/data/glove.840B.300d.zip
unzip glove.840B.300d.zip
27 changes: 14 additions & 13 deletions examples/trials/ga_squad/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

'''
Evaluation scripts for QA model.
'''

from __future__ import print_function
from collections import Counter
import string
Expand Down Expand Up @@ -68,8 +72,8 @@ def f1_score(prediction, ground_truth):
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
f1_result = (2 * precision * recall) / (precision + recall)
return f1_result

def exact_match_score(prediction, ground_truth):
'''
Expand All @@ -91,28 +95,25 @@ def _evaluate(dataset, predictions):
'''
Evaluate function.
'''
f1 = exact_match = total = 0
f1_result = exact_match = total = 0
count = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
for qa_pair in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
#print(message, file=sys.stderr)
if qa_pair['id'] not in predictions:
count += 1
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
ground_truths = list(map(lambda x: x['text'], qa_pair['answers']))
prediction = predictions[qa_pair['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_result += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
print('total', total, 'exact_match', exact_match, 'unanswer_question ', count)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
f1_result = 100.0 * f1_result / total
return {'exact_match': exact_match, 'f1': f1_result}

def evaluate(data_file, pred_file):
'''
Expand Down
20 changes: 10 additions & 10 deletions examples/trials/ga_squad/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ class Layer(object):
'''
Layer class, which contains the information of graph.
'''
def __init__(self, graph_type, input=None, output=None, size=None):
self.input = input if input is not None else []
def __init__(self, graph_type, inputs=None, output=None, size=None):
self.input = inputs if inputs is not None else []
self.output = output if output is not None else []
self.graph_type = graph_type
self.is_delete = False
Expand Down Expand Up @@ -117,11 +117,11 @@ class Graph(object):
'''
Customed Graph class.
'''
def __init__(self, max_layer_num, input, output, hide):
def __init__(self, max_layer_num, inputs, output, hide):
self.layers = []
self.max_layer_num = max_layer_num

for layer in input:
for layer in inputs:
self.layers.append(layer)
for layer in output:
self.layers.append(layer)
Expand Down Expand Up @@ -240,7 +240,7 @@ def mutation(self, only_add=False):
if graph_type <= 1:
new_id = len(layers)
out = random.choice(layers_out)
input = []
inputs = []
output = [out]
pos = random.randint(0, len(layers[out].input) - 1)
last_in = layers[out].input[pos]
Expand All @@ -250,13 +250,13 @@ def mutation(self, only_add=False):
if graph_type == 1:
layers[last_in].output.remove(out)
layers[last_in].output.append(new_id)
input = [last_in]
lay = Layer(graph_type=layer_type, input=input, output=output)
while len(input) < lay.input_size:
inputs = [last_in]
lay = Layer(graph_type=layer_type, inputs=inputs, output=output)
while len(inputs) < lay.input_size:
layer1 = random.choice(layers_in)
input.append(layer1)
inputs.append(layer1)
layers[layer1].output.append(new_id)
lay.input = input
lay.input = inputs
layers.append(lay)
else:
layer1 = random.choice(layers_del)
Expand Down
29 changes: 26 additions & 3 deletions examples/trials/ga_squad/readme.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,32 @@
## How to download data
# Download data

## Use downloading script

Execute the following command to download needed files
using the downloading script:

```
chmod +x ./download.sh
./download.sh
```

## Download manually

1. download "dev-v1.1.json" and "train-v1.1.json" in https://rajpurkar.github.io/SQuAD-explorer/
2. download "glove.840B.300d.txt" in "https://nlp.stanford.edu/projects/glove/"

## How to submit this job
```
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
```

2. download "glove.840B.300d.txt" in https://nlp.stanford.edu/projects/glove/

```
wget http://nlp.stanford.edu/data/glove.840B.300d.zip
unzip glove.840B.300d.zip
```

# How to submit this job

1. run "$NNI_ROOT_DIR/auto_run.py" as "$NNI_ROOT_DIR/README-AUTO.md" said.
2. use the dockerImage openpai.azurecr.io/nni_v0.0.1, which means it use a tensorflow cpu-version.
Expand Down
7 changes: 6 additions & 1 deletion examples/trials/ga_squad/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@


class GAGConfig:
"""The class for model hyper-parameter configuration."""
def __init__(self):
self.batch_size = 128

self.dropout = 0.1

self.char_vcb_size = 1371
self.char_vcb_size = 1500
self.max_char_length = 20
self.char_embed_dim = 100

Expand All @@ -56,6 +57,7 @@ def __init__(self):


class GAG:
"""The class for the computation graph based QA model."""
def __init__(self, cfg, embed, graph):
self.cfg = cfg
self.embed = embed
Expand Down Expand Up @@ -83,6 +85,7 @@ def __init__(self, cfg, embed, graph):


def build_net(self, is_training):
"""Build the whole neural network for the QA model."""
cfg = self.cfg
with tf.device('/cpu:0'):
word_embed = tf.get_variable(
Expand Down Expand Up @@ -202,6 +205,7 @@ def build_net(self, is_training):

if is_training:
def label_smoothing(inputs, masks, epsilon=0.1):
"""Modify target for label smoothing."""
epsilon = cfg.labelsmoothing
num_of_channel = tf.shape(inputs)[-1] # number of channels
inputs = tf.cast(inputs, tf.float32)
Expand Down Expand Up @@ -229,6 +233,7 @@ def label_smoothing(inputs, masks, epsilon=0.1):
return tf.stack([self.begin_prob, self.end_prob])

def build_char_states(self, char_embed, is_training, reuse, char_ids, char_lengths):
"""Build char embedding network for the QA model."""
max_char_length = self.cfg.max_char_length

inputs = dropout(tf.nn.embedding_lookup(char_embed, char_ids),
Expand Down
8 changes: 4 additions & 4 deletions examples/trials/ga_squad/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_config():
default='./glove.840B.300d.txt', help='dev file')
parser.add_argument('--root_path', default='./data/',
type=str, help='Root path of models')
parser.add_argument('--batch_size', type=int, default=2, help='batch size')
parser.add_argument('--batch_size', type=int, default=64, help='batch size')
parser.add_argument('--save_path', type=str,
default='./save', help='save path dir')
parser.add_argument('--learning_rate', type=float, default=0.0001,
Expand Down Expand Up @@ -88,11 +88,13 @@ def load_embedding(path):
'''
return embedding for a specif file by given file path.
'''
EMBEDDING_DIM = 300
embedding_dict = {}
with open(path, 'r', encoding='utf-8') as file:
pairs = [line.strip('\r\n').split() for line in file.readlines()]
for pair in pairs:
embedding_dict[pair[0]] = [float(x) for x in pair[1:]]
if len(pair) == EMBEDDING_DIM + 1:
embedding_dict[pair[0]] = [float(x) for x in pair[1:]]
logger.debug('embedding_dict size: %d', len(embedding_dict))
return embedding_dict

Expand Down Expand Up @@ -241,8 +243,6 @@ def run_epoch(batches, answer_net, is_training):
if count % 100 == 0:
logger.debug('%d %g except:%g' %
(count, used, used / count * len(batches)))
if count % 100 == 0:
break
loss = loss_sum / len(batches)
if is_training:
return loss
Expand Down
20 changes: 20 additions & 0 deletions examples/trials/mnist-batch-tune-keras/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
authorName: default
experimentName: example_mnist-keras
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 6
#choice: local, remote
trainingServicePlatform: local
searchSpacePath: ~/nni/examples/trials/mnist-batch-tune-keras/search_space.json
#choice: true, false
useAnnotation: false
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner
builtinTunerName: BatchTuner
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
trial:
command: python3 mnist-keras.py
codeDir: ~/nni/examples/trials/mnist-batch-tune-keras
gpuNum: 0
Loading

0 comments on commit 6d09780

Please sign in to comment.