From fb240d2f10204dee85c8d91375a0a06b279896a9 Mon Sep 17 00:00:00 2001 From: livc Date: Wed, 19 Jul 2017 13:12:11 +0800 Subject: [PATCH] fix bug --- sequence_tagging_for_ner/data/download.sh | 6 +++++- sequence_tagging_for_ner/reader.py | 4 ++-- sequence_tagging_for_ner/train.py | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sequence_tagging_for_ner/data/download.sh b/sequence_tagging_for_ner/data/download.sh index fc9de3d7f2..99d81c1e09 100644 --- a/sequence_tagging_for_ner/data/download.sh +++ b/sequence_tagging_for_ner/data/download.sh @@ -1,4 +1,8 @@ -wget http://cs224d.stanford.edu/assignment2/assignment2.zip +if [ -f assignment2.zip ]; then + echo "data exist" +else + wget http://cs224d.stanford.edu/assignment2/assignment2.zip +fi if [ $? -eq 0 ];then unzip assignment2.zip diff --git a/sequence_tagging_for_ner/reader.py b/sequence_tagging_for_ner/reader.py index 2662abe80b..5050d0bf49 100644 --- a/sequence_tagging_for_ner/reader.py +++ b/sequence_tagging_for_ner/reader.py @@ -21,7 +21,7 @@ def canonicalize_word(word, wordset=None, digits=True): if (wordset != None) and (word in wordset): return word word = canonicalize_digits(word) # try to canonicalize numbers if (wordset == None) or (word in wordset): return word - else: return "" # unknown token + else: return "UUUNKKK" # unknown token def data_reader(data_file, word_dict, label_dict): @@ -35,7 +35,7 @@ def data_reader(data_file, word_dict, label_dict): """ def reader(): - UNK_IDX = word_dict[""] + UNK_IDX = word_dict["UUUNKKK"] sentence = [] labels = [] diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py index dd041b6aaa..5facfeda0d 100644 --- a/sequence_tagging_for_ner/train.py +++ b/sequence_tagging_for_ner/train.py @@ -106,4 +106,5 @@ def event_handler(event): test_data_file="data/test", vocab_file="data/vocab.txt", target_file="data/target.txt", - emb_file="data/wordVectors.txt") + emb_file="data/wordVectors.txt", + model_save_dir="model/")