From dc7f7527270483528c0b1a9b4a2348096b0c378d Mon Sep 17 00:00:00 2001 From: henrye Date: Mon, 17 Jul 2017 15:15:03 +0100 Subject: [PATCH 1/3] minor comment and print statement fixes --- parlai/core/dict.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parlai/core/dict.py b/parlai/core/dict.py index df79bb7d83b..3a5b4a61acd 100644 --- a/parlai/core/dict.py +++ b/parlai/core/dict.py @@ -133,7 +133,7 @@ def __init__(self, opt, shared=None): self.ind2tok[0] = self.null_token if self.eos_token: - # set special unknown word token + # set special end of sentence token index = len(self.tok2ind) self.tok2ind[self.eos_token] = index self.ind2tok[index] = self.eos_token @@ -267,7 +267,7 @@ def load(self, filename): """Load pre-existing dictionary in 'token[count]' format. Initialize counts from other dictionary, or 0 if they aren't included. """ - print('Dictionary: loading existing dictionary from {}.'.format( + print('Dictionary: loading existing dictionary from {}'.format( filename)) with open(filename) as read: for line in read: @@ -292,7 +292,7 @@ def save(self, filename=None, append=False, sort=True): If ``sort`` (default ``True``), then first sort the dictionary before saving. """ filename = self.opt['model_file'] if filename is None else filename - print('Dictionary: saving dictionary to {}.'.format(filename)) + print('Dictionary: saving dictionary to {}'.format(filename)) if sort: self.sort() From 973d53d33be29ab88454728a79f5c24d3620a1c4 Mon Sep 17 00:00:00 2001 From: henrye Date: Tue, 18 Jul 2017 13:54:10 +0100 Subject: [PATCH 2/3] add start of sentence token to dict.py --- parlai/core/dict.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/parlai/core/dict.py b/parlai/core/dict.py index 3a5b4a61acd..5e20cb97c98 100644 --- a/parlai/core/dict.py +++ b/parlai/core/dict.py @@ -71,8 +71,9 @@ class DictionaryAgent(Agent): default_maxngram = -1 default_minfreq = 0 default_null = '__NULL__' - default_eos = '__EOS__' default_unk = '__UNK__' + default_eos = '__EOS__' + default_go = '__GO__' @staticmethod def add_cmdline_args(argparser): @@ -100,12 +101,15 @@ def add_cmdline_args(argparser): dictionary.add_argument( '--dict-nulltoken', default=DictionaryAgent.default_null, help='empty token, can be used for padding or just empty values') + dictionary.add_argument( + '--dict-unktoken', default=DictionaryAgent.default_unk, + help='token to return for unavailable words') dictionary.add_argument( '--dict-eostoken', default=DictionaryAgent.default_eos, help='token for end of sentence markers, if needed') dictionary.add_argument( - '--dict-unktoken', default=DictionaryAgent.default_unk, - help='token to return for unavailable words') + '--dict-gotoken', default=DictionaryAgent.default_go, + help='token for starting sentence generation, if needed') dictionary.add_argument( '--dict-maxexs', default=100000, type=int, help='max number of examples to build dict on') @@ -115,8 +119,9 @@ def __init__(self, opt, shared=None): # initialize fields self.opt = copy.deepcopy(opt) self.null_token = opt['dict_nulltoken'] - self.eos_token = opt['dict_eostoken'] self.unk_token = opt['dict_unktoken'] + self.eos_token = opt['dict_eostoken'] + self.go_token = opt['dict_gotoken'] self.max_ngram_size = opt['dict_max_ngram_size'] if shared: @@ -132,17 +137,23 @@ def __init__(self, opt, shared=None): self.tok2ind[self.null_token] = 0 self.ind2tok[0] = self.null_token + if self.unk_token: + # set special unknown word token + index = len(self.tok2ind) + self.tok2ind[self.unk_token] = index + self.ind2tok[index] = self.unk_token + if self.eos_token: - # set special end of sentence token + # set special end of sentence word token index = len(self.tok2ind) self.tok2ind[self.eos_token] = index self.ind2tok[index] = self.eos_token - if self.unk_token: - # set special unknown word token + if self.go_token: + # set special start of sentence word token index = len(self.tok2ind) - self.tok2ind[self.unk_token] = index - self.ind2tok[index] = self.unk_token + self.tok2ind[self.go_token] = index + self.ind2tok[index] = self.go_token if opt.get('dict_file') and os.path.isfile(opt['dict_file']): # load pre-existing dictionary @@ -164,13 +175,17 @@ def __init__(self, opt, shared=None): if not shared: - if self.null_token: - # fix count for null token to one billion and two - self.freq[self.null_token] = 1000000002 + if self.go_token: + # fix count for start of sentence token to one billion and three + self.freq[self.go_token] = 1000000003 if self.eos_token: - # fix count for end of sentence token to one billion and one - self.freq[self.eos_token] = 1000000001 + # fix count for end of sentence token to one billion and two + self.freq[self.eos_token] = 1000000002 + + if self.null_token: + # fix count for null token to one billion and one + self.freq[self.null_token] = 1000000001 if self.unk_token: # fix count for unknown token to one billion From 8a0e4aaba6cddbe34012b6dfbe7e7d3448cd39b1 Mon Sep 17 00:00:00 2001 From: henrye Date: Wed, 19 Jul 2017 08:39:58 +0100 Subject: [PATCH 3/3] return original order of special tokens; change names of start and end tokens --- parlai/core/dict.py | 50 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/parlai/core/dict.py b/parlai/core/dict.py index 5e20cb97c98..a287ab99677 100644 --- a/parlai/core/dict.py +++ b/parlai/core/dict.py @@ -71,9 +71,9 @@ class DictionaryAgent(Agent): default_maxngram = -1 default_minfreq = 0 default_null = '__NULL__' + default_end = '__END__' default_unk = '__UNK__' - default_eos = '__EOS__' - default_go = '__GO__' + default_start = '__START__' @staticmethod def add_cmdline_args(argparser): @@ -101,14 +101,14 @@ def add_cmdline_args(argparser): dictionary.add_argument( '--dict-nulltoken', default=DictionaryAgent.default_null, help='empty token, can be used for padding or just empty values') + dictionary.add_argument( + '--dict-endtoken', default=DictionaryAgent.default_end, + help='token for end of sentence markers, if needed') dictionary.add_argument( '--dict-unktoken', default=DictionaryAgent.default_unk, help='token to return for unavailable words') dictionary.add_argument( - '--dict-eostoken', default=DictionaryAgent.default_eos, - help='token for end of sentence markers, if needed') - dictionary.add_argument( - '--dict-gotoken', default=DictionaryAgent.default_go, + '--dict-starttoken', default=DictionaryAgent.default_start, help='token for starting sentence generation, if needed') dictionary.add_argument( '--dict-maxexs', default=100000, type=int, @@ -119,9 +119,9 @@ def __init__(self, opt, shared=None): # initialize fields self.opt = copy.deepcopy(opt) self.null_token = opt['dict_nulltoken'] + self.end_token = opt['dict_endtoken'] self.unk_token = opt['dict_unktoken'] - self.eos_token = opt['dict_eostoken'] - self.go_token = opt['dict_gotoken'] + self.start_token = opt['dict_starttoken'] self.max_ngram_size = opt['dict_max_ngram_size'] if shared: @@ -137,23 +137,23 @@ def __init__(self, opt, shared=None): self.tok2ind[self.null_token] = 0 self.ind2tok[0] = self.null_token + if self.end_token: + # set special end of sentence word token + index = len(self.tok2ind) + self.tok2ind[self.end_token] = index + self.ind2tok[index] = self.end_token + if self.unk_token: # set special unknown word token index = len(self.tok2ind) self.tok2ind[self.unk_token] = index self.ind2tok[index] = self.unk_token - if self.eos_token: - # set special end of sentence word token - index = len(self.tok2ind) - self.tok2ind[self.eos_token] = index - self.ind2tok[index] = self.eos_token - - if self.go_token: + if self.start_token: # set special start of sentence word token index = len(self.tok2ind) - self.tok2ind[self.go_token] = index - self.ind2tok[index] = self.go_token + self.tok2ind[self.start_token] = index + self.ind2tok[index] = self.start_token if opt.get('dict_file') and os.path.isfile(opt['dict_file']): # load pre-existing dictionary @@ -175,17 +175,17 @@ def __init__(self, opt, shared=None): if not shared: - if self.go_token: + if self.start_token: # fix count for start of sentence token to one billion and three - self.freq[self.go_token] = 1000000003 - - if self.eos_token: - # fix count for end of sentence token to one billion and two - self.freq[self.eos_token] = 1000000002 + self.freq[self.start_token] = 1000000003 if self.null_token: - # fix count for null token to one billion and one - self.freq[self.null_token] = 1000000001 + # fix count for null token to one billion and two + self.freq[self.null_token] = 1000000002 + + if self.end_token: + # fix count for end of sentence token to one billion and one + self.freq[self.end_token] = 1000000001 if self.unk_token: # fix count for unknown token to one billion