From dc7f7527270483528c0b1a9b4a2348096b0c378d Mon Sep 17 00:00:00 2001
From: henrye <c.henry.elder@gmail.com>
Date: Mon, 17 Jul 2017 15:15:03 +0100
Subject: [PATCH 1/3] minor comment and print statement fixes

---
 parlai/core/dict.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/parlai/core/dict.py b/parlai/core/dict.py
index df79bb7d83b..3a5b4a61acd 100644
--- a/parlai/core/dict.py
+++ b/parlai/core/dict.py
@@ -133,7 +133,7 @@ def __init__(self, opt, shared=None):
                 self.ind2tok[0] = self.null_token
 
             if self.eos_token:
-                # set special unknown word token
+                # set special end of sentence token
                 index = len(self.tok2ind)
                 self.tok2ind[self.eos_token] = index
                 self.ind2tok[index] = self.eos_token
@@ -267,7 +267,7 @@ def load(self, filename):
         """Load pre-existing dictionary in 'token[<TAB>count]' format.
         Initialize counts from other dictionary, or 0 if they aren't included.
         """
-        print('Dictionary: loading existing dictionary from {}.'.format(
+        print('Dictionary: loading existing dictionary from {}'.format(
               filename))
         with open(filename) as read:
             for line in read:
@@ -292,7 +292,7 @@ def save(self, filename=None, append=False, sort=True):
         If ``sort`` (default ``True``), then first sort the dictionary before saving.
         """
         filename = self.opt['model_file'] if filename is None else filename
-        print('Dictionary: saving dictionary to {}.'.format(filename))
+        print('Dictionary: saving dictionary to {}'.format(filename))
         if sort:
             self.sort()
 

From 973d53d33be29ab88454728a79f5c24d3620a1c4 Mon Sep 17 00:00:00 2001
From: henrye <c.henry.elder@gmail.com>
Date: Tue, 18 Jul 2017 13:54:10 +0100
Subject: [PATCH 2/3] add start of sentence token to dict.py

---
 parlai/core/dict.py | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/parlai/core/dict.py b/parlai/core/dict.py
index 3a5b4a61acd..5e20cb97c98 100644
--- a/parlai/core/dict.py
+++ b/parlai/core/dict.py
@@ -71,8 +71,9 @@ class DictionaryAgent(Agent):
     default_maxngram = -1
     default_minfreq = 0
     default_null = '__NULL__'
-    default_eos = '__EOS__'
     default_unk = '__UNK__'
+    default_eos = '__EOS__'
+    default_go = '__GO__'
 
     @staticmethod
     def add_cmdline_args(argparser):
@@ -100,12 +101,15 @@ def add_cmdline_args(argparser):
         dictionary.add_argument(
            '--dict-nulltoken', default=DictionaryAgent.default_null,
            help='empty token, can be used for padding or just empty values')
+        dictionary.add_argument(
+            '--dict-unktoken', default=DictionaryAgent.default_unk,
+            help='token to return for unavailable words')
         dictionary.add_argument(
            '--dict-eostoken', default=DictionaryAgent.default_eos,
            help='token for end of sentence markers, if needed')
         dictionary.add_argument(
-            '--dict-unktoken', default=DictionaryAgent.default_unk,
-            help='token to return for unavailable words')
+           '--dict-gotoken', default=DictionaryAgent.default_go,
+           help='token for starting sentence generation, if needed')
         dictionary.add_argument(
             '--dict-maxexs', default=100000, type=int,
             help='max number of examples to build dict on')
@@ -115,8 +119,9 @@ def __init__(self, opt, shared=None):
         # initialize fields
         self.opt = copy.deepcopy(opt)
         self.null_token = opt['dict_nulltoken']
-        self.eos_token = opt['dict_eostoken']
         self.unk_token = opt['dict_unktoken']
+        self.eos_token = opt['dict_eostoken']
+        self.go_token = opt['dict_gotoken']
         self.max_ngram_size = opt['dict_max_ngram_size']
 
         if shared:
@@ -132,17 +137,23 @@ def __init__(self, opt, shared=None):
                 self.tok2ind[self.null_token] = 0
                 self.ind2tok[0] = self.null_token
 
+            if self.unk_token:
+                # set special unknown word token
+                index = len(self.tok2ind)
+                self.tok2ind[self.unk_token] = index
+                self.ind2tok[index] = self.unk_token
+
             if self.eos_token:
-                # set special end of sentence token
+                # set special end of sentence word token
                 index = len(self.tok2ind)
                 self.tok2ind[self.eos_token] = index
                 self.ind2tok[index] = self.eos_token
 
-            if self.unk_token:
-                # set special unknown word token
+            if self.go_token:
+                # set special start of sentence word token
                 index = len(self.tok2ind)
-                self.tok2ind[self.unk_token] = index
-                self.ind2tok[index] = self.unk_token
+                self.tok2ind[self.go_token] = index
+                self.ind2tok[index] = self.go_token
 
             if opt.get('dict_file') and os.path.isfile(opt['dict_file']):
                 # load pre-existing dictionary
@@ -164,13 +175,17 @@ def __init__(self, opt, shared=None):
 
         if not shared:
 
-            if self.null_token:
-                # fix count for null token to one billion and two
-                self.freq[self.null_token] = 1000000002
+            if self.go_token:
+                # fix count for start of sentence token to one billion and three
+                self.freq[self.go_token] = 1000000003
 
             if self.eos_token:
-                # fix count for end of sentence token to one billion and one
-                self.freq[self.eos_token] = 1000000001
+                # fix count for end of sentence token to one billion and two
+                self.freq[self.eos_token] = 1000000002
+
+            if self.null_token:
+                # fix count for null token to one billion and one
+                self.freq[self.null_token] = 1000000001
 
             if self.unk_token:
                 # fix count for unknown token to one billion

From 8a0e4aaba6cddbe34012b6dfbe7e7d3448cd39b1 Mon Sep 17 00:00:00 2001
From: henrye <c.henry.elder@gmail.com>
Date: Wed, 19 Jul 2017 08:39:58 +0100
Subject: [PATCH 3/3] return original order of special tokens; change names of
 start and end tokens

---
 parlai/core/dict.py | 50 ++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/parlai/core/dict.py b/parlai/core/dict.py
index 5e20cb97c98..a287ab99677 100644
--- a/parlai/core/dict.py
+++ b/parlai/core/dict.py
@@ -71,9 +71,9 @@ class DictionaryAgent(Agent):
     default_maxngram = -1
     default_minfreq = 0
     default_null = '__NULL__'
+    default_end = '__END__'
     default_unk = '__UNK__'
-    default_eos = '__EOS__'
-    default_go = '__GO__'
+    default_start = '__START__'
 
     @staticmethod
     def add_cmdline_args(argparser):
@@ -101,14 +101,14 @@ def add_cmdline_args(argparser):
         dictionary.add_argument(
            '--dict-nulltoken', default=DictionaryAgent.default_null,
            help='empty token, can be used for padding or just empty values')
+        dictionary.add_argument(
+           '--dict-endtoken', default=DictionaryAgent.default_end,
+           help='token for end of sentence markers, if needed')
         dictionary.add_argument(
             '--dict-unktoken', default=DictionaryAgent.default_unk,
             help='token to return for unavailable words')
         dictionary.add_argument(
-           '--dict-eostoken', default=DictionaryAgent.default_eos,
-           help='token for end of sentence markers, if needed')
-        dictionary.add_argument(
-           '--dict-gotoken', default=DictionaryAgent.default_go,
+           '--dict-starttoken', default=DictionaryAgent.default_start,
            help='token for starting sentence generation, if needed')
         dictionary.add_argument(
             '--dict-maxexs', default=100000, type=int,
@@ -119,9 +119,9 @@ def __init__(self, opt, shared=None):
         # initialize fields
         self.opt = copy.deepcopy(opt)
         self.null_token = opt['dict_nulltoken']
+        self.end_token = opt['dict_endtoken']
         self.unk_token = opt['dict_unktoken']
-        self.eos_token = opt['dict_eostoken']
-        self.go_token = opt['dict_gotoken']
+        self.start_token = opt['dict_starttoken']
         self.max_ngram_size = opt['dict_max_ngram_size']
 
         if shared:
@@ -137,23 +137,23 @@ def __init__(self, opt, shared=None):
                 self.tok2ind[self.null_token] = 0
                 self.ind2tok[0] = self.null_token
 
+            if self.end_token:
+                # set special end of sentence word token
+                index = len(self.tok2ind)
+                self.tok2ind[self.end_token] = index
+                self.ind2tok[index] = self.end_token
+
             if self.unk_token:
                 # set special unknown word token
                 index = len(self.tok2ind)
                 self.tok2ind[self.unk_token] = index
                 self.ind2tok[index] = self.unk_token
 
-            if self.eos_token:
-                # set special end of sentence word token
-                index = len(self.tok2ind)
-                self.tok2ind[self.eos_token] = index
-                self.ind2tok[index] = self.eos_token
-
-            if self.go_token:
+            if self.start_token:
                 # set special start of sentence word token
                 index = len(self.tok2ind)
-                self.tok2ind[self.go_token] = index
-                self.ind2tok[index] = self.go_token
+                self.tok2ind[self.start_token] = index
+                self.ind2tok[index] = self.start_token
 
             if opt.get('dict_file') and os.path.isfile(opt['dict_file']):
                 # load pre-existing dictionary
@@ -175,17 +175,17 @@ def __init__(self, opt, shared=None):
 
         if not shared:
 
-            if self.go_token:
+            if self.start_token:
                 # fix count for start of sentence token to one billion and three
-                self.freq[self.go_token] = 1000000003
-
-            if self.eos_token:
-                # fix count for end of sentence token to one billion and two
-                self.freq[self.eos_token] = 1000000002
+                self.freq[self.start_token] = 1000000003
 
             if self.null_token:
-                # fix count for null token to one billion and one
-                self.freq[self.null_token] = 1000000001
+                # fix count for null token to one billion and two
+                self.freq[self.null_token] = 1000000002
+
+            if self.end_token:
+                # fix count for end of sentence token to one billion and one
+                self.freq[self.end_token] = 1000000001
 
             if self.unk_token:
                 # fix count for unknown token to one billion