huggingface · thomwolf · Dec 28, 2019 · Dec 28, 2019
diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
@@ -56,7 +56,6 @@
     write_predictions,
     write_predictions_extended,
 )
-
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
 # We've added it here for automated tests (see examples/test_examples.py file)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
@@ -84,7 +84,7 @@ def __init__(self, tokenizer, args, file_path="train", block_size=512):
         assert os.path.isfile(file_path)
         directory, filename = os.path.split(file_path)
         cached_features_file = os.path.join(
-            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
+            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename + ".bin"
         )
 
         if os.path.exists(cached_features_file) and not args.overwrite_cache:
@@ -625,6 +625,12 @@ def main():
         args.config_name if args.config_name else args.model_name_or_path,
         cache_dir=args.cache_dir if args.cache_dir else None,
     )
+
+    # Desactivate past output for now (reduce memory)
+    # we use it only for GPT/GPT2 generation or specific XLNet/Transformer-XL trainings (not implemented currently)
+    if hasattr(config, "output_past"):
+        config.output_past = False
+
     tokenizer = tokenizer_class.from_pretrained(
         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
         do_lower_case=args.do_lower_case,

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -31,7 +31,6 @@
 from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
 from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
-
 # Configurations
 from .configuration_utils import PretrainedConfig
 from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
@@ -56,7 +55,6 @@
     xnli_processors,
     xnli_tasks_num_labels,
 )
-
 # Files and general utilities
 from .file_utils import (
     CONFIG_NAME,
@@ -73,10 +71,8 @@
     is_tf_available,
     is_torch_available,
 )
-
 # Model Cards
 from .modelcard import ModelCard
-
 # TF 2.0 <=> PyTorch conversion utilities
 from .modeling_tf_pytorch_utils import (
     convert_tf_weight_name_to_pt_weight_name,
@@ -87,7 +83,6 @@
     load_tf2_model_in_pytorch_model,
     load_tf2_weights_in_pytorch_model,
 )
-
 # Pipelines
 from .pipelines import (
     CsvPipelineDataFormat,
@@ -113,7 +108,6 @@
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_t5 import T5Tokenizer
 from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
-
 # Tokenizers
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_xlm import XLMTokenizer

diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
@@ -103,6 +103,7 @@ class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
         self.output_attentions = config.output_attentions
+        self.output_past = config.output_past
 
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -159,10 +160,11 @@ def _attn(self, q, k, v, attention_mask=None, head_mask=None):
         if head_mask is not None:
             w = w * head_mask
 
-        outputs = [torch.matmul(w, v)]
+        x = torch.matmul(w, v)
         if self.output_attentions:
-            outputs.append(w)
-        return outputs
+            return (x, w)
+        else:
+            return (x,)
 
     def merge_heads(self, x):
         x = x.permute(0, 2, 1, 3).contiguous()
@@ -187,7 +189,8 @@ def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
             past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
             key = torch.cat((past_key, key), dim=-1)
             value = torch.cat((past_value, value), dim=-2)
-        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+        if self.output_past:
+            present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
 
         attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
         a = attn_outputs[0]
@@ -196,8 +199,11 @@ def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
         a = self.c_proj(a)
         a = self.resid_dropout(a)
 
-        outputs = [a, present] + attn_outputs[1:]
-        return outputs  # a, present, (attentions)
+        if self.output_past:
+            outputs = (a, present) + attn_outputs[1:]
+        else:
+            outputs = (a,) + attn_outputs[1:]
+        return outputs  # a, (present), (attentions)
 
 
 class MLP(nn.Module):
@@ -228,14 +234,14 @@ def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
         output_attn = self.attn(
             self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
         )
-        a = output_attn[0]  # output_attn: a, present, (attentions)
+        a = output_attn[0]  # output_attn: a, (present), (attentions)
 
         x = x + a
         m = self.mlp(self.ln_2(x))
         x = x + m
 
-        outputs = [x] + output_attn[1:]
-        return outputs  # x, present, (attentions)
+        outputs = (x,) + output_attn[1:]
+        return outputs  # x, (present), (attentions)
 
 
 class GPT2PreTrainedModel(PreTrainedModel):
@@ -465,7 +471,7 @@ def forward(
         output_shape = input_shape + (hidden_states.size(-1),)
 
         presents = ()
-        all_attentions = []
+        all_attentions = ()
         all_hidden_states = ()
         for i, (block, layer_past) in enumerate(zip(self.h, past)):
             if self.output_hidden_states:
@@ -475,12 +481,12 @@ def forward(
                 hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
             )
 
-            hidden_states, present = outputs[:2]
+            hidden_states = outputs[0]
             if self.output_past:
-                presents = presents + (present,)
+                presents = presents + (outputs[1],)
 
             if self.output_attentions:
-                all_attentions.append(outputs[2])
+                all_attentions = all_attentions + (outputs[2 if self.output_past else 1],)
 
         hidden_states = self.ln_f(hidden_states)
 

diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
@@ -52,7 +52,6 @@
     write_predictions,
     write_predictions_extended,
 )
-
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
 # We've added it here for automated tests (see examples/test_examples.py file)

diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
@@ -21,7 +21,6 @@
 import math
 
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
-
 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
 from utils_squad_evaluate import find_all_best_thresh_v2, get_raw_scores, make_qid_to_has_ans