From 81422c4e6d213767dc075f20049e8fd201675029 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Mon, 23 Dec 2019 22:23:44 +0100
Subject: [PATCH 1/7] Remove unused variables in examples.

---
 examples/contrib/run_openai_gpt.py         | 6 ------
 examples/contrib/run_transfo_xl.py         | 4 +---
 examples/run_multiple_choice.py            | 3 +--
 examples/summarization/modeling_bertabs.py | 5 -----
 4 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index 80331f3402..136e25821f 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -44,13 +44,10 @@ from transformers import (
     AdamW,
     OpenAIGPTDoubleHeadsModel,
     OpenAIGPTTokenizer,
-    cached_path,
     get_linear_schedule_with_warmup,
 )
 
 
-ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
-
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
@@ -182,9 +179,6 @@ def main():
     model.to(device)
 
     # Load and encode the datasets
-    if not args.train_dataset and not args.eval_dataset:
-        roc_stories = cached_path(ROCSTORIES_URL)
-
     def tokenize_and_encode(obj):
         """ Tokenize and encode a nested object """
         if isinstance(obj, str):
diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py
index ae4efbe00e..84e2806a7b 100644
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -28,7 +28,7 @@ import time
 
 import torch
 
-from transformers import TransfoXLCorpus, TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import TransfoXLCorpus, TransfoXLLMHeadModel
 
 
 logging.basicConfig(
@@ -73,9 +73,7 @@ def main():
     # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
     # and tokenizing the dataset
     # The pre-processed corpus is a convertion (using the conversion script )
-    tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)
     corpus = TransfoXLCorpus.from_pretrained(args.model_name)
-    ntokens = len(corpus.vocab)
 
     va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
     te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 7989422889..69202ab5d5 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer):
 
     global_step = 0
     tr_loss, logging_loss = 0.0, 0.0
-    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
+    best_dev_acc = 0.0
     best_steps = 0
     model.zero_grad()
     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
@@ -193,7 +193,6 @@ def train(args, train_dataset, model, tokenizer):
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                         if results["eval_acc"] > best_dev_acc:
                             best_dev_acc = results["eval_acc"]
-                            best_dev_loss = results["eval_loss"]
                             best_steps = global_step
                             if args.do_test:
                                 results_test = evaluate(args, model, tokenizer, test=True)
diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py
index 22e50b5e78..4dd89ada88 100644
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
@@ -446,8 +446,6 @@ class MultiHeadedAttention(nn.Module):
         batch_size = key.size(0)
         dim_per_head = self.dim_per_head
         head_count = self.head_count
-        key_len = key.size(1)
-        query_len = query.size(1)
 
         def shape(x):
             """  projection """
@@ -504,9 +502,6 @@ class MultiHeadedAttention(nn.Module):
 
         query = shape(query)
 
-        key_len = key.size(2)
-        query_len = query.size(2)
-
         # 2) Calculate and scale scores.
         query = query / math.sqrt(dim_per_head)
         scores = torch.matmul(query, key.transpose(2, 3))

From 71f94a8a1c89577ec4482b3e5600fbcdfb3dd1a8 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Mon, 23 Dec 2019 22:28:34 +0100
Subject: [PATCH 2/7] Remove unused variables in src.

---
 src/transformers/data/metrics/__init__.py            | 2 +-
 src/transformers/modeling_albert.py                  | 5 -----
 src/transformers/modeling_t5.py                      | 1 -
 src/transformers/modeling_tf_pytorch_utils.py        | 7 +++----
 src/transformers/modeling_tf_t5.py                   | 1 -
 src/transformers/modeling_tf_transfo_xl_utilities.py | 1 -
 src/transformers/modeling_tf_utils.py                | 6 +++---
 7 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py
index f65c76faeb..48cd3b99af 100644
--- a/src/transformers/data/metrics/__init__.py
+++ b/src/transformers/data/metrics/__init__.py
@@ -19,7 +19,7 @@ try:
     from sklearn.metrics import matthews_corrcoef, f1_score
 
     _has_sklearn = True
-except (AttributeError, ImportError) as e:
+except (AttributeError, ImportError):
     _has_sklearn = False
 
 
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index 5162a1d1de..c663b7b8ec 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -241,8 +241,6 @@ class AlbertAttention(BertSelfAttention):
         context_layer = torch.matmul(attention_probs, value_layer)
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        reshaped_context_layer = context_layer.view(*new_context_layer_shape)
 
         # Should find a better way to do this
         w = (
@@ -334,9 +332,6 @@ class AlbertTransformer(nn.Module):
             # Index of the hidden group
             group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
 
-            # Index of the layer inside the group
-            layer_idx = int(i - group_idx * layers_per_group)
-
             layer_group_output = self.albert_layer_groups[group_idx](
                 hidden_states,
                 attention_mask,
diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py
index 9c169d5016..576eb89d88 100644
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -629,7 +629,6 @@ class T5Stack(T5PreTrainedModel):
                 all_attentions = all_attentions + (layer_outputs[1],)  # We keep only self-attention weights for now
 
         hidden_states = self.final_layer_norm(hidden_states)
-        layer_output = self.dropout(hidden_states)
 
         # Add last layer
         if self.output_hidden_states:
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 94d15ba74b..81290326c9 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -122,7 +122,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         tf_inputs = tf_model.dummy_inputs
 
     if tf_inputs is not None:
-        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built
+        tf_model(tf_inputs, training=False)  # Make sure model is built
 
     # Adapt state dict - TODO remove this and update the AWS weights files instead
     # Convert old format to new format if needed from a PyTorch state_dict
@@ -187,7 +187,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     K.batch_set_value(weight_value_tuples)
 
     if tf_inputs is not None:
-        tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
+        tf_model(tf_inputs, training=False)  # Make sure restore ops are run
 
     logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
 
@@ -218,7 +218,6 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
 
     import transformers
 
-    tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
 
     # Instantiate and load the associated TF 2.0 model
@@ -230,7 +229,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
         tf_inputs = tf_model.dummy_inputs
 
     if tf_inputs is not None:
-        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built
+        tf_model(tf_inputs, training=False)  # Make sure model is built
 
     tf_model.load_weights(tf_checkpoint_path, by_name=True)
 
diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index 5840407273..43f5517be9 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -491,7 +491,6 @@ class TFT5MainLayer(tf.keras.layers.Layer):
                 all_attentions = all_attentions + (layer_outputs[1],)
 
         hidden_states = self.final_layer_norm(hidden_states)
-        layer_output = self.dropout(hidden_states, training=training)
 
         # Add last layer
         if self.output_hidden_states:
diff --git a/src/transformers/modeling_tf_transfo_xl_utilities.py b/src/transformers/modeling_tf_transfo_xl_utilities.py
index cd32d86390..23ffb639f7 100644
--- a/src/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/src/transformers/modeling_tf_transfo_xl_utilities.py
@@ -118,7 +118,6 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
         hidden, target = inputs
         head_logprob = 0
         if self.n_clusters == 0:
-            softmax_b = tf.get_variable("bias", [self.config.vocab_size], initializer=tf.zeros_initializer())
             output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
             if target is not None:
                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index bfb773e38a..b9c0adac38 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -320,7 +320,7 @@ class TFPreTrainedModel(tf.keras.Model):
             # Load from a PyTorch checkpoint
             return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
 
-        ret = model(model.dummy_inputs, training=False)  # build the network with dummy inputs
+        model(model.dummy_inputs, training=False)  # build the network with dummy inputs
 
         assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
         # 'by_name' allow us to do transfer learning by skipping/adding layers
@@ -333,7 +333,7 @@ class TFPreTrainedModel(tf.keras.Model):
                 "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
             )
 
-        ret = model(model.dummy_inputs, training=False)  # Make sure restore ops are run
+        model(model.dummy_inputs, training=False)  # Make sure restore ops are run
 
         # Check if the models are the same to output loading informations
         with h5py.File(resolved_archive_file, "r") as f:
@@ -515,7 +515,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
             cls_index = inputs[1] if len(inputs) > 1 else None
             assert len(inputs) <= 2, "Too many inputs."
         else:
-            input_ids = inputs.get("input_ids")
+            hidden_states = inputs.get("hidden_states")
             cls_index = inputs.get("cls_index", None)
 
         if self.summary_type == "last":

From 495580dad193b7a8405b717b60089574de6563c7 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Mon, 23 Dec 2019 22:36:21 +0100
Subject: [PATCH 3/7] Remove unused variables in templates.

---
 templates/adding_a_new_example_script/utils_xxx.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
index d766cf32a6..b8f8cdf2b9 100644
--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
@@ -868,8 +868,6 @@ def write_predictions_extended(
         orig_data = json.load(reader)["data"]
 
     qid_to_has_ans = make_qid_to_has_ans(orig_data)
-    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
     exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
     out_eval = {}
 

From e6c0019c80d03b86e7fc051a9c51c55d9a4e7ba7 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Mon, 23 Dec 2019 22:31:39 +0100
Subject: [PATCH 4/7] Remove unused variables in tests.

---
 tests/test_modeling_common.py     | 3 +--
 tests/test_modeling_tf_common.py  | 2 +-
 tests/test_modeling_tf_xlm.py     | 1 -
 tests/test_tokenization_common.py | 2 --
 4 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 2c634ed958..719debcb3c 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -284,7 +284,6 @@ class ModelTesterMixin:
             multihead_outputs = head_mask.grad
 
             attentions = outputs[-1]
-            hidden_states = outputs[-2]
 
             # Remove Nan
             for t in attentions:
@@ -590,7 +589,7 @@ class ModelTesterMixin:
                 inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
 
             with torch.no_grad():
-                outputs = model(**inputs_dict)
+                model(**inputs_dict)
 
 
 class ConfigTester(object):
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 114d69ed1b..e6fb1439c1 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -332,7 +332,7 @@ class TFModelTesterMixin:
                 inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
                 inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
 
-            outputs = model(inputs_dict)
+            model(inputs_dict)
 
 
 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py
index 5160fde479..53719f63f4 100644
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -224,7 +224,6 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
 
             inputs = {"input_ids": input_ids, "lengths": input_lengths}
 
-            outputs = model(inputs)
             start_logits, end_logits = model(inputs)
 
             result = {
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 035a0dc27f..c10dfb4785 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -159,7 +159,6 @@ class TokenizerTesterMixin:
         self.assertEqual(all_size_2, all_size + len(new_toks))
 
         tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-        out_string = tokenizer.decode(tokens)
 
         self.assertGreaterEqual(len(tokens), 4)
         self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
@@ -178,7 +177,6 @@ class TokenizerTesterMixin:
         tokens = tokenizer.encode(
             ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
         )
-        out_string = tokenizer.decode(tokens)
 
         self.assertGreaterEqual(len(tokens), 6)
         self.assertGreater(tokens[0], tokenizer.vocab_size - 1)

From e74c73a85d28fe3d4a1bc6199565056b614eb341 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Mon, 23 Dec 2019 22:38:23 +0100
Subject: [PATCH 5/7] Enable F841 warning in flake8.

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f59ce55df7..47f12faf3e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,5 +24,5 @@ multi_line_output = 3
 use_parentheses = True
 
 [flake8]
-ignore = E203, E501, F841, W503
+ignore = E203, E501, W503
 max-line-length = 119

From 35d32308def0bf40ae5f348e088a92d2948a0bd7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 24 Dec 2019 11:29:49 +0100
Subject: [PATCH 6/7] adding back final dropout in T5

---
 src/transformers/modeling_t5.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py
index 576eb89d88..459e49d0b2 100644
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -629,6 +629,7 @@ class T5Stack(T5PreTrainedModel):
                 all_attentions = all_attentions + (layer_outputs[1],)  # We keep only self-attention weights for now
 
         hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
 
         # Add last layer
         if self.output_hidden_states:

From 3e0cf49514d2185d3142000a717afa33c807546e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 24 Dec 2019 11:30:56 +0100
Subject: [PATCH 7/7] adding back last dropout in TF 2.0 T5

---
 src/transformers/modeling_tf_t5.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index 43f5517be9..af928cf395 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -491,6 +491,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
                 all_attentions = all_attentions + (layer_outputs[1],)
 
         hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
 
         # Add last layer
         if self.output_hidden_states: