From 81422c4e6d213767dc075f20049e8fd201675029 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Mon, 23 Dec 2019 22:23:44 +0100 Subject: [PATCH 1/7] Remove unused variables in examples. --- examples/contrib/run_openai_gpt.py | 6 ------ examples/contrib/run_transfo_xl.py | 4 +--- examples/run_multiple_choice.py | 3 +-- examples/summarization/modeling_bertabs.py | 5 ----- 4 files changed, 2 insertions(+), 16 deletions(-) diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py index 80331f3402..136e25821f 100644 --- a/examples/contrib/run_openai_gpt.py +++ b/examples/contrib/run_openai_gpt.py @@ -44,13 +44,10 @@ from transformers import ( AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, - cached_path, get_linear_schedule_with_warmup, ) -ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz" - logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO ) @@ -182,9 +179,6 @@ def main(): model.to(device) # Load and encode the datasets - if not args.train_dataset and not args.eval_dataset: - roc_stories = cached_path(ROCSTORIES_URL) - def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py index ae4efbe00e..84e2806a7b 100644 --- a/examples/contrib/run_transfo_xl.py +++ b/examples/contrib/run_transfo_xl.py @@ -28,7 +28,7 @@ import time import torch -from transformers import TransfoXLCorpus, TransfoXLLMHeadModel, TransfoXLTokenizer +from transformers import TransfoXLCorpus, TransfoXLLMHeadModel logging.basicConfig( @@ -73,9 +73,7 @@ def main(): # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax # and tokenizing the dataset # The pre-processed corpus is a convertion (using the conversion script ) - tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name) corpus = TransfoXLCorpus.from_pretrained(args.model_name) - ntokens = len(corpus.vocab) va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 7989422889..69202ab5d5 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer): global_step = 0 tr_loss, logging_loss = 0.0, 0.0 - best_dev_acc, best_dev_loss = 0.0, 99999999999.0 + best_dev_acc = 0.0 best_steps = 0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) @@ -193,7 +193,6 @@ def train(args, train_dataset, model, tokenizer): tb_writer.add_scalar("eval_{}".format(key), value, global_step) if results["eval_acc"] > best_dev_acc: best_dev_acc = results["eval_acc"] - best_dev_loss = results["eval_loss"] best_steps = global_step if args.do_test: results_test = evaluate(args, model, tokenizer, test=True) diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py index 22e50b5e78..4dd89ada88 100644 --- a/examples/summarization/modeling_bertabs.py +++ b/examples/summarization/modeling_bertabs.py @@ -446,8 +446,6 @@ class MultiHeadedAttention(nn.Module): batch_size = key.size(0) dim_per_head = self.dim_per_head head_count = self.head_count - key_len = key.size(1) - query_len = query.size(1) def shape(x): """ projection """ @@ -504,9 +502,6 @@ class MultiHeadedAttention(nn.Module): query = shape(query) - key_len = key.size(2) - query_len = query.size(2) - # 2) Calculate and scale scores. query = query / math.sqrt(dim_per_head) scores = torch.matmul(query, key.transpose(2, 3)) From 71f94a8a1c89577ec4482b3e5600fbcdfb3dd1a8 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Mon, 23 Dec 2019 22:28:34 +0100 Subject: [PATCH 2/7] Remove unused variables in src. --- src/transformers/data/metrics/__init__.py | 2 +- src/transformers/modeling_albert.py | 5 ----- src/transformers/modeling_t5.py | 1 - src/transformers/modeling_tf_pytorch_utils.py | 7 +++---- src/transformers/modeling_tf_t5.py | 1 - src/transformers/modeling_tf_transfo_xl_utilities.py | 1 - src/transformers/modeling_tf_utils.py | 6 +++--- 7 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py index f65c76faeb..48cd3b99af 100644 --- a/src/transformers/data/metrics/__init__.py +++ b/src/transformers/data/metrics/__init__.py @@ -19,7 +19,7 @@ try: from sklearn.metrics import matthews_corrcoef, f1_score _has_sklearn = True -except (AttributeError, ImportError) as e: +except (AttributeError, ImportError): _has_sklearn = False diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index 5162a1d1de..c663b7b8ec 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -241,8 +241,6 @@ class AlbertAttention(BertSelfAttention): context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - reshaped_context_layer = context_layer.view(*new_context_layer_shape) # Should find a better way to do this w = ( @@ -334,9 +332,6 @@ class AlbertTransformer(nn.Module): # Index of the hidden group group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) - # Index of the layer inside the group - layer_idx = int(i - group_idx * layers_per_group) - layer_group_output = self.albert_layer_groups[group_idx]( hidden_states, attention_mask, diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 9c169d5016..576eb89d88 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -629,7 +629,6 @@ class T5Stack(T5PreTrainedModel): all_attentions = all_attentions + (layer_outputs[1],) # We keep only self-attention weights for now hidden_states = self.final_layer_norm(hidden_states) - layer_output = self.dropout(hidden_states) # Add last layer if self.output_hidden_states: diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index 94d15ba74b..81290326c9 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -122,7 +122,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a tf_inputs = tf_model.dummy_inputs if tf_inputs is not None: - tfo = tf_model(tf_inputs, training=False) # Make sure model is built + tf_model(tf_inputs, training=False) # Make sure model is built # Adapt state dict - TODO remove this and update the AWS weights files instead # Convert old format to new format if needed from a PyTorch state_dict @@ -187,7 +187,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a K.batch_set_value(weight_value_tuples) if tf_inputs is not None: - tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run + tf_model(tf_inputs, training=False) # Make sure restore ops are run logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel)) @@ -218,7 +218,6 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs import transformers - tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path)) # Instantiate and load the associated TF 2.0 model @@ -230,7 +229,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs tf_inputs = tf_model.dummy_inputs if tf_inputs is not None: - tfo = tf_model(tf_inputs, training=False) # Make sure model is built + tf_model(tf_inputs, training=False) # Make sure model is built tf_model.load_weights(tf_checkpoint_path, by_name=True) diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 5840407273..43f5517be9 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -491,7 +491,6 @@ class TFT5MainLayer(tf.keras.layers.Layer): all_attentions = all_attentions + (layer_outputs[1],) hidden_states = self.final_layer_norm(hidden_states) - layer_output = self.dropout(hidden_states, training=training) # Add last layer if self.output_hidden_states: diff --git a/src/transformers/modeling_tf_transfo_xl_utilities.py b/src/transformers/modeling_tf_transfo_xl_utilities.py index cd32d86390..23ffb639f7 100644 --- a/src/transformers/modeling_tf_transfo_xl_utilities.py +++ b/src/transformers/modeling_tf_transfo_xl_utilities.py @@ -118,7 +118,6 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): hidden, target = inputs head_logprob = 0 if self.n_clusters == 0: - softmax_b = tf.get_variable("bias", [self.config.vocab_size], initializer=tf.zeros_initializer()) output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) if target is not None: loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index bfb773e38a..b9c0adac38 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -320,7 +320,7 @@ class TFPreTrainedModel(tf.keras.Model): # Load from a PyTorch checkpoint return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True) - ret = model(model.dummy_inputs, training=False) # build the network with dummy inputs + model(model.dummy_inputs, training=False) # build the network with dummy inputs assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file) # 'by_name' allow us to do transfer learning by skipping/adding layers @@ -333,7 +333,7 @@ class TFPreTrainedModel(tf.keras.Model): "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. " ) - ret = model(model.dummy_inputs, training=False) # Make sure restore ops are run + model(model.dummy_inputs, training=False) # Make sure restore ops are run # Check if the models are the same to output loading informations with h5py.File(resolved_archive_file, "r") as f: @@ -515,7 +515,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): cls_index = inputs[1] if len(inputs) > 1 else None assert len(inputs) <= 2, "Too many inputs." else: - input_ids = inputs.get("input_ids") + hidden_states = inputs.get("hidden_states") cls_index = inputs.get("cls_index", None) if self.summary_type == "last": From 495580dad193b7a8405b717b60089574de6563c7 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Mon, 23 Dec 2019 22:36:21 +0100 Subject: [PATCH 3/7] Remove unused variables in templates. --- templates/adding_a_new_example_script/utils_xxx.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py index d766cf32a6..b8f8cdf2b9 100644 --- a/templates/adding_a_new_example_script/utils_xxx.py +++ b/templates/adding_a_new_example_script/utils_xxx.py @@ -868,8 +868,6 @@ def write_predictions_extended( orig_data = json.load(reader)["data"] qid_to_has_ans = make_qid_to_has_ans(orig_data) - has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] - no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) out_eval = {} From e6c0019c80d03b86e7fc051a9c51c55d9a4e7ba7 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Mon, 23 Dec 2019 22:31:39 +0100 Subject: [PATCH 4/7] Remove unused variables in tests. --- tests/test_modeling_common.py | 3 +-- tests/test_modeling_tf_common.py | 2 +- tests/test_modeling_tf_xlm.py | 1 - tests/test_tokenization_common.py | 2 -- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 2c634ed958..719debcb3c 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -284,7 +284,6 @@ class ModelTesterMixin: multihead_outputs = head_mask.grad attentions = outputs[-1] - hidden_states = outputs[-2] # Remove Nan for t in attentions: @@ -590,7 +589,7 @@ class ModelTesterMixin: inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids) with torch.no_grad(): - outputs = model(**inputs_dict) + model(**inputs_dict) class ConfigTester(object): diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 114d69ed1b..e6fb1439c1 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -332,7 +332,7 @@ class TFModelTesterMixin: inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids) inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids) - outputs = model(inputs_dict) + model(inputs_dict) def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py index 5160fde479..53719f63f4 100644 --- a/tests/test_modeling_tf_xlm.py +++ b/tests/test_modeling_tf_xlm.py @@ -224,7 +224,6 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase): inputs = {"input_ids": input_ids, "lengths": input_lengths} - outputs = model(inputs) start_logits, end_logits = model(inputs) result = { diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 035a0dc27f..c10dfb4785 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -159,7 +159,6 @@ class TokenizerTesterMixin: self.assertEqual(all_size_2, all_size + len(new_toks)) tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) - out_string = tokenizer.decode(tokens) self.assertGreaterEqual(len(tokens), 4) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) @@ -178,7 +177,6 @@ class TokenizerTesterMixin: tokens = tokenizer.encode( ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False ) - out_string = tokenizer.decode(tokens) self.assertGreaterEqual(len(tokens), 6) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) From e74c73a85d28fe3d4a1bc6199565056b614eb341 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Mon, 23 Dec 2019 22:38:23 +0100 Subject: [PATCH 5/7] Enable F841 warning in flake8. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index f59ce55df7..47f12faf3e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,5 +24,5 @@ multi_line_output = 3 use_parentheses = True [flake8] -ignore = E203, E501, F841, W503 +ignore = E203, E501, W503 max-line-length = 119 From 35d32308def0bf40ae5f348e088a92d2948a0bd7 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 24 Dec 2019 11:29:49 +0100 Subject: [PATCH 6/7] adding back final dropout in T5 --- src/transformers/modeling_t5.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 576eb89d88..459e49d0b2 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -629,6 +629,7 @@ class T5Stack(T5PreTrainedModel): all_attentions = all_attentions + (layer_outputs[1],) # We keep only self-attention weights for now hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) # Add last layer if self.output_hidden_states: From 3e0cf49514d2185d3142000a717afa33c807546e Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 24 Dec 2019 11:30:56 +0100 Subject: [PATCH 7/7] adding back last dropout in TF 2.0 T5 --- src/transformers/modeling_tf_t5.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 43f5517be9..af928cf395 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -491,6 +491,7 @@ class TFT5MainLayer(tf.keras.layers.Layer): all_attentions = all_attentions + (layer_outputs[1],) hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) # Add last layer if self.output_hidden_states: