From 448c467256332e4be8c122a159b482c1ef039b98 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Thu, 14 May 2020 13:14:26 -0400
Subject: [PATCH] Fix: unpin flake8 and fix cs errors (#4367)

* Fix: unpin flake8 and fix cs errors

* Ok we still need to quote those
---
 examples/benchmarks.py                              |  2 +-
 examples/distillation/distiller.py                  |  8 ++++----
 examples/distillation/scripts/binarized_data.py     |  2 +-
 examples/distillation/scripts/extract.py            |  2 +-
 examples/distillation/scripts/extract_distilbert.py | 10 +++++-----
 examples/distillation/train.py                      |  4 ++--
 setup.cfg                                           |  2 +-
 setup.py                                            |  2 +-
 src/transformers/convert_marian_to_pytorch.py       |  2 +-
 src/transformers/data/datasets/glue.py              |  2 +-
 src/transformers/data/datasets/language_modeling.py |  2 +-
 src/transformers/pipelines.py                       |  6 +++++-
 tests/test_tokenization_common.py                   | 12 +++++++++++-
 13 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index fb3f51d1c4..f215482999 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -478,7 +478,7 @@ def _compute_pytorch(
                             dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
                         if not no_speed:
-                            print_fn("Going through model with sequence of shape".format(sequence.shape))
+                            print_fn("Going through model with sequence of shape {}".format(sequence.shape))
                             runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                             average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                             dictionary[model_name]["time"][batch_size][slice_size] = average_time
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index 53669623b6..893d9916a9 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -80,7 +80,7 @@ class Distiller:
 
         self.mlm = params.mlm
         if self.mlm:
-            logger.info(f"Using MLM loss for LM step.")
+            logger.info("Using MLM loss for LM step.")
             self.mlm_mask_prop = params.mlm_mask_prop
             assert 0.0 <= self.mlm_mask_prop <= 1.0
             assert params.word_mask + params.word_keep + params.word_rand == 1.0
@@ -91,7 +91,7 @@ class Distiller:
                 self.pred_probs = self.pred_probs.half()
                 self.token_probs = self.token_probs.half()
         else:
-            logger.info(f"Using CLM loss for LM step.")
+            logger.info("Using CLM loss for LM step.")
 
         self.epoch = 0
         self.n_iter = 0
@@ -365,8 +365,8 @@ class Distiller:
             self.end_epoch()
 
         if self.is_master:
-            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
-            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name="pytorch_model.bin")
             logger.info("Training is finished")
 
     def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 2dcca18396..8e34b29dcc 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -60,7 +60,7 @@ def main():
     with open(args.file_path, "r", encoding="utf8") as fp:
         data = fp.readlines()
 
-    logger.info(f"Start encoding")
+    logger.info("Start encoding")
     logger.info(f"{len(data)} examples to process.")
 
     rslt = []
diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
index 8d102c0cda..b4bea90d53 100644
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -93,7 +93,7 @@ if __name__ == "__main__":
     elif args.model_type == "gpt2":
         for w in ["weight", "bias"]:
             compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
-        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
+        compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]
 
     print(f"N layers selected for distillation: {std_idx}")
     print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
diff --git a/examples/distillation/scripts/extract_distilbert.py b/examples/distillation/scripts/extract_distilbert.py
index 972418b56b..d709268cf0 100644
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -37,7 +37,7 @@ if __name__ == "__main__":
         model = BertForMaskedLM.from_pretrained(args.model_name)
         prefix = "bert"
     else:
-        raise ValueError(f'args.model_type should be "bert".')
+        raise ValueError('args.model_type should be "bert".')
 
     state_dict = model.state_dict()
     compressed_sd = {}
@@ -78,12 +78,12 @@ if __name__ == "__main__":
             ]
         std_idx += 1
 
-    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
-    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
+    compressed_sd["vocab_projector.weight"] = state_dict["cls.predictions.decoder.weight"]
+    compressed_sd["vocab_projector.bias"] = state_dict["cls.predictions.bias"]
     if args.vocab_transform:
         for w in ["weight", "bias"]:
-            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
-            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
+            compressed_sd[f"vocab_transform.{w}"] = state_dict["cls.predictions.transform.dense.{w}"]
+            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict["cls.predictions.transform.LayerNorm.{w}"]
 
     print(f"N layers selected for distillation: {std_idx}")
     print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index 670d03ea16..0d21ae04f8 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -273,7 +273,7 @@ def main():
         token_probs = None
 
     train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f"Data loader created.")
+    logger.info("Data loader created.")
 
     # STUDENT #
     logger.info(f"Loading student config from {args.student_config}")
@@ -288,7 +288,7 @@ def main():
 
     if args.n_gpu > 0:
         student.to(f"cuda:{args.local_rank}")
-    logger.info(f"Student loaded.")
+    logger.info("Student loaded.")
 
     # TEACHER #
     teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
diff --git a/setup.cfg b/setup.cfg
index 2a081a8acb..79c4d49e3e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,5 +36,5 @@ multi_line_output = 3
 use_parentheses = True
 
 [flake8]
-ignore = E203, E501, W503
+ignore = E203, E501, E741, W503
 max-line-length = 119
diff --git a/setup.py b/setup.py
index 5f186ce000..c45abe8655 100644
--- a/setup.py
+++ b/setup.py
@@ -79,7 +79,7 @@ extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rt
 extras["quality"] = [
     "black",
     "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
-    "flake8==3.7.9",
+    "flake8",
 ]
 extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
 
diff --git a/src/transformers/convert_marian_to_pytorch.py b/src/transformers/convert_marian_to_pytorch.py
index c140fafca4..bd58534ed3 100644
--- a/src/transformers/convert_marian_to_pytorch.py
+++ b/src/transformers/convert_marian_to_pytorch.py
@@ -226,7 +226,7 @@ def lmap(f, x) -> List:
 def fetch_test_set(test_set_url):
     import wget
 
-    fname = wget.download(test_set_url, f"opus_test.txt")
+    fname = wget.download(test_set_url, "opus_test.txt")
     lns = Path(fname).open().readlines()
     src = lmap(str.strip, lns[::4])
     gold = lmap(str.strip, lns[1::4])
diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py
index 3c963c2da3..944eb83a3a 100644
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -114,7 +114,7 @@ class GlueDataset(Dataset):
                 torch.save(self.features, cached_features_file)
                 # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                 logger.info(
-                    f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                 )
 
     def __len__(self):
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index db2497ba49..6fae7b55c5 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -65,7 +65,7 @@ class TextDataset(Dataset):
                 with open(cached_features_file, "wb") as handle:
                     pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                 logger.info(
-                    f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                 )
 
     def __len__(self):
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 6f666c016a..52f1f64d16 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from itertools import chain
 from os.path import abspath, exists
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 
@@ -58,6 +58,10 @@ if is_torch_available():
         AutoModelWithLMHead,
     )
 
+if TYPE_CHECKING:
+    from .modeling_utils import PreTrainedModel
+    from .modeling_tf_utils import TFPreTrainedModel
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index d92f143899..acad8d655f 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -19,11 +19,21 @@ import pickle
 import shutil
 import tempfile
 from collections import OrderedDict
-from typing import Dict, Tuple, Union
+from typing import TYPE_CHECKING, Dict, Tuple, Union
 
 from tests.utils import require_tf, require_torch
 
 
+if TYPE_CHECKING:
+    from transformers import (
+        PretrainedConfig,
+        PreTrainedTokenizer,
+        PreTrainedTokenizerFast,
+        PreTrainedModel,
+        TFPreTrainedModel,
+    )
+
+
 def merge_model_tokenizer_mappings(
     model_mapping: Dict["PretrainedConfig", Union["PreTrainedModel", "TFPreTrainedModel"]],
     tokenizer_mapping: Dict["PretrainedConfig", Tuple["PreTrainedTokenizer", "PreTrainedTokenizerFast"]],