diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index bfa1cd1666..7d1a9e8e8c 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -108,7 +108,7 @@ def read_swag_examples(input_file, is_training=True):
         lines = []
         for line in reader:
             if sys.version_info[0] == 2:
-                line = list(unicode(cell, "utf-8") for cell in line)
+                line = list(unicode(cell, "utf-8") for cell in line)  # noqa: F821
             lines.append(line)
 
     if is_training and lines[0][-1] != "label":
diff --git a/examples/run_generation.py b/examples/run_generation.py
index 629b9348a0..531c485326 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -225,7 +225,7 @@ def main():
     # Batch size == 1. to add more examples please use num_return_sequences > 1
     generated_sequence = output_sequences[0].tolist()
     text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
-    text = text[: t.find(args.stop_token) if args.stop_token else None]
+    text = text[: text.find(args.stop_token) if args.stop_token else None]
 
     print(text)
 
diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py
index 987ffbc0ef..1eea8f3352 100644
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -184,7 +184,7 @@ class SwagProcessor(DataProcessor):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, "utf-8") for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)  # noqa: F821
                 lines.append(line)
             return lines
 
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 7b576a65dc..edebd8ab05 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -68,6 +68,14 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
 #
 # See the conversion methods in modeling_tf_pytorch_utils.py for more details
 ####################################################
+
+TFXxxAttention = tf.keras.layers.Layer
+
+TFXxxIntermediate = tf.keras.layers.Layer
+
+TFXxxOutput = tf.keras.layers.Layer
+
+
 class TFXxxLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFXxxLayer, self).__init__(**kwargs)
@@ -316,6 +324,9 @@ class TFXxxModel(TFXxxPreTrainedModel):
         return outputs
 
 
+TFXxxMLMHead = tf.keras.layers.Layer
+
+
 @add_start_docstrings(
     """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING
 )
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 0779b65214..c4bcc55fdd 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -135,6 +135,14 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
 #
 # See the conversion methods in modeling_tf_pytorch_utils.py for more details
 ####################################################
+
+XxxAttention = nn.Module
+
+XxxIntermediate = nn.Module
+
+XxxOutput = nn.Module
+
+
 class XxxLayer(nn.Module):
     def __init__(self, config):
         super(XxxLayer, self).__init__()
@@ -160,6 +168,16 @@ class XxxLayer(nn.Module):
 # pointers for your model and the weights initialization
 # method if its not fully covered by PreTrainedModel's default method
 ####################################################
+
+XxxLayerNorm = torch.nn.LayerNorm
+
+XxxEmbeddings = nn.Module
+
+XxxEncoder = nn.Module
+
+XxxPooler = nn.Module
+
+
 class XxxPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index 6800920cfa..65761ae982 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -1,6 +1,7 @@
 import os
 from argparse import ArgumentParser
 from getpass import getpass
+from typing import List, Union
 
 from transformers.commands import BaseTransformersCLICommand
 from transformers.hf_api import HfApi, HfFolder, HTTPError
@@ -96,8 +97,7 @@ class LogoutCommand(BaseUserCommand):
 
 
 class ListObjsCommand(BaseUserCommand):
-    def tabulate(self, rows, headers):
-        # type: (List[List[Union[str, int]]], List[str]) -> str
+    def tabulate(self, rows: List[List[Union[str, int]]], headers: List[str]) -> str:
         """
         Inspired by:
         stackoverflow.com/a/8356620/593036
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
index 7e044438af..0ac98bf0f5 100644
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -102,7 +102,7 @@ class DataProcessor(object):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, "utf-8") for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)  # noqa: F821
                 lines.append(line)
             return lines
 
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index b1a4d240d2..2334ff06b3 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -419,7 +419,7 @@ def get_from_cache(
                 with open(meta_path, "w") as meta_file:
                     output_string = json.dumps(meta)
                     if sys.version_info[0] == 2 and isinstance(output_string, str):
-                        output_string = unicode(output_string, "utf-8")  # The beauty of python 2
+                        output_string = unicode(output_string, "utf-8")  # noqa: F821
                     meta_file.write(output_string)
 
     return cache_path
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index b1c7650835..13469728dc 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import io
 import os
 from os.path import expanduser
+from typing import List
 
 import requests
 import six
@@ -93,7 +95,7 @@ class HfApi:
         return d["user"]
 
     def logout(self, token):
-        # type: (...) -> void
+        # type: (...) -> None
         """
         Call HF API to log out.
         """
@@ -135,8 +137,7 @@ class HfApi:
             pf.close()
         return urls.access
 
-    def list_objs(self, token):
-        # type: (...) -> List[S3Obj]
+    def list_objs(self, token) -> List[S3Obj]:
         """
         Call HF API to list all stored files for user.
         """
@@ -156,9 +157,7 @@ class TqdmProgressFileReader:
     for implementation details.
     """
 
-    def __init__(
-        self, f  # type: io.BufferedReader
-    ):
+    def __init__(self, f: io.BufferedReader):
         self.f = f
         self.total_size = os.fstat(f.fileno()).st_size  # type: int
         self.pbar = tqdm(total=self.total_size, leave=False)
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index e2e4f2b5fd..9b56bc45d4 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -339,7 +339,9 @@ class BertIntermediate(nn.Module):
     def __init__(self, config):
         super(BertIntermediate, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
             self.intermediate_act_fn = config.hidden_act
@@ -459,7 +461,9 @@ class BertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super(BertPredictionHeadTransform, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index 7cc4621437..ab9d14ab14 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -311,7 +311,9 @@ class TFAlbertLayer(tf.keras.layers.Layer):
             config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
         )
 
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
@@ -452,7 +454,9 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
         self.dense = tf.keras.layers.Dense(
             config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
         )
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.activation = ACT2FN[config.hidden_act]
         else:
             self.activation = config.hidden_act
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index 9ce83fe4d6..bd41893b68 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -690,9 +690,9 @@ class TFAutoModelForQuestionAnswering(object):
         elif isinstance(config, BertConfig):
             return TFBertForQuestionAnswering(config)
         elif isinstance(config, XLNetConfig):
-            return TFXLNetForQuestionAnswering(config)
+            raise NotImplementedError("TFXLNetForQuestionAnswering isn't implemented")
         elif isinstance(config, XLMConfig):
-            return TFXLMForQuestionAnswering(config)
+            raise NotImplementedError("TFXLMForQuestionAnswering isn't implemented")
         raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 1360b1951c..f67901618a 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -315,7 +315,9 @@ class TFBertIntermediate(tf.keras.layers.Layer):
         self.dense = tf.keras.layers.Dense(
             config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
         )
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
             self.intermediate_act_fn = config.hidden_act
@@ -420,7 +422,9 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
         self.dense = tf.keras.layers.Dense(
             config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
         )
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+        if isinstance(config.hidden_act, str) or (
+            sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)  # noqa: F821
+        ):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index e913a0513b..9e48856a64 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -295,7 +295,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
         )
         self.dropout = tf.keras.layers.Dropout(config.dropout)
         if isinstance(config.ff_activation, str) or (
-            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)  # noqa: F821
         ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
@@ -483,7 +483,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             if dtype is not None and dtype != tf.float32:
                 fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
             if self.clamp_len > 0:
-                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -clamp_len, clamp_len)
+                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
             pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
 
         return pos_emb
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index 423ba8cb7c..d749f1d122 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -431,7 +431,7 @@ class XLNetFeedForward(nn.Module):
         self.layer_2 = nn.Linear(config.d_inner, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
         if isinstance(config.ff_activation, str) or (
-            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)
+            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)  # noqa: F821
         ):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
index 76681b1af3..8865110663 100644
--- a/transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -35,7 +35,7 @@ class TokenizerUtilsTest(unittest.TestCase):
 
             for special_tok in tokenizer.all_special_tokens:
                 if six.PY2:
-                    self.assertIsInstance(special_tok, unicode)
+                    self.assertIsInstance(special_tok, unicode)  # noqa: F821
                 else:
                     self.assertIsInstance(special_tok, str)
                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index 276a33cbf2..699304bb5d 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -156,7 +156,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
         """
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):
+        if six.PY2 and isinstance(text, unicode):  # noqa: F821
             text = text.encode("utf-8")
 
         if not sample:
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index c8f97f0526..6b2b85093f 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -80,7 +80,7 @@ def bytes_to_unicode():
     This is a signficant percentage of your normal, say, 32K bpe vocab.
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     """
-    _chr = unichr if sys.version_info[0] == 2 else chr
+    _chr = unichr if sys.version_info[0] == 2 else chr  # noqa: F821
     bs = (
         list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
     )
diff --git a/transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py
index ce058580ba..9f5dc63f63 100644
--- a/transformers/tokenization_transfo_xl.py
+++ b/transformers/tokenization_transfo_xl.py
@@ -36,10 +36,10 @@ try:
 except ImportError:
     pass
 
-# if sys.version_info[0] == 2:
-#     import cPickle as pickle
-# else:
-#     import pickle
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
 
 
 logger = logging.getLogger(__name__)
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index f0df6de60f..8c60beb9d3 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -252,10 +252,10 @@ class PreTrainedTokenizer(object):
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 if key == "additional_special_tokens":
                     assert isinstance(value, (list, tuple)) and all(
-                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
                     )
                 else:
-                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
                 setattr(self, key, value)
 
     @classmethod
@@ -567,7 +567,7 @@ class PreTrainedTokenizer(object):
 
         to_add_tokens = []
         for token in new_tokens:
-            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
+            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))  # noqa: F821
             if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
                 token = token.lower()
             if (
@@ -650,11 +650,11 @@ class PreTrainedTokenizer(object):
             assert key in self.SPECIAL_TOKENS_ATTRIBUTES
             if key == "additional_special_tokens":
                 assert isinstance(value, (list, tuple)) and all(
-                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value
+                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
                 )
                 added_tokens += self.add_tokens(value)
             else:
-                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
                 added_tokens += self.add_tokens([value])
             logger.info("Assigning %s to the %s key of the tokenizer", value, key)
             setattr(self, key, value)
@@ -746,7 +746,7 @@ class PreTrainedTokenizer(object):
         if tokens is None:
             return None
 
-        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
+        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):  # noqa: F821
             return self._convert_token_to_id_with_added_voc(tokens)
 
         ids = []
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index 6c016728e1..ac41afb802 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -156,7 +156,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         """
         text = self.preprocess_text(text)
         # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):
+        if six.PY2 and isinstance(text, unicode):  # noqa: F821
             text = text.encode("utf-8")
 
         if not sample: