From 8af25b166486ec0cedbd2ef9147c3700dba88e0b Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Sun, 22 Dec 2019 17:56:09 +0100
Subject: [PATCH] Remove six.

---
 src/transformers/file_utils.py                | 39 ++++++-------------
 src/transformers/hf_api.py                    | 19 ++-------
 src/transformers/pipelines.py                 |  3 +-
 src/transformers/tokenization_albert.py       | 34 +++-------------
 src/transformers/tokenization_bert.py         |  4 +-
 .../tokenization_bert_japanese.py             |  7 +---
 src/transformers/tokenization_camembert.py    |  4 +-
 src/transformers/tokenization_ctrl.py         |  4 +-
 src/transformers/tokenization_gpt2.py         |  4 +-
 src/transformers/tokenization_openai.py       |  2 +-
 src/transformers/tokenization_t5.py           | 22 ++---------
 src/transformers/tokenization_transfo_xl.py   |  2 +-
 src/transformers/tokenization_utils.py        | 26 +++++--------
 src/transformers/tokenization_xlm.py          |  4 +-
 src/transformers/tokenization_xlm_roberta.py  |  4 +-
 src/transformers/tokenization_xlnet.py        | 34 +++-------------
 .../adding_a_new_model/tokenization_xxx.py    |  4 +-
 tests/test_hf_api.py                          |  5 +--
 tests/test_tokenization_utils.py              |  7 +---
 19 files changed, 61 insertions(+), 167 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 55062e7f0e..0477bf452e 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -18,7 +18,6 @@ from io import open
 
 import boto3
 import requests
-import six
 from botocore.config import Config
 from botocore.exceptions import ClientError
 from filelock import FileLock
@@ -107,36 +106,20 @@ def is_tf_available():
     return _tf_available
 
 
-if not six.PY2:
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + fn.__doc__
+        return fn
 
-    def add_start_docstrings(*docstr):
-        def docstring_decorator(fn):
-            fn.__doc__ = "".join(docstr) + fn.__doc__
-            return fn
-
-        return docstring_decorator
-
-    def add_end_docstrings(*docstr):
-        def docstring_decorator(fn):
-            fn.__doc__ = fn.__doc__ + "".join(docstr)
-            return fn
-
-        return docstring_decorator
+    return docstring_decorator
 
 
-else:
-    # Not possible to update class docstrings on python2
-    def add_start_docstrings(*docstr):
-        def docstring_decorator(fn):
-            return fn
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + "".join(docstr)
+        return fn
 
-        return docstring_decorator
-
-    def add_end_docstrings(*docstr):
-        def docstring_decorator(fn):
-            return fn
-
-        return docstring_decorator
+    return docstring_decorator
 
 
 def is_remote_url(url_or_filename):
@@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
     ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
     if isinstance(user_agent, dict):
         ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
-    elif isinstance(user_agent, six.string_types):
+    elif isinstance(user_agent, str):
         ua += "; " + user_agent
     headers = {"user-agent": ua}
     if resume_size > 0:
diff --git a/src/transformers/hf_api.py b/src/transformers/hf_api.py
index 9666282196..a6964737a4 100644
--- a/src/transformers/hf_api.py
+++ b/src/transformers/hf_api.py
@@ -20,7 +20,6 @@ from os.path import expanduser
 from typing import List
 
 import requests
-import six
 from tqdm import tqdm
 
 
@@ -160,11 +159,8 @@ class TqdmProgressFileReader:
         self.f = f
         self.total_size = os.fstat(f.fileno()).st_size  # type: int
         self.pbar = tqdm(total=self.total_size, leave=False)
-        if six.PY3:
-            # does not work unless PY3
-            # no big deal as the CLI does not currently support PY2 anyways.
-            self.read = f.read
-            f.read = self._read
+        self.read = f.read
+        f.read = self._read
 
     def _read(self, n=-1):
         self.pbar.update(n)
@@ -182,16 +178,7 @@ class HfFolder:
         """
         Save token, creating folder as needed.
         """
-        if six.PY3:
-            os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
-        else:
-            # Python 2
-            try:
-                os.makedirs(os.path.dirname(cls.path_token))
-            except OSError as e:
-                if e.errno != os.errno.EEXIST:
-                    raise e
-                pass
+        os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
         with open(cls.path_token, "w+") as f:
             f.write(token)
 
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 605d8b84e3..996521ac5c 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -26,7 +26,6 @@ from os.path import abspath, exists
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
-import six
 
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
 from .configuration_utils import PretrainedConfig
@@ -939,7 +938,7 @@ def pipeline(
             modelcard = config
 
     # Instantiate tokenizer if needed
-    if isinstance(tokenizer, six.string_types):
+    if isinstance(tokenizer, str):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer)
 
     # Instantiate config if needed
diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py
index 04f0bf00af..541ae7ae68 100644
--- a/src/transformers/tokenization_albert.py
+++ b/src/transformers/tokenization_albert.py
@@ -20,8 +20,6 @@ import os
 import unicodedata
 from shutil import copyfile
 
-import six
-
 from .tokenization_utils import PreTrainedTokenizer
 
 
@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
             outputs = inputs
         outputs = outputs.replace("``", '"').replace("''", '"')
 
-        if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode("utf-8")
-
         if not self.keep_accents:
             outputs = unicodedata.normalize("NFKD", outputs)
             outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
 
         return outputs
 
-    def _tokenize(self, text, return_unicode=True, sample=False):
-        """ Tokenize a string.
-            return_unicode is used only for py2
-        """
+    def _tokenize(self, text, sample=False):
+        """ Tokenize a string. """
         text = self.preprocess_text(text)
-        # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):  # noqa: F821
-            text = text.encode("utf-8")
 
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
             else:
                 new_pieces.append(piece)
 
-        # note(zhiliny): convert back to unicode for py2
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in new_pieces:
-                if isinstance(piece, str):
-                    piece = piece.decode("utf-8")
-                ret_pieces.append(piece)
-            new_pieces = ret_pieces
-
         return new_pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.sp_model.PieceToId(token)
 
-    def _convert_id_to_token(self, index, return_unicode=True):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode("utf-8")
-        return token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py
index ef8a7d8391..9a9b6626f1 100644
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.vocab.get(token, self.vocab.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
         return self.ids_to_tokens.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py
index 893db031d4..57dde09734 100644
--- a/src/transformers/tokenization_bert_japanese.py
+++ b/src/transformers/tokenization_bert_japanese.py
@@ -20,8 +20,6 @@ import logging
 import os
 import unicodedata
 
-import six
-
 from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
 
 
@@ -194,10 +192,7 @@ class MecabTokenizer(object):
         never_split = self.never_split + (never_split if never_split is not None else [])
         tokens = []
 
-        if six.PY2:
-            mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
-        else:
-            mecab_output = self.mecab.parse(text)
+        mecab_output = self.mecab.parse(text)
 
         cursor = 0
         for line in mecab_output.split("\n"):
diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py
index 09104bbbf0..dd33d510da 100644
--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         return self.sp_model.EncodeAsPieces(text)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
         elif self.sp_model.PieceToId(token) == 0:
@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         return self.fairseq_offset + self.sp_model.PieceToId(token)
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
         if index in self.fairseq_ids_to_tokens:
             return self.fairseq_ids_to_tokens[index]
         return self.sp_model.IdToPiece(index - self.fairseq_offset)
diff --git a/src/transformers/tokenization_ctrl.py b/src/transformers/tokenization_ctrl.py
index e40df10448..28873ba036 100644
--- a/src/transformers/tokenization_ctrl.py
+++ b/src/transformers/tokenization_ctrl.py
@@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py
index ff96b07945..44d615061f 100644
--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         return bpe_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
         return self.decoder.get(index)
 
     def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/tokenization_openai.py b/src/transformers/tokenization_openai.py
index 693a76377e..566bf14cb4 100644
--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
@@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
diff --git a/src/transformers/tokenization_t5.py b/src/transformers/tokenization_t5.py
index d5a051107c..bd7a6e3144 100644
--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -20,8 +20,6 @@ import os
 import re
 from shutil import copyfile
 
-import six
-
 from .tokenization_utils import PreTrainedTokenizer
 
 
@@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
-    def _tokenize(self, text, return_unicode=True, sample=False):
+    def _tokenize(self, text, sample=False):
         """ Take as input a string and return a list of strings (tokens) for words/sub-words
         """
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
         else:
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-
-        # convert back to unicode for py2
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in pieces:
-                if isinstance(piece, str):
-                    piece = piece.decode("utf-8")
-                ret_pieces.append(piece)
-            pieces = ret_pieces
-
         return pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         if token.startswith("<extra_id_"):
             match = re.match(r"<extra_id_(\d+)>", token)
             num = int(match.group(1))
             return self.vocab_size - num - 1
         return self.sp_model.piece_to_id(token)
 
-    def _convert_id_to_token(self, index, return_unicode=True):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
         if index < self.sp_model.get_piece_size():
             token = self.sp_model.IdToPiece(index)
         else:
             token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
-        if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode("utf-8")
         return token
 
     def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py
index 0ed96302d6..43cb03b73a 100644
--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         return self.idx2sym[idx]
 
     def _convert_token_to_id(self, sym):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         if sym in self.sym2idx:
             return self.sym2idx[sym]
         else:
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index e7f5d99182..6394090d24 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -23,8 +23,6 @@ import os
 import re
 from io import open
 
-import six
-
 from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
 
 
@@ -251,11 +249,9 @@ class PreTrainedTokenizer(object):
         for key, value in kwargs.items():
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 if key == "additional_special_tokens":
-                    assert isinstance(value, (list, tuple)) and all(
-                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
-                    )
+                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
                 else:
-                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
+                    assert isinstance(value, str)
                 setattr(self, key, value)
 
     @classmethod
@@ -567,7 +563,7 @@ class PreTrainedTokenizer(object):
 
         to_add_tokens = []
         for token in new_tokens:
-            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))  # noqa: F821
+            assert isinstance(token, str)
             if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
                 token = token.lower()
             if (
@@ -649,12 +645,10 @@ class PreTrainedTokenizer(object):
         for key, value in special_tokens_dict.items():
             assert key in self.SPECIAL_TOKENS_ATTRIBUTES
             if key == "additional_special_tokens":
-                assert isinstance(value, (list, tuple)) and all(
-                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
-                )
+                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
                 added_tokens += self.add_tokens(value)
             else:
-                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
+                assert isinstance(value, str)
                 added_tokens += self.add_tokens([value])
             logger.info("Assigning %s to the %s key of the tokenizer", value, key)
             setattr(self, key, value)
@@ -740,13 +734,13 @@ class PreTrainedTokenizer(object):
         raise NotImplementedError
 
     def convert_tokens_to_ids(self, tokens):
-        """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
+        """ Converts a single token, or a sequence of tokens, (str) in a single integer id
             (resp. a sequence of ids), using the vocabulary.
         """
         if tokens is None:
             return None
 
-        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):  # noqa: F821
+        if isinstance(tokens, str):
             return self._convert_token_to_id_with_added_voc(tokens)
 
         ids = []
@@ -901,9 +895,9 @@ class PreTrainedTokenizer(object):
         """
 
         def get_input_ids(text):
-            if isinstance(text, six.string_types):
+            if isinstance(text, str):
                 return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
                 return self.convert_tokens_to_ids(text)
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                 return text
@@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object):
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         """ Converts a single index or a sequence of indices (integers) in a token "
-            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
+            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
 
             Args:
                 skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py
index ffc4c5679f..94e2054690 100644
--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
         return self.decoder.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py
index 525a7f5c27..dde2382f8b 100644
--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         return self.sp_model.EncodeAsPieces(text)
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
         return self.sp_model.PieceToId(token) + self.fairseq_offset
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
         if index in self.fairseq_ids_to_tokens:
             return self.fairseq_ids_to_tokens[index]
         return self.sp_model.IdToPiece(index - self.fairseq_offset)
diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py
index f8251b2a90..6d0a6d0f18 100644
--- a/src/transformers/tokenization_xlnet.py
+++ b/src/transformers/tokenization_xlnet.py
@@ -20,8 +20,6 @@ import os
 import unicodedata
 from shutil import copyfile
 
-import six
-
 from .tokenization_utils import PreTrainedTokenizer
 
 
@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
             outputs = inputs
         outputs = outputs.replace("``", '"').replace("''", '"')
 
-        if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode("utf-8")
-
         if not self.keep_accents:
             outputs = unicodedata.normalize("NFKD", outputs)
             outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         return outputs
 
-    def _tokenize(self, text, return_unicode=True, sample=False):
-        """ Tokenize a string.
-            return_unicode is used only for py2
-        """
+    def _tokenize(self, text, sample=False):
+        """ Tokenize a string. """
         text = self.preprocess_text(text)
-        # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):  # noqa: F821
-            text = text.encode("utf-8")
 
         if not sample:
             pieces = self.sp_model.EncodeAsPieces(text)
@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
             else:
                 new_pieces.append(piece)
 
-        # note(zhiliny): convert back to unicode for py2
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in new_pieces:
-                if isinstance(piece, str):
-                    piece = piece.decode("utf-8")
-                ret_pieces.append(piece)
-            new_pieces = ret_pieces
-
         return new_pieces
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.sp_model.PieceToId(token)
 
-    def _convert_id_to_token(self, index, return_unicode=True):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode("utf-8")
-        return token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index af26102832..9c2fd2667b 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer):
         return split_tokens
 
     def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
         return self.vocab.get(token, self.vocab.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
         return self.ids_to_tokens.get(index, self.unk_token)
 
     def convert_tokens_to_string(self, tokens):
diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py
index cec8b9879b..f8c89b103a 100644
--- a/tests/test_hf_api.py
+++ b/tests/test_hf_api.py
@@ -19,7 +19,6 @@ import time
 import unittest
 
 import requests
-import six
 from requests.exceptions import HTTPError
 
 from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj
@@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest):
 
     def test_login_valid(self):
         token = self._api.login(username=USER, password=PASS)
-        self.assertIsInstance(token, six.string_types)
+        self.assertIsInstance(token, str)
 
 
 class HfApiEndpointsTest(HfApiCommonTest):
@@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
     def test_presign_and_upload(self):
         for FILE_KEY, FILE_PATH in FILES:
             access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
-            self.assertIsInstance(access_url, six.string_types)
+            self.assertIsInstance(access_url, str)
             with open(FILE_PATH, "r") as f:
                 body = f.read()
             r = requests.get(access_url)
diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py
index 11721f94c1..2909b4f9da 100644
--- a/tests/test_tokenization_utils.py
+++ b/tests/test_tokenization_utils.py
@@ -16,8 +16,6 @@
 
 import unittest
 
-import six
-
 from transformers import PreTrainedTokenizer
 from transformers.tokenization_gpt2 import GPT2Tokenizer
 
@@ -34,10 +32,7 @@ class TokenizerUtilsTest(unittest.TestCase):
             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
 
             for special_tok in tokenizer.all_special_tokens:
-                if six.PY2:
-                    self.assertIsInstance(special_tok, unicode)  # noqa: F821
-                else:
-                    self.assertIsInstance(special_tok, str)
+                self.assertIsInstance(special_tok, str)
                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
                 self.assertIsInstance(special_tok_id, int)