From 8af25b166486ec0cedbd2ef9147c3700dba88e0b Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Sun, 22 Dec 2019 17:56:09 +0100 Subject: [PATCH] Remove six. --- src/transformers/file_utils.py | 39 ++++++------------- src/transformers/hf_api.py | 19 ++------- src/transformers/pipelines.py | 3 +- src/transformers/tokenization_albert.py | 34 +++------------- src/transformers/tokenization_bert.py | 4 +- .../tokenization_bert_japanese.py | 7 +--- src/transformers/tokenization_camembert.py | 4 +- src/transformers/tokenization_ctrl.py | 4 +- src/transformers/tokenization_gpt2.py | 4 +- src/transformers/tokenization_openai.py | 2 +- src/transformers/tokenization_t5.py | 22 ++--------- src/transformers/tokenization_transfo_xl.py | 2 +- src/transformers/tokenization_utils.py | 26 +++++-------- src/transformers/tokenization_xlm.py | 4 +- src/transformers/tokenization_xlm_roberta.py | 4 +- src/transformers/tokenization_xlnet.py | 34 +++------------- .../adding_a_new_model/tokenization_xxx.py | 4 +- tests/test_hf_api.py | 5 +-- tests/test_tokenization_utils.py | 7 +--- 19 files changed, 61 insertions(+), 167 deletions(-) diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 55062e7f0e..0477bf452e 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -18,7 +18,6 @@ from io import open import boto3 import requests -import six from botocore.config import Config from botocore.exceptions import ClientError from filelock import FileLock @@ -107,36 +106,20 @@ def is_tf_available(): return _tf_available -if not six.PY2: +def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = "".join(docstr) + fn.__doc__ + return fn - def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + fn.__doc__ - return fn - - return docstring_decorator - - def add_end_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = fn.__doc__ + "".join(docstr) - return fn - - return docstring_decorator + return docstring_decorator -else: - # Not possible to update class docstrings on python2 - def add_start_docstrings(*docstr): - def docstring_decorator(fn): - return fn +def add_end_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + "".join(docstr) + return fn - return docstring_decorator - - def add_end_docstrings(*docstr): - def docstring_decorator(fn): - return fn - - return docstring_decorator + return docstring_decorator def is_remote_url(url_or_filename): @@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0]) if isinstance(user_agent, dict): ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) - elif isinstance(user_agent, six.string_types): + elif isinstance(user_agent, str): ua += "; " + user_agent headers = {"user-agent": ua} if resume_size > 0: diff --git a/src/transformers/hf_api.py b/src/transformers/hf_api.py index 9666282196..a6964737a4 100644 --- a/src/transformers/hf_api.py +++ b/src/transformers/hf_api.py @@ -20,7 +20,6 @@ from os.path import expanduser from typing import List import requests -import six from tqdm import tqdm @@ -160,11 +159,8 @@ class TqdmProgressFileReader: self.f = f self.total_size = os.fstat(f.fileno()).st_size # type: int self.pbar = tqdm(total=self.total_size, leave=False) - if six.PY3: - # does not work unless PY3 - # no big deal as the CLI does not currently support PY2 anyways. - self.read = f.read - f.read = self._read + self.read = f.read + f.read = self._read def _read(self, n=-1): self.pbar.update(n) @@ -182,16 +178,7 @@ class HfFolder: """ Save token, creating folder as needed. """ - if six.PY3: - os.makedirs(os.path.dirname(cls.path_token), exist_ok=True) - else: - # Python 2 - try: - os.makedirs(os.path.dirname(cls.path_token)) - except OSError as e: - if e.errno != os.errno.EEXIST: - raise e - pass + os.makedirs(os.path.dirname(cls.path_token), exist_ok=True) with open(cls.path_token, "w+") as f: f.write(token) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 605d8b84e3..996521ac5c 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -26,7 +26,6 @@ from os.path import abspath, exists from typing import Dict, List, Optional, Tuple, Union import numpy as np -import six from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig from .configuration_utils import PretrainedConfig @@ -939,7 +938,7 @@ def pipeline( modelcard = config # Instantiate tokenizer if needed - if isinstance(tokenizer, six.string_types): + if isinstance(tokenizer, str): tokenizer = AutoTokenizer.from_pretrained(tokenizer) # Instantiate config if needed diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py index 04f0bf00af..541ae7ae68 100644 --- a/src/transformers/tokenization_albert.py +++ b/src/transformers/tokenization_albert.py @@ -20,8 +20,6 @@ import os import unicodedata from shutil import copyfile -import six - from .tokenization_utils import PreTrainedTokenizer @@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer): outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') - if six.PY2 and isinstance(outputs, str): - outputs = outputs.decode("utf-8") - if not self.keep_accents: outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) @@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer): return outputs - def _tokenize(self, text, return_unicode=True, sample=False): - """ Tokenize a string. - return_unicode is used only for py2 - """ + def _tokenize(self, text, sample=False): + """ Tokenize a string. """ text = self.preprocess_text(text) - # note(zhiliny): in some systems, sentencepiece only accepts str for py2 - if six.PY2 and isinstance(text, unicode): # noqa: F821 - text = text.encode("utf-8") if not sample: pieces = self.sp_model.EncodeAsPieces(text) @@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer): else: new_pieces.append(piece) - # note(zhiliny): convert back to unicode for py2 - if six.PY2 and return_unicode: - ret_pieces = [] - for piece in new_pieces: - if isinstance(piece, str): - piece = piece.decode("utf-8") - ret_pieces.append(piece) - new_pieces = ret_pieces - return new_pieces def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.sp_model.PieceToId(token) - def _convert_id_to_token(self, index, return_unicode=True): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" - token = self.sp_model.IdToPiece(index) - if six.PY2 and return_unicode and isinstance(token, str): - token = token.decode("utf-8") - return token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index ef8a7d8391..9a9b6626f1 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer): return split_tokens def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + """Converts an index (integer) in a token (str) using the vocab.""" return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py index 893db031d4..57dde09734 100644 --- a/src/transformers/tokenization_bert_japanese.py +++ b/src/transformers/tokenization_bert_japanese.py @@ -20,8 +20,6 @@ import logging import os import unicodedata -import six - from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab @@ -194,10 +192,7 @@ class MecabTokenizer(object): never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] - if six.PY2: - mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8") - else: - mecab_output = self.mecab.parse(text) + mecab_output = self.mecab.parse(text) cursor = 0 for line in mecab_output.split("\n"): diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py index 09104bbbf0..dd33d510da 100644 --- a/src/transformers/tokenization_camembert.py +++ b/src/transformers/tokenization_camembert.py @@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer): return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] elif self.sp_model.PieceToId(token) == 0: @@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer): return self.fairseq_offset + self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + """Converts an index (integer) in a token (str) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) diff --git a/src/transformers/tokenization_ctrl.py b/src/transformers/tokenization_ctrl.py index e40df10448..28873ba036 100644 --- a/src/transformers/tokenization_ctrl.py +++ b/src/transformers/tokenization_ctrl.py @@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer): return split_tokens def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index ff96b07945..44d615061f 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer): return bpe_tokens def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/tokenization_openai.py b/src/transformers/tokenization_openai.py index 693a76377e..566bf14cb4 100644 --- a/src/transformers/tokenization_openai.py +++ b/src/transformers/tokenization_openai.py @@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): return split_tokens def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): diff --git a/src/transformers/tokenization_t5.py b/src/transformers/tokenization_t5.py index d5a051107c..bd7a6e3144 100644 --- a/src/transformers/tokenization_t5.py +++ b/src/transformers/tokenization_t5.py @@ -20,8 +20,6 @@ import os import re from shutil import copyfile -import six - from .tokenization_utils import PreTrainedTokenizer @@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer): self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, return_unicode=True, sample=False): + def _tokenize(self, text, sample=False): """ Take as input a string and return a list of strings (tokens) for words/sub-words """ if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - - # convert back to unicode for py2 - if six.PY2 and return_unicode: - ret_pieces = [] - for piece in pieces: - if isinstance(piece, str): - piece = piece.decode("utf-8") - ret_pieces.append(piece) - pieces = ret_pieces - return pieces def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ if token.startswith("", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) - def _convert_id_to_token(self, index, return_unicode=True): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) else: token = "".format(self.vocab_size - 1 - index) - if six.PY2 and return_unicode and isinstance(token, str): - token = token.decode("utf-8") return token def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py index 0ed96302d6..43cb03b73a 100644 --- a/src/transformers/tokenization_transfo_xl.py +++ b/src/transformers/tokenization_transfo_xl.py @@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): return self.idx2sym[idx] def _convert_token_to_id(self, sym): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ if sym in self.sym2idx: return self.sym2idx[sym] else: diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index e7f5d99182..6394090d24 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -23,8 +23,6 @@ import os import re from io import open -import six - from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available @@ -251,11 +249,9 @@ class PreTrainedTokenizer(object): for key, value in kwargs.items(): if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key == "additional_special_tokens": - assert isinstance(value, (list, tuple)) and all( - isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821 - ) + assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) else: - assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821 + assert isinstance(value, str) setattr(self, key, value) @classmethod @@ -567,7 +563,7 @@ class PreTrainedTokenizer(object): to_add_tokens = [] for token in new_tokens: - assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) # noqa: F821 + assert isinstance(token, str) if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens: token = token.lower() if ( @@ -649,12 +645,10 @@ class PreTrainedTokenizer(object): for key, value in special_tokens_dict.items(): assert key in self.SPECIAL_TOKENS_ATTRIBUTES if key == "additional_special_tokens": - assert isinstance(value, (list, tuple)) and all( - isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821 - ) + assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) added_tokens += self.add_tokens(value) else: - assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821 + assert isinstance(value, str) added_tokens += self.add_tokens([value]) logger.info("Assigning %s to the %s key of the tokenizer", value, key) setattr(self, key, value) @@ -740,13 +734,13 @@ class PreTrainedTokenizer(object): raise NotImplementedError def convert_tokens_to_ids(self, tokens): - """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id + """ Converts a single token, or a sequence of tokens, (str) in a single integer id (resp. a sequence of ids), using the vocabulary. """ if tokens is None: return None - if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): # noqa: F821 + if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) ids = [] @@ -901,9 +895,9 @@ class PreTrainedTokenizer(object): """ def get_input_ids(text): - if isinstance(text, six.string_types): + if isinstance(text, str): return self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) - elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types): + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): return text @@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object): def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """ Converts a single index or a sequence of indices (integers) in a token " - (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. + (resp.) a sequence of tokens (str), using the vocabulary and added tokens. Args: skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py index ffc4c5679f..94e2054690 100644 --- a/src/transformers/tokenization_xlm.py +++ b/src/transformers/tokenization_xlm.py @@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer): return split_tokens def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py index 525a7f5c27..dde2382f8b 100644 --- a/src/transformers/tokenization_xlm_roberta.py +++ b/src/transformers/tokenization_xlm_roberta.py @@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] return self.sp_model.PieceToId(token) + self.fairseq_offset def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + """Converts an index (integer) in a token (str) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py index f8251b2a90..6d0a6d0f18 100644 --- a/src/transformers/tokenization_xlnet.py +++ b/src/transformers/tokenization_xlnet.py @@ -20,8 +20,6 @@ import os import unicodedata from shutil import copyfile -import six - from .tokenization_utils import PreTrainedTokenizer @@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer): outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') - if six.PY2 and isinstance(outputs, str): - outputs = outputs.decode("utf-8") - if not self.keep_accents: outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) @@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer): return outputs - def _tokenize(self, text, return_unicode=True, sample=False): - """ Tokenize a string. - return_unicode is used only for py2 - """ + def _tokenize(self, text, sample=False): + """ Tokenize a string. """ text = self.preprocess_text(text) - # note(zhiliny): in some systems, sentencepiece only accepts str for py2 - if six.PY2 and isinstance(text, unicode): # noqa: F821 - text = text.encode("utf-8") if not sample: pieces = self.sp_model.EncodeAsPieces(text) @@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer): else: new_pieces.append(piece) - # note(zhiliny): convert back to unicode for py2 - if six.PY2 and return_unicode: - ret_pieces = [] - for piece in new_pieces: - if isinstance(piece, str): - piece = piece.decode("utf-8") - ret_pieces.append(piece) - new_pieces = ret_pieces - return new_pieces def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.sp_model.PieceToId(token) - def _convert_id_to_token(self, index, return_unicode=True): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" - token = self.sp_model.IdToPiece(index) - if six.PY2 and return_unicode and isinstance(token, str): - token = token.decode("utf-8") - return token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index af26102832..9c2fd2667b 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer): return split_tokens def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ + """ Converts a token (str) in an id using the vocab. """ return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" + """Converts an index (integer) in a token (str) using the vocab.""" return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index cec8b9879b..f8c89b103a 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -19,7 +19,6 @@ import time import unittest import requests -import six from requests.exceptions import HTTPError from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj @@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest): def test_login_valid(self): token = self._api.login(username=USER, password=PASS) - self.assertIsInstance(token, six.string_types) + self.assertIsInstance(token, str) class HfApiEndpointsTest(HfApiCommonTest): @@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest): def test_presign_and_upload(self): for FILE_KEY, FILE_PATH in FILES: access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH) - self.assertIsInstance(access_url, six.string_types) + self.assertIsInstance(access_url, str) with open(FILE_PATH, "r") as f: body = f.read() r = requests.get(access_url) diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py index 11721f94c1..2909b4f9da 100644 --- a/tests/test_tokenization_utils.py +++ b/tests/test_tokenization_utils.py @@ -16,8 +16,6 @@ import unittest -import six - from transformers import PreTrainedTokenizer from transformers.tokenization_gpt2 import GPT2Tokenizer @@ -34,10 +32,7 @@ class TokenizerUtilsTest(unittest.TestCase): self.assertIsInstance(tokenizer, PreTrainedTokenizer) for special_tok in tokenizer.all_special_tokens: - if six.PY2: - self.assertIsInstance(special_tok, unicode) # noqa: F821 - else: - self.assertIsInstance(special_tok, str) + self.assertIsInstance(special_tok, str) special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) self.assertIsInstance(special_tok_id, int)