Remove six.
This commit is contained in:
@@ -18,7 +18,6 @@ from io import open
|
|||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
import requests
|
import requests
|
||||||
import six
|
|
||||||
from botocore.config import Config
|
from botocore.config import Config
|
||||||
from botocore.exceptions import ClientError
|
from botocore.exceptions import ClientError
|
||||||
from filelock import FileLock
|
from filelock import FileLock
|
||||||
@@ -107,8 +106,6 @@ def is_tf_available():
|
|||||||
return _tf_available
|
return _tf_available
|
||||||
|
|
||||||
|
|
||||||
if not six.PY2:
|
|
||||||
|
|
||||||
def add_start_docstrings(*docstr):
|
def add_start_docstrings(*docstr):
|
||||||
def docstring_decorator(fn):
|
def docstring_decorator(fn):
|
||||||
fn.__doc__ = "".join(docstr) + fn.__doc__
|
fn.__doc__ = "".join(docstr) + fn.__doc__
|
||||||
@@ -116,6 +113,7 @@ if not six.PY2:
|
|||||||
|
|
||||||
return docstring_decorator
|
return docstring_decorator
|
||||||
|
|
||||||
|
|
||||||
def add_end_docstrings(*docstr):
|
def add_end_docstrings(*docstr):
|
||||||
def docstring_decorator(fn):
|
def docstring_decorator(fn):
|
||||||
fn.__doc__ = fn.__doc__ + "".join(docstr)
|
fn.__doc__ = fn.__doc__ + "".join(docstr)
|
||||||
@@ -124,21 +122,6 @@ if not six.PY2:
|
|||||||
return docstring_decorator
|
return docstring_decorator
|
||||||
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Not possible to update class docstrings on python2
|
|
||||||
def add_start_docstrings(*docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
return fn
|
|
||||||
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
def add_end_docstrings(*docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
return fn
|
|
||||||
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def is_remote_url(url_or_filename):
|
def is_remote_url(url_or_filename):
|
||||||
parsed = urlparse(url_or_filename)
|
parsed = urlparse(url_or_filename)
|
||||||
return parsed.scheme in ("http", "https", "s3")
|
return parsed.scheme in ("http", "https", "s3")
|
||||||
@@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
|
|||||||
ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
|
ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
|
||||||
if isinstance(user_agent, dict):
|
if isinstance(user_agent, dict):
|
||||||
ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
|
ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
|
||||||
elif isinstance(user_agent, six.string_types):
|
elif isinstance(user_agent, str):
|
||||||
ua += "; " + user_agent
|
ua += "; " + user_agent
|
||||||
headers = {"user-agent": ua}
|
headers = {"user-agent": ua}
|
||||||
if resume_size > 0:
|
if resume_size > 0:
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ from os.path import expanduser
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import six
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
@@ -160,9 +159,6 @@ class TqdmProgressFileReader:
|
|||||||
self.f = f
|
self.f = f
|
||||||
self.total_size = os.fstat(f.fileno()).st_size # type: int
|
self.total_size = os.fstat(f.fileno()).st_size # type: int
|
||||||
self.pbar = tqdm(total=self.total_size, leave=False)
|
self.pbar = tqdm(total=self.total_size, leave=False)
|
||||||
if six.PY3:
|
|
||||||
# does not work unless PY3
|
|
||||||
# no big deal as the CLI does not currently support PY2 anyways.
|
|
||||||
self.read = f.read
|
self.read = f.read
|
||||||
f.read = self._read
|
f.read = self._read
|
||||||
|
|
||||||
@@ -182,16 +178,7 @@ class HfFolder:
|
|||||||
"""
|
"""
|
||||||
Save token, creating folder as needed.
|
Save token, creating folder as needed.
|
||||||
"""
|
"""
|
||||||
if six.PY3:
|
|
||||||
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
||||||
else:
|
|
||||||
# Python 2
|
|
||||||
try:
|
|
||||||
os.makedirs(os.path.dirname(cls.path_token))
|
|
||||||
except OSError as e:
|
|
||||||
if e.errno != os.errno.EEXIST:
|
|
||||||
raise e
|
|
||||||
pass
|
|
||||||
with open(cls.path_token, "w+") as f:
|
with open(cls.path_token, "w+") as f:
|
||||||
f.write(token)
|
f.write(token)
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ from os.path import abspath, exists
|
|||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import six
|
|
||||||
|
|
||||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
|
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
|
||||||
from .configuration_utils import PretrainedConfig
|
from .configuration_utils import PretrainedConfig
|
||||||
@@ -939,7 +938,7 @@ def pipeline(
|
|||||||
modelcard = config
|
modelcard = config
|
||||||
|
|
||||||
# Instantiate tokenizer if needed
|
# Instantiate tokenizer if needed
|
||||||
if isinstance(tokenizer, six.string_types):
|
if isinstance(tokenizer, str):
|
||||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
|
||||||
|
|
||||||
# Instantiate config if needed
|
# Instantiate config if needed
|
||||||
|
|||||||
@@ -20,8 +20,6 @@ import os
|
|||||||
import unicodedata
|
import unicodedata
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
|
|
||||||
|
|
||||||
@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
outputs = inputs
|
outputs = inputs
|
||||||
outputs = outputs.replace("``", '"').replace("''", '"')
|
outputs = outputs.replace("``", '"').replace("''", '"')
|
||||||
|
|
||||||
if six.PY2 and isinstance(outputs, str):
|
|
||||||
outputs = outputs.decode("utf-8")
|
|
||||||
|
|
||||||
if not self.keep_accents:
|
if not self.keep_accents:
|
||||||
outputs = unicodedata.normalize("NFKD", outputs)
|
outputs = unicodedata.normalize("NFKD", outputs)
|
||||||
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
|
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
|
||||||
@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def _tokenize(self, text, return_unicode=True, sample=False):
|
def _tokenize(self, text, sample=False):
|
||||||
""" Tokenize a string.
|
""" Tokenize a string. """
|
||||||
return_unicode is used only for py2
|
|
||||||
"""
|
|
||||||
text = self.preprocess_text(text)
|
text = self.preprocess_text(text)
|
||||||
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
|
|
||||||
if six.PY2 and isinstance(text, unicode): # noqa: F821
|
|
||||||
text = text.encode("utf-8")
|
|
||||||
|
|
||||||
if not sample:
|
if not sample:
|
||||||
pieces = self.sp_model.EncodeAsPieces(text)
|
pieces = self.sp_model.EncodeAsPieces(text)
|
||||||
@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
else:
|
else:
|
||||||
new_pieces.append(piece)
|
new_pieces.append(piece)
|
||||||
|
|
||||||
# note(zhiliny): convert back to unicode for py2
|
|
||||||
if six.PY2 and return_unicode:
|
|
||||||
ret_pieces = []
|
|
||||||
for piece in new_pieces:
|
|
||||||
if isinstance(piece, str):
|
|
||||||
piece = piece.decode("utf-8")
|
|
||||||
ret_pieces.append(piece)
|
|
||||||
new_pieces = ret_pieces
|
|
||||||
|
|
||||||
return new_pieces
|
return new_pieces
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.sp_model.PieceToId(token)
|
return self.sp_model.PieceToId(token)
|
||||||
|
|
||||||
def _convert_id_to_token(self, index, return_unicode=True):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
token = self.sp_model.IdToPiece(index)
|
return self.sp_model.IdToPiece(index)
|
||||||
if six.PY2 and return_unicode and isinstance(token, str):
|
|
||||||
token = token.decode("utf-8")
|
|
||||||
return token
|
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||||
|
|||||||
@@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
return split_tokens
|
return split_tokens
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
return self.ids_to_tokens.get(index, self.unk_token)
|
return self.ids_to_tokens.get(index, self.unk_token)
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
|
|||||||
@@ -20,8 +20,6 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
|
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
|
||||||
|
|
||||||
|
|
||||||
@@ -194,9 +192,6 @@ class MecabTokenizer(object):
|
|||||||
never_split = self.never_split + (never_split if never_split is not None else [])
|
never_split = self.never_split + (never_split if never_split is not None else [])
|
||||||
tokens = []
|
tokens = []
|
||||||
|
|
||||||
if six.PY2:
|
|
||||||
mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
|
|
||||||
else:
|
|
||||||
mecab_output = self.mecab.parse(text)
|
mecab_output = self.mecab.parse(text)
|
||||||
|
|
||||||
cursor = 0
|
cursor = 0
|
||||||
|
|||||||
@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
return self.sp_model.EncodeAsPieces(text)
|
return self.sp_model.EncodeAsPieces(text)
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
if token in self.fairseq_tokens_to_ids:
|
if token in self.fairseq_tokens_to_ids:
|
||||||
return self.fairseq_tokens_to_ids[token]
|
return self.fairseq_tokens_to_ids[token]
|
||||||
elif self.sp_model.PieceToId(token) == 0:
|
elif self.sp_model.PieceToId(token) == 0:
|
||||||
@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
return self.fairseq_offset + self.sp_model.PieceToId(token)
|
return self.fairseq_offset + self.sp_model.PieceToId(token)
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
if index in self.fairseq_ids_to_tokens:
|
if index in self.fairseq_ids_to_tokens:
|
||||||
return self.fairseq_ids_to_tokens[index]
|
return self.fairseq_ids_to_tokens[index]
|
||||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||||
|
|||||||
@@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
|||||||
return split_tokens
|
return split_tokens
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
return self.decoder.get(index, self.unk_token)
|
return self.decoder.get(index, self.unk_token)
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
|
|||||||
@@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
return bpe_tokens
|
return bpe_tokens
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
return self.decoder.get(index)
|
return self.decoder.get(index)
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
|
|||||||
@@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
|||||||
return split_tokens
|
return split_tokens
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
|
|||||||
@@ -20,8 +20,6 @@ import os
|
|||||||
import re
|
import re
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
|
|
||||||
|
|
||||||
@@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
|
|||||||
self.sp_model = spm.SentencePieceProcessor()
|
self.sp_model = spm.SentencePieceProcessor()
|
||||||
self.sp_model.Load(self.vocab_file)
|
self.sp_model.Load(self.vocab_file)
|
||||||
|
|
||||||
def _tokenize(self, text, return_unicode=True, sample=False):
|
def _tokenize(self, text, sample=False):
|
||||||
""" Take as input a string and return a list of strings (tokens) for words/sub-words
|
""" Take as input a string and return a list of strings (tokens) for words/sub-words
|
||||||
"""
|
"""
|
||||||
if not sample:
|
if not sample:
|
||||||
pieces = self.sp_model.EncodeAsPieces(text)
|
pieces = self.sp_model.EncodeAsPieces(text)
|
||||||
else:
|
else:
|
||||||
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
||||||
|
|
||||||
# convert back to unicode for py2
|
|
||||||
if six.PY2 and return_unicode:
|
|
||||||
ret_pieces = []
|
|
||||||
for piece in pieces:
|
|
||||||
if isinstance(piece, str):
|
|
||||||
piece = piece.decode("utf-8")
|
|
||||||
ret_pieces.append(piece)
|
|
||||||
pieces = ret_pieces
|
|
||||||
|
|
||||||
return pieces
|
return pieces
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
if token.startswith("<extra_id_"):
|
if token.startswith("<extra_id_"):
|
||||||
match = re.match(r"<extra_id_(\d+)>", token)
|
match = re.match(r"<extra_id_(\d+)>", token)
|
||||||
num = int(match.group(1))
|
num = int(match.group(1))
|
||||||
return self.vocab_size - num - 1
|
return self.vocab_size - num - 1
|
||||||
return self.sp_model.piece_to_id(token)
|
return self.sp_model.piece_to_id(token)
|
||||||
|
|
||||||
def _convert_id_to_token(self, index, return_unicode=True):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
if index < self.sp_model.get_piece_size():
|
if index < self.sp_model.get_piece_size():
|
||||||
token = self.sp_model.IdToPiece(index)
|
token = self.sp_model.IdToPiece(index)
|
||||||
else:
|
else:
|
||||||
token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
|
token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
|
||||||
if six.PY2 and return_unicode and isinstance(token, str):
|
|
||||||
token = token.decode("utf-8")
|
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
|
|||||||
@@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
|||||||
return self.idx2sym[idx]
|
return self.idx2sym[idx]
|
||||||
|
|
||||||
def _convert_token_to_id(self, sym):
|
def _convert_token_to_id(self, sym):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
if sym in self.sym2idx:
|
if sym in self.sym2idx:
|
||||||
return self.sym2idx[sym]
|
return self.sym2idx[sym]
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -23,8 +23,6 @@ import os
|
|||||||
import re
|
import re
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
|
from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
|
||||||
|
|
||||||
|
|
||||||
@@ -251,11 +249,9 @@ class PreTrainedTokenizer(object):
|
|||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
||||||
if key == "additional_special_tokens":
|
if key == "additional_special_tokens":
|
||||||
assert isinstance(value, (list, tuple)) and all(
|
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
|
||||||
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
|
assert isinstance(value, str)
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -567,7 +563,7 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
to_add_tokens = []
|
to_add_tokens = []
|
||||||
for token in new_tokens:
|
for token in new_tokens:
|
||||||
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) # noqa: F821
|
assert isinstance(token, str)
|
||||||
if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
|
if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
|
||||||
token = token.lower()
|
token = token.lower()
|
||||||
if (
|
if (
|
||||||
@@ -649,12 +645,10 @@ class PreTrainedTokenizer(object):
|
|||||||
for key, value in special_tokens_dict.items():
|
for key, value in special_tokens_dict.items():
|
||||||
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
|
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
|
||||||
if key == "additional_special_tokens":
|
if key == "additional_special_tokens":
|
||||||
assert isinstance(value, (list, tuple)) and all(
|
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
|
||||||
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
|
|
||||||
)
|
|
||||||
added_tokens += self.add_tokens(value)
|
added_tokens += self.add_tokens(value)
|
||||||
else:
|
else:
|
||||||
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
|
assert isinstance(value, str)
|
||||||
added_tokens += self.add_tokens([value])
|
added_tokens += self.add_tokens([value])
|
||||||
logger.info("Assigning %s to the %s key of the tokenizer", value, key)
|
logger.info("Assigning %s to the %s key of the tokenizer", value, key)
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
@@ -740,13 +734,13 @@ class PreTrainedTokenizer(object):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def convert_tokens_to_ids(self, tokens):
|
def convert_tokens_to_ids(self, tokens):
|
||||||
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
|
""" Converts a single token, or a sequence of tokens, (str) in a single integer id
|
||||||
(resp. a sequence of ids), using the vocabulary.
|
(resp. a sequence of ids), using the vocabulary.
|
||||||
"""
|
"""
|
||||||
if tokens is None:
|
if tokens is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): # noqa: F821
|
if isinstance(tokens, str):
|
||||||
return self._convert_token_to_id_with_added_voc(tokens)
|
return self._convert_token_to_id_with_added_voc(tokens)
|
||||||
|
|
||||||
ids = []
|
ids = []
|
||||||
@@ -901,9 +895,9 @@ class PreTrainedTokenizer(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def get_input_ids(text):
|
def get_input_ids(text):
|
||||||
if isinstance(text, six.string_types):
|
if isinstance(text, str):
|
||||||
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
|
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
|
||||||
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
|
||||||
return self.convert_tokens_to_ids(text)
|
return self.convert_tokens_to_ids(text)
|
||||||
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
||||||
return text
|
return text
|
||||||
@@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||||
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
|
(resp.) a sequence of tokens (str), using the vocabulary and added tokens.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
|
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
|
||||||
|
|||||||
@@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
return split_tokens
|
return split_tokens
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
return self.decoder.get(index, self.unk_token)
|
return self.decoder.get(index, self.unk_token)
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
|
|||||||
@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
|||||||
return self.sp_model.EncodeAsPieces(text)
|
return self.sp_model.EncodeAsPieces(text)
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
if token in self.fairseq_tokens_to_ids:
|
if token in self.fairseq_tokens_to_ids:
|
||||||
return self.fairseq_tokens_to_ids[token]
|
return self.fairseq_tokens_to_ids[token]
|
||||||
return self.sp_model.PieceToId(token) + self.fairseq_offset
|
return self.sp_model.PieceToId(token) + self.fairseq_offset
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
if index in self.fairseq_ids_to_tokens:
|
if index in self.fairseq_ids_to_tokens:
|
||||||
return self.fairseq_ids_to_tokens[index]
|
return self.fairseq_ids_to_tokens[index]
|
||||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||||
|
|||||||
@@ -20,8 +20,6 @@ import os
|
|||||||
import unicodedata
|
import unicodedata
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
|
|
||||||
|
|
||||||
@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
outputs = inputs
|
outputs = inputs
|
||||||
outputs = outputs.replace("``", '"').replace("''", '"')
|
outputs = outputs.replace("``", '"').replace("''", '"')
|
||||||
|
|
||||||
if six.PY2 and isinstance(outputs, str):
|
|
||||||
outputs = outputs.decode("utf-8")
|
|
||||||
|
|
||||||
if not self.keep_accents:
|
if not self.keep_accents:
|
||||||
outputs = unicodedata.normalize("NFKD", outputs)
|
outputs = unicodedata.normalize("NFKD", outputs)
|
||||||
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
|
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
|
||||||
@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def _tokenize(self, text, return_unicode=True, sample=False):
|
def _tokenize(self, text, sample=False):
|
||||||
""" Tokenize a string.
|
""" Tokenize a string. """
|
||||||
return_unicode is used only for py2
|
|
||||||
"""
|
|
||||||
text = self.preprocess_text(text)
|
text = self.preprocess_text(text)
|
||||||
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
|
|
||||||
if six.PY2 and isinstance(text, unicode): # noqa: F821
|
|
||||||
text = text.encode("utf-8")
|
|
||||||
|
|
||||||
if not sample:
|
if not sample:
|
||||||
pieces = self.sp_model.EncodeAsPieces(text)
|
pieces = self.sp_model.EncodeAsPieces(text)
|
||||||
@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
else:
|
else:
|
||||||
new_pieces.append(piece)
|
new_pieces.append(piece)
|
||||||
|
|
||||||
# note(zhiliny): convert back to unicode for py2
|
|
||||||
if six.PY2 and return_unicode:
|
|
||||||
ret_pieces = []
|
|
||||||
for piece in new_pieces:
|
|
||||||
if isinstance(piece, str):
|
|
||||||
piece = piece.decode("utf-8")
|
|
||||||
ret_pieces.append(piece)
|
|
||||||
new_pieces = ret_pieces
|
|
||||||
|
|
||||||
return new_pieces
|
return new_pieces
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.sp_model.PieceToId(token)
|
return self.sp_model.PieceToId(token)
|
||||||
|
|
||||||
def _convert_id_to_token(self, index, return_unicode=True):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
token = self.sp_model.IdToPiece(index)
|
return self.sp_model.IdToPiece(index)
|
||||||
if six.PY2 and return_unicode and isinstance(token, str):
|
|
||||||
token = token.decode("utf-8")
|
|
||||||
return token
|
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||||
|
|||||||
@@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer):
|
|||||||
return split_tokens
|
return split_tokens
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
""" Converts a token (str) in an id using the vocab. """
|
||||||
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
return self.ids_to_tokens.get(index, self.unk_token)
|
return self.ids_to_tokens.get(index, self.unk_token)
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ import time
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import six
|
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj
|
from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj
|
||||||
@@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest):
|
|||||||
|
|
||||||
def test_login_valid(self):
|
def test_login_valid(self):
|
||||||
token = self._api.login(username=USER, password=PASS)
|
token = self._api.login(username=USER, password=PASS)
|
||||||
self.assertIsInstance(token, six.string_types)
|
self.assertIsInstance(token, str)
|
||||||
|
|
||||||
|
|
||||||
class HfApiEndpointsTest(HfApiCommonTest):
|
class HfApiEndpointsTest(HfApiCommonTest):
|
||||||
@@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
|
|||||||
def test_presign_and_upload(self):
|
def test_presign_and_upload(self):
|
||||||
for FILE_KEY, FILE_PATH in FILES:
|
for FILE_KEY, FILE_PATH in FILES:
|
||||||
access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
|
access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
|
||||||
self.assertIsInstance(access_url, six.string_types)
|
self.assertIsInstance(access_url, str)
|
||||||
with open(FILE_PATH, "r") as f:
|
with open(FILE_PATH, "r") as f:
|
||||||
body = f.read()
|
body = f.read()
|
||||||
r = requests.get(access_url)
|
r = requests.get(access_url)
|
||||||
|
|||||||
@@ -16,8 +16,6 @@
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
from transformers.tokenization_gpt2 import GPT2Tokenizer
|
from transformers.tokenization_gpt2 import GPT2Tokenizer
|
||||||
|
|
||||||
@@ -34,9 +32,6 @@ class TokenizerUtilsTest(unittest.TestCase):
|
|||||||
self.assertIsInstance(tokenizer, PreTrainedTokenizer)
|
self.assertIsInstance(tokenizer, PreTrainedTokenizer)
|
||||||
|
|
||||||
for special_tok in tokenizer.all_special_tokens:
|
for special_tok in tokenizer.all_special_tokens:
|
||||||
if six.PY2:
|
|
||||||
self.assertIsInstance(special_tok, unicode) # noqa: F821
|
|
||||||
else:
|
|
||||||
self.assertIsInstance(special_tok, str)
|
self.assertIsInstance(special_tok, str)
|
||||||
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
|
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
|
||||||
self.assertIsInstance(special_tok_id, int)
|
self.assertIsInstance(special_tok_id, int)
|
||||||
|
|||||||
Reference in New Issue
Block a user