tokenization abstract class - tests for examples

This commit is contained in:
thomwolf
2019-07-05 15:02:59 +02:00
parent a4f980547f
commit 36bca545ff
33 changed files with 815 additions and 566 deletions

View File

@@ -27,15 +27,24 @@ import unicodedata
import six
from .file_utils import cached_path
from .model_utils import clean_up_tokenization
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
logger = logging.getLogger(__name__)
PRETRAINED_VOCAB_ARCHIVE_MAP = {
VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'xlnet-large-cased': 512,
}
VOCAB_NAME = 'spiece.model'
SPECIAL_TOKENS_NAME = 'special_tokens.txt'
SPIECE_UNDERLINE = u''
@@ -46,7 +55,7 @@ SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4
class XLNetTokenizer(object):
class XLNetTokenizer(PreTrainedTokenizer):
"""
SentencePiece based tokenizer. Peculiarities:
- requires SentencePiece: https://github.com/google/sentencepiece
@@ -63,64 +72,11 @@ class XLNetTokenizer(object):
"<eod>" : 7,
"<eop>" : 8,
}
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
special_tokens_file = None
if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
logger.warning("The pre-trained model you are loading is a cased model but you have not set "
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
"you may want to check this behavior.")
kwargs['do_lower_case'] = False
elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
logger.warning("The pre-trained model you are loading is an uncased model but you have set "
"`do_lower_case` to False. We are setting `do_lower_case=True` for you "
"but you may want to check this behavior.")
kwargs['do_lower_case'] = True
else:
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
if not os.path.exists(special_tokens_file):
special_tokens_file = None
else:
logger.info("loading special tokens file {}".format(special_tokens_file))
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except EnvironmentError:
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
logger.error(
"Couldn't reach server at '{}' to download vocabulary.".format(
vocab_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {}"
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
vocab_file))
return None
if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
# Instantiate tokenizer.
if special_tokens_file and 'special_tokens' not in kwargs:
special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
else:
special_tokens = kwargs.pop('special_tokens', [])
tokenizer = cls(resolved_vocab_file, special_tokens=special_tokens, *inputs, **kwargs)
return tokenizer
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, special_tokens=None, max_len=None,
def __init__(self, vocab_file, max_len=None,
do_lower_case=False, remove_space=True, keep_accents=False):
try:
import sentencepiece as spm
@@ -136,9 +92,6 @@ class XLNetTokenizer(object):
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)
@property
def UNK_TOKEN(self):
@@ -181,7 +134,7 @@ class XLNetTokenizer(object):
return self.special_symbols["<mask>"]
def __len__(self):
return len(self.encoder) + len(self.special_tokens)
return len(self.sp_model)
def __getstate__(self):
state = self.__dict__.copy()
@@ -198,19 +151,6 @@ class XLNetTokenizer(object):
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def set_special_tokens(self, special_tokens):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if not special_tokens:
self.special_tokens = {}
self.special_tokens_decoder = {}
return
self.special_tokens = dict((tok, len(self.sp_model) + i) for i, tok in enumerate(special_tokens))
self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
logger.info("Special tokens: %s", str(self.special_tokens))
def preprocess_text(self, inputs):
if self.remove_space:
outputs = ' '.join(inputs.strip().split())
@@ -272,15 +212,9 @@ class XLNetTokenizer(object):
""" Converts a sequence of tokens into ids using the vocab. """
ids = []
if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
if tokens in self.special_tokens:
return self.special_tokens[tokens]
else:
return self.sp_model.PieceToId(tokens)
return self.sp_model.PieceToId(tokens)
for token in tokens:
if token in self.special_tokens:
ids.append(self.special_tokens[token])
else:
ids.append(self.sp_model.PieceToId(token))
ids.append(self.sp_model.PieceToId(token))
if len(ids) > self.max_len:
logger.warning(
"Token indices sequence length is longer than the specified maximum "
@@ -289,15 +223,11 @@ class XLNetTokenizer(object):
)
return ids
def convert_ids_to_tokens(self, ids, return_unicode=True, skip_special_tokens=False):
def convert_ids_to_tokens(self, ids, return_unicode=True):
"""Converts a sequence of ids in tokens."""
tokens = []
for i in ids:
if i in self.special_tokens_decoder:
if not skip_special_tokens:
tokens.append(self.special_tokens_decoder[i])
else:
tokens.append(self.sp_model.IdToPiece(i))
tokens.append(self.sp_model.IdToPiece(i))
if six.PY2 and return_unicode:
ret_pieces = []
@@ -311,9 +241,9 @@ class XLNetTokenizer(object):
def encode(self, text, sample=False):
return self.convert_tokens_to_ids(self.tokenize(text, sample=sample))
def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
def decode(self, ids, clean_up_tokenization_spaces=True):
"""Converts a sequence of ids in a string."""
tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
tokens = self.convert_ids_to_tokens(ids)
out_string = ''.join(tokens)
if clean_up_tokenization_spaces:
out_string = out_string.strip().replace('<unk>', '')
@@ -328,18 +258,7 @@ class XLNetTokenizer(object):
logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
return
out_vocab_file = os.path.join(vocab_path, VOCAB_NAME)
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
copyfile(self.vocab_file, out_vocab_file)
index = len(self.sp_model)
with open(special_tokens_file, 'w', encoding='utf-8') as writer:
for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(special_tokens_file))
index = token_index
writer.write(token + u'\n')
index += 1
return out_vocab_file, special_tokens_file
return (out_vocab_file,)