Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b0892fa0e8 | ||
|
|
f1e2e423ab | ||
|
|
5787e4c159 | ||
|
|
21f28c34b7 | ||
|
|
9d9b872b66 | ||
|
|
d6b0b9d451 | ||
|
|
7833b21a5a | ||
|
|
c473484087 | ||
|
|
1bbc28bee7 | ||
|
|
1bc13697b1 | ||
|
|
b2309cc6bf | ||
|
|
7ecff0ccbb | ||
|
|
58cca47c16 | ||
|
|
991172922f | ||
|
|
b58a15a31e |
@@ -26,7 +26,7 @@ author = u'huggingface'
|
||||
# The short X.Y version
|
||||
version = u''
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = u'3.0.0'
|
||||
release = u'3.0.2'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
@@ -39,7 +39,7 @@ of the specified model are used to initialize the model. The
|
||||
library also includes a number of task-specific final layers or 'heads' whose
|
||||
weights are instantiated randomly when not present in the specified
|
||||
pre-trained model. For example, instantiating a model with
|
||||
``BertForSequenceClassification.from_pretrained('bert-base-uncased', num_classes=2)``
|
||||
``BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)``
|
||||
will create a BERT model instance with encoder weights copied from the
|
||||
``bert-base-uncased`` model and a randomly initialized sequence
|
||||
classification head on top of the encoder with an output size of 2. Models
|
||||
@@ -272,7 +272,7 @@ optimize.
|
||||
:func:`~transformers.Trainer` uses a built-in default function to collate
|
||||
batches and prepare them to be fed into the model. If needed, you can also
|
||||
use the ``data_collator`` argument to pass your own collator function which
|
||||
takes in the data in the format provides by your dataset and returns a
|
||||
takes in the data in the format provided by your dataset and returns a
|
||||
batch ready to be fed into the model. Note that
|
||||
:func:`~transformers.TFTrainer` expects the passed datasets to be dataset
|
||||
objects from ``tensorflow_datasets``.
|
||||
|
||||
@@ -214,8 +214,14 @@ def main():
|
||||
if requires_preprocessing:
|
||||
prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
|
||||
preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
|
||||
|
||||
if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
|
||||
tokenizer_kwargs = {"add_space_before_punct_symbol": True}
|
||||
else:
|
||||
tokenizer_kwargs = {}
|
||||
|
||||
encoded_prompt = tokenizer.encode(
|
||||
preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", add_space_before_punct_symbol=True
|
||||
preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
|
||||
)
|
||||
else:
|
||||
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
|
||||
|
||||
4
setup.py
4
setup.py
@@ -101,7 +101,7 @@ extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "sci
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="3.0.1",
|
||||
version="3.0.2",
|
||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||
author_email="thomas@huggingface.co",
|
||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||
@@ -114,7 +114,7 @@ setup(
|
||||
packages=find_packages("src"),
|
||||
install_requires=[
|
||||
"numpy",
|
||||
"tokenizers == 0.8.0-rc4",
|
||||
"tokenizers == 0.8.1.rc1",
|
||||
# dataclasses for Python versions that don't have it
|
||||
"dataclasses;python_version<'3.7'",
|
||||
# utilities from PyPA to e.g. compare versions
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
__version__ = "3.0.1"
|
||||
__version__ = "3.0.2"
|
||||
|
||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||
# default Python logging output behavior when present.
|
||||
|
||||
@@ -71,10 +71,10 @@ from transformers import (
|
||||
XLMRobertaConfig,
|
||||
XLNetConfig,
|
||||
cached_path,
|
||||
hf_bucket_url,
|
||||
is_torch_available,
|
||||
load_pytorch_checkpoint_in_tf2_model,
|
||||
)
|
||||
from transformers.file_utils import hf_bucket_url
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
|
||||
@@ -347,6 +347,10 @@ class TFGenerationMixin:
|
||||
encoder_outputs = None
|
||||
cur_len = shape_list(input_ids)[-1]
|
||||
|
||||
assert (
|
||||
cur_len < max_length
|
||||
), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
|
||||
|
||||
if num_beams > 1:
|
||||
output = self._generate_beam_search(
|
||||
input_ids,
|
||||
|
||||
@@ -428,6 +428,10 @@ class GenerationMixin:
|
||||
encoder_outputs = None
|
||||
cur_len = input_ids.shape[-1]
|
||||
|
||||
assert (
|
||||
cur_len < max_length
|
||||
), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
|
||||
|
||||
if num_beams > 1:
|
||||
output = self._generate_beam_search(
|
||||
input_ids,
|
||||
|
||||
@@ -478,7 +478,7 @@ class TFT5Block(tf.keras.layers.Layer):
|
||||
return outputs # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
|
||||
|
||||
|
||||
class _NoLayerEmbedTokens(object):
|
||||
class _NoLayerEmbedTokens:
|
||||
"""
|
||||
this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
|
||||
class to avoid problem with weight restoring. Also it makes sure that the layer is
|
||||
@@ -655,7 +655,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
|
||||
# Since we are adding it to the raw scores before the softmax, this is
|
||||
# effectively the same as removing these entirely.
|
||||
|
||||
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
|
||||
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
|
||||
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
|
||||
# extended_attention_mask = tf.math.equal(extended_attention_mask,
|
||||
# tf.transpose(extended_attention_mask, perm=(-1, -2)))
|
||||
@@ -682,16 +682,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
|
||||
else:
|
||||
encoder_extended_attention_mask = None
|
||||
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
# attention_probs has shape bsz x n_heads x N x N
|
||||
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||
if head_mask is not None:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
head_mask = [None] * self.num_hidden_layers
|
||||
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||
assert head_mask is None, "Head mask not supported"
|
||||
head_mask = [None] * self.num_hidden_layers
|
||||
|
||||
present_key_value_states = ()
|
||||
all_hidden_states = ()
|
||||
@@ -1054,8 +1046,6 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
|
||||
r"""
|
||||
Returns:
|
||||
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
|
||||
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
|
||||
Classification loss (cross entropy).
|
||||
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
|
||||
|
||||
@@ -606,7 +606,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token="[MASK]",
|
||||
clean_text=True,
|
||||
tokenize_chinese_chars=True,
|
||||
strip_accents=True,
|
||||
strip_accents=None,
|
||||
wordpieces_prefix="##",
|
||||
**kwargs
|
||||
):
|
||||
|
||||
@@ -102,17 +102,26 @@ def get_pairs(word):
|
||||
|
||||
class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
GPT-2 BPE tokenizer. Peculiarities:
|
||||
GPT-2 BPE tokenizer, using byte-level Byte-Pair-Encoding.
|
||||
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => the encoding methods should be called with the
|
||||
``add_prefix_space`` flag set to ``True``.
|
||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||
the absence of a space at the beginning of a string:
|
||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||
|
||||
::
|
||||
|
||||
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
>>> from transformers import GPT2Tokenizer
|
||||
>>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
||||
>>> tokenizer("Hello world")['input_ids']
|
||||
[15496, 995]
|
||||
>>> tokenizer(" Hello world")['input_ids']
|
||||
[18435, 995]
|
||||
|
||||
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
|
||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
||||
|
||||
.. note::
|
||||
|
||||
When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
@@ -137,6 +146,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -287,21 +297,30 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library), using byte-level
|
||||
Byte-Pair-Encoding.
|
||||
|
||||
Peculiarities:
|
||||
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => the encoding methods should be called with the
|
||||
``add_prefix_space`` flag set to ``True``.
|
||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||
the absence of a space at the beginning of a string:
|
||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||
|
||||
::
|
||||
|
||||
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
>>> from transformers import GPT2TokenizerFast
|
||||
>>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
>>> tokenizer("Hello world")['input_ids']
|
||||
[15496, 995]
|
||||
>>> tokenizer(" Hello world")['input_ids']
|
||||
[18435, 995]
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
|
||||
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
|
||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
||||
|
||||
.. note::
|
||||
|
||||
When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
|
||||
``add_prefix_space=True``.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
@@ -330,6 +349,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@@ -96,6 +96,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
@@ -261,6 +262,7 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
kwargs.setdefault("unk_token", unk_token)
|
||||
|
||||
@@ -62,17 +62,26 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
|
||||
class RobertaTokenizer(GPT2Tokenizer):
|
||||
"""
|
||||
Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
|
||||
Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
|
||||
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => the encoding methods should be called with the
|
||||
``add_prefix_space`` flag set to ``True``.
|
||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||
the absence of a space at the beginning of a string:
|
||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||
|
||||
::
|
||||
|
||||
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
>>> from transformers import RobertaTokenizer
|
||||
>>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
||||
>>> tokenizer("Hello world")['input_ids']
|
||||
[0, 31414, 232, 328, 2]
|
||||
>>> tokenizer(" Hello world")['input_ids']
|
||||
[0, 20920, 232, 2]
|
||||
|
||||
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
|
||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
||||
|
||||
.. note::
|
||||
|
||||
When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
@@ -251,19 +260,28 @@ class RobertaTokenizer(GPT2Tokenizer):
|
||||
|
||||
class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
"""
|
||||
Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
|
||||
tokenizer, using byte-level Byte-Pair-Encoding.
|
||||
|
||||
Peculiarities:
|
||||
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => the encoding methods should be called with the
|
||||
``add_prefix_space`` flag set to ``True``.
|
||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||
the absence of a space at the beginning of a string:
|
||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||
|
||||
::
|
||||
|
||||
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
>>> from transformers import RobertaTokenizerFast
|
||||
>>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
||||
>>> tokenizer("Hello world")['input_ids']
|
||||
[0, 31414, 232, 328, 2]
|
||||
>>> tokenizer(" Hello world")['input_ids']
|
||||
[0, 20920, 232, 2]
|
||||
|
||||
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
|
||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
||||
|
||||
.. note::
|
||||
|
||||
When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
|
||||
``add_prefix_space=True``.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
|
||||
@@ -588,8 +588,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
first_ids,
|
||||
second_ids,
|
||||
add_special_tokens=add_special_tokens,
|
||||
padding_strategy=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
|
||||
truncation_strategy=truncation_strategy,
|
||||
padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
|
||||
truncation=truncation_strategy.value,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
pad_to_multiple_of=None, # we pad in batch afterward
|
||||
@@ -649,7 +649,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||
|
||||
def convert_ids_to_tokens(
|
||||
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
|
||||
) -> Union[int, List[int]]:
|
||||
) -> Union[str, List[str]]:
|
||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||
(resp.) a sequence of tokens (str), using the vocabulary and added tokens.
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
||||
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
|
||||
from transformers.testing_utils import require_tf, require_torch, slow
|
||||
from transformers.tokenization_utils import AddedToken
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -233,6 +234,12 @@ class TokenizerTesterMixin:
|
||||
|
||||
self.assertListEqual(subwords, subwords_loaded)
|
||||
|
||||
def test_pickle_added_tokens(self):
|
||||
tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
|
||||
tok2 = pickle.loads(pickle.dumps(tok1))
|
||||
|
||||
self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
|
||||
|
||||
def test_added_tokens_do_lower_case(self):
|
||||
# TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens
|
||||
tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)
|
||||
|
||||
@@ -91,8 +91,6 @@ class CommonFastTokenizerTest(unittest.TestCase):
|
||||
self.assert_padding(tokenizer_r, tokenizer_p)
|
||||
self.assert_create_token_type_ids(tokenizer_r, tokenizer_p)
|
||||
self.assert_prepare_for_model(tokenizer_r, tokenizer_p)
|
||||
# TODO: enable for v3.0.0
|
||||
# self.assert_empty_output_no_special_tokens(tokenizer_r, tokenizer_p)
|
||||
|
||||
def fast_only(self, tokenizer_r):
|
||||
# Ensure None raise an error
|
||||
@@ -748,29 +746,41 @@ class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
|
||||
add_special_tokens=True,
|
||||
)
|
||||
|
||||
expected_results = [
|
||||
((0, 1), "A"),
|
||||
((1, 2), ","),
|
||||
((3, 8), "naive"), # BERT normalizes this away
|
||||
# Append MASK here after lower-casing
|
||||
((16, 21), "Allen"),
|
||||
((22, 24), "##NL"),
|
||||
((24, 25), "##P"),
|
||||
((26, 34), "sentence"),
|
||||
((35, 36), "."),
|
||||
]
|
||||
|
||||
# Check if the tokenizer is uncased
|
||||
if tokenizer_r.init_kwargs.get("do_lower_case"):
|
||||
expected_results = [(offset, token.lower()) for (offset, token) in expected_results]
|
||||
|
||||
# Append the special tokens
|
||||
expected_results.insert(3, ((9, 15), "[MASK]"))
|
||||
expected_results.insert(0, (None, "[CLS]"))
|
||||
expected_results.append((None, "[SEP]"))
|
||||
do_lower_case = tokenizer_r.init_kwargs.get("do_lower_case")
|
||||
expected_results = (
|
||||
[
|
||||
((0, 0), "[CLS]"),
|
||||
((0, 1), "A"),
|
||||
((1, 2), ","),
|
||||
((3, 5), "na"),
|
||||
((5, 6), "##ï"),
|
||||
((6, 8), "##ve"),
|
||||
((9, 15), "[MASK]"),
|
||||
((16, 21), "Allen"),
|
||||
((21, 23), "##NL"),
|
||||
((23, 24), "##P"),
|
||||
((25, 33), "sentence"),
|
||||
((33, 34), "."),
|
||||
((0, 0), "[SEP]"),
|
||||
]
|
||||
if not do_lower_case
|
||||
else [
|
||||
((0, 0), "[CLS]"),
|
||||
((0, 1), "a"),
|
||||
((1, 2), ","),
|
||||
((3, 8), "naive"),
|
||||
((9, 15), "[MASK]"),
|
||||
((16, 21), "allen"),
|
||||
((21, 23), "##nl"),
|
||||
((23, 24), "##p"),
|
||||
((25, 33), "sentence"),
|
||||
((33, 34), "."),
|
||||
((0, 0), "[SEP]"),
|
||||
]
|
||||
)
|
||||
|
||||
self.assertEqual([e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]))
|
||||
# self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
|
||||
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
|
||||
|
||||
|
||||
class RobertaFastTokenizerTest(CommonFastTokenizerTest):
|
||||
|
||||
Reference in New Issue
Block a user