[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency (#8073)
* Fixing roberta for slow-fast tests * WIP getting equivalence on pipelines * slow-to-fast equivalence - working on question-answering pipeline * optional FAISS tests * Pipeline Q&A * Move pipeline tests to their own test job again * update tokenizer to add sequence id methods * update to tokenizers 0.9.4 * set sentencepiecce as optional * clean up squad * clean up pipelines to use sequence_ids * style/quality * wording * Switch to use_fast = True by default * update tests for use_fast at True by default * fix rag tokenizer test * removing protobuf from required dependencies * fix NER test for use_fast = True by default * fixing example tests (Q&A examples use slow tokenizers for now) * protobuf in main deps extras["sentencepiece"] and example deps * fix protobug install test * try to fix seq2seq by switching to slow tokenizers for now * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
@@ -182,7 +182,9 @@ def to_py_obj(obj):
|
||||
"""
|
||||
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
|
||||
"""
|
||||
if isinstance(obj, (list, tuple)):
|
||||
if isinstance(obj, (dict, BatchEncoding)):
|
||||
return {k: to_py_obj(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return [to_py_obj(o) for o in obj]
|
||||
elif is_tf_available() and isinstance(obj, tf.Tensor):
|
||||
return obj.numpy().tolist()
|
||||
@@ -216,6 +218,9 @@ class BatchEncoding(UserDict):
|
||||
initialization.
|
||||
prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
|
||||
n_sequences (:obj:`Optional[int]`, `optional`):
|
||||
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
|
||||
initialization.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -224,6 +229,7 @@ class BatchEncoding(UserDict):
|
||||
encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
|
||||
tensor_type: Union[None, str, TensorType] = None,
|
||||
prepend_batch_axis: bool = False,
|
||||
n_sequences: Optional[int] = None,
|
||||
):
|
||||
super().__init__(data)
|
||||
|
||||
@@ -232,8 +238,22 @@ class BatchEncoding(UserDict):
|
||||
|
||||
self._encodings = encoding
|
||||
|
||||
if n_sequences is None and encoding is not None and len(encoding):
|
||||
n_sequences = encoding[0].n_sequences
|
||||
|
||||
self._n_sequences = n_sequences
|
||||
|
||||
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
|
||||
|
||||
@property
|
||||
def n_sequences(self) -> Optional[int]:
|
||||
"""
|
||||
:obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
|
||||
:class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
|
||||
sentence) or :obj:`2` (a pair of sentences)
|
||||
"""
|
||||
return self.n_sequences
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
"""
|
||||
@@ -311,6 +331,27 @@ class BatchEncoding(UserDict):
|
||||
raise ValueError("tokens() is not available when using Python-based tokenizers")
|
||||
return self._encodings[batch_index].tokens
|
||||
|
||||
def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
|
||||
"""
|
||||
Return a list mapping the tokens to the id of their original sentences:
|
||||
|
||||
- :obj:`None` for special tokens added around or between sequences,
|
||||
- :obj:`0` for tokens corresponding to words in the first sequence,
|
||||
- :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
|
||||
encoded.
|
||||
|
||||
Args:
|
||||
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
|
||||
|
||||
Returns:
|
||||
:obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
|
||||
added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
|
||||
corresponding sequence.
|
||||
"""
|
||||
if not self._encodings:
|
||||
raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
|
||||
return self._encodings[batch_index].sequence_ids
|
||||
|
||||
def words(self, batch_index: int = 0) -> List[Optional[int]]:
|
||||
"""
|
||||
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
|
||||
@@ -325,7 +366,67 @@ class BatchEncoding(UserDict):
|
||||
"""
|
||||
if not self._encodings:
|
||||
raise ValueError("words() is not available when using Python-based tokenizers")
|
||||
return self._encodings[batch_index].words
|
||||
warnings.warn(
|
||||
"`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
|
||||
"but more self-explanatory `BatchEncoding.word_ids()` property.",
|
||||
FutureWarning,
|
||||
)
|
||||
return self.word_ids(batch_index)
|
||||
|
||||
def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
|
||||
"""
|
||||
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
|
||||
|
||||
Args:
|
||||
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
|
||||
|
||||
Returns:
|
||||
:obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
|
||||
the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
|
||||
word (several tokens will be mapped to the same word index if they are parts of that word).
|
||||
"""
|
||||
if not self._encodings:
|
||||
raise ValueError("word_ids() is not available when using Python-based tokenizers")
|
||||
return self._encodings[batch_index].word_ids
|
||||
|
||||
def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
|
||||
"""
|
||||
Get the index of the sequence represented by the given token. In the general use case, this method returns
|
||||
:obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair
|
||||
|
||||
Can be called as:
|
||||
|
||||
- ``self.token_to_sequence(token_index)`` if batch size is 1
|
||||
- ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1
|
||||
|
||||
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
|
||||
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
|
||||
tokenized words.
|
||||
|
||||
Args:
|
||||
batch_or_token_index (:obj:`int`):
|
||||
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
|
||||
the token in the sequence.
|
||||
token_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
:obj:`int`: Index of the word in the input sequence.
|
||||
"""
|
||||
|
||||
if not self._encodings:
|
||||
raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
|
||||
if token_index is not None:
|
||||
batch_index = batch_or_token_index
|
||||
else:
|
||||
batch_index = 0
|
||||
token_index = batch_or_token_index
|
||||
if batch_index < 0:
|
||||
batch_index = self._batch_size + batch_index
|
||||
if token_index < 0:
|
||||
token_index = self._seq_len + token_index
|
||||
return self._encodings[batch_index].token_to_sequence(token_index)
|
||||
|
||||
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
|
||||
"""
|
||||
@@ -365,9 +466,11 @@ class BatchEncoding(UserDict):
|
||||
token_index = self._seq_len + token_index
|
||||
return self._encodings[batch_index].token_to_word(token_index)
|
||||
|
||||
def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> Optional[TokenSpan]:
|
||||
def word_to_tokens(
|
||||
self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
|
||||
) -> Optional[TokenSpan]:
|
||||
"""
|
||||
Get the encoded token span corresponding to a word in the sequence of the batch.
|
||||
Get the encoded token span corresponding to a word in a sequence of the batch.
|
||||
|
||||
Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
|
||||
|
||||
@@ -376,8 +479,9 @@ class BatchEncoding(UserDict):
|
||||
|
||||
Can be called as:
|
||||
|
||||
- ``self.word_to_tokens(word_index)`` if batch size is 1
|
||||
- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
|
||||
- ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
|
||||
- ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
|
||||
to 1
|
||||
|
||||
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
|
||||
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
|
||||
@@ -390,6 +494,9 @@ class BatchEncoding(UserDict):
|
||||
word_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
|
||||
sequence.
|
||||
sequence_index (:obj:`int`, `optional`, defaults to 0):
|
||||
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
|
||||
or 1) the provided word index belongs to.
|
||||
|
||||
Returns:
|
||||
Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
|
||||
@@ -407,7 +514,7 @@ class BatchEncoding(UserDict):
|
||||
batch_index = self._batch_size + batch_index
|
||||
if word_index < 0:
|
||||
word_index = self._seq_len + word_index
|
||||
span = self._encodings[batch_index].word_to_tokens(word_index)
|
||||
span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
|
||||
return TokenSpan(*span) if span is not None else None
|
||||
|
||||
def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
|
||||
@@ -446,7 +553,9 @@ class BatchEncoding(UserDict):
|
||||
token_index = batch_or_token_index
|
||||
return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
|
||||
|
||||
def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
|
||||
def char_to_token(
|
||||
self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
|
||||
) -> int:
|
||||
"""
|
||||
Get the index of the token in the encoded output comprising a character in the original string for a sequence
|
||||
of the batch.
|
||||
@@ -467,6 +576,9 @@ class BatchEncoding(UserDict):
|
||||
char_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
|
||||
sequence.
|
||||
sequence_index (:obj:`int`, `optional`, defaults to 0):
|
||||
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
|
||||
or 1) the provided character index belongs to.
|
||||
|
||||
|
||||
Returns:
|
||||
@@ -480,9 +592,11 @@ class BatchEncoding(UserDict):
|
||||
else:
|
||||
batch_index = 0
|
||||
char_index = batch_or_char_index
|
||||
return self._encodings[batch_index].char_to_token(char_index)
|
||||
return self._encodings[batch_index].char_to_token(char_index, sequence_index)
|
||||
|
||||
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
|
||||
def word_to_chars(
|
||||
self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
|
||||
) -> CharSpan:
|
||||
"""
|
||||
Get the character span in the original string corresponding to given word in a sequence of the batch.
|
||||
|
||||
@@ -503,6 +617,9 @@ class BatchEncoding(UserDict):
|
||||
word_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
|
||||
sequence.
|
||||
sequence_index (:obj:`int`, `optional`, defaults to 0):
|
||||
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
|
||||
or 1) the provided word index belongs to.
|
||||
|
||||
Returns:
|
||||
:obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
|
||||
@@ -520,9 +637,9 @@ class BatchEncoding(UserDict):
|
||||
else:
|
||||
batch_index = 0
|
||||
word_index = batch_or_word_index
|
||||
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index)))
|
||||
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
|
||||
|
||||
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
|
||||
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
|
||||
"""
|
||||
Get the word in the original string corresponding to a character in the original string of a sequence of the
|
||||
batch.
|
||||
@@ -543,6 +660,9 @@ class BatchEncoding(UserDict):
|
||||
char_index (:obj:`int`, `optional`):
|
||||
If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
|
||||
original string.
|
||||
sequence_index (:obj:`int`, `optional`, defaults to 0):
|
||||
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
|
||||
or 1) the provided character index belongs to.
|
||||
|
||||
|
||||
Returns:
|
||||
@@ -556,7 +676,7 @@ class BatchEncoding(UserDict):
|
||||
else:
|
||||
batch_index = 0
|
||||
char_index = batch_or_char_index
|
||||
return self._encodings[batch_index].char_to_word(char_index)
|
||||
return self._encodings[batch_index].char_to_word(char_index, sequence_index)
|
||||
|
||||
def convert_to_tensors(
|
||||
self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
|
||||
@@ -1872,6 +1992,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
"Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
|
||||
)
|
||||
|
||||
save_directory = str(save_directory)
|
||||
|
||||
added_tokens_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user