[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency (#8073)

* Fixing roberta for slow-fast tests

* WIP getting equivalence on pipelines

* slow-to-fast equivalence - working on question-answering pipeline

* optional FAISS tests

* Pipeline Q&A

* Move pipeline tests to their own test job again

* update tokenizer to add sequence id methods

* update to tokenizers 0.9.4

* set sentencepiecce as optional

* clean up squad

* clean up pipelines to use sequence_ids

* style/quality

* wording

* Switch to use_fast = True by default

* update tests for use_fast at True by default

* fix rag tokenizer test

* removing protobuf from required dependencies

* fix NER test for use_fast = True by default

* fixing example tests (Q&A examples use slow tokenizers for now)

* protobuf in main deps extras["sentencepiece"] and example deps

* fix protobug install test

* try to fix seq2seq by switching to slow tokenizers for now

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
Thomas Wolf
2020-11-15 22:50:59 +01:00
committed by GitHub
parent 24184e73c4
commit f4e04cd2c6
23 changed files with 689 additions and 262 deletions

View File

@@ -182,7 +182,9 @@ def to_py_obj(obj):
"""
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
"""
if isinstance(obj, (list, tuple)):
if isinstance(obj, (dict, BatchEncoding)):
return {k: to_py_obj(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj]
elif is_tf_available() and isinstance(obj, tf.Tensor):
return obj.numpy().tolist()
@@ -216,6 +218,9 @@ class BatchEncoding(UserDict):
initialization.
prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
n_sequences (:obj:`Optional[int]`, `optional`):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization.
"""
def __init__(
@@ -224,6 +229,7 @@ class BatchEncoding(UserDict):
encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
tensor_type: Union[None, str, TensorType] = None,
prepend_batch_axis: bool = False,
n_sequences: Optional[int] = None,
):
super().__init__(data)
@@ -232,8 +238,22 @@ class BatchEncoding(UserDict):
self._encodings = encoding
if n_sequences is None and encoding is not None and len(encoding):
n_sequences = encoding[0].n_sequences
self._n_sequences = n_sequences
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
@property
def n_sequences(self) -> Optional[int]:
"""
:obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
:class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
sentence) or :obj:`2` (a pair of sentences)
"""
return self.n_sequences
@property
def is_fast(self) -> bool:
"""
@@ -311,6 +331,27 @@ class BatchEncoding(UserDict):
raise ValueError("tokens() is not available when using Python-based tokenizers")
return self._encodings[batch_index].tokens
def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to the id of their original sentences:
- :obj:`None` for special tokens added around or between sequences,
- :obj:`0` for tokens corresponding to words in the first sequence,
- :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
encoded.
Args:
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
Returns:
:obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
corresponding sequence.
"""
if not self._encodings:
raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
return self._encodings[batch_index].sequence_ids
def words(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
@@ -325,7 +366,67 @@ class BatchEncoding(UserDict):
"""
if not self._encodings:
raise ValueError("words() is not available when using Python-based tokenizers")
return self._encodings[batch_index].words
warnings.warn(
"`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
"but more self-explanatory `BatchEncoding.word_ids()` property.",
FutureWarning,
)
return self.word_ids(batch_index)
def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
"""
Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
Args:
batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
Returns:
:obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
word (several tokens will be mapped to the same word index if they are parts of that word).
"""
if not self._encodings:
raise ValueError("word_ids() is not available when using Python-based tokenizers")
return self._encodings[batch_index].word_ids
def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
"""
Get the index of the sequence represented by the given token. In the general use case, this method returns
:obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair
Can be called as:
- ``self.token_to_sequence(token_index)`` if batch size is 1
- ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
words are defined by the user). In this case it allows to easily associate encoded tokens with provided
tokenized words.
Args:
batch_or_token_index (:obj:`int`):
Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
the token in the sequence.
token_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
sequence.
Returns:
:obj:`int`: Index of the word in the input sequence.
"""
if not self._encodings:
raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
if token_index is not None:
batch_index = batch_or_token_index
else:
batch_index = 0
token_index = batch_or_token_index
if batch_index < 0:
batch_index = self._batch_size + batch_index
if token_index < 0:
token_index = self._seq_len + token_index
return self._encodings[batch_index].token_to_sequence(token_index)
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
"""
@@ -365,9 +466,11 @@ class BatchEncoding(UserDict):
token_index = self._seq_len + token_index
return self._encodings[batch_index].token_to_word(token_index)
def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> Optional[TokenSpan]:
def word_to_tokens(
self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
) -> Optional[TokenSpan]:
"""
Get the encoded token span corresponding to a word in the sequence of the batch.
Get the encoded token span corresponding to a word in a sequence of the batch.
Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
@@ -376,8 +479,9 @@ class BatchEncoding(UserDict):
Can be called as:
- ``self.word_to_tokens(word_index)`` if batch size is 1
- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
- ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
- ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
to 1
This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
@@ -390,6 +494,9 @@ class BatchEncoding(UserDict):
word_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided word index belongs to.
Returns:
Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
@@ -407,7 +514,7 @@ class BatchEncoding(UserDict):
batch_index = self._batch_size + batch_index
if word_index < 0:
word_index = self._seq_len + word_index
span = self._encodings[batch_index].word_to_tokens(word_index)
span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
return TokenSpan(*span) if span is not None else None
def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
@@ -446,7 +553,9 @@ class BatchEncoding(UserDict):
token_index = batch_or_token_index
return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
def char_to_token(
self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
) -> int:
"""
Get the index of the token in the encoded output comprising a character in the original string for a sequence
of the batch.
@@ -467,6 +576,9 @@ class BatchEncoding(UserDict):
char_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided character index belongs to.
Returns:
@@ -480,9 +592,11 @@ class BatchEncoding(UserDict):
else:
batch_index = 0
char_index = batch_or_char_index
return self._encodings[batch_index].char_to_token(char_index)
return self._encodings[batch_index].char_to_token(char_index, sequence_index)
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
def word_to_chars(
self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
) -> CharSpan:
"""
Get the character span in the original string corresponding to given word in a sequence of the batch.
@@ -503,6 +617,9 @@ class BatchEncoding(UserDict):
word_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
sequence.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided word index belongs to.
Returns:
:obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
@@ -520,9 +637,9 @@ class BatchEncoding(UserDict):
else:
batch_index = 0
word_index = batch_or_word_index
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index)))
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
"""
Get the word in the original string corresponding to a character in the original string of a sequence of the
batch.
@@ -543,6 +660,9 @@ class BatchEncoding(UserDict):
char_index (:obj:`int`, `optional`):
If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
original string.
sequence_index (:obj:`int`, `optional`, defaults to 0):
If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
or 1) the provided character index belongs to.
Returns:
@@ -556,7 +676,7 @@ class BatchEncoding(UserDict):
else:
batch_index = 0
char_index = batch_or_char_index
return self._encodings[batch_index].char_to_word(char_index)
return self._encodings[batch_index].char_to_word(char_index, sequence_index)
def convert_to_tensors(
self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
@@ -1872,6 +1992,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
)
save_directory = str(save_directory)
added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
)