Introduce a new tensor type for return_tensors on tokenizer for NumPy (#4585)
* Refactor tensor creation in tokenizers.
* Make sure to convert string to TensorType
* Refactor convert_to_tensors_
* Introduce numpy tensor creation
* Format
* Add unittest for TensorType creation from str
* sorting imports
* Added unittests for numpy tensor conversion.
* Do not use in-place version for squeeze as numpy doesn't provide such feature.
* Added extra parameter prepend_batch_axis: bool on prepare_for_model.
* Ensure test_np_encode_plus_sent_to_model is not executed if encoder/decoder model.
* style.
* numpy tests require_torch for now while flax not merged.
* Hopefully will make flake8 happy.
* One more time 🎶
This commit is contained in:
@@ -832,3 +832,47 @@ class TokenizerTesterMixin:
|
||||
# This should not fail
|
||||
model(encoded_sequence_fast)
|
||||
model(batch_encoded_sequence_fast)
|
||||
|
||||
# TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
|
||||
@require_torch
|
||||
def test_np_encode_plus_sent_to_model(self):
|
||||
from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
|
||||
|
||||
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
|
||||
return
|
||||
|
||||
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
|
||||
config = config_class()
|
||||
|
||||
if config.is_encoder_decoder or config.pad_token_id is None:
|
||||
return
|
||||
|
||||
# Build sequence
|
||||
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
|
||||
sequence = " ".join(first_ten_tokens)
|
||||
encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="np")
|
||||
batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
|
||||
|
||||
# TODO: add forward through JAX/Flax when PR is merged
|
||||
# This is currently here to make flake8 happy !
|
||||
if encoded_sequence is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on encode_plus()")
|
||||
|
||||
if batch_encoded_sequence is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus()")
|
||||
|
||||
if self.test_rust_tokenizer:
|
||||
fast_tokenizer = self.get_rust_tokenizer()
|
||||
encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="np")
|
||||
batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
|
||||
|
||||
# TODO: add forward through JAX/Flax when PR is merged
|
||||
# This is currently here to make flake8 happy !
|
||||
if encoded_sequence_fast is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on encode_plus() (fast)")
|
||||
|
||||
if batch_encoded_sequence_fast is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus() (fast)")
|
||||
|
||||
Reference in New Issue
Block a user