Introduce a new tensor type for return_tensors on tokenizer for NumPy (#4585)
* Refactor tensor creation in tokenizers.
* Make sure to convert string to TensorType
* Refactor convert_to_tensors_
* Introduce numpy tensor creation
* Format
* Add unittest for TensorType creation from str
* sorting imports
* Added unittests for numpy tensor conversion.
* Do not use in-place version for squeeze as numpy doesn't provide such feature.
* Added extra parameter prepend_batch_axis: bool on prepare_for_model.
* Ensure test_np_encode_plus_sent_to_model is not executed if encoder/decoder model.
* style.
* numpy tests require_torch for now while flax not merged.
* Hopefully will make flake8 happy.
* One more time 🎶
This commit is contained in:
@@ -832,3 +832,47 @@ class TokenizerTesterMixin:
|
||||
# This should not fail
|
||||
model(encoded_sequence_fast)
|
||||
model(batch_encoded_sequence_fast)
|
||||
|
||||
# TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
|
||||
@require_torch
|
||||
def test_np_encode_plus_sent_to_model(self):
|
||||
from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
|
||||
|
||||
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
|
||||
return
|
||||
|
||||
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
|
||||
config = config_class()
|
||||
|
||||
if config.is_encoder_decoder or config.pad_token_id is None:
|
||||
return
|
||||
|
||||
# Build sequence
|
||||
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
|
||||
sequence = " ".join(first_ten_tokens)
|
||||
encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="np")
|
||||
batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
|
||||
|
||||
# TODO: add forward through JAX/Flax when PR is merged
|
||||
# This is currently here to make flake8 happy !
|
||||
if encoded_sequence is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on encode_plus()")
|
||||
|
||||
if batch_encoded_sequence is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus()")
|
||||
|
||||
if self.test_rust_tokenizer:
|
||||
fast_tokenizer = self.get_rust_tokenizer()
|
||||
encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="np")
|
||||
batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
|
||||
|
||||
# TODO: add forward through JAX/Flax when PR is merged
|
||||
# This is currently here to make flake8 happy !
|
||||
if encoded_sequence_fast is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on encode_plus() (fast)")
|
||||
|
||||
if batch_encoded_sequence_fast is None:
|
||||
raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus() (fast)")
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
from transformers import PreTrainedTokenizer, TensorType
|
||||
from transformers.tokenization_gpt2 import GPT2Tokenizer
|
||||
|
||||
from .utils import slow
|
||||
@@ -39,3 +39,8 @@ class TokenizerUtilsTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_pretrained_tokenizers(self):
|
||||
self.check_tokenizer_from_pretrained(GPT2Tokenizer)
|
||||
|
||||
def check_tensor_type_from_str(self):
|
||||
self.assertEqual(TensorType("tf"), TensorType.TENSORFLOW)
|
||||
self.assertEqual(TensorType("pt"), TensorType.PYTORCH)
|
||||
self.assertEqual(TensorType("np"), TensorType.NUMPY)
|
||||
|
||||
Reference in New Issue
Block a user