Introduce a new tensor type for return_tensors on tokenizer for NumPy (#4585)

* Refactor tensor creation in tokenizers.

* Make sure to convert string to TensorType

* Refactor convert_to_tensors_

* Introduce numpy tensor creation

* Format

* Add unittest for TensorType creation from str

* sorting imports

* Added unittests for numpy tensor conversion.

* Do not use in-place version for squeeze as numpy doesn't provide such feature.

* Added extra parameter prepend_batch_axis: bool on prepare_for_model.

* Ensure test_np_encode_plus_sent_to_model is not executed if encoder/decoder model.

* style.

* numpy tests require_torch for now while flax not merged.

* Hopefully will make flake8 happy.

* One more time 🎶
This commit is contained in:
Funtowicz Morgan
2020-06-04 04:57:01 +00:00
committed by GitHub
parent efae154929
commit 5bf9afbf35
4 changed files with 133 additions and 85 deletions

View File

@@ -832,3 +832,47 @@ class TokenizerTesterMixin:
# This should not fail
model(encoded_sequence_fast)
model(batch_encoded_sequence_fast)
# TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
@require_torch
def test_np_encode_plus_sent_to_model(self):
from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
tokenizer = self.get_tokenizer()
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
return
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
return
# Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
sequence = " ".join(first_ten_tokens)
encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="np")
batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
# TODO: add forward through JAX/Flax when PR is merged
# This is currently here to make flake8 happy !
if encoded_sequence is None:
raise ValueError("Cannot convert list to numpy tensor on encode_plus()")
if batch_encoded_sequence is None:
raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus()")
if self.test_rust_tokenizer:
fast_tokenizer = self.get_rust_tokenizer()
encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="np")
batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
# TODO: add forward through JAX/Flax when PR is merged
# This is currently here to make flake8 happy !
if encoded_sequence_fast is None:
raise ValueError("Cannot convert list to numpy tensor on encode_plus() (fast)")
if batch_encoded_sequence_fast is None:
raise ValueError("Cannot convert list to numpy tensor on batch_encode_plus() (fast)")