Deprecate models (#24787)
* Deprecate some models * Fix imports * Fix inits too * Remove tests * Add deprecated banner to documentation * Remove from init * Fix auto classes * Style * Remote upgrade strategy 1 * Remove site package cache * Revert this part * Fix typo... * Update utils * Update docs/source/en/model_doc/bort.md Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr> * Address review comments * With all files saved --------- Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
@@ -1,51 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import AutoModel
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
class BortIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_output_embeds_base_model(self):
|
||||
model = AutoModel.from_pretrained("amazon/bort")
|
||||
model.to(torch_device)
|
||||
|
||||
input_ids = torch.tensor(
|
||||
[[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
|
||||
device=torch_device,
|
||||
dtype=torch.long,
|
||||
) # Schloß Nymphenburg in Munich is really nice!
|
||||
output = model(input_ids)["last_hidden_state"]
|
||||
expected_shape = torch.Size((1, 15, 1024))
|
||||
self.assertEqual(output.shape, expected_shape)
|
||||
# compare the actual values for a slice.
|
||||
expected_slice = torch.tensor(
|
||||
[[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
|
||||
device=torch_device,
|
||||
dtype=torch.float,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
|
||||
@@ -1,53 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import is_tf_available
|
||||
from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from transformers import TFAutoModel
|
||||
|
||||
|
||||
@require_tf
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
class TFBortIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_output_embeds_base_model(self):
|
||||
model = TFAutoModel.from_pretrained("amazon/bort")
|
||||
|
||||
input_ids = tf.convert_to_tensor(
|
||||
[[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
|
||||
dtype=tf.int32,
|
||||
) # Schloß Nymphenburg in Munich is really nice!
|
||||
|
||||
output = model(input_ids)["last_hidden_state"]
|
||||
expected_shape = tf.TensorShape((1, 15, 1024))
|
||||
self.assertEqual(output.shape, expected_shape)
|
||||
# compare the actual values for a slice.
|
||||
expected_slice = tf.convert_to_tensor(
|
||||
[[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
|
||||
dtype=tf.float32,
|
||||
)
|
||||
|
||||
self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
|
||||
@@ -1,311 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import MCTCTFeatureExtractor
|
||||
from transformers.testing_utils import require_torch
|
||||
|
||||
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
||||
|
||||
|
||||
global_rng = random.Random()
|
||||
|
||||
|
||||
def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||
"""Creates a random float32 tensor"""
|
||||
if rng is None:
|
||||
rng = global_rng
|
||||
|
||||
values = []
|
||||
for _batch_idx in range(shape[0]):
|
||||
values.append([])
|
||||
for _ in range(shape[1]):
|
||||
values[-1].append(rng.random() * scale)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
@require_torch
|
||||
class MCTCTFeatureExtractionTester(unittest.TestCase):
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
min_seq_length=400,
|
||||
max_seq_length=2000,
|
||||
feature_size=24,
|
||||
num_mel_bins=24,
|
||||
padding_value=0.0,
|
||||
sampling_rate=16_000,
|
||||
return_attention_mask=True,
|
||||
do_normalize=True,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.min_seq_length = min_seq_length
|
||||
self.max_seq_length = max_seq_length
|
||||
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
||||
self.feature_size = feature_size
|
||||
self.num_mel_bins = num_mel_bins
|
||||
self.padding_value = padding_value
|
||||
self.sampling_rate = sampling_rate
|
||||
self.return_attention_mask = return_attention_mask
|
||||
self.do_normalize = do_normalize
|
||||
|
||||
def prepare_feat_extract_dict(self):
|
||||
return {
|
||||
"feature_size": self.feature_size,
|
||||
"num_mel_bins": self.num_mel_bins,
|
||||
"padding_value": self.padding_value,
|
||||
"sampling_rate": self.sampling_rate,
|
||||
"return_attention_mask": self.return_attention_mask,
|
||||
"do_normalize": self.do_normalize,
|
||||
}
|
||||
|
||||
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
||||
def _flatten(list_of_lists):
|
||||
return list(itertools.chain(*list_of_lists))
|
||||
|
||||
if equal_length:
|
||||
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
|
||||
else:
|
||||
# make sure that inputs increase in size
|
||||
speech_inputs = [
|
||||
floats_list((x, self.feature_size))
|
||||
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
||||
]
|
||||
if numpify:
|
||||
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
||||
return speech_inputs
|
||||
|
||||
|
||||
@require_torch
|
||||
class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
||||
feature_extraction_class = MCTCTFeatureExtractor
|
||||
|
||||
def setUp(self):
|
||||
self.feat_extract_tester = MCTCTFeatureExtractionTester(self)
|
||||
|
||||
def _check_zero_mean_unit_variance(self, input_vector):
|
||||
self.assertTrue(np.all(np.mean(input_vector) < 1e-3))
|
||||
self.assertTrue(np.all(np.abs(np.var(input_vector) - 1) < 1e-3))
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
# create three inputs of length 800, 1000, and 1200
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
||||
|
||||
# Test feature size
|
||||
input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
|
||||
self.assertTrue(input_features.ndim == 3)
|
||||
self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
|
||||
|
||||
# Test not batched input
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
|
||||
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
|
||||
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
||||
|
||||
# Test batched
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
|
||||
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
# Test 2-D numpy arrays are batched.
|
||||
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
|
||||
np_speech_inputs = np.asarray(speech_inputs)
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
|
||||
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
def test_cepstral_mean_and_variance_normalization(self):
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
|
||||
|
||||
paddings = ["longest", "max_length", "do_not_pad"]
|
||||
max_lengths = [None, 16, None]
|
||||
for max_length, padding in zip(max_lengths, paddings):
|
||||
inputs = feature_extractor(
|
||||
speech_inputs,
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
return_attention_mask=True,
|
||||
truncation=max_length is not None, # reference to #16419
|
||||
)
|
||||
input_features = inputs.input_features
|
||||
attention_mask = inputs.attention_mask
|
||||
fbank_feat_lengths = [np.sum(x) for x in attention_mask]
|
||||
self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
|
||||
self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
|
||||
self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
|
||||
|
||||
def test_cepstral_mean_and_variance_normalization_np(self):
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
|
||||
|
||||
paddings = ["longest", "max_length", "do_not_pad"]
|
||||
max_lengths = [None, 16, None]
|
||||
for max_length, padding in zip(max_lengths, paddings):
|
||||
inputs = feature_extractor(
|
||||
speech_inputs,
|
||||
max_length=max_length,
|
||||
padding=padding,
|
||||
return_tensors="np",
|
||||
return_attention_mask=True,
|
||||
truncation=max_length is not None,
|
||||
)
|
||||
input_features = inputs.input_features
|
||||
attention_mask = inputs.attention_mask
|
||||
fbank_feat_lengths = [np.sum(x) for x in attention_mask]
|
||||
|
||||
self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
|
||||
self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
|
||||
self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
|
||||
self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
|
||||
self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
|
||||
|
||||
def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
|
||||
inputs = feature_extractor(
|
||||
speech_inputs,
|
||||
padding="max_length",
|
||||
max_length=4,
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
return_attention_mask=True,
|
||||
)
|
||||
input_features = inputs.input_features
|
||||
attention_mask = inputs.attention_mask
|
||||
fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
|
||||
|
||||
self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
|
||||
self._check_zero_mean_unit_variance(input_features[1])
|
||||
self._check_zero_mean_unit_variance(input_features[2])
|
||||
|
||||
def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
|
||||
inputs = feature_extractor(
|
||||
speech_inputs,
|
||||
padding="longest",
|
||||
max_length=4,
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
return_attention_mask=True,
|
||||
)
|
||||
input_features = inputs.input_features
|
||||
attention_mask = inputs.attention_mask
|
||||
fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
|
||||
|
||||
self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
|
||||
self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
|
||||
self._check_zero_mean_unit_variance(input_features[2])
|
||||
|
||||
# make sure that if max_length < longest -> then pad to max_length
|
||||
self.assertEqual(input_features.shape, (3, 4, 24))
|
||||
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
|
||||
inputs = feature_extractor(
|
||||
speech_inputs,
|
||||
padding="longest",
|
||||
max_length=16,
|
||||
truncation=True,
|
||||
return_tensors="np",
|
||||
return_attention_mask=True,
|
||||
)
|
||||
input_features = inputs.input_features
|
||||
attention_mask = inputs.attention_mask
|
||||
fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
|
||||
|
||||
self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
|
||||
self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
|
||||
self._check_zero_mean_unit_variance(input_features[2])
|
||||
|
||||
# make sure that if max_length < longest -> then pad to max_length
|
||||
self.assertEqual(input_features.shape, (3, 16, 24))
|
||||
|
||||
def test_double_precision_pad(self):
|
||||
import torch
|
||||
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
|
||||
py_speech_inputs = np_speech_inputs.tolist()
|
||||
|
||||
for inputs in [py_speech_inputs, np_speech_inputs]:
|
||||
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
|
||||
self.assertTrue(np_processed.input_features.dtype == np.float32)
|
||||
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
|
||||
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
|
||||
|
||||
def test_different_window(self):
|
||||
import torch
|
||||
|
||||
init_dict = self.feat_extract_tester.prepare_feat_extract_dict()
|
||||
init_dict["win_function"] = "hann_window"
|
||||
|
||||
feature_extractor = self.feature_extraction_class(**init_dict)
|
||||
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
|
||||
py_speech_inputs = np_speech_inputs.tolist()
|
||||
|
||||
for inputs in [py_speech_inputs, np_speech_inputs]:
|
||||
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
|
||||
self.assertTrue(np_processed.input_features.dtype == np.float32)
|
||||
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
|
||||
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
def test_integration(self):
|
||||
# fmt: off
|
||||
expected = np.array([
|
||||
[
|
||||
1.1280, 1.1319, 1.2744, 1.4369, 1.4328, 1.3671, 1.2889, 1.3046,
|
||||
1.4419, 0.8387, 0.2995, 0.0404, 0.1068, 0.0472, 0.3728, 1.3356,
|
||||
1.4491, 0.4770, 0.3997, 0.2776, 0.3184, -0.1243, -0.1170, -0.0828
|
||||
],
|
||||
[
|
||||
1.0826, 1.0565, 1.2110, 1.3886, 1.3416, 1.2009, 1.1894, 1.2707,
|
||||
1.5153, 0.7005, 0.4916, 0.4017, 0.3743, 0.1935, 0.4228, 1.1084,
|
||||
0.9768, 0.0608, 0.2044, 0.1723, 0.0433, -0.2360, -0.2478, -0.2643
|
||||
],
|
||||
[
|
||||
1.0590, 0.9923, 1.1185, 1.3309, 1.1971, 1.0067, 1.0080, 1.2036,
|
||||
1.5397, 1.0383, 0.7672, 0.7551, 0.4878, 0.8771, 0.7565, 0.8775,
|
||||
0.9042, 0.4595, 0.6157, 0.4954, 0.1857, 0.0307, 0.0199, 0.1033
|
||||
],
|
||||
])
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
|
||||
self.assertTrue(np.allclose(input_features[0, 100:103], expected, atol=1e-4))
|
||||
@@ -1,651 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch MCTCT model. """
|
||||
|
||||
import inspect
|
||||
import math
|
||||
import unittest
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from transformers import MCTCTConfig, is_torch_available
|
||||
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
|
||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import MCTCTForCTC, MCTCTModel, MCTCTProcessor
|
||||
|
||||
|
||||
class MCTCTModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=10,
|
||||
seq_length=40, # speech is longer
|
||||
is_training=False,
|
||||
vocab_size=32,
|
||||
hidden_size=128 * 4,
|
||||
num_hidden_layers=4,
|
||||
intermediate_size=20,
|
||||
num_attention_heads=4,
|
||||
attention_head_dim=128,
|
||||
max_position_embeddings=920,
|
||||
layer_norm_eps=1e-5,
|
||||
layerdrop=0.3,
|
||||
hidden_act="relu",
|
||||
initializer_range=0.02,
|
||||
hidden_dropout_prob=0.3,
|
||||
attention_probs_dropout_prob=0.3,
|
||||
conv_glu_dim=1,
|
||||
conv_dropout=0.3,
|
||||
num_conv_layers=1,
|
||||
conv_kernel=(7,),
|
||||
conv_stride=(3,),
|
||||
input_feat_per_channel=80,
|
||||
input_channels=1,
|
||||
conv_channels=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length # speech is longer
|
||||
self.is_training = is_training
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.layerdrop = layerdrop
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
|
||||
self.conv_glu_dim = conv_glu_dim
|
||||
self.conv_dropout = conv_dropout
|
||||
self.num_conv_layers = num_conv_layers
|
||||
self.conv_kernel = conv_kernel
|
||||
self.conv_stride = conv_stride
|
||||
self.input_feat_per_channel = input_feat_per_channel
|
||||
self.input_channels = input_channels
|
||||
self.conv_channels = conv_channels
|
||||
|
||||
output_seq_length = self.seq_length
|
||||
dilation = 1
|
||||
for _, kernel_sz, stride in zip(range(self.num_conv_layers), self.conv_kernel, self.conv_stride):
|
||||
padding = kernel_sz // 2
|
||||
output_seq_length = output_seq_length + 2 * padding - dilation * (kernel_sz - 1) - 1
|
||||
output_seq_length = torch.div(output_seq_length, stride, rounding_mode="trunc") + 1
|
||||
|
||||
self.output_seq_length = int(math.ceil(output_seq_length))
|
||||
self.encoder_seq_length = self.output_seq_length
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_features = floats_tensor(
|
||||
[self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
|
||||
)
|
||||
attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, input_features, attention_mask
|
||||
|
||||
def get_config(self):
|
||||
return MCTCTConfig(
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
intermediate_size=self.intermediate_size,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
attention_head_dim=self.attention_head_dim,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
layer_norm_eps=self.layer_norm_eps,
|
||||
layerdrop=self.layerdrop,
|
||||
hidden_act=self.hidden_act,
|
||||
initializer_range=self.initializer_range,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
conv_glu_dim=self.conv_glu_dim,
|
||||
conv_dropout=self.conv_dropout,
|
||||
num_conv_layers=self.num_conv_layers,
|
||||
conv_kernel=self.conv_kernel,
|
||||
conv_stride=self.conv_stride,
|
||||
input_feat_per_channel=self.input_feat_per_channel,
|
||||
input_channels=self.input_channels,
|
||||
conv_channels=self.conv_channels,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_features, attention_mask):
|
||||
model = MCTCTModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_features, attention_mask=attention_mask)
|
||||
|
||||
self.parent.assertEqual(
|
||||
result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
|
||||
)
|
||||
|
||||
def create_and_check_model_for_ctc(self, config, input_features, attention_mask):
|
||||
config.add_adapter = True
|
||||
config.output_hidden_size = 2 * config.hidden_size
|
||||
model = MCTCTForCTC(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_features, attention_mask=attention_mask)
|
||||
self.parent.assertEqual(
|
||||
result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
|
||||
)
|
||||
|
||||
def create_and_check_batch_inference(self, config, input_features, *args):
|
||||
# test does not pass for models making use of `group_norm`
|
||||
# check: https://github.com/pytorch/fairseq/issues/3227
|
||||
model = MCTCTModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
input_features = input_features[:3]
|
||||
attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.bool)
|
||||
|
||||
input_lengths = [input_features.shape[-1] // i for i in [2, 2, 1]]
|
||||
|
||||
# pad input
|
||||
for i in range(len(input_lengths)):
|
||||
input_features[i, input_lengths[i] :] = 0.0
|
||||
attention_mask[i, input_lengths[i] :] = 0.0
|
||||
|
||||
batch_outputs = model(input_features, attention_mask=attention_mask).last_hidden_state
|
||||
|
||||
for i in range(input_features.shape[0]):
|
||||
input_slice = input_features[i : i + 1, : input_lengths[i]]
|
||||
output = model(input_slice).last_hidden_state
|
||||
|
||||
batch_output = batch_outputs[i : i + 1, : output.shape[1]]
|
||||
self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
|
||||
|
||||
def check_ctc_loss(self, config, input_features, *args):
|
||||
model = MCTCTForCTC(config=config)
|
||||
model.to(torch_device)
|
||||
|
||||
# make sure that dropout is disabled
|
||||
model.eval()
|
||||
|
||||
input_features = input_features[:3]
|
||||
|
||||
# input_features is a 2D window for each sequence
|
||||
attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.long)
|
||||
|
||||
# -2 since input_features is a 2D window for each sequence in batch
|
||||
input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
|
||||
max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
|
||||
labels = ids_tensor((input_features.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
|
||||
# pad input
|
||||
for i in range(len(input_lengths)):
|
||||
input_features[i, input_lengths[i] :] = 0.0
|
||||
attention_mask[i, input_lengths[i] :] = 0
|
||||
|
||||
model.config.ctc_loss_reduction = "sum"
|
||||
sum_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
|
||||
|
||||
model.config.ctc_loss_reduction = "mean"
|
||||
mean_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
|
||||
|
||||
self.parent.assertTrue(isinstance(sum_loss, float))
|
||||
self.parent.assertTrue(isinstance(mean_loss, float))
|
||||
|
||||
def check_ctc_training(self, config, input_features, *args):
|
||||
config.ctc_zero_infinity = True
|
||||
model = MCTCTForCTC(config=config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
|
||||
input_features = input_features[:3]
|
||||
|
||||
input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
|
||||
max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
|
||||
labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 1), model.config.vocab_size)
|
||||
|
||||
# pad input
|
||||
for i in range(len(input_lengths)):
|
||||
input_features[i, input_lengths[i] :] = 0.0
|
||||
|
||||
if max_length_labels[i] < labels.shape[-1]:
|
||||
# it's important that we make sure that target lenghts are at least
|
||||
# one shorter than logit lenghts to prevent -inf
|
||||
labels[i, max_length_labels[i] - 1 :] = -100
|
||||
|
||||
loss = model(input_features, labels=labels).loss
|
||||
self.parent.assertFalse(torch.isinf(loss).item())
|
||||
|
||||
loss.backward()
|
||||
|
||||
def check_labels_out_of_vocab(self, config, input_features, *args):
|
||||
model = MCTCTForCTC(config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
|
||||
input_features = input_features[:3]
|
||||
|
||||
input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
|
||||
max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
|
||||
labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
|
||||
|
||||
with self.parent.assertRaises(ValueError):
|
||||
model(input_features, labels=labels)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config, input_features, attention_mask = self.prepare_config_and_inputs()
|
||||
inputs_dict = {"input_features": input_features, "attention_mask": attention_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class MCTCTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{"automatic-speech-recognition": MCTCTForCTC, "feature-extraction": MCTCTModel} if is_torch_available() else {}
|
||||
)
|
||||
test_pruning = False
|
||||
test_headmasking = False
|
||||
test_torchscript = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = MCTCTModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_ctc_loss_inference(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.check_ctc_loss(*config_and_inputs)
|
||||
|
||||
def test_ctc_train(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.check_ctc_training(*config_and_inputs)
|
||||
|
||||
def test_labels_out_of_vocab(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
|
||||
|
||||
# MCTCT has no inputs_embeds
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
# `input_ids` is renamed to `input_features`
|
||||
def test_forward_signature(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = [
|
||||
"input_features",
|
||||
"attention_mask",
|
||||
"head_mask",
|
||||
"output_attentions",
|
||||
"output_hidden_states",
|
||||
"return_dict",
|
||||
]
|
||||
self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
|
||||
|
||||
# MCTCT cannot resize token embeddings
|
||||
# since it has no tokens embeddings
|
||||
def test_resize_tokens_embeddings(self):
|
||||
pass
|
||||
|
||||
# MCTCT has no inputs_embeds
|
||||
def test_model_common_attributes(self):
|
||||
pass
|
||||
|
||||
def test_retain_grad_hidden_states_attentions(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.output_hidden_states = True
|
||||
config.output_attentions = True
|
||||
config.layerdrop = 0.0
|
||||
|
||||
# no need to test all models as different heads yield the same functionality
|
||||
model_class = self.all_model_classes[0]
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
|
||||
input_features = inputs_dict["input_features"]
|
||||
|
||||
input_lengths = torch.tensor(
|
||||
[input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
|
||||
)
|
||||
output_lengths = model._get_feat_extract_output_lengths(input_lengths)
|
||||
|
||||
labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
|
||||
inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
|
||||
inputs_dict["labels"] = labels
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
|
||||
output = outputs[0]
|
||||
|
||||
# Encoder-/Decoder-only models
|
||||
hidden_states = outputs.hidden_states[0]
|
||||
attentions = outputs.attentions[0]
|
||||
|
||||
hidden_states.retain_grad()
|
||||
attentions.retain_grad()
|
||||
|
||||
output.flatten()[0].backward(retain_graph=True)
|
||||
|
||||
self.assertIsNotNone(hidden_states.grad)
|
||||
self.assertIsNotNone(attentions.grad)
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
configs_no_init = _config_zero_init(config)
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
uniform_init_parms = [
|
||||
"conv.weight",
|
||||
"masked_spec_embed",
|
||||
"codevectors",
|
||||
"quantizer.weight_proj.weight",
|
||||
"project_hid.weight",
|
||||
"project_hid.bias",
|
||||
"project_q.weight",
|
||||
"project_q.bias",
|
||||
"feature_projection.projection.weight",
|
||||
"feature_projection.projection.bias",
|
||||
"objective.weight",
|
||||
]
|
||||
if param.requires_grad:
|
||||
if any(x in name for x in uniform_init_parms):
|
||||
self.assertTrue(
|
||||
-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
else:
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
# overwrite from test_modeling_common
|
||||
def _mock_init_weights(self, module):
|
||||
if hasattr(module, "weight") and module.weight is not None:
|
||||
module.weight.data.fill_(3)
|
||||
if hasattr(module, "weight_g") and module.weight_g is not None:
|
||||
module.weight_g.data.fill_(3)
|
||||
if hasattr(module, "weight_v") and module.weight_v is not None:
|
||||
module.weight_v.data.fill_(3)
|
||||
if hasattr(module, "bias") and module.bias is not None:
|
||||
module.bias.data.fill_(3)
|
||||
if hasattr(module, "codevectors") and module.codevectors is not None:
|
||||
module.codevectors.data.fill_(3)
|
||||
if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
|
||||
module.masked_spec_embed.data.fill_(3)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
@require_torch
|
||||
class MCTCTRobustModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_headmasking = False
|
||||
test_torchscript = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = MCTCTModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_batched_inference(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_batch_inference(*config_and_inputs)
|
||||
|
||||
def test_ctc_loss_inference(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.check_ctc_loss(*config_and_inputs)
|
||||
|
||||
def test_ctc_train(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.check_ctc_training(*config_and_inputs)
|
||||
|
||||
def test_labels_out_of_vocab(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
|
||||
|
||||
# MCTCT has no inputs_embeds
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
# `input_ids` is renamed to `input_features`
|
||||
def test_forward_signature(self):
|
||||
pass
|
||||
|
||||
# MCTCT cannot resize token embeddings
|
||||
# since it has no tokens embeddings
|
||||
def test_resize_tokens_embeddings(self):
|
||||
pass
|
||||
|
||||
# MCTCT has no inputs_embeds
|
||||
# and thus the `get_input_embeddings` fn
|
||||
# is not implemented
|
||||
def test_model_common_attributes(self):
|
||||
pass
|
||||
|
||||
def test_retain_grad_hidden_states_attentions(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.output_hidden_states = True
|
||||
config.output_attentions = True
|
||||
|
||||
# no need to test all models as different heads yield the same functionality
|
||||
model_class = self.all_model_classes[0]
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
|
||||
# set layer drop to 0
|
||||
model.config.layerdrop = 0.0
|
||||
|
||||
input_features = inputs_dict["input_features"]
|
||||
|
||||
input_lengths = torch.tensor(
|
||||
[input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
|
||||
)
|
||||
output_lengths = model._get_feat_extract_output_lengths(input_lengths)
|
||||
|
||||
labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
|
||||
inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
|
||||
inputs_dict["labels"] = labels
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
|
||||
output = outputs[0]
|
||||
|
||||
# Encoder-/Decoder-only models
|
||||
hidden_states = outputs.hidden_states[0]
|
||||
attentions = outputs.attentions[0]
|
||||
|
||||
hidden_states.retain_grad()
|
||||
attentions.retain_grad()
|
||||
|
||||
output.flatten()[0].backward(retain_graph=True)
|
||||
|
||||
self.assertIsNotNone(hidden_states.grad)
|
||||
self.assertIsNotNone(attentions.grad)
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
configs_no_init = _config_zero_init(config)
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
uniform_init_parms = [
|
||||
"conv.weight",
|
||||
"masked_spec_embed",
|
||||
"codevectors",
|
||||
"quantizer.weight_proj.weight",
|
||||
"project_hid.weight",
|
||||
"project_hid.bias",
|
||||
"project_q.weight",
|
||||
"project_q.bias",
|
||||
"feature_projection.projection.weight",
|
||||
"feature_projection.projection.bias",
|
||||
"objective.weight",
|
||||
]
|
||||
if param.requires_grad:
|
||||
if any(x in name for x in uniform_init_parms):
|
||||
self.assertTrue(
|
||||
-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
else:
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
# overwrite from test_modeling_common
|
||||
def _mock_init_weights(self, module):
|
||||
if hasattr(module, "weight") and module.weight is not None:
|
||||
module.weight.data.fill_(3)
|
||||
if hasattr(module, "weight_g") and module.weight_g is not None:
|
||||
module.weight_g.data.fill_(3)
|
||||
if hasattr(module, "weight_v") and module.weight_v is not None:
|
||||
module.weight_v.data.fill_(3)
|
||||
if hasattr(module, "bias") and module.bias is not None:
|
||||
module.bias.data.fill_(3)
|
||||
if hasattr(module, "codevectors") and module.codevectors is not None:
|
||||
module.codevectors.data.fill_(3)
|
||||
if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
|
||||
module.masked_spec_embed.data.fill_(3)
|
||||
|
||||
@unittest.skip(reason="Feed forward chunking is not implemented")
|
||||
def test_feed_forward_chunking(self):
|
||||
pass
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_soundfile
|
||||
@slow
|
||||
class MCTCTModelIntegrationTest(unittest.TestCase):
|
||||
def _load_datasamples(self, num_samples):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").filter(
|
||||
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
|
||||
)[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
def test_inference_ctc_normal(self):
|
||||
model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
|
||||
model.to(torch_device)
|
||||
processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
|
||||
input_speech = self._load_datasamples(1)
|
||||
|
||||
input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = model(input_features).logits
|
||||
|
||||
predicted_ids = torch.argmax(logits, dim=-1)
|
||||
predicted_trans = processor.batch_decode(predicted_ids)
|
||||
|
||||
EXPECTED_TRANSCRIPTIONS = ["a man said to the universe, sir, i exist."]
|
||||
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
|
||||
|
||||
def test_inference_ctc_normal_batched(self):
|
||||
model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
|
||||
model.to(torch_device)
|
||||
processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
|
||||
|
||||
input_speech = self._load_datasamples(2)
|
||||
|
||||
inputs = processor(input_speech, return_tensors="pt", padding=True)
|
||||
|
||||
input_features = inputs.input_features.to(torch_device)
|
||||
attention_mask = inputs.attention_mask.to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = model(input_features, attention_mask=attention_mask).logits
|
||||
|
||||
predicted_ids = torch.argmax(logits, dim=-1)
|
||||
predicted_trans = processor.batch_decode(predicted_ids)
|
||||
|
||||
EXPECTED_TRANSCRIPTIONS = [
|
||||
"a man said to the universe, sir, i exist.",
|
||||
'"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor."',
|
||||
]
|
||||
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
|
||||
|
||||
def test_inference_ctc_robust_batched(self):
|
||||
model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large").to(torch_device)
|
||||
processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
|
||||
|
||||
input_speech = self._load_datasamples(4)
|
||||
|
||||
inputs = processor(input_speech, return_tensors="pt", padding=True, return_attention_mask=True)
|
||||
|
||||
input_features = inputs.input_features.to(torch_device)
|
||||
attention_mask = inputs.attention_mask.to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = model(input_features, attention_mask=attention_mask).logits
|
||||
|
||||
predicted_ids = torch.argmax(logits, dim=-1)
|
||||
predicted_trans = processor.batch_decode(predicted_ids)
|
||||
|
||||
EXPECTED_TRANSCRIPTIONS = [
|
||||
"a man said to the universe, sir, i exist.",
|
||||
'"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor." "',
|
||||
"\"the cadona's chest still-dripping bloodthe acofis overstrained eyes, even the soring arena around him"
|
||||
" with thousands of spectators retrivialities not worth-thinking about.",
|
||||
"his instant panic was followed by a small sharp blow high on his chestr.",
|
||||
]
|
||||
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
|
||||
@@ -1,158 +0,0 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import MCTCTProcessor, is_speech_available, is_torch_available
|
||||
from transformers.file_utils import FEATURE_EXTRACTOR_NAME
|
||||
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizer
|
||||
from transformers.testing_utils import require_torch, require_torchaudio
|
||||
|
||||
|
||||
if is_speech_available() and is_torch_available():
|
||||
from transformers import MCTCTFeatureExtractor
|
||||
|
||||
from .test_feature_extraction_mctct import floats_list
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
class MCTCTProcessorTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
|
||||
self.add_kwargs_tokens_map = {
|
||||
"pad_token": "<pad>",
|
||||
"unk_token": "<unk>",
|
||||
"bos_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
}
|
||||
feature_extractor_map = {
|
||||
"feature_size": 1,
|
||||
"padding_value": 0.0,
|
||||
"sampling_rate": 16000,
|
||||
"return_attention_mask": False,
|
||||
"do_normalize": True,
|
||||
}
|
||||
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||
|
||||
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(feature_extractor_map) + "\n")
|
||||
|
||||
def get_tokenizer(self, **kwargs_init):
|
||||
kwargs = self.add_kwargs_tokens_map.copy()
|
||||
kwargs.update(kwargs_init)
|
||||
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
return MCTCTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
def test_save_load_pretrained_default(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = MCTCTProcessor.from_pretrained(self.tmpdirname)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
|
||||
|
||||
def test_save_load_pretrained_additional_features(self):
|
||||
processor = MCTCTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
|
||||
|
||||
processor = MCTCTProcessor.from_pretrained(
|
||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||
)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
|
||||
|
||||
def test_feature_extractor(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
raw_speech = floats_list((3, 1000))
|
||||
|
||||
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
|
||||
input_processor = processor(raw_speech, return_tensors="np")
|
||||
|
||||
for key in input_feat_extract.keys():
|
||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_tokenizer(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
input_str = "This is a test string"
|
||||
|
||||
encoded_processor = processor(text=input_str)
|
||||
|
||||
encoded_tok = tokenizer(input_str)
|
||||
|
||||
for key in encoded_tok.keys():
|
||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||
|
||||
def test_tokenizer_decode(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
|
||||
decoded_processor = processor.batch_decode(predicted_ids)
|
||||
decoded_tok = tokenizer.batch_decode(predicted_ids)
|
||||
|
||||
self.assertListEqual(decoded_tok, decoded_processor)
|
||||
|
||||
def test_model_input_names(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
self.assertListEqual(
|
||||
processor.model_input_names,
|
||||
feature_extractor.model_input_names,
|
||||
msg="`processor` and `feature_extractor` model input names do not match",
|
||||
)
|
||||
@@ -1,381 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the RetriBERT tokenizer. """
|
||||
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers import RetriBertTokenizer, RetriBertTokenizerFast
|
||||
from transformers.models.bert.tokenization_bert import (
|
||||
VOCAB_FILES_NAMES,
|
||||
BasicTokenizer,
|
||||
WordpieceTokenizer,
|
||||
_is_control,
|
||||
_is_punctuation,
|
||||
_is_whitespace,
|
||||
)
|
||||
from transformers.testing_utils import require_tokenizers, require_torch, slow
|
||||
|
||||
from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
|
||||
|
||||
|
||||
# Copied from transformers.tests.bert.test_modeling_bert.py with Bert->RetriBert
|
||||
@require_tokenizers
|
||||
class RetriBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = RetriBertTokenizer
|
||||
test_slow_tokenizer = True
|
||||
rust_tokenizer_class = RetriBertTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
space_between_special_tokens = True
|
||||
from_pretrained_filter = filter_non_english
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
vocab_tokens = [
|
||||
"[UNK]",
|
||||
"[CLS]",
|
||||
"[SEP]",
|
||||
"[PAD]",
|
||||
"[MASK]",
|
||||
"want",
|
||||
"##want",
|
||||
"##ed",
|
||||
"wa",
|
||||
"un",
|
||||
"runn",
|
||||
"##ing",
|
||||
",",
|
||||
"low",
|
||||
"lowest",
|
||||
]
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
input_text = "UNwant\u00E9d,running"
|
||||
output_text = "unwanted, running"
|
||||
return input_text, output_text
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = self.tokenizer_class(self.vocab_file)
|
||||
|
||||
tokens = tokenizer.tokenize("UNwant\u00E9d,running")
|
||||
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence = "UNwant\u00E9d,running"
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
# With lower casing
|
||||
tokenizer = self.get_tokenizer(do_lower_case=True)
|
||||
rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
|
||||
|
||||
sequence = "UNwant\u00E9d,running"
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_chinese(self):
|
||||
tokenizer = BasicTokenizer()
|
||||
|
||||
self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
|
||||
|
||||
def test_basic_tokenizer_lower(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
|
||||
|
||||
def test_basic_tokenizer_lower_strip_accents_false(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hällo", "!", "how", "are", "you", "?"]
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
|
||||
|
||||
def test_basic_tokenizer_lower_strip_accents_true(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
|
||||
|
||||
def test_basic_tokenizer_lower_strip_accents_default(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
|
||||
)
|
||||
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
|
||||
|
||||
def test_basic_tokenizer_no_lower(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_basic_tokenizer_no_lower_strip_accents_false(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_basic_tokenizer_no_lower_strip_accents_true(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_basic_tokenizer_respects_never_split_tokens(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
|
||||
)
|
||||
|
||||
def test_wordpiece_tokenizer(self):
|
||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
|
||||
|
||||
vocab = {}
|
||||
for i, token in enumerate(vocab_tokens):
|
||||
vocab[token] = i
|
||||
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
|
||||
|
||||
self.assertListEqual(tokenizer.tokenize(""), [])
|
||||
|
||||
self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
|
||||
|
||||
self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
|
||||
|
||||
def test_is_whitespace(self):
|
||||
self.assertTrue(_is_whitespace(" "))
|
||||
self.assertTrue(_is_whitespace("\t"))
|
||||
self.assertTrue(_is_whitespace("\r"))
|
||||
self.assertTrue(_is_whitespace("\n"))
|
||||
self.assertTrue(_is_whitespace("\u00A0"))
|
||||
|
||||
self.assertFalse(_is_whitespace("A"))
|
||||
self.assertFalse(_is_whitespace("-"))
|
||||
|
||||
def test_is_control(self):
|
||||
self.assertTrue(_is_control("\u0005"))
|
||||
|
||||
self.assertFalse(_is_control("A"))
|
||||
self.assertFalse(_is_control(" "))
|
||||
self.assertFalse(_is_control("\t"))
|
||||
self.assertFalse(_is_control("\r"))
|
||||
|
||||
def test_is_punctuation(self):
|
||||
self.assertTrue(_is_punctuation("-"))
|
||||
self.assertTrue(_is_punctuation("$"))
|
||||
self.assertTrue(_is_punctuation("`"))
|
||||
self.assertTrue(_is_punctuation("."))
|
||||
|
||||
self.assertFalse(_is_punctuation("A"))
|
||||
self.assertFalse(_is_punctuation(" "))
|
||||
|
||||
def test_clean_text(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
# Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
|
||||
self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
|
||||
|
||||
self.assertListEqual(
|
||||
[rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = self.tokenizer_class.from_pretrained("yjernite/retribert-base-uncased")
|
||||
|
||||
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
||||
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
||||
|
||||
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||
|
||||
assert encoded_sentence == [101] + text + [102]
|
||||
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
||||
|
||||
def test_offsets_with_special_characters(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
||||
tokens = tokenizer_r.encode_plus(
|
||||
sentence,
|
||||
return_attention_mask=False,
|
||||
return_token_type_ids=False,
|
||||
return_offsets_mapping=True,
|
||||
add_special_tokens=True,
|
||||
)
|
||||
|
||||
do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
|
||||
expected_results = (
|
||||
[
|
||||
((0, 0), tokenizer_r.cls_token),
|
||||
((0, 1), "A"),
|
||||
((1, 2), ","),
|
||||
((3, 5), "na"),
|
||||
((5, 6), "##ï"),
|
||||
((6, 8), "##ve"),
|
||||
((9, 15), tokenizer_r.mask_token),
|
||||
((16, 21), "Allen"),
|
||||
((21, 23), "##NL"),
|
||||
((23, 24), "##P"),
|
||||
((25, 33), "sentence"),
|
||||
((33, 34), "."),
|
||||
((0, 0), tokenizer_r.sep_token),
|
||||
]
|
||||
if not do_lower_case
|
||||
else [
|
||||
((0, 0), tokenizer_r.cls_token),
|
||||
((0, 1), "a"),
|
||||
((1, 2), ","),
|
||||
((3, 8), "naive"),
|
||||
((9, 15), tokenizer_r.mask_token),
|
||||
((16, 21), "allen"),
|
||||
((21, 23), "##nl"),
|
||||
((23, 24), "##p"),
|
||||
((25, 33), "sentence"),
|
||||
((33, 34), "."),
|
||||
((0, 0), tokenizer_r.sep_token),
|
||||
]
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
|
||||
)
|
||||
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
|
||||
|
||||
def test_change_tokenize_chinese_chars(self):
|
||||
list_of_commun_chinese_char = ["的", "人", "有"]
|
||||
text_with_chinese_char = "".join(list_of_commun_chinese_char)
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
kwargs["tokenize_chinese_chars"] = True
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||
|
||||
tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
|
||||
tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
|
||||
|
||||
# it is expected that each Chinese character is not preceded by "##"
|
||||
self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
|
||||
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
||||
|
||||
kwargs["tokenize_chinese_chars"] = False
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||
|
||||
tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
|
||||
tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
|
||||
|
||||
# it is expected that only the first Chinese character is not preceded by "##".
|
||||
expected_tokens = [
|
||||
f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
|
||||
]
|
||||
self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
|
||||
self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
|
||||
|
||||
# RetriBertModel doesn't define `get_input_embeddings` and it's forward method doesn't take only the output of the tokenizer as input
|
||||
@require_torch
|
||||
@slow
|
||||
def test_torch_encode_plus_sent_to_model(self):
|
||||
import torch
|
||||
|
||||
from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
|
||||
|
||||
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
|
||||
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
|
||||
return
|
||||
|
||||
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
|
||||
config = config_class()
|
||||
|
||||
if config.is_encoder_decoder or config.pad_token_id is None:
|
||||
return
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
# The following test is different from the common's one
|
||||
self.assertGreaterEqual(model.bert_query.get_input_embeddings().weight.shape[0], len(tokenizer))
|
||||
|
||||
# Build sequence
|
||||
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
|
||||
sequence = " ".join(first_ten_tokens)
|
||||
encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="pt")
|
||||
|
||||
# Ensure that the BatchEncoding.to() method works.
|
||||
encoded_sequence.to(model.device)
|
||||
|
||||
batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
|
||||
# This should not fail
|
||||
|
||||
with torch.no_grad(): # saves some time
|
||||
# The following lines are different from the common's ones
|
||||
model.embed_questions(**encoded_sequence)
|
||||
model.embed_questions(**batch_encoded_sequence)
|
||||
@@ -1,904 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from transformers import AddedToken, TapexTokenizer
|
||||
from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
|
||||
from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
|
||||
|
||||
from ...test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
@require_pandas
|
||||
class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = TapexTokenizer
|
||||
test_rust_tokenizer = False
|
||||
from_pretrained_kwargs = {"cls_token": "<s>"}
|
||||
test_seq2seq = False
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||
# fmt: off
|
||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"] # noqa: E231
|
||||
# fmt: on
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
def get_table(self, tokenizer, length=5):
|
||||
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
|
||||
|
||||
if length == 0:
|
||||
data = {}
|
||||
else:
|
||||
data = {toks[0]: [toks[tok] for tok in range(1, length)]}
|
||||
|
||||
table = pd.DataFrame.from_dict(data)
|
||||
|
||||
return table
|
||||
|
||||
def get_table_and_query(self, tokenizer, length=5):
|
||||
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
|
||||
table = self.get_table(tokenizer, length=length - 3)
|
||||
query = " ".join(toks[:3])
|
||||
|
||||
return table, query
|
||||
|
||||
def get_clean_sequence(
|
||||
self,
|
||||
tokenizer,
|
||||
with_prefix_space=False,
|
||||
max_length=20,
|
||||
min_length=5,
|
||||
empty_table: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
return_table_and_query: bool = False,
|
||||
):
|
||||
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
|
||||
|
||||
if empty_table:
|
||||
table = pd.DataFrame.from_dict({})
|
||||
query = " ".join(toks[:min_length])
|
||||
else:
|
||||
data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
|
||||
table = pd.DataFrame.from_dict(data)
|
||||
query = " ".join(toks[:3])
|
||||
|
||||
output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
|
||||
output_txt = tokenizer.decode(output_ids)
|
||||
|
||||
if len(output_ids) < min_length:
|
||||
raise ValueError("Update the code to generate the sequences so that they are larger")
|
||||
if len(output_ids) > max_length:
|
||||
raise ValueError("Update the code to generate the sequences so that they are smaller")
|
||||
|
||||
if return_table_and_query:
|
||||
return output_txt, output_ids, table, query
|
||||
|
||||
return output_txt, output_ids
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
kwargs.update(self.special_tokens_map)
|
||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
input_text = "lower newer"
|
||||
output_text = "lower newer"
|
||||
return input_text, output_text
|
||||
|
||||
def test_full_tokenizer_roberta(self):
|
||||
tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||
text = "lower newer"
|
||||
bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + [tokenizer.unk_token]
|
||||
input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
def roberta_dict_integration_testing(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
|
||||
self.assertListEqual(
|
||||
tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
|
||||
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
|
||||
)
|
||||
|
||||
def test_add_tokens_tokenizer(self):
|
||||
tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
vocab_size = tokenizer.vocab_size
|
||||
all_size = len(tokenizer)
|
||||
|
||||
self.assertNotEqual(vocab_size, 0)
|
||||
|
||||
# We usually have added tokens from the start in tests because our vocab fixtures are
|
||||
# smaller than the original vocabs - let's not assert this
|
||||
# self.assertEqual(vocab_size, all_size)
|
||||
|
||||
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
|
||||
added_toks = tokenizer.add_tokens(new_toks)
|
||||
vocab_size_2 = tokenizer.vocab_size
|
||||
all_size_2 = len(tokenizer)
|
||||
|
||||
self.assertNotEqual(vocab_size_2, 0)
|
||||
self.assertEqual(vocab_size, vocab_size_2)
|
||||
self.assertEqual(added_toks, len(new_toks))
|
||||
self.assertEqual(all_size_2, all_size + len(new_toks))
|
||||
|
||||
tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
|
||||
|
||||
self.assertGreaterEqual(len(tokens), 4)
|
||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
|
||||
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
|
||||
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
||||
vocab_size_3 = tokenizer.vocab_size
|
||||
all_size_3 = len(tokenizer)
|
||||
|
||||
self.assertNotEqual(vocab_size_3, 0)
|
||||
self.assertEqual(vocab_size, vocab_size_3)
|
||||
self.assertEqual(added_toks_2, len(new_toks_2))
|
||||
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
||||
|
||||
tokens = tokenizer.encode(
|
||||
table,
|
||||
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
|
||||
add_special_tokens=False,
|
||||
)
|
||||
|
||||
self.assertGreaterEqual(len(tokens), 6)
|
||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[0], tokens[1])
|
||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[-2], tokens[-3])
|
||||
self.assertEqual(tokens[0], tokenizer.eos_token_id)
|
||||
self.assertEqual(tokens[-2], tokenizer.pad_token_id)
|
||||
|
||||
def test_token_type_ids(self):
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
empty_table = self.get_table(tokenizer, length=0)
|
||||
seq_0 = "Test this method."
|
||||
|
||||
# We want to have sequence 0 and sequence 1 are tagged
|
||||
# respectively with 0 and 1 token_ids
|
||||
# (regardless of whether the model use token type ids)
|
||||
# We use this assumption in the QA pipeline among other place
|
||||
output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
|
||||
|
||||
# Assert that the token type IDs have the same length as the input IDs
|
||||
self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
|
||||
self.assertIn(0, output["token_type_ids"])
|
||||
|
||||
def test_add_special_tokens(self):
|
||||
tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
input_table = self.get_table(tokenizer, length=0)
|
||||
|
||||
special_token = "[SPECIAL_TOKEN]"
|
||||
|
||||
tokenizer.add_special_tokens({"cls_token": special_token})
|
||||
encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
|
||||
self.assertEqual(len(encoded_special_token), 1)
|
||||
|
||||
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
|
||||
self.assertTrue(special_token not in decoded)
|
||||
|
||||
def test_batch_encode_plus_overflowing_tokens(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
table = self.get_table(tokenizer, length=10)
|
||||
string_sequences = ["Testing the prepare_for_model method.", "Test"]
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
|
||||
|
||||
tokenizer.batch_encode_plus(
|
||||
table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
|
||||
)
|
||||
|
||||
@is_pt_tf_cross_test
|
||||
def test_batch_encode_plus_tensors(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
sequences = [
|
||||
"Testing batch encode plus",
|
||||
"Testing batch encode plus with different sequence lengths",
|
||||
"Testing batch encode plus with different sequence lengths correctly pads",
|
||||
]
|
||||
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
|
||||
# A Tensor cannot be build by sequences which are not the same size
|
||||
self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
|
||||
self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
|
||||
|
||||
if tokenizer.pad_token_id is None:
|
||||
self.assertRaises(
|
||||
ValueError,
|
||||
tokenizer.batch_encode_plus,
|
||||
table,
|
||||
sequences,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.assertRaises(
|
||||
ValueError,
|
||||
tokenizer.batch_encode_plus,
|
||||
table,
|
||||
sequences,
|
||||
padding="longest",
|
||||
return_tensors="tf",
|
||||
)
|
||||
else:
|
||||
pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
|
||||
tensorflow_tensor = tokenizer.batch_encode_plus(
|
||||
table, sequences, padding="longest", return_tensors="tf"
|
||||
)
|
||||
encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
|
||||
|
||||
for key in encoded_sequences.keys():
|
||||
pytorch_value = pytorch_tensor[key].tolist()
|
||||
tensorflow_value = tensorflow_tensor[key].numpy().tolist()
|
||||
encoded_value = encoded_sequences[key]
|
||||
|
||||
self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
sequences = [
|
||||
"Testing batch encode plus",
|
||||
"Testing batch encode plus with different sequence lengths",
|
||||
"Testing batch encode plus with different sequence lengths correctly pads",
|
||||
]
|
||||
|
||||
# Test not batched
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
|
||||
encoded_sequences_2 = tokenizer(table, sequences[0])
|
||||
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
|
||||
|
||||
# Test not batched pairs
|
||||
table = self.get_table(tokenizer, length=10)
|
||||
encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
|
||||
encoded_sequences_2 = tokenizer(table, sequences[1])
|
||||
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
|
||||
|
||||
# Test batched
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
|
||||
encoded_sequences_2 = tokenizer(table, sequences)
|
||||
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
|
||||
|
||||
def test_internal_consistency(self):
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
input_text, output_text = self.get_input_output_texts(tokenizer)
|
||||
|
||||
tokens = tokenizer.tokenize(input_text)
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
|
||||
self.assertListEqual(ids, ids_2)
|
||||
|
||||
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
|
||||
self.assertNotEqual(len(tokens_2), 0)
|
||||
text_2 = tokenizer.decode(ids)
|
||||
self.assertIsInstance(text_2, str)
|
||||
|
||||
self.assertEqual(text_2, output_text)
|
||||
|
||||
def test_save_and_load_tokenizer(self):
|
||||
# safety check on max_len default value so we are sure the test works
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
self.assertNotEqual(tokenizer.model_max_length, 42)
|
||||
|
||||
# Now let's start the test
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
# Isolate this from the other tests because we save additional tokens/etc
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
sample_text = " He is very happy, UNwant\u00E9d,running"
|
||||
before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
|
||||
before_vocab = tokenizer.get_vocab()
|
||||
tokenizer.save_pretrained(tmpdirname)
|
||||
|
||||
after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
|
||||
after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
|
||||
after_vocab = after_tokenizer.get_vocab()
|
||||
self.assertListEqual(before_tokens, after_tokens)
|
||||
self.assertDictEqual(before_vocab, after_vocab)
|
||||
|
||||
shutil.rmtree(tmpdirname)
|
||||
|
||||
def test_number_of_added_tokens(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table, query = self.get_table_and_query(tokenizer)
|
||||
|
||||
sequences = tokenizer.encode(table, query, add_special_tokens=False)
|
||||
attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
|
||||
|
||||
self.assertEqual(2, len(attached_sequences) - len(sequences))
|
||||
|
||||
@unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
|
||||
def test_prepare_for_model(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TAPEX tokenizer does not support pairs.")
|
||||
def test_maximum_encoding_length_pair_input(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TAPEX tokenizer does not support pairs.")
|
||||
def test_maximum_encoding_length_single_input(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Not implemented")
|
||||
def test_right_and_left_truncation(self):
|
||||
pass
|
||||
|
||||
def test_encode_decode_with_spaces(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
|
||||
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
|
||||
tokenizer.add_tokens(new_toks)
|
||||
input = "[ABC][DEF][ABC][DEF]"
|
||||
if self.space_between_special_tokens:
|
||||
output = "[ABC] [DEF] [ABC] [DEF]"
|
||||
else:
|
||||
output = input
|
||||
encoded = tokenizer.encode(table, input, add_special_tokens=False)
|
||||
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
|
||||
self.assertIn(decoded, [output, output.lower()])
|
||||
|
||||
def test_tokenize_special_tokens(self):
|
||||
"""Test `tokenize` with special tokens."""
|
||||
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
|
||||
SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
|
||||
|
||||
# TODO:
|
||||
# Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
|
||||
# with one variable(property) for a better maintainability?
|
||||
|
||||
# `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
|
||||
tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
|
||||
# `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
|
||||
# which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
|
||||
|
||||
token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
|
||||
token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
|
||||
|
||||
self.assertEqual(len(token_1), 1)
|
||||
self.assertEqual(len(token_2), 1)
|
||||
self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
|
||||
self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
|
||||
|
||||
def test_special_tokens_mask(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
sequence_0 = "Encode this."
|
||||
# Testing single inputs
|
||||
encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(
|
||||
table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
|
||||
)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||
|
||||
filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
|
||||
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer)
|
||||
sequence = "Sequence"
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequence)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(table, sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
padded_sequence = tokenizer.encode(
|
||||
table,
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
|
||||
self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(table, sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
self.assertEqual(sequence_length, padded_sequence_right_length)
|
||||
self.assertListEqual(encoded_sequence, padded_sequence_right)
|
||||
|
||||
def test_padding_to_multiple_of(self):
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
if tokenizer.pad_token is None:
|
||||
self.skipTest("No padding token.")
|
||||
else:
|
||||
empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
|
||||
normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
|
||||
for key, value in empty_tokens.items():
|
||||
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
||||
for key, value in normal_tokens.items():
|
||||
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
||||
|
||||
normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
|
||||
for key, value in normal_tokens.items():
|
||||
self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
||||
|
||||
# Should also work with truncation
|
||||
normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
|
||||
for key, value in normal_tokens.items():
|
||||
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
||||
|
||||
def test_right_and_left_padding(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
sequence = "Sequence"
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequence)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(table, sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
padded_sequence = tokenizer.encode(
|
||||
table, sequence, max_length=sequence_length + padding_size, padding="max_length"
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
|
||||
self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
|
||||
|
||||
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "left"
|
||||
encoded_sequence = tokenizer.encode(table, sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
padded_sequence = tokenizer.encode(
|
||||
table, sequence, max_length=sequence_length + padding_size, padding="max_length"
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
|
||||
self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
|
||||
|
||||
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
|
||||
encoded_sequence = tokenizer.encode(table, sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
self.assertEqual(sequence_length, padded_sequence_right_length)
|
||||
self.assertListEqual(encoded_sequence, padded_sequence_right)
|
||||
|
||||
tokenizer.padding_side = "left"
|
||||
padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
|
||||
padded_sequence_left_length = len(padded_sequence_left)
|
||||
self.assertEqual(sequence_length, padded_sequence_left_length)
|
||||
self.assertListEqual(encoded_sequence, padded_sequence_left)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(table, sequence)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
self.assertEqual(sequence_length, padded_sequence_right_length)
|
||||
self.assertListEqual(encoded_sequence, padded_sequence_right)
|
||||
|
||||
tokenizer.padding_side = "left"
|
||||
padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
|
||||
padded_sequence_left_length = len(padded_sequence_left)
|
||||
self.assertEqual(sequence_length, padded_sequence_left_length)
|
||||
self.assertListEqual(encoded_sequence, padded_sequence_left)
|
||||
|
||||
def test_encode_plus_with_padding(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
sequence = "Sequence"
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequence)
|
||||
|
||||
padding_size = 10
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
token_type_padding_idx = tokenizer.pad_token_type_id
|
||||
|
||||
encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
|
||||
input_ids = encoded_sequence["input_ids"]
|
||||
special_tokens_mask = encoded_sequence["special_tokens_mask"]
|
||||
sequence_length = len(input_ids)
|
||||
|
||||
# Test 'longest' and 'no_padding' don't do anything
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
not_padded_sequence = tokenizer.encode_plus(
|
||||
table,
|
||||
sequence,
|
||||
padding=False,
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
not_padded_input_ids = not_padded_sequence["input_ids"]
|
||||
|
||||
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
|
||||
not_padded_sequence_length = len(not_padded_input_ids)
|
||||
|
||||
self.assertEqual(sequence_length, not_padded_sequence_length)
|
||||
self.assertListEqual(input_ids, not_padded_input_ids)
|
||||
self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
|
||||
|
||||
not_padded_sequence = tokenizer.encode_plus(
|
||||
table,
|
||||
sequence,
|
||||
padding=False,
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
not_padded_input_ids = not_padded_sequence["input_ids"]
|
||||
|
||||
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
|
||||
not_padded_sequence_length = len(not_padded_input_ids)
|
||||
|
||||
self.assertEqual(sequence_length, not_padded_sequence_length)
|
||||
self.assertListEqual(input_ids, not_padded_input_ids)
|
||||
self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
|
||||
|
||||
# Test right padding
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
right_padded_sequence = tokenizer.encode_plus(
|
||||
table,
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
right_padded_input_ids = right_padded_sequence["input_ids"]
|
||||
|
||||
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
||||
right_padded_sequence_length = len(right_padded_input_ids)
|
||||
|
||||
self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
|
||||
self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
|
||||
self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
|
||||
|
||||
# Test left padding
|
||||
tokenizer.padding_side = "left"
|
||||
left_padded_sequence = tokenizer.encode_plus(
|
||||
table,
|
||||
sequence,
|
||||
max_length=sequence_length + padding_size,
|
||||
padding="max_length",
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
left_padded_input_ids = left_padded_sequence["input_ids"]
|
||||
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
||||
left_padded_sequence_length = len(left_padded_input_ids)
|
||||
|
||||
self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
|
||||
self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
|
||||
self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
|
||||
|
||||
if "token_type_ids" in tokenizer.model_input_names:
|
||||
token_type_ids = encoded_sequence["token_type_ids"]
|
||||
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
|
||||
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
|
||||
|
||||
self.assertListEqual(
|
||||
(token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
|
||||
)
|
||||
self.assertListEqual(
|
||||
[[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
|
||||
)
|
||||
|
||||
if "attention_mask" in tokenizer.model_input_names:
|
||||
attention_mask = encoded_sequence["attention_mask"]
|
||||
right_padded_attention_mask = right_padded_sequence["attention_mask"]
|
||||
left_padded_attention_mask = left_padded_sequence["attention_mask"]
|
||||
|
||||
self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
|
||||
self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
|
||||
|
||||
def test_batch_encode_plus_padding(self):
|
||||
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
|
||||
|
||||
# Right padding tests
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
sequences = [
|
||||
"Testing batch encode plus",
|
||||
"Testing batch encode plus with different sequence lengths",
|
||||
"Testing batch encode plus with different sequence lengths correctly pads",
|
||||
]
|
||||
|
||||
max_length = 100
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequences)
|
||||
|
||||
encoded_sequences = [
|
||||
tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
|
||||
for sequence in sequences
|
||||
]
|
||||
encoded_sequences_batch = tokenizer.batch_encode_plus(
|
||||
table, sequences, max_length=max_length, padding="max_length"
|
||||
)
|
||||
self.assertListEqual(
|
||||
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
|
||||
)
|
||||
|
||||
# Left padding tests
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
tokenizer.padding_side = "left"
|
||||
sequences = [
|
||||
"Testing batch encode plus",
|
||||
"Testing batch encode plus with different sequence lengths",
|
||||
"Testing batch encode plus with different sequence lengths correctly pads",
|
||||
]
|
||||
|
||||
max_length = 100
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequences)
|
||||
|
||||
encoded_sequences = [
|
||||
tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
|
||||
for sequence in sequences
|
||||
]
|
||||
encoded_sequences_batch = tokenizer.batch_encode_plus(
|
||||
table, sequences, max_length=max_length, padding="max_length"
|
||||
)
|
||||
self.assertListEqual(
|
||||
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
|
||||
)
|
||||
|
||||
def test_batch_encode_plus_batch_sequence_length(self):
|
||||
# Tests that all encoded values have the correct size
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer, length=0)
|
||||
sequences = [
|
||||
"Testing batch encode plus",
|
||||
"Testing batch encode plus with different sequence lengths",
|
||||
"Testing batch encode plus with different sequence lengths correctly pads",
|
||||
]
|
||||
|
||||
encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
|
||||
encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
|
||||
self.assertListEqual(
|
||||
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
|
||||
)
|
||||
|
||||
maximum_length = len(
|
||||
max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
|
||||
)
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequences)
|
||||
|
||||
encoded_sequences_padded = [
|
||||
tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
|
||||
for sequence in sequences
|
||||
]
|
||||
|
||||
encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
|
||||
self.assertListEqual(
|
||||
encoded_sequences_padded,
|
||||
self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
|
||||
)
|
||||
|
||||
# check 'longest' is unsensitive to a max length
|
||||
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
|
||||
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
|
||||
table, sequences, max_length=maximum_length + 10, padding="longest"
|
||||
)
|
||||
for key in encoded_sequences_batch_padded_1.keys():
|
||||
self.assertListEqual(
|
||||
encoded_sequences_batch_padded_1[key],
|
||||
encoded_sequences_batch_padded_2[key],
|
||||
)
|
||||
|
||||
# check 'no_padding' is unsensitive to a max length
|
||||
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
|
||||
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
|
||||
table, sequences, max_length=maximum_length + 10, padding=False
|
||||
)
|
||||
for key in encoded_sequences_batch_padded_1.keys():
|
||||
self.assertListEqual(
|
||||
encoded_sequences_batch_padded_1[key],
|
||||
encoded_sequences_batch_padded_2[key],
|
||||
)
|
||||
|
||||
def test_special_tokens_mask_input_pairs(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
sequence_0 = "Encode this."
|
||||
empty_table = self.get_table(tokenizer, length=0)
|
||||
table = self.get_table(tokenizer, length=10)
|
||||
encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
|
||||
number_of_tokens = len(encoded_sequence)
|
||||
encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(
|
||||
table,
|
||||
sequence_0,
|
||||
add_special_tokens=True,
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||
|
||||
filtered_sequence = [
|
||||
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
|
||||
]
|
||||
# NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
|
||||
# in order to have equivalent results with encoding an empty table or empty sequence
|
||||
del filtered_sequence[number_of_tokens + 1]
|
||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||
print("Encoded sequence:", encoded_sequence)
|
||||
print("Filtered sequence:", filtered_sequence)
|
||||
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||
|
||||
@slow
|
||||
def test_full_tokenizer(self):
|
||||
question = "Greece held its last Summer Olympics in 2004"
|
||||
table_dict = {
|
||||
"header": ["Year", "City", "Country", "Nations"],
|
||||
"rows": [
|
||||
[1896, "Athens", "Greece", 14],
|
||||
[1900, "Paris", "France", 24],
|
||||
[1904, "St. Louis", "USA", 12],
|
||||
[2004, "Athens", "Greece", 201],
|
||||
[2008, "Beijing", "China", 204],
|
||||
[2012, "London", "UK", 204],
|
||||
],
|
||||
}
|
||||
table = pd.DataFrame.from_dict(table_dict["rows"])
|
||||
table.columns = table_dict["header"]
|
||||
|
||||
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
|
||||
encoding = tokenizer(table, question)
|
||||
|
||||
# fmt: off
|
||||
expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
|
||||
# fmt: on
|
||||
|
||||
self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
|
||||
|
||||
def test_tokenizer_as_target(self):
|
||||
# by default the tokenizer do_lower_case
|
||||
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
|
||||
answer_text = "tapex is a good model!"
|
||||
expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
|
||||
answer_encoding = tokenizer(answer=answer_text)
|
||||
self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
|
||||
|
||||
@slow
|
||||
def test_tokenizer_lower_case(self):
|
||||
cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
|
||||
uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
|
||||
answer_text = "Beijing, London, Paris"
|
||||
answer_text_lower = "beijing, london, paris"
|
||||
|
||||
self.assertNotEqual(
|
||||
cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
|
||||
)
|
||||
self.assertEqual(
|
||||
cased_tokenizer(answer=answer_text_lower).input_ids,
|
||||
uncased_tokenizer(answer=answer_text).input_ids,
|
||||
)
|
||||
# batched encoding assert
|
||||
self.assertNotEqual(
|
||||
cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
|
||||
)
|
||||
self.assertEqual(
|
||||
cased_tokenizer(answer=[answer_text_lower]).input_ids,
|
||||
uncased_tokenizer(answer=[answer_text]).input_ids,
|
||||
)
|
||||
# test input encoding lowercase
|
||||
question = "Greece held its last Summer Olympics in 2004"
|
||||
table_dict = {
|
||||
"header": ["Year", "City", "Country", "Nations"],
|
||||
"rows": [
|
||||
[1896, "Athens", "Greece", 14],
|
||||
[1900, "Paris", "France", 24],
|
||||
[1904, "St. Louis", "USA", 12],
|
||||
[2004, "Athens", "Greece", 201],
|
||||
[2008, "Beijing", "China", 204],
|
||||
[2012, "London", "UK", 204],
|
||||
],
|
||||
}
|
||||
table = pd.DataFrame.from_dict(table_dict["rows"])
|
||||
table.columns = table_dict["header"]
|
||||
|
||||
self.assertNotEqual(
|
||||
cased_tokenizer(table=table, query=question).input_ids,
|
||||
uncased_tokenizer(table=table, query=question).input_ids,
|
||||
)
|
||||
@@ -1,276 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch TrajectoryTransformer model. """
|
||||
|
||||
|
||||
import inspect
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import TrajectoryTransformerConfig, is_torch_available
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
|
||||
from ...generation.test_utils import GenerationTesterMixin
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, random_attention_mask
|
||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import TrajectoryTransformerModel
|
||||
from transformers.models.trajectory_transformer.modeling_trajectory_transformer import (
|
||||
TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
)
|
||||
|
||||
|
||||
class TrajectoryTransformerModelTester:
|
||||
def __init__(self, parent, batch_size=13, n_embd=128, action_dim=6, observation_dim=17, is_training=True):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.n_embd = n_embd
|
||||
self.action_dim = action_dim
|
||||
self.observation_dim = observation_dim
|
||||
self.is_training = is_training
|
||||
self.seq_length = self.action_dim + self.observation_dim + 1
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
|
||||
torch_device
|
||||
)
|
||||
attention_mask = random_attention_mask((self.batch_size, self.seq_length)).to(torch_device)
|
||||
targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
config = self.get_config()
|
||||
return config, trajectories, attention_mask, targets
|
||||
|
||||
def get_config(self):
|
||||
return TrajectoryTransformerConfig(
|
||||
batch_size=self.batch_size,
|
||||
n_embd=self.n_embd,
|
||||
action_dim=self.action_dim,
|
||||
observation_dim=self.observation_dim,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_dict):
|
||||
model = TrajectoryTransformerModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
result = model(trajectories=input_dict["trajectories"], attention_mask=input_dict["attention_mask"])
|
||||
result = model(
|
||||
trajectories=input_dict["trajectories"],
|
||||
output_hidden_states=True,
|
||||
output_attentions=True,
|
||||
use_cache=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
self.parent.assertEqual(result.hidden_states[-1].shape, (self.batch_size, self.seq_length, self.n_embd))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, trajectories, attention_mask, targets) = config_and_inputs
|
||||
inputs_dict = {"trajectories": trajectories, "attention_mask": attention_mask, "targets": targets}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (TrajectoryTransformerModel,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = {"feature-extraction": TrajectoryTransformerModel} if is_torch_available() else {}
|
||||
|
||||
# Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
|
||||
test_generate_without_input_ids = False
|
||||
|
||||
# Ignoring of a failing tests from ModelTesterMixin, as the model does not implement these features
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
test_attention_outputs = False
|
||||
test_hidden_states_output = False
|
||||
test_inputs_embeds = False
|
||||
test_model_common_attributes = False
|
||||
test_torchscript = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = TrajectoryTransformerModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=TrajectoryTransformerConfig, n_embd=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_conditional_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["trajectories"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
# # Input is 'trajectories' not 'input_ids'
|
||||
def test_model_main_input_name(self):
|
||||
model_signature = inspect.signature(getattr(TrajectoryTransformerModel, "forward"))
|
||||
# The main input is the name of the argument after `self`
|
||||
observed_main_input_name = list(model_signature.parameters.keys())[1]
|
||||
self.assertEqual(TrajectoryTransformerModel.main_input_name, observed_main_input_name)
|
||||
|
||||
def test_retain_grad_hidden_states_attentions(self):
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.output_hidden_states = True
|
||||
config.output_attentions = self.has_attentions
|
||||
|
||||
model = TrajectoryTransformerModel(config)
|
||||
model.to(torch_device)
|
||||
|
||||
outputs = model(
|
||||
trajectories=input_dict["trajectories"],
|
||||
attention_mask=input_dict["attention_mask"],
|
||||
targets=input_dict["targets"],
|
||||
output_hidden_states=True,
|
||||
output_attentions=True,
|
||||
use_cache=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
output = outputs[0]
|
||||
hidden_states = outputs.hidden_states[0]
|
||||
hidden_states.retain_grad()
|
||||
|
||||
if self.has_attentions:
|
||||
attentions = outputs.attentions[0]
|
||||
attentions.retain_grad()
|
||||
|
||||
output.flatten()[0].backward(retain_graph=True)
|
||||
|
||||
self.assertIsNotNone(hidden_states.grad)
|
||||
|
||||
if self.has_attentions:
|
||||
self.assertIsNotNone(attentions.grad)
|
||||
|
||||
def test_training(self):
|
||||
if not self.model_tester.is_training:
|
||||
return
|
||||
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
model = TrajectoryTransformerModel(config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
loss = model(
|
||||
trajectories=input_dict["trajectories"],
|
||||
attention_mask=input_dict["attention_mask"],
|
||||
targets=input_dict["targets"],
|
||||
output_hidden_states=True,
|
||||
output_attentions=True,
|
||||
use_cache=True,
|
||||
return_dict=True,
|
||||
).loss
|
||||
loss.backward()
|
||||
|
||||
def test_training_gradient_checkpointing(self):
|
||||
if not self.model_tester.is_training:
|
||||
return
|
||||
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
model = TrajectoryTransformerModel(config)
|
||||
model.gradient_checkpointing_enable()
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
loss = model(
|
||||
trajectories=input_dict["trajectories"],
|
||||
attention_mask=input_dict["attention_mask"],
|
||||
targets=input_dict["targets"],
|
||||
output_hidden_states=True,
|
||||
output_attentions=True,
|
||||
use_cache=False,
|
||||
return_dict=True,
|
||||
).loss
|
||||
loss.backward()
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
configs_no_init = _config_zero_init(config)
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
if param.requires_grad:
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = TrajectoryTransformerModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
@require_torch
|
||||
class TrajectoryTransformerModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_prediction(self):
|
||||
batch_size = 1
|
||||
|
||||
config = TrajectoryTransformerConfig.from_pretrained("CarlCochet/trajectory-transformer-halfcheetah-medium-v2")
|
||||
model = TrajectoryTransformerModel.from_pretrained(
|
||||
"CarlCochet/trajectory-transformer-halfcheetah-medium-v2", config=config
|
||||
)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
seq_length = model.config.action_dim + model.config.observation_dim + 1
|
||||
|
||||
trajectories = torch.LongTensor(
|
||||
[[3, 19, 20, 22, 9, 7, 23, 10, 18, 14, 13, 4, 17, 11, 5, 6, 15, 21, 2, 8, 1, 0, 12, 16]]
|
||||
).to(torch_device)
|
||||
outputs = model(
|
||||
trajectories=trajectories,
|
||||
output_hidden_states=True,
|
||||
output_attentions=True,
|
||||
use_cache=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
output = outputs.logits
|
||||
|
||||
expected_shape = torch.Size((batch_size, seq_length, model.config.vocab_size + 1))
|
||||
expected_slice = torch.tensor(
|
||||
[[[-0.7193, -0.2532, -0.0898], [1.9429, 2.0434, 2.3975], [-3.3651, -2.8744, -2.4532]]]
|
||||
).to(torch_device)
|
||||
output_slice = output[:, :3, :3]
|
||||
|
||||
self.assertEqual(output.shape, expected_shape)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_slice, atol=1e-4))
|
||||
@@ -1,278 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch Van model. """
|
||||
|
||||
|
||||
import inspect
|
||||
import math
|
||||
import unittest
|
||||
|
||||
from transformers import VanConfig
|
||||
from transformers.testing_utils import require_scipy, require_torch, require_vision, slow, torch_device
|
||||
from transformers.utils import cached_property, is_scipy_available, is_torch_available, is_vision_available
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
|
||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||
|
||||
|
||||
if is_scipy_available():
|
||||
from scipy import stats
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers import VanForImageClassification, VanModel
|
||||
from transformers.models.van.modeling_van import VAN_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
|
||||
class VanModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=2,
|
||||
image_size=224,
|
||||
num_channels=3,
|
||||
hidden_sizes=[16, 32, 64, 128],
|
||||
depths=[1, 1, 1, 1],
|
||||
is_training=True,
|
||||
use_labels=True,
|
||||
num_labels=3,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.num_channels = num_channels
|
||||
self.hidden_sizes = hidden_sizes
|
||||
self.depths = depths
|
||||
self.is_training = is_training
|
||||
self.use_labels = use_labels
|
||||
self.num_labels = num_labels
|
||||
self.type_sequence_label_size = num_labels
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
||||
|
||||
labels = None
|
||||
if self.use_labels:
|
||||
labels = ids_tensor([self.batch_size], self.num_labels)
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, pixel_values, labels
|
||||
|
||||
def get_config(self):
|
||||
return VanConfig(
|
||||
num_channels=self.num_channels,
|
||||
hidden_sizes=self.hidden_sizes,
|
||||
depths=self.depths,
|
||||
num_labels=self.num_labels,
|
||||
is_decoder=False,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, pixel_values, labels):
|
||||
model = VanModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values)
|
||||
# expected last hidden states: B, C, H // 32, W // 32
|
||||
self.parent.assertEqual(
|
||||
result.last_hidden_state.shape,
|
||||
(self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
|
||||
)
|
||||
|
||||
def create_and_check_for_image_classification(self, config, pixel_values, labels):
|
||||
model = VanForImageClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values, labels=labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, pixel_values, labels = config_and_inputs
|
||||
inputs_dict = {"pixel_values": pixel_values}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class VanModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Here we also overwrite some of the tests of test_modeling_common.py, as Van does not use input_ids, inputs_embeds,
|
||||
attention_mask and seq_length.
|
||||
"""
|
||||
|
||||
all_model_classes = (VanModel, VanForImageClassification) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{"feature-extraction": VanModel, "image-classification": VanForImageClassification}
|
||||
if is_torch_available()
|
||||
else {}
|
||||
)
|
||||
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
has_attentions = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = VanModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=VanConfig, has_text_modality=False, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.create_and_test_config_common_properties()
|
||||
self.config_tester.create_and_test_config_to_json_string()
|
||||
self.config_tester.create_and_test_config_to_json_file()
|
||||
self.config_tester.create_and_test_config_from_and_save_pretrained()
|
||||
self.config_tester.create_and_test_config_with_num_labels()
|
||||
self.config_tester.check_config_can_be_init_without_params()
|
||||
self.config_tester.check_config_arguments_init()
|
||||
|
||||
def create_and_test_config_common_properties(self):
|
||||
return
|
||||
|
||||
@unittest.skip(reason="Van does not use inputs_embeds")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Van does not support input and output embeddings")
|
||||
def test_model_common_attributes(self):
|
||||
pass
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["pixel_values"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
@require_scipy
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
configs_no_init = _config_zero_init(config)
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, module in model.named_modules():
|
||||
if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm)):
|
||||
self.assertTrue(
|
||||
torch.all(module.weight == 1),
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
self.assertTrue(
|
||||
torch.all(module.bias == 0),
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
elif isinstance(module, nn.Conv2d):
|
||||
fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
|
||||
fan_out //= module.groups
|
||||
std = math.sqrt(2.0 / fan_out)
|
||||
# divide by std -> mean = 0, std = 1
|
||||
data = module.weight.data.cpu().flatten().numpy() / std
|
||||
test = stats.anderson(data)
|
||||
self.assertTrue(test.statistic > 0.05)
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
|
||||
|
||||
expected_num_stages = len(self.model_tester.hidden_sizes)
|
||||
# van has no embeddings
|
||||
self.assertEqual(len(hidden_states), expected_num_stages)
|
||||
|
||||
# Van's feature maps are of shape (batch_size, num_channels, height, width)
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[self.model_tester.image_size // 4, self.model_tester.image_size // 4],
|
||||
)
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
def test_for_image_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in VAN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = VanModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
def prepare_img():
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
return image
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class VanModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_image_processor(self):
|
||||
return AutoImageProcessor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
|
||||
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 1000))
|
||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor([0.1029, -0.0904, -0.6365]).to(torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
|
||||
Reference in New Issue
Block a user