diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py index 2859497ba8..ffa135556a 100644 --- a/src/transformers/models/splinter/tokenization_splinter.py +++ b/src/transformers/models/splinter/tokenization_splinter.py @@ -137,6 +137,7 @@ class SplinterTokenizer(PreTrainedTokenizer): pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, + question_token=question_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, **kwargs, diff --git a/tests/models/splinter/test_tokenization_splinter.py b/tests/models/splinter/test_tokenization_splinter.py new file mode 100644 index 0000000000..4c6d295e8a --- /dev/null +++ b/tests/models/splinter/test_tokenization_splinter.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +from tests.test_tokenization_common import TokenizerTesterMixin +from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available +from transformers.models.splinter import SplinterTokenizer +from transformers.testing_utils import get_tests_dir, slow + + +SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.txt") + + +if is_torch_available(): + FRAMEWORK = "pt" +elif is_tf_available(): + FRAMEWORK = "tf" +else: + FRAMEWORK = "jax" + + +class SplinterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + tokenizer_class = SplinterTokenizer + rust_tokenizer_class = SplinterTokenizerFast + space_between_special_tokens = False + test_rust_tokenizer = False + test_sentencepiece_ignore_case = False + pre_trained_model_path = "tau/splinter-base" + + # Copied from transformers.models.siglip.SiglipTokenizationTest.setUp + def setUp(self): + super().setUp() + tokenizer = SplinterTokenizer(SAMPLE_VOCAB) + tokenizer.vocab["[UNK]"] = len(tokenizer.vocab) + tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab) + tokenizer.vocab["."] = len(tokenizer.vocab) + tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split()) + tokenizer.save_pretrained(self.tmpdirname) + + def get_tokenizer(self, **kwargs) -> SplinterTokenizer: + return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + + def get_rust_tokenizer(self, **kwargs) -> SplinterTokenizerFast: + return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + + # Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab + def test_get_vocab(self): + vocab_keys = list(self.get_tokenizer().get_vocab().keys()) + self.assertEqual(vocab_keys[0], "[PAD]") + self.assertEqual(vocab_keys[1], "[SEP]") + self.assertEqual(vocab_keys[2], "[MASK]") + + # Copied from transformers.models.siglip.SiglipTokenizationTest.test_convert_token_and_id + def test_convert_token_and_id(self): + token = "[PAD]" + token_id = 0 + + self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) + self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + + def test_question_token_id(self): + tokenizer = self.get_tokenizer() + self.assertEqual(tokenizer.question_token_id, tokenizer.convert_tokens_to_ids(tokenizer.question_token)) + + # Copied from transformers.models.siglip.SiglipTokenizationTest.test_full_tokenizer + def test_full_tokenizer(self): + tokenizer = self.get_tokenizer() + test_str = "This is a test" + + unk_token = tokenizer.unk_token + unk_token_id = tokenizer._convert_token_to_id_with_added_voc(unk_token) + + expected_tokens = test_str.lower().split() + tokenizer.add_tokens(expected_tokens) + tokens = tokenizer.tokenize(test_str) + self.assertListEqual(tokens, expected_tokens) + + # test with out of vocabulary string + tokens = tokenizer.tokenize(test_str + " oov") + self.assertListEqual(tokens, expected_tokens + [unk_token]) + + expected_token_ids = [13, 14, 15, 16, unk_token_id] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual(token_ids, expected_token_ids) + + tokenizer = self.get_tokenizer(basic_tokenize=False) + expected_token_ids = [13, 14, 15, 16, unk_token_id] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual(token_ids, expected_token_ids) + + # Copied from transformers.models.siglip.SiglipTokenizationTest.test_rust_and_python_full_tokenizers + def test_rust_and_python_full_tokenizers(self): + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + sequence = "I need to test this rigor" + tokens = tokenizer.tokenize(sequence, add_special_tokens=False) + rust_tokens = rust_tokenizer.tokenize(sequence, add_special_tokens=False) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + # Copied from transformers.models.siglip.SiglipTokenizationTest.test_max_length + def test_max_length(self): + max_length = 20 + tokenizer = self.get_tokenizer() + texts = ["this is a test", "I have pizza for lunch"] + tokenized = tokenizer( + text_target=texts, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors=FRAMEWORK, + ) + self.assertEqual(len(tokenized["input_ids"]), len(texts)) + self.assertEqual(len(tokenized["input_ids"][0]), max_length) + self.assertEqual(len(tokenized["input_ids"][1]), max_length) + self.assertEqual(len(tokenized["attention_mask"][0]), max_length) + self.assertEqual(len(tokenized["attention_mask"][1]), max_length) + self.assertEqual(len(tokenized["token_type_ids"][0]), max_length) + self.assertEqual(len(tokenized["token_type_ids"][1]), max_length) + + # Copied from transformers.models.siglip.SiglipTokenizationTest.test_tokenizer_integration + # fmt:skip + @slow + def test_tokenizer_integration(self): + tokenizer = SplinterTokenizer.from_pretrained("tau/splinter-base", max_length=10) + texts = [ + "The cat sat on the windowsill, watching birds in the garden.", + "She baked a delicious cake for her sister's birthday party.", + "The sun set over the horizon, painting the sky with vibrant colors.", + ] + # fmt:off + expected_token_id_list = [ + [101, 1109, 5855, 2068, 1113, 1103, 3751, 7956, 117, 2903, 4939, 1107, 1103, 4605, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1153, 19983, 170, 13108, 10851, 1111, 1123, 2104, 112, 188, 5913, 1710, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1109, 3336, 1383, 1166, 1103, 11385, 117, 3504, 1103, 3901, 1114, 18652, 5769, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ] + # fmt:on + for text, expected_token_ids in zip(texts, expected_token_id_list): + input_ids = tokenizer(text, padding="max_length").input_ids + self.assertListEqual(input_ids, expected_token_ids) + + def test_special_tokens_mask_input_pairs(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + sequence_0 = "Encode this." + sequence_1 = "This one too please." + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, + sequence_1, + add_special_tokens=True, + return_special_tokens_mask=True, + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + # splinter tokenizer always add cls, question_suffix, and 2 separators + # while in special_token_mask it does not seems to do that + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special) - 2)