From 7223844df9738719ee335428a326cd712f506806 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Mon, 23 Aug 2021 14:35:18 +0200 Subject: [PATCH] Change how "additional_special_tokens" argument in the ".from_pretrained" method of the tokenizer is taken into account (#13056) * add test * add change in PretrainedTokenizerBase * change Luke * deactivate * add the possibility to add additional special tokens for M2M100 * format * add special test for canine * proposed changes for mbart * proposed changes for mbart50 * proposed changes for byt5 * proposed changes for canine * proposed changes for t5 * test fast and slow * remove comment * remove comment * add fast version for all tests * replace break by continue * add more comments * add check to avoid duplicates * remove comment * format * proposed change for wave2vec2 * reverse changes mbart * uncomment * format --- .../models/luke/tokenization_luke.py | 4 +- .../models/m2m_100/tokenization_m2m_100.py | 12 +++- .../models/mbart50/tokenization_mbart50.py | 6 +- .../mbart50/tokenization_mbart50_fast.py | 6 +- src/transformers/tokenization_utils_base.py | 6 ++ tests/test_processor_wav2vec2.py | 5 +- tests/test_tokenization_byt5.py | 70 ++++++++++++++++++- tests/test_tokenization_canine.py | 59 ++++++++++++++++ tests/test_tokenization_common.py | 59 ++++++++++++++++ tests/test_tokenization_t5.py | 69 +++++++++++++++++- 10 files changed, 285 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index 3fe2665dc5..c5739317f0 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -210,8 +210,8 @@ class LukeTokenizer(RobertaTokenizer): if isinstance(entity_token_2, str) else entity_token_2 ) - kwargs["additional_special_tokens"] = [entity_token_1, entity_token_2] - kwargs["additional_special_tokens"] += kwargs.get("additional_special_tokens", []) + kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) + kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2] super().__init__( vocab_file=vocab_file, diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index 3e9f921d03..f27d4fbd3f 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -137,6 +137,15 @@ class M2M100Tokenizer(PreTrainedTokenizer): ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES} + + kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) + kwargs["additional_special_tokens"] += [ + self.get_lang_token(lang_code) + for lang_code in FAIRSEQ_LANGUAGE_CODES + if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"] + ] + super().__init__( src_lang=src_lang, tgt_lang=tgt_lang, @@ -157,14 +166,11 @@ class M2M100Tokenizer(PreTrainedTokenizer): self.encoder_size = len(self.encoder) - self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES} - self.lang_token_to_id = { self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES) } self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)} self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()} - self._additional_special_tokens = list(self.lang_token_to_id.keys()) self._src_lang = src_lang if src_lang is not None else "en" self.tgt_lang = tgt_lang diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py index 47d45211d3..15a9551936 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50.py +++ b/src/transformers/models/mbart50/tokenization_mbart50.py @@ -130,6 +130,11 @@ class MBart50Tokenizer(PreTrainedTokenizer): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) + kwargs["additional_special_tokens"] += [ + code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"] + ] + super().__init__( src_lang=src_lang, tgt_lang=tgt_lang, @@ -168,7 +173,6 @@ class MBart50Tokenizer(PreTrainedTokenizer): self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - self._additional_special_tokens = list(self.lang_code_to_id.keys()) self._src_lang = src_lang if src_lang is not None else "en_XX" self.cur_lang_code_id = self.lang_code_to_id[self._src_lang] diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py index f0b0770ce2..01980769e5 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py +++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py @@ -125,6 +125,11 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) + kwargs["additional_special_tokens"] += [ + code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"] + ] + super().__init__( vocab_file, src_lang=src_lang, @@ -141,7 +146,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): self.vocab_file = vocab_file - self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES}) self.lang_code_to_id = { lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES } diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 762c966047..5bd7c06540 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1862,6 +1862,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: special_tokens_map = json.load(special_tokens_map_handle) for key, value in special_tokens_map.items(): + if key in kwargs and kwargs[key]: + # This value has already been redefined by the kwargs + # We keep this new value and ignore the one stored in the special_tokens_map_file + + continue + if isinstance(value, dict): value = AddedToken(**value) elif isinstance(value, list): diff --git a/tests/test_processor_wav2vec2.py b/tests/test_processor_wav2vec2.py index 7d30b06934..924d64bee9 100644 --- a/tests/test_processor_wav2vec2.py +++ b/tests/test_processor_wav2vec2.py @@ -53,8 +53,9 @@ class Wav2Vec2ProcessorTest(unittest.TestCase): with open(self.feature_extraction_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(feature_extractor_map) + "\n") - def get_tokenizer(self, **kwargs): - kwargs.update(self.add_kwargs_tokens_map) + def get_tokenizer(self, **kwargs_init): + kwargs = self.add_kwargs_tokens_map.copy() + kwargs.update(kwargs_init) return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_feature_extractor(self, **kwargs): diff --git a/tests/test_tokenization_byt5.py b/tests/test_tokenization_byt5.py index 79c2f0005c..d8d4795917 100644 --- a/tests/test_tokenization_byt5.py +++ b/tests/test_tokenization_byt5.py @@ -13,11 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import os import shutil import tempfile import unittest -from transformers import BatchEncoding, ByT5Tokenizer +from transformers import AddedToken, BatchEncoding, ByT5Tokenizer from transformers.file_utils import cached_property, is_tf_available, is_torch_available from .test_tokenization_common import TokenizerTesterMixin @@ -161,6 +163,72 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): shutil.rmtree(tmpdirname) + # There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens + # We need to add the extra_ids in the list of the arg additional_special_tokens + def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: + special_tokens_map = json.load(json_file) + + with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + tokenizer_config = json.load(json_file) + + added_tokens_extra_ids = [f"" for i in range(125)] + + special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: + json.dump(special_tokens_map, outfile) + with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + json.dump(tokenizer_config, outfile) + + # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes + # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and + # "special_tokens_map.json" files + tokenizer_without_change_in_init = tokenizer_class.from_pretrained( + tmp_dir, + ) + self.assertIn( + "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens + ) + # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab + self.assertEqual( + ["an_additional_special_token"], + tokenizer_without_change_in_init.convert_ids_to_tokens( + tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"]) + ), + ) + + # Now we test that we can change the value of additional_special_tokens in the from_pretrained + new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)] + tokenizer = tokenizer_class.from_pretrained( + tmp_dir, + additional_special_tokens=new_added_tokens, + ) + + self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens) + self.assertEqual( + ["a_new_additional_special_token"], + tokenizer.convert_ids_to_tokens( + tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"]) + ), + ) + # tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list def test_pretrained_model_lists(self): pass diff --git a/tests/test_tokenization_canine.py b/tests/test_tokenization_canine.py index 9f95b75f62..a0c5a09727 100644 --- a/tests/test_tokenization_canine.py +++ b/tests/test_tokenization_canine.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import os import shutil import tempfile import unittest @@ -175,6 +177,63 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.save_pretrained(tmp_dir_name) tokenizer.from_pretrained(tmp_dir_name) + def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: + special_tokens_map = json.load(json_file) + + with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + tokenizer_config = json.load(json_file) + + # a special token for Canine can be defined as follows: + NEW_TOKEN = 0xE006 + new_token_1 = chr(NEW_TOKEN) + + special_tokens_map["additional_special_tokens"] = [new_token_1] + tokenizer_config["additional_special_tokens"] = [new_token_1] + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: + json.dump(special_tokens_map, outfile) + with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + json.dump(tokenizer_config, outfile) + + # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes + # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and + # "special_tokens_map.json" files + tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0) + self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens) + # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab + self.assertEqual( + [new_token_1], + tokenizer_without_change_in_init.convert_ids_to_tokens( + tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1]) + ), + ) + + NEW_TOKEN = 0xE007 + new_token_2 = chr(NEW_TOKEN) + # Now we test that we can change the value of additional_special_tokens in the from_pretrained + new_added_tokens = [AddedToken(new_token_2, lstrip=True)] + tokenizer = tokenizer_class.from_pretrained( + tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0 + ) + + self.assertIn(new_token_2, tokenizer.additional_special_tokens) + # self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab + self.assertEqual( + [new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2])) + ) + @require_tokenizers def test_encode_decode_with_spaces(self): tokenizers = self.get_tokenizers(do_lower_case=False) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 7e9dd887ee..443fc402cf 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -16,6 +16,7 @@ import inspect import itertools +import json import os import pickle import re @@ -3161,6 +3162,64 @@ class TokenizerTesterMixin: self.assertTrue(special_token_id in p_output) self.assertTrue(special_token_id in cr_output) + def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: + special_tokens_map = json.load(json_file) + + with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + tokenizer_config = json.load(json_file) + + special_tokens_map["additional_special_tokens"] = ["an_additional_special_token"] + tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"] + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: + json.dump(special_tokens_map, outfile) + with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + json.dump(tokenizer_config, outfile) + + # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes + # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and + # "special_tokens_map.json" files + tokenizer_without_change_in_init = tokenizer_class.from_pretrained( + tmp_dir, + ) + self.assertIn( + "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens + ) + self.assertIn("an_additional_special_token", tokenizer_without_change_in_init.get_vocab()) + self.assertEqual( + ["an_additional_special_token"], + tokenizer_without_change_in_init.convert_ids_to_tokens( + tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"]) + ), + ) + + # Now we test that we can change the value of additional_special_tokens in the from_pretrained + new_added_tokens = [AddedToken("a_new_additional_special_token", lstrip=True)] + tokenizer = tokenizer_class.from_pretrained( + tmp_dir, + additional_special_tokens=new_added_tokens, + ) + + self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens) + self.assertEqual( + ["a_new_additional_special_token"], + tokenizer.convert_ids_to_tokens( + tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"]) + ), + ) + def test_training_new_tokenizer(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 89557387b6..4025801bf2 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -12,7 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import json +import os +import tempfile import unittest from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast @@ -294,6 +296,71 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in cr_output) + def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: + special_tokens_map = json.load(json_file) + + with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + tokenizer_config = json.load(json_file) + + added_tokens_extra_ids = [f"" for i in range(100)] + + special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: + json.dump(special_tokens_map, outfile) + with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + json.dump(tokenizer_config, outfile) + + # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes + # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and + # "special_tokens_map.json" files + tokenizer_without_change_in_init = tokenizer_class.from_pretrained( + tmp_dir, + ) + self.assertIn( + "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens + ) + # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab + self.assertEqual( + ["an_additional_special_token"], + tokenizer_without_change_in_init.convert_ids_to_tokens( + tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"]) + ), + ) + + # Now we test that we can change the value of additional_special_tokens in the from_pretrained + new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)] + tokenizer = tokenizer_class.from_pretrained( + tmp_dir, + additional_special_tokens=new_added_tokens, + ) + + self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens) + self.assertEqual( + ["a_new_additional_special_token"], + tokenizer.convert_ids_to_tokens( + tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"]) + ), + ) + @slow def test_tokenizer_integration(self): # fmt: off