Change how "additional_special_tokens" argument in the ".from_pretrained" method of the tokenizer is taken into account (#13056)
* add test * add change in PretrainedTokenizerBase * change Luke * deactivate * add the possibility to add additional special tokens for M2M100 * format * add special test for canine * proposed changes for mbart * proposed changes for mbart50 * proposed changes for byt5 * proposed changes for canine * proposed changes for t5 * test fast and slow * remove comment * remove comment * add fast version for all tests * replace break by continue * add more comments * add check to avoid duplicates * remove comment * format * proposed change for wave2vec2 * reverse changes mbart * uncomment * format
This commit is contained in:
@@ -210,8 +210,8 @@ class LukeTokenizer(RobertaTokenizer):
|
||||
if isinstance(entity_token_2, str)
|
||||
else entity_token_2
|
||||
)
|
||||
kwargs["additional_special_tokens"] = [entity_token_1, entity_token_2]
|
||||
kwargs["additional_special_tokens"] += kwargs.get("additional_special_tokens", [])
|
||||
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
|
||||
kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
|
||||
@@ -137,6 +137,15 @@ class M2M100Tokenizer(PreTrainedTokenizer):
|
||||
) -> None:
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES}
|
||||
|
||||
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
|
||||
kwargs["additional_special_tokens"] += [
|
||||
self.get_lang_token(lang_code)
|
||||
for lang_code in FAIRSEQ_LANGUAGE_CODES
|
||||
if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
|
||||
]
|
||||
|
||||
super().__init__(
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
@@ -157,14 +166,11 @@ class M2M100Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.encoder_size = len(self.encoder)
|
||||
|
||||
self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES}
|
||||
|
||||
self.lang_token_to_id = {
|
||||
self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)
|
||||
}
|
||||
self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)}
|
||||
self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}
|
||||
self._additional_special_tokens = list(self.lang_token_to_id.keys())
|
||||
|
||||
self._src_lang = src_lang if src_lang is not None else "en"
|
||||
self.tgt_lang = tgt_lang
|
||||
|
||||
@@ -130,6 +130,11 @@ class MBart50Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
|
||||
kwargs["additional_special_tokens"] += [
|
||||
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
|
||||
]
|
||||
|
||||
super().__init__(
|
||||
src_lang=src_lang,
|
||||
tgt_lang=tgt_lang,
|
||||
@@ -168,7 +173,6 @@ class MBart50Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
|
||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||
self._additional_special_tokens = list(self.lang_code_to_id.keys())
|
||||
|
||||
self._src_lang = src_lang if src_lang is not None else "en_XX"
|
||||
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
|
||||
|
||||
@@ -125,6 +125,11 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
|
||||
kwargs["additional_special_tokens"] += [
|
||||
code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
|
||||
]
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
src_lang=src_lang,
|
||||
@@ -141,7 +146,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES})
|
||||
self.lang_code_to_id = {
|
||||
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
|
||||
}
|
||||
|
||||
@@ -1862,6 +1862,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
|
||||
special_tokens_map = json.load(special_tokens_map_handle)
|
||||
for key, value in special_tokens_map.items():
|
||||
if key in kwargs and kwargs[key]:
|
||||
# This value has already been redefined by the kwargs
|
||||
# We keep this new value and ignore the one stored in the special_tokens_map_file
|
||||
|
||||
continue
|
||||
|
||||
if isinstance(value, dict):
|
||||
value = AddedToken(**value)
|
||||
elif isinstance(value, list):
|
||||
|
||||
@@ -53,8 +53,9 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
|
||||
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(feature_extractor_map) + "\n")
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
kwargs.update(self.add_kwargs_tokens_map)
|
||||
def get_tokenizer(self, **kwargs_init):
|
||||
kwargs = self.add_kwargs_tokens_map.copy()
|
||||
kwargs.update(kwargs_init)
|
||||
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
|
||||
@@ -13,11 +13,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import BatchEncoding, ByT5Tokenizer
|
||||
from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
|
||||
from transformers.file_utils import cached_property, is_tf_available, is_torch_available
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
@@ -161,6 +163,72 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
shutil.rmtree(tmpdirname)
|
||||
|
||||
# There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens
|
||||
# We need to add the extra_ids in the list of the arg additional_special_tokens
|
||||
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
|
||||
tokenizer_list = []
|
||||
if self.test_slow_tokenizer:
|
||||
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
|
||||
|
||||
if self.test_rust_tokenizer:
|
||||
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
|
||||
|
||||
for tokenizer_class, tokenizer_utils in tokenizer_list:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tokenizer_utils.save_pretrained(tmp_dir)
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
|
||||
special_tokens_map = json.load(json_file)
|
||||
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
|
||||
tokenizer_config = json.load(json_file)
|
||||
|
||||
added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(125)]
|
||||
|
||||
special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
|
||||
"an_additional_special_token"
|
||||
]
|
||||
tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
|
||||
"an_additional_special_token"
|
||||
]
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(special_tokens_map, outfile)
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(tokenizer_config, outfile)
|
||||
|
||||
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
|
||||
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
|
||||
# "special_tokens_map.json" files
|
||||
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
|
||||
tmp_dir,
|
||||
)
|
||||
self.assertIn(
|
||||
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
|
||||
)
|
||||
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
|
||||
self.assertEqual(
|
||||
["an_additional_special_token"],
|
||||
tokenizer_without_change_in_init.convert_ids_to_tokens(
|
||||
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
|
||||
),
|
||||
)
|
||||
|
||||
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
|
||||
new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
tmp_dir,
|
||||
additional_special_tokens=new_added_tokens,
|
||||
)
|
||||
|
||||
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
|
||||
self.assertEqual(
|
||||
["a_new_additional_special_token"],
|
||||
tokenizer.convert_ids_to_tokens(
|
||||
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
|
||||
),
|
||||
)
|
||||
|
||||
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
|
||||
def test_pretrained_model_lists(self):
|
||||
pass
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
@@ -175,6 +177,63 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.save_pretrained(tmp_dir_name)
|
||||
tokenizer.from_pretrained(tmp_dir_name)
|
||||
|
||||
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
|
||||
tokenizer_list = []
|
||||
if self.test_slow_tokenizer:
|
||||
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
|
||||
|
||||
if self.test_rust_tokenizer:
|
||||
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
|
||||
|
||||
for tokenizer_class, tokenizer_utils in tokenizer_list:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tokenizer_utils.save_pretrained(tmp_dir)
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
|
||||
special_tokens_map = json.load(json_file)
|
||||
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
|
||||
tokenizer_config = json.load(json_file)
|
||||
|
||||
# a special token for Canine can be defined as follows:
|
||||
NEW_TOKEN = 0xE006
|
||||
new_token_1 = chr(NEW_TOKEN)
|
||||
|
||||
special_tokens_map["additional_special_tokens"] = [new_token_1]
|
||||
tokenizer_config["additional_special_tokens"] = [new_token_1]
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(special_tokens_map, outfile)
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(tokenizer_config, outfile)
|
||||
|
||||
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
|
||||
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
|
||||
# "special_tokens_map.json" files
|
||||
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
|
||||
self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
|
||||
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
|
||||
self.assertEqual(
|
||||
[new_token_1],
|
||||
tokenizer_without_change_in_init.convert_ids_to_tokens(
|
||||
tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
|
||||
),
|
||||
)
|
||||
|
||||
NEW_TOKEN = 0xE007
|
||||
new_token_2 = chr(NEW_TOKEN)
|
||||
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
|
||||
new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
|
||||
)
|
||||
|
||||
self.assertIn(new_token_2, tokenizer.additional_special_tokens)
|
||||
# self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
|
||||
self.assertEqual(
|
||||
[new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
|
||||
)
|
||||
|
||||
@require_tokenizers
|
||||
def test_encode_decode_with_spaces(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
import inspect
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
@@ -3161,6 +3162,64 @@ class TokenizerTesterMixin:
|
||||
self.assertTrue(special_token_id in p_output)
|
||||
self.assertTrue(special_token_id in cr_output)
|
||||
|
||||
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
|
||||
tokenizer_list = []
|
||||
if self.test_slow_tokenizer:
|
||||
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
|
||||
|
||||
if self.test_rust_tokenizer:
|
||||
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
|
||||
|
||||
for tokenizer_class, tokenizer_utils in tokenizer_list:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tokenizer_utils.save_pretrained(tmp_dir)
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
|
||||
special_tokens_map = json.load(json_file)
|
||||
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
|
||||
tokenizer_config = json.load(json_file)
|
||||
|
||||
special_tokens_map["additional_special_tokens"] = ["an_additional_special_token"]
|
||||
tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"]
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(special_tokens_map, outfile)
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(tokenizer_config, outfile)
|
||||
|
||||
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
|
||||
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
|
||||
# "special_tokens_map.json" files
|
||||
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
|
||||
tmp_dir,
|
||||
)
|
||||
self.assertIn(
|
||||
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
|
||||
)
|
||||
self.assertIn("an_additional_special_token", tokenizer_without_change_in_init.get_vocab())
|
||||
self.assertEqual(
|
||||
["an_additional_special_token"],
|
||||
tokenizer_without_change_in_init.convert_ids_to_tokens(
|
||||
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
|
||||
),
|
||||
)
|
||||
|
||||
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
|
||||
new_added_tokens = [AddedToken("a_new_additional_special_token", lstrip=True)]
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
tmp_dir,
|
||||
additional_special_tokens=new_added_tokens,
|
||||
)
|
||||
|
||||
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
|
||||
self.assertEqual(
|
||||
["a_new_additional_special_token"],
|
||||
tokenizer.convert_ids_to_tokens(
|
||||
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
|
||||
),
|
||||
)
|
||||
|
||||
def test_training_new_tokenizer(self):
|
||||
# This feature only exists for fast tokenizers
|
||||
if not self.test_rust_tokenizer:
|
||||
|
||||
@@ -12,7 +12,9 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
|
||||
@@ -294,6 +296,71 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_token_id in r_output)
|
||||
self.assertTrue(special_token_id in cr_output)
|
||||
|
||||
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
|
||||
tokenizer_list = []
|
||||
if self.test_slow_tokenizer:
|
||||
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
|
||||
|
||||
if self.test_rust_tokenizer:
|
||||
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
|
||||
|
||||
for tokenizer_class, tokenizer_utils in tokenizer_list:
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tokenizer_utils.save_pretrained(tmp_dir)
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
|
||||
special_tokens_map = json.load(json_file)
|
||||
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
|
||||
tokenizer_config = json.load(json_file)
|
||||
|
||||
added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(100)]
|
||||
|
||||
special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
|
||||
"an_additional_special_token"
|
||||
]
|
||||
tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
|
||||
"an_additional_special_token"
|
||||
]
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(special_tokens_map, outfile)
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(tokenizer_config, outfile)
|
||||
|
||||
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
|
||||
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
|
||||
# "special_tokens_map.json" files
|
||||
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
|
||||
tmp_dir,
|
||||
)
|
||||
self.assertIn(
|
||||
"an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
|
||||
)
|
||||
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
|
||||
self.assertEqual(
|
||||
["an_additional_special_token"],
|
||||
tokenizer_without_change_in_init.convert_ids_to_tokens(
|
||||
tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
|
||||
),
|
||||
)
|
||||
|
||||
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
|
||||
new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
tmp_dir,
|
||||
additional_special_tokens=new_added_tokens,
|
||||
)
|
||||
|
||||
self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
|
||||
self.assertEqual(
|
||||
["a_new_additional_special_token"],
|
||||
tokenizer.convert_ids_to_tokens(
|
||||
tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
|
||||
),
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_tokenizer_integration(self):
|
||||
# fmt: off
|
||||
|
||||
Reference in New Issue
Block a user