Change how "additional_special_tokens" argument in the ".from_pretrained" method of the tokenizer is taken into account (#13056)
* add test * add change in PretrainedTokenizerBase * change Luke * deactivate * add the possibility to add additional special tokens for M2M100 * format * add special test for canine * proposed changes for mbart * proposed changes for mbart50 * proposed changes for byt5 * proposed changes for canine * proposed changes for t5 * test fast and slow * remove comment * remove comment * add fast version for all tests * replace break by continue * add more comments * add check to avoid duplicates * remove comment * format * proposed change for wave2vec2 * reverse changes mbart * uncomment * format
This commit is contained in:
@@ -13,6 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
@@ -175,6 +177,63 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.save_pretrained(tmp_dir_name)
|
||||
tokenizer.from_pretrained(tmp_dir_name)
|
||||
|
||||
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
|
||||
tokenizer_list = []
|
||||
if self.test_slow_tokenizer:
|
||||
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
|
||||
|
||||
if self.test_rust_tokenizer:
|
||||
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
|
||||
|
||||
for tokenizer_class, tokenizer_utils in tokenizer_list:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tokenizer_utils.save_pretrained(tmp_dir)
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
|
||||
special_tokens_map = json.load(json_file)
|
||||
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
|
||||
tokenizer_config = json.load(json_file)
|
||||
|
||||
# a special token for Canine can be defined as follows:
|
||||
NEW_TOKEN = 0xE006
|
||||
new_token_1 = chr(NEW_TOKEN)
|
||||
|
||||
special_tokens_map["additional_special_tokens"] = [new_token_1]
|
||||
tokenizer_config["additional_special_tokens"] = [new_token_1]
|
||||
|
||||
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(special_tokens_map, outfile)
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
|
||||
json.dump(tokenizer_config, outfile)
|
||||
|
||||
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
|
||||
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
|
||||
# "special_tokens_map.json" files
|
||||
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
|
||||
self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
|
||||
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
|
||||
self.assertEqual(
|
||||
[new_token_1],
|
||||
tokenizer_without_change_in_init.convert_ids_to_tokens(
|
||||
tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
|
||||
),
|
||||
)
|
||||
|
||||
NEW_TOKEN = 0xE007
|
||||
new_token_2 = chr(NEW_TOKEN)
|
||||
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
|
||||
new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
|
||||
)
|
||||
|
||||
self.assertIn(new_token_2, tokenizer.additional_special_tokens)
|
||||
# self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
|
||||
self.assertEqual(
|
||||
[new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
|
||||
)
|
||||
|
||||
@require_tokenizers
|
||||
def test_encode_decode_with_spaces(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
||||
Reference in New Issue
Block a user