Change how "additional_special_tokens" argument in the ".from_pretrained" method of the tokenizer is taken into account (#13056)

* add test

* add change in PretrainedTokenizerBase

* change Luke

* deactivate

* add the possibility to add additional special tokens for M2M100

* format

* add special test for canine

* proposed changes for mbart

* proposed changes for mbart50

* proposed changes for byt5

* proposed changes for canine

* proposed changes for t5

* test fast and slow

* remove comment

* remove comment

* add fast version for all tests

* replace break by continue

* add more comments

* add check to avoid duplicates

* remove comment

* format

* proposed change for wave2vec2

* reverse changes mbart

* uncomment

* format
This commit is contained in:
SaulLu
2021-08-23 14:35:18 +02:00
committed by GitHub
parent b13c6c18d0
commit 7223844df9
10 changed files with 285 additions and 11 deletions

View File

@@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
@@ -175,6 +177,63 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.save_pretrained(tmp_dir_name)
tokenizer.from_pretrained(tmp_dir_name)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
special_tokens_map = json.load(json_file)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
tokenizer_config = json.load(json_file)
# a special token for Canine can be defined as follows:
NEW_TOKEN = 0xE006
new_token_1 = chr(NEW_TOKEN)
special_tokens_map["additional_special_tokens"] = [new_token_1]
tokenizer_config["additional_special_tokens"] = [new_token_1]
with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
json.dump(special_tokens_map, outfile)
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
json.dump(tokenizer_config, outfile)
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
# self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
[new_token_1],
tokenizer_without_change_in_init.convert_ids_to_tokens(
tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
),
)
NEW_TOKEN = 0xE007
new_token_2 = chr(NEW_TOKEN)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
tokenizer = tokenizer_class.from_pretrained(
tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
)
self.assertIn(new_token_2, tokenizer.additional_special_tokens)
# self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
self.assertEqual(
[new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
)
@require_tokenizers
def test_encode_decode_with_spaces(self):
tokenizers = self.get_tokenizers(do_lower_case=False)