Change how "additional_special_tokens" argument in the ".from_pretrained" method of the tokenizer is taken into account (#13056)

* add test * add change in PretrainedTokenizerBase * change Luke * deactivate * add the possibility to add additional special tokens for M2M100 * format * add special test for canine * proposed changes for mbart * proposed changes for mbart50 * proposed changes for byt5 * proposed changes for canine * proposed changes for t5 * test fast and slow * remove comment * remove comment * add fast version for all tests * replace break by continue * add more comments * add check to avoid duplicates * remove comment * format * proposed change for wave2vec2 * reverse changes mbart * uncomment * format
2021-08-23 14:35:18 +02:00
parent b13c6c18d0
commit 7223844df9
10 changed files with 285 additions and 11 deletions
--- a/tests/test_processor_wav2vec2.py
+++ b/tests/test_processor_wav2vec2.py
@@ -53,8 +53,9 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
            fp.write(json.dumps(feature_extractor_map) + "\n")

-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.add_kwargs_tokens_map)
+    def get_tokenizer(self, **kwargs_init):
+        kwargs = self.add_kwargs_tokens_map.copy()
+        kwargs.update(kwargs_init)
        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_feature_extractor(self, **kwargs):