Fix add_special_tokens on fast tokenizers (#4531)

2020-05-28 10:54:45 -04:00
parent e444648a30
commit 5e737018e1
2 changed files with 10 additions and 4 deletions
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -2400,15 +2400,20 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):

    def add_special_tokens(self, special_tokens_dict: dict) -> int:
        # Map special tokens to class attributes (self.pad_token...)
-        num_added_tokens = super().add_special_tokens(special_tokens_dict)
+        super().add_special_tokens(special_tokens_dict)

        # If the backend tokenizer the only specificities of special tokens are that
        #    - they will never be processed by the model, and
        #    - they will be removed while decoding.
        # But they are not mapped to special attributes in the backend so we can just
        # send a list.
-        tokens = flatten(special_tokens_dict.values())
-        self._tokenizer.add_special_tokens(tokens)
+        tokens = []
+        for token in special_tokens_dict.values():
+            if isinstance(token, list):
+                tokens += token
+            else:
+                tokens += [token]
+        num_added_tokens = self._tokenizer.add_special_tokens(tokens)

        return num_added_tokens