From b262577d17efe0beea88d2bb4f78f1c4a25f2636 Mon Sep 17 00:00:00 2001 From: vitaliyradchenko Date: Wed, 25 Dec 2019 18:31:35 +0200 Subject: [PATCH] add special tokens to unique_added_tokens_encoder --- src/transformers/tokenization_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 5b3e795448..6559cd0845 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -469,6 +469,9 @@ class PreTrainedTokenizer(object): tokenizer.init_inputs = init_inputs tokenizer.init_kwargs = init_kwargs + # update unique_added_tokens_encoder with special tokens for correct tokenization + tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens)) + # Add supplementary tokens. if added_tokens_file is not None: with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: @@ -476,7 +479,7 @@ class PreTrainedTokenizer(object): added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} tokenizer.added_tokens_encoder.update(added_tok_encoder) tokenizer.added_tokens_decoder.update(added_tok_decoder) - tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys()).union(set(tokenizer.all_special_tokens))) + tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys())) return tokenizer