From 0477b307c7501ea76e01b03cb387a2312db752b3 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 15 Nov 2019 23:54:11 -0500 Subject: [PATCH] [camembert] tokenizer: use additional_special_tokens --- transformers/tokenization_camembert.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py index ae1b322941..41d3d74cff 100644 --- a/transformers/tokenization_camembert.py +++ b/transformers/tokenization_camembert.py @@ -45,10 +45,12 @@ class CamembertTokenizer(PreTrainedTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, bos_token="", eos_token="", sep_token="", - cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): + cls_token="", unk_token="", pad_token='', mask_token='', + additional_special_tokens=['NOTUSED', 'NOTUSED'], **kwargs): super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, - mask_token=mask_token, **kwargs) + mask_token=mask_token, additional_special_tokens=additional_special_tokens, + **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.sp_model = spm.SentencePieceProcessor()