max_len_single_sentence & max_len_sentences_pair as attributes so they can be modified

This commit is contained in:
thomwolf
2019-08-23 22:07:26 +02:00
parent ab7bd5ef98
commit 3bcbebd440
8 changed files with 26 additions and 40 deletions

View File

@@ -77,6 +77,9 @@ class RobertaTokenizer(PreTrainedTokenizer):
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
@@ -160,14 +163,6 @@ class RobertaTokenizer(PreTrainedTokenizer):
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 4 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.