adding max_lengths for single sentences and sentences pairs

This commit is contained in:
thomwolf
2019-08-23 17:31:11 +02:00
parent 90dcd8c05d
commit 47d6853439
5 changed files with 40 additions and 0 deletions

View File

@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer):
tokenize_chinese_chars=tokenize_chinese_chars) tokenize_chinese_chars=tokenize_chinese_chars)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.vocab) return len(self.vocab)

View File

@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text return text
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 4 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
""" """
Adds special tokens to a sequence for sequence classification tasks. Adds special tokens to a sequence for sequence classification tasks.

View File

@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object):
"pad_token", "cls_token", "mask_token", "pad_token", "cls_token", "mask_token",
"additional_special_tokens"] "additional_special_tokens"]
@property
def max_len_single_sentence(self):
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@property @property
def bos_token(self): def bos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """ """ Beginning of sentence token (string). Log an error if used while not having been set. """

View File

@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace('</w>', ' ').strip() out_string = ''.join(tokens).replace('</w>', ' ').strip()
return out_string return out_string
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
""" """
Adds special tokens to a sequence for sequence classification tasks. Adds special tokens to a sequence for sequence classification tasks.

View File

@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string return out_string
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Adds special tokens to a sequence pair for sequence classification tasks.