adding max_lengths for single sentences and sentences pairs
This commit is contained in:
@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
tokenize_chinese_chars=tokenize_chinese_chars)
|
tokenize_chinese_chars=tokenize_chinese_chars)
|
||||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_single_sentence(self):
|
||||||
|
return self.max_len - 2 # take into account special tokens
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_sentences_pair(self):
|
||||||
|
return self.max_len - 3 # take into account special tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def vocab_size(self):
|
||||||
return len(self.vocab)
|
return len(self.vocab)
|
||||||
|
|||||||
@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
|
|||||||
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
|
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_single_sentence(self):
|
||||||
|
return self.max_len - 2 # take into account special tokens
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_sentences_pair(self):
|
||||||
|
return self.max_len - 4 # take into account special tokens
|
||||||
|
|
||||||
def add_special_tokens_single_sentence(self, token_ids):
|
def add_special_tokens_single_sentence(self, token_ids):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Adds special tokens to a sequence for sequence classification tasks.
|
||||||
|
|||||||
@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object):
|
|||||||
"pad_token", "cls_token", "mask_token",
|
"pad_token", "cls_token", "mask_token",
|
||||||
"additional_special_tokens"]
|
"additional_special_tokens"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_single_sentence(self):
|
||||||
|
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_sentences_pair(self):
|
||||||
|
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def bos_token(self):
|
def bos_token(self):
|
||||||
""" Beginning of sentence token (string). Log an error if used while not having been set. """
|
""" Beginning of sentence token (string). Log an error if used while not having been set. """
|
||||||
|
|||||||
@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_single_sentence(self):
|
||||||
|
return self.max_len - 2 # take into account special tokens
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_sentences_pair(self):
|
||||||
|
return self.max_len - 3 # take into account special tokens
|
||||||
|
|
||||||
def add_special_tokens_single_sentence(self, token_ids):
|
def add_special_tokens_single_sentence(self, token_ids):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Adds special tokens to a sequence for sequence classification tasks.
|
||||||
|
|||||||
@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_single_sentence(self):
|
||||||
|
return self.max_len - 2 # take into account special tokens
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_len_sentences_pair(self):
|
||||||
|
return self.max_len - 3 # take into account special tokens
|
||||||
|
|
||||||
def add_special_tokens_single_sentence(self, token_ids):
|
def add_special_tokens_single_sentence(self, token_ids):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||||
|
|||||||
Reference in New Issue
Block a user