Merge branch 'master' into xlm-tokenization

This commit is contained in:
Thomas Wolf
2019-08-30 17:15:16 +02:00
committed by GitHub
41 changed files with 3091 additions and 67 deletions

View File

@@ -143,6 +143,9 @@ class BertTokenizer(PreTrainedTokenizer):
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "