Sequence special token handling for BERT and RoBERTa
This commit is contained in:
@@ -71,9 +71,15 @@ class TextDataset(Dataset):
|
|||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||||
|
|
||||||
|
tokenized_text = tokenizer.add_special_tokens_single_sentence(tokenized_text)
|
||||||
while len(tokenized_text) >= block_size: # Truncate in block of block_size
|
while len(tokenized_text) >= block_size: # Truncate in block of block_size
|
||||||
self.examples.append(tokenized_text[:block_size])
|
if isinstance(tokenizer, (BertTokenizer, RobertaTokenizer)):
|
||||||
tokenized_text = tokenized_text[block_size:]
|
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size - 2]))
|
||||||
|
tokenized_text = tokenized_text[block_size - 2:]
|
||||||
|
else:
|
||||||
|
self.examples.append(tokenized_text[:block_size])
|
||||||
|
tokenized_text = tokenized_text[block_size:]
|
||||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||||
# can change this behavior by adding (model specific) padding.
|
# can change this behavior by adding (model specific) padding.
|
||||||
|
|||||||
Reference in New Issue
Block a user