From 2d042274ac9ee6cd03aabcb861126937a29feb1a Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 20 Aug 2019 14:15:28 -0400 Subject: [PATCH] Sequence special token handling for BERT and RoBERTa --- examples/run_lm_finetuning.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index bd7047a587..c69d4db53b 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -71,9 +71,15 @@ class TextDataset(Dataset): text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) + + tokenized_text = tokenizer.add_special_tokens_single_sentence(tokenized_text) while len(tokenized_text) >= block_size: # Truncate in block of block_size - self.examples.append(tokenized_text[:block_size]) - tokenized_text = tokenized_text[block_size:] + if isinstance(tokenizer, (BertTokenizer, RobertaTokenizer)): + self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size - 2])) + tokenized_text = tokenized_text[block_size - 2:] + else: + self.examples.append(tokenized_text[:block_size]) + tokenized_text = tokenized_text[block_size:] # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding.