Sentence -> Sequence. Removed output_mask from the special token addition methods.

This commit is contained in:
LysandreJik
2019-09-19 09:55:36 +02:00
parent 8cba057260
commit bf503158c5
13 changed files with 49 additions and 76 deletions

View File

@@ -75,7 +75,7 @@ class TextDataset(Dataset):
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
while len(tokenized_text) >= block_size: # Truncate in block of block_size
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size]))
tokenized_text = tokenized_text[block_size:]
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
# If your dataset is small, first you should loook for a bigger one :-) and second you