update encode_plus - add truncation strategies
This commit is contained in:
@@ -75,7 +75,7 @@ class TextDataset(Dataset):
|
||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||
|
||||
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
||||
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
|
||||
self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
|
||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||
# can change this behavior by adding (model specific) padding.
|
||||
@@ -109,10 +109,8 @@ def mask_tokens(inputs, tokenizer, args):
|
||||
labels = inputs.clone()
|
||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
||||
probability_matrix *= torch.tensor(
|
||||
[tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()],
|
||||
dtype=torch.float
|
||||
)
|
||||
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
|
||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
||||
|
||||
|
||||
Reference in New Issue
Block a user