From 5ed50a93fb4fc4ed554a54835060ab4721123d07 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 30 Sep 2019 14:14:27 -0400 Subject: [PATCH] LM finetuning won't mask special tokens anymore --- examples/run_lm_finetuning.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index a91deebb6c..024b254b56 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -108,7 +108,12 @@ def mask_tokens(inputs, tokenizer, args): """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) - masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool() + probability_matrix = torch.full(labels.shape, args.mlm_probability) + probability_matrix *= torch.tensor( + [tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()], + dtype=torch.float + ) + masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -1 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])