diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index e028900d30..34808d5331 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -202,8 +202,9 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) - padding_mask = labels.eq(tokenizer.pad_token_id) - probability_matrix.masked_fill_(padding_mask, value=0.0) + if tokenizer._pad_token is not None: + padding_mask = labels.eq(tokenizer.pad_token_id) + probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens @@ -228,6 +229,8 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): + if tokenizer._pad_token is None: + return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) @@ -421,6 +424,8 @@ def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefi # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): + if tokenizer._pad_token is None: + return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset)