From b915ba9dfe51db8161db5bc599df3944646b2b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 17 Oct 2019 17:44:20 +0200 Subject: [PATCH] pad sequence with 0, mask with -1 --- examples/run_seq2seq_finetuning.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py index 38dcb2d005..1f21cff82c 100644 --- a/examples/run_seq2seq_finetuning.py +++ b/examples/run_seq2seq_finetuning.py @@ -58,7 +58,7 @@ class TextDataset(Dataset): [2] https://github.com/abisee/cnn-dailymail/ """ - def __init__(self, tokenizer, prefix='train', data_dir="", block_size=512): + def __init__(self, tokenizer, prefix="train", data_dir="", block_size=512): assert os.path.isdir(data_dir) # Load features that have already been computed if present @@ -165,7 +165,12 @@ def _fit_to_block_size(sequence, block_size): if len(sequence) > block_size: return sequence[:block_size] else: - return sequence.extend([-1] * (block_size - len(sequence))) + return sequence.extend([0] * (block_size - len(sequence))) + + +def mask_padding_tokens(sequence): + """ Replace the padding token with -1 values """ + return [s if s != 0 else -1 for s in sequence] def load_and_cache_examples(args, tokenizer): @@ -219,11 +224,8 @@ def train(args, train_dataset, model, tokenizer): logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info( - " Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size - ) - logger.info( - " Total train batch size (w. parallel, distributed & accumulation) = %d", + logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) + logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), @@ -242,7 +244,7 @@ def train(args, train_dataset, model, tokenizer): source = ([s for s, _ in batch]).to(args.device) target = ([t for _, t in batch]).to(args.device) model.train() - outputs = model(source, target) + outputs = model(source, target, decoder_lm_labels=mask_padding_tokens(target)) loss = outputs[0] loss.backward()