prepare_seq2seq_batch makes labels/ decoder_input_ids made later. (#6654)

* broken test * batch parity * tests pass * boom boom * boom boom * split out bart tokenizer tests * fix tests * boom boom * Fixed dataset bug * Fix marian * Undo extra * Get marian working * Fix t5 tok tests * Test passing * Cleanup * better assert msg * require torch * Fix mbart tests * undo extra decoder_attn_mask change * Fix import * pegasus tokenizer can ignore src_lang kwargs * unused kwarg test cov * boom boom * add todo for pegasus issue * cover one word translation edge case * Cleanup * doc
2020-08-28 11:15:17 -04:00
parent cb276b41de
commit 9336086ab5
20 changed files with 429 additions and 290 deletions
--- a/examples/seq2seq/README.md
+++ b/examples/seq2seq/README.md
@@ -71,8 +71,8 @@ Summarization Tips:
 (It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).

 **Update 2018-07-18**
-Datasets: `Seq2SeqDataset` should be used for all tokenizers without a `prepare_seq2seq_batch` method. For those who do (like Marian, MBart), `TranslationDataset` should be used.**
-A new dataset is needed to support multilingual tasks.
+Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
+Future work/help wanted: A new dataset to support multilingual tasks.


 ### Command Line Options
@@ -106,7 +106,7 @@ The following command should work on a 16GB GPU:
    --train_batch_size=1 \
    --eval_batch_size=1 \
    --output_dir=xsum_results \
-    --num_train_epochs 1 \
+    --num_train_epochs 6 \
    --model_name_or_path facebook/bart-large
 ```

--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
@@ -1,6 +1,7 @@
 import argparse
 import gc
 import os
+import warnings
 from pathlib import Path
 from typing import List

@@ -11,6 +12,7 @@ from torch.nn import functional as F

 from lightning_base import generic_train
 from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5Config, T5ForConditionalGeneration
+from transformers.modeling_bart import shift_tokens_right


 try:
@@ -22,6 +24,7 @@ try:
        assert_all_frozen,
        calculate_bleu,
        freeze_params,
+        label_smoothed_nll_loss,
        pickle_load,
        use_task_specific_params,
    )
@@ -34,12 +37,15 @@ except ImportError:
        assert_all_frozen,
        calculate_bleu,
        freeze_params,
+        label_smoothed_nll_loss,
        pickle_load,
        use_task_specific_params,
    )


 class BartSummarizationDistiller(SummarizationModule):
+    """Supports Bart, Pegasus and other models that inherit from Bart."""
+
    loss_names = ["loss", "ce_loss", "mlm_loss", "enc_mse_loss", "hid_loss_enc", "hid_loss_dec"]

    def __init__(self, hparams):
@@ -160,22 +166,32 @@ class BartSummarizationDistiller(SummarizationModule):
    def _step(self, batch):
        # assert is_frozen(self.teacher)
        pad_token_id = self.tokenizer.pad_token_id
-        input_ids, src_mask, y = batch["input_ids"], batch["attention_mask"], batch["decoder_input_ids"]
-        decoder_input_ids = y[:, :-1].contiguous()
-        labels = y[:, 1:].clone()
-        labels[y[:, 1:] == pad_token_id] = -100
+        input_ids, src_mask, tgt_ids = batch["input_ids"], batch["attention_mask"], batch["labels"]
+        decoder_input_ids = shift_tokens_right(tgt_ids, pad_token_id)
        # noinspection PyCallingNonCallable
-        sloss, slogits, dec_hidden, enc_outputs, enc_hidden_state = self(
+        lm_logits, dec_hidden, enc_outputs, enc_hidden_state = self(
            input_ids,
            attention_mask=src_mask,
            decoder_input_ids=decoder_input_ids,
-            labels=labels,
            output_hidden_states=True,
            output_attentions=False,
-        )
+            use_cache=False,
+        )  # TODO(@sshleifer): return_dict=True cleanup
+
+        # Same cross entropy vs. label smoothing logic as finetune.py
+        assert lm_logits.shape[-1] == self.model.config.vocab_size
+        if self.hparams.label_smoothing == 0:
+            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
+            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
+            student_lm_loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))
+        else:
+            lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1)
+            student_lm_loss, _ = label_smoothed_nll_loss(
+                lprobs, tgt_ids, self.hparams.label_smoothing, ignore_index=pad_token_id
+            )

        def zero_tensor():
-            return torch.tensor(0.0).type_as(sloss)
+            return torch.tensor(0.0).type_as(student_lm_loss)

        loss_encoder, hid_loss_enc, hid_loss_dec = zero_tensor(), zero_tensor(), zero_tensor()
        if self.different_encoder:
@@ -199,29 +215,26 @@ class BartSummarizationDistiller(SummarizationModule):
                attention_mask=src_mask,
                encoder_outputs=teacher_enc_outputs,
                decoder_input_ids=decoder_input_ids,
-                lm_labels=labels,
+                lm_labels=tgt_ids,
                output_hidden_states=True,
            )
        dec_mask = decoder_input_ids.ne(pad_token_id)
-        loss_ce, s_logits_slct, t_logits_slct = self.calc_ce_loss(dec_mask, slogits, tlogits)
+        loss_ce, s_logits_slct, t_logits_slct = self.calc_ce_loss(dec_mask, lm_logits, tlogits)
        if self.alpha_hid > 0:
            hid_loss_dec = self.calc_hidden_loss(dec_mask, dec_hidden, tdec_hidden, self.hparams.d_layer_to_copy)

        blended_loss = (
            self.alpha_ce * loss_ce
-            + self.alpha_mlm * sloss
+            + self.alpha_mlm * student_lm_loss
            + self.hparams.alpha_encoder_loss * loss_encoder
            + self.hparams.alpha_hid * (hid_loss_enc + hid_loss_dec)
        )
-        return blended_loss, loss_ce, sloss, loss_encoder, hid_loss_enc, hid_loss_dec
+        return blended_loss, loss_ce, student_lm_loss, loss_encoder, hid_loss_enc, hid_loss_dec

    def calc_hidden_loss(self, attention_mask, hidden_states, hidden_states_T, matches):
-        assert not isinstance(
-            hidden_states, torch.Tensor
-        ), f"expected list or tuple for hidden_states, got tensor of shape {hidden_states.shape}"
-        assert not isinstance(
-            hidden_states_T, torch.Tensor
-        ), f"expected list or tuple for hidden_states_T, got tensor of shape {hidden_states_T.shape}"
+        msg = "expected list or tuple for hidden_states, got tensor of shape: "
+        assert not isinstance(hidden_states, torch.Tensor), f"{msg}{hidden_states.shape}"
+        assert not isinstance(hidden_states_T, torch.Tensor), f"{msg}{hidden_states_T.shape}"
        mask = attention_mask.to(hidden_states[0])
        valid_count = mask.sum() * hidden_states[0].size(-1)
        hidden_losses = [
@@ -233,7 +246,7 @@ class BartSummarizationDistiller(SummarizationModule):


 def add_distill_args(parser):
-    parser.add_argument("--teacher", default="facebook/bart-large-cnn", type=str)
+    parser.add_argument("--teacher", type=str)
    parser.add_argument("--alpha_ce", default=0.8, type=float)
    parser.add_argument("--alpha_mlm", default=0.2, type=float)
    parser.add_argument("--alpha_encoder_loss", default=0.0, type=float)
@@ -245,8 +258,9 @@ def add_distill_args(parser):


 class BartTranslationDistiller(BartSummarizationDistiller):
+    """Supports Mbart, Marian, other models that inherit from Bart."""
+
    mode = "translation"
-    loss_names = ["loss"]
    metric_names = ["bleu"]
    val_metric = "bleu"

@@ -368,7 +382,7 @@ class T5SummarizationDistiller(BartSummarizationDistiller):
                attention_mask=source_mask,
                encoder_outputs=teacher_enc_outputs,
                decoder_input_ids=decoder_input_ids,
-                lm_labels=labels,
+                labels=labels,
                output_hidden_states=True,
                use_cache=False,
            )
@@ -402,6 +416,7 @@ def create_module(args):


 def evaluate_checkpoint(ckpt_path: Path, dest_dir=None):
+    # TODO(SS): DELETE?
    exp_dir = ckpt_path.parent
    if dest_dir is None:
        dest_dir = exp_dir
@@ -424,33 +439,40 @@ def evaluate_checkpoint(ckpt_path: Path, dest_dir=None):
    trainer.test(model)


-def get_layers_to_copy(n_to_get, tot):
-    all_layers = list(range(tot))
-    if tot == 12:  # Alternating for special cases
-        layers_to_copy = {  # maps  num layers in student -> which teacher layers to copy
-            1: [0],
-            2: [0, 6],
-            3: [0, 6, 11],
-            4: [0, 4, 8, 11],
-            6: [0, 2, 4, 7, 9, 11],
-            9: [0, 1, 2, 4, 5, 7, 9, 10, 11],
-            12: all_layers,
-        }
-        return layers_to_copy[n_to_get]
-    elif tot == 16:
-        layers_to_copy = {  # maps  num layers in student -> which teacher layers to copy
-            1: [0],
-            2: [0, 8],
-            3: [0, 8, 15],
-            4: [0, 5, 10, 15],
-            6: [0, 3, 6, 9, 12, 15],
-            8: [0, 2, 4, 6, 8, 10, 12, 15],
-            9: [0, 1, 3, 5, 7, 9, 11, 13, 15],
-            16: all_layers,
-        }
-        return layers_to_copy[n_to_get]
-    else:
-        return all_layers[:n_to_get]  # TODO: better version on theseus-bart branch
+LAYERS_TO_COPY = {
+    # maps  num layers in student -> which teacher layers to copy.
+    # 12: bart, 16: pegasus, 6: marian/Helsinki-NLP
+    12: {
+        1: [0],
+        2: [0, 6],
+        3: [0, 6, 11],
+        4: [0, 4, 8, 11],
+        6: [0, 2, 4, 7, 9, 11],
+        9: [0, 1, 2, 4, 5, 7, 9, 10, 11],
+        12: list(range(12)),
+    },
+    16: {  # maps  num layers in student -> which teacher layers to copy
+        1: [0],
+        2: [0, 8],
+        3: [0, 8, 15],
+        4: [0, 5, 10, 15],
+        6: [0, 3, 6, 9, 12, 15],
+        8: [0, 2, 4, 6, 8, 10, 12, 15],
+        9: [0, 1, 3, 5, 7, 9, 11, 13, 15],
+        16: list(range(16)),
+    },
+    6: {1: [0], 2: [0, 5], 3: [0, 2, 5], 4: [0, 1, 3, 5], 6: list(range(6))},
+}
+
+
+def get_layers_to_copy(n_student, n_teacher):
+    try:
+        return LAYERS_TO_COPY[n_teacher][n_student]
+    except KeyError:
+        warnings.warn(
+            f"no hardcoded layers to copy for teacher {n_teacher} -> student {n_student}, defaulting to first {n_student}"
+        )
+        return list(range(n_student))


 def distill_main(args):
--- a/examples/seq2seq/finetune.py
+++ b/examples/seq2seq/finetune.py
@@ -13,15 +13,16 @@ import torch
 from torch.utils.data import DataLoader

 from lightning_base import BaseTransformer, add_generic_args, generic_train
-from transformers import MarianTokenizer, MBartTokenizer, T5ForConditionalGeneration
+from transformers import MBartTokenizer, T5ForConditionalGeneration
+from transformers.modeling_bart import shift_tokens_right


 try:
    from .callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
    from .utils import (
        ROUGE_KEYS,
+        LegacySeq2SeqDataset,
        Seq2SeqDataset,
-        TranslationDataset,
        assert_all_frozen,
        calculate_bleu,
        calculate_rouge,
@@ -39,8 +40,8 @@ except ImportError:
    from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
    from utils import (
        ROUGE_KEYS,
+        LegacySeq2SeqDataset,
        Seq2SeqDataset,
-        TranslationDataset,
        assert_all_frozen,
        calculate_bleu,
        calculate_rouge,
@@ -102,14 +103,13 @@ class SummarizationModule(BaseTransformer):

        self.hparams.git_sha = get_git_info()["repo_sha"]
        self.num_workers = hparams.num_workers
-        self.decoder_start_token_id = None
+        self.decoder_start_token_id = None  # default to config
        if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer):
            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang]
            self.model.config.decoder_start_token_id = self.decoder_start_token_id
-        if isinstance(self.tokenizer, MBartTokenizer) or isinstance(self.tokenizer, MarianTokenizer):
-            self.dataset_class = TranslationDataset
-        else:
-            self.dataset_class = Seq2SeqDataset
+        self.dataset_class = (
+            Seq2SeqDataset if hasattr(self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset
+        )

    def freeze_embeds(self):
        """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
@@ -134,27 +134,25 @@ class SummarizationModule(BaseTransformer):

    def _step(self, batch: dict) -> Tuple:
        pad_token_id = self.tokenizer.pad_token_id
-        source_ids, source_mask, target_ids = batch["input_ids"], batch["attention_mask"], batch["decoder_input_ids"]
-
+        src_ids, src_mask = batch["input_ids"], batch["attention_mask"]
+        tgt_ids = batch["labels"]
        if isinstance(self.model, T5ForConditionalGeneration):
-            decoder_input_ids = self.model._shift_right(target_ids)
-            lm_labels = target_ids
+            decoder_input_ids = self.model._shift_right(tgt_ids)
        else:
-            decoder_input_ids = target_ids[:, :-1].contiguous()  # Why this line?
-            lm_labels = target_ids[:, 1:].clone()  # why clone?
-
-        outputs = self(source_ids, attention_mask=source_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
+            decoder_input_ids = shift_tokens_right(tgt_ids, pad_token_id)

+        outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
+        lm_logits = outputs[0]
        if self.hparams.label_smoothing == 0:
-            # Same behavior as modeling_bart.py
+            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
-            lm_logits = outputs[0]
+
            assert lm_logits.shape[-1] == self.model.config.vocab_size
-            loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), lm_labels.view(-1))
+            loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))
        else:
-            lprobs = torch.nn.functional.log_softmax(outputs[0], dim=-1)
+            lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1)
            loss, nll_loss = label_smoothed_nll_loss(
-                lprobs, lm_labels, self.hparams.label_smoothing, ignore_index=pad_token_id
+                lprobs, tgt_ids, self.hparams.label_smoothing, ignore_index=pad_token_id
            )
        return (loss,)

@@ -167,7 +165,7 @@ class SummarizationModule(BaseTransformer):

        logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
        # tokens per batch
-        logs["tpb"] = batch["input_ids"].ne(self.pad).sum() + batch["decoder_input_ids"].ne(self.pad).sum()
+        logs["tpb"] = batch["input_ids"].ne(self.pad).sum() + batch["labels"].ne(self.pad).sum()
        return {"loss": loss_tensors[0], "log": logs}

    def validation_step(self, batch, batch_idx) -> Dict:
@@ -204,7 +202,7 @@ class SummarizationModule(BaseTransformer):
        )
        gen_time = (time.time() - t0) / batch["input_ids"].shape[0]
        preds: List[str] = self.ids_to_clean_text(generated_ids)
-        target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"])
+        target: List[str] = self.ids_to_clean_text(batch["labels"])
        loss_tensors = self._step(batch)
        base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
        rouge: Dict = self.calc_generative_metrics(preds, target)
--- a/examples/seq2seq/run_eval.py
+++ b/examples/seq2seq/run_eval.py
@@ -132,4 +132,6 @@ def run_generate():


 if __name__ == "__main__":
+    # Usage for MT:
+    # python run_eval.py MODEL_NAME $DATA_DIR/test.source $save_dir/test_translations.txt --reference_path $DATA_DIR/test.target --score_path $save_dir/test_bleu.json  --task translation $@
    run_generate()
--- a/examples/seq2seq/test_seq2seq_examples.py
+++ b/examples/seq2seq/test_seq2seq_examples.py
@@ -10,18 +10,18 @@ from unittest.mock import patch
 import pytest
 import pytorch_lightning as pl
 import torch
-from pytest import param
 from torch.utils.data import DataLoader

 import lightning_base
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from transformers.modeling_bart import shift_tokens_right
 from transformers.testing_utils import CaptureStderr, CaptureStdout, require_multigpu

 from .distillation import distill_main, evaluate_checkpoint
 from .finetune import SummarizationModule, main
 from .pack_dataset import pack_data_dir
 from .run_eval import generate_summaries_or_translations, run_generate
-from .utils import Seq2SeqDataset, TranslationDataset, label_smoothed_nll_loss, lmap, load_json
+from .utils import LegacySeq2SeqDataset, Seq2SeqDataset, label_smoothed_nll_loss, lmap, load_json


 logging.basicConfig(level=logging.DEBUG)
@@ -452,18 +452,27 @@ def test_pack_dataset():
    assert orig_paths == new_paths


-@pytest.mark.parametrize(["tok_name"], [pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)])
-def test_mbart_dataset_truncation(tok_name):
+@pytest.mark.parametrize(
+    ["tok_name"],
+    [
+        pytest.param(MBART_TINY),
+        pytest.param(MARIAN_TINY),
+        pytest.param(T5_TINY),
+        pytest.param(BART_TINY),
+        pytest.param("google/pegasus-xsum"),
+    ],
+)
+def test_seq2seq_dataset_truncation(tok_name):
    tokenizer = AutoTokenizer.from_pretrained(tok_name)
    tmp_dir = make_test_data_dir()
    max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
    max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
    max_src_len = 4
    max_tgt_len = 8
-    assert max_len_target > max_src_len  # Truncated
-    assert max_len_source > max_src_len
-    src_lang, tgt_lang = "ro_RO", "de_DE"  # NOT WHAT IT WAS TRAINED ON
-    train_dataset = TranslationDataset(
+    assert max_len_target > max_src_len  # Will be truncated
+    assert max_len_source > max_src_len  # Will be truncated
+    src_lang, tgt_lang = "ro_RO", "de_DE"  # ignored for all but mbart, but never causes error.
+    train_dataset = Seq2SeqDataset(
        tokenizer,
        data_dir=tmp_dir,
        type_path="train",
@@ -479,10 +488,11 @@ def test_mbart_dataset_truncation(tok_name):
        # show that articles were trimmed.
        assert batch["input_ids"].shape[1] == max_src_len
        # show that targets are the same len
-        assert batch["decoder_input_ids"].shape[1] == max_tgt_len
-        if tok_name == MARIAN_TINY:
+        assert batch["labels"].shape[1] == max_tgt_len
+        if tok_name != MBART_TINY:
            continue
        # check language codes in correct place
+        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], tokenizer.pad_token_id)
        assert batch["decoder_input_ids"][0, 0].item() == tokenizer.lang_code_to_id[tgt_lang]
        assert batch["decoder_input_ids"][0, -1].item() == tokenizer.eos_token_id
        assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id
@@ -491,14 +501,14 @@ def test_mbart_dataset_truncation(tok_name):
        break  # No need to test every batch


-@pytest.mark.parametrize(["tok"], [pytest.param(T5_TINY), pytest.param(BART_TINY), param(MARIAN_TINY)])
-def test_summarization_dataset_truncation(tok):
+@pytest.mark.parametrize(["tok"], [pytest.param(BART_TINY), pytest.param("bert-base-cased")])
+def test_legacy_dataset_truncation(tok):
    tokenizer = AutoTokenizer.from_pretrained(tok)
    tmp_dir = make_test_data_dir()
    max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
    max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
    trunc_target = 4
-    train_dataset = Seq2SeqDataset(
+    train_dataset = LegacySeq2SeqDataset(
        tokenizer,
        data_dir=tmp_dir,
        type_path="train",
@@ -512,6 +522,6 @@ def test_summarization_dataset_truncation(tok):
        assert batch["input_ids"].shape[1] == max_len_source
        assert 20 >= batch["input_ids"].shape[1]  # trimmed significantly
        # show that targets were truncated
-        assert batch["decoder_input_ids"].shape[1] == trunc_target  # Truncated
+        assert batch["labels"].shape[1] == trunc_target  # Truncated
        assert max_len_target > trunc_target  # Truncated
        break  # No need to test every batch
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@@ -3,7 +3,6 @@ import json
 import linecache
 import os
 import pickle
-import warnings
 from logging import getLogger
 from pathlib import Path
 from typing import Callable, Dict, Iterable, List
@@ -41,6 +40,7 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):


 def encode_line(tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
+    """Only used by LegacyDataset"""
    extra_kw = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
    return tokenizer(
        [line],
@@ -75,7 +75,7 @@ def trim_batch(
        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])


-class Seq2SeqDataset(Dataset):
+class AbstractSeq2SeqDataset(Dataset):
    def __init__(
        self,
        tokenizer,
@@ -102,11 +102,28 @@ class Seq2SeqDataset(Dataset):
        self.pad_token_id = self.tokenizer.pad_token_id
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
+        self.add_prefix_space = isinstance(self.tokenizer, BartTokenizer)

    def __len__(self):
        return len(self.src_lens)

+    @staticmethod
+    def get_char_lens(data_file):
+        return [len(x) for x in Path(data_file).open().readlines()]
+
+    def make_sortish_sampler(self, batch_size):
+        return SortishSampler(self.src_lens, batch_size)
+
+    def __getitem__(self, item):
+        raise NotImplementedError("You must implement this")
+
+    def collate_fn(self, batch):
+        raise NotImplementedError("You must implement this")
+
+
+class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
+        """Call tokenizer on src and tgt_lines"""
        index = index + 1  # linecache starts at 1
        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
@@ -121,42 +138,27 @@ class Seq2SeqDataset(Dataset):
        return {
            "input_ids": source_ids,
            "attention_mask": src_mask,
-            "decoder_input_ids": target_ids,
+            "labels": target_ids,
        }

-    @staticmethod
-    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open().readlines()]
-
    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
        input_ids = torch.stack([x["input_ids"] for x in batch])
        masks = torch.stack([x["attention_mask"] for x in batch])
-        target_ids = torch.stack([x["decoder_input_ids"] for x in batch])
+        target_ids = torch.stack([x["labels"] for x in batch])
        pad_token_id = self.pad_token_id
        y = trim_batch(target_ids, pad_token_id)
        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
        batch = {
            "input_ids": source_ids,
            "attention_mask": source_mask,
-            "decoder_input_ids": y,
+            "labels": y,
        }
        return batch

-    def make_sortish_sampler(self, batch_size):
-        return SortishSampler(self.src_lens, batch_size)

-
-class TranslationDataset(Seq2SeqDataset):
+class Seq2SeqDataset(AbstractSeq2SeqDataset):
    """A dataset that calls prepare_seq2seq_batch."""

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if self.max_source_length != self.max_target_length:
-            warnings.warn(
-                f"Mbart is using sequence lengths {self.max_source_length}, {self.max_target_length}. "
-                f"Imbalanced sequence lengths may be undesired for translation tasks"
-            )
-
    def __getitem__(self, index) -> Dict[str, str]:
        index = index + 1  # linecache starts at 1
        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
@@ -169,6 +171,7 @@ class TranslationDataset(Seq2SeqDataset):
        }

    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
+        """Call prepare_seq2seq_batch."""
        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
            [x["src_texts"] for x in batch],
            src_lang=self.src_lang,
@@ -176,6 +179,8 @@ class TranslationDataset(Seq2SeqDataset):
            tgt_lang=self.tgt_lang,
            max_length=self.max_source_length,
            max_target_length=self.max_target_length,
+            return_tensors="pt",
+            add_prefix_space=self.add_prefix_space,
        )
        return batch_encoding.data

@@ -276,7 +281,11 @@ def calculate_rouge(output_lns: List[str], reference_lns: List[str], use_stemmer
    return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}


+# Utilities for freezing parameters and checking whether they are frozen
+
+
 def freeze_params(model: nn.Module):
+    """Set requires_grad=False for each of model.parameters()"""
    for par in model.parameters():
        par.requires_grad = False