From 411c582109a5b62a5a8e42969ba08af4c4c66e82 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 22 Jan 2021 10:03:57 -0500
Subject: [PATCH] Fixes to run_seq2seq and instructions (#9734)

* Fixes to run_seq2seq and instructions

* Add more defaults for summarization
---
 examples/seq2seq/README.md      | 88 ++++++++++++++++++++++++++++++++-
 examples/seq2seq/run_seq2seq.py | 29 ++++++++++-
 2 files changed, 113 insertions(+), 4 deletions(-)

diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md
index 5805e8a19c..00b7394ba8 100644
--- a/examples/seq2seq/README.md
+++ b/examples/seq2seq/README.md
@@ -22,14 +22,98 @@ For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.
 
 ### Supported Architectures
 
-- `BartForConditionalGeneration` (and anything that inherits from it)
+- `BartForConditionalGeneration`
 - `MarianMTModel`
 - `PegasusForConditionalGeneration`
 - `MBartForConditionalGeneration`
 - `FSMTForConditionalGeneration`
 - `T5ForConditionalGeneration`
 
-## Datasets
+This directory is in a bit of messy state and is undergoing some cleaning, please bare with us in the meantime :-) Here are the instructions to use the new and old scripts for fine-tuning sequence-to-sequence models.
+
+## New script
+
+The new script for fine-tuning a model on a summarization or translation task is `run_seq2seq.py`. It is a lightweight example of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (json or csv), then fine-tune one of the architectures above on it.
+
+Here is an example on a summarization task:
+```bash
+python examples/seq2seq/run_seq2seq.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --task summarization \
+    --dataset_name xsum \
+    --output_dir ~/tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+And here is how you would use it on your own files (replace `path_to_csv_or_json_file`, `text_column_name` and `summary_column_name` by the relevant values):
+```bash
+python examples/seq2seq/run_seq2seq.py \
+    -model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --task summarization \
+    --train_file path_to_csv_or_json_file \
+    --validation_file path_to_csv_or_json_file \
+    --output_dir ~/tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate \
+    --text_column text_column_name \
+    --summary_column summary_column_name 
+```
+The training and validation files should have a column for the inputs texts and a column for the summaries.
+
+Here is an example of a translation fine-tuning:
+```bash
+python examples/seq2seq/run_seq2seq.py \
+    --model_name_or_path sshleifer/student_marian_en_ro_6_1 \
+    --do_train \
+    --do_eval \
+    --task translation_en_to_ro \
+    --dataset_name wmt16 \
+    --dataset_config_name ro-en \
+    --source_lang en-XX \
+    --target_lang ro-RO\
+    --output_dir ~/tmp/tst-translation \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+And here is how you would use it on your own files (replace `path_to_json_file`, by the relevant values):
+```bash
+python examples/seq2seq/run_seq2seq.py \
+    --model_name_or_path sshleifer/student_marian_en_ro_6_1 \
+    --do_train \
+    --do_eval \
+    --task translation_en_to_ro \
+    --dataset_name wmt16 \
+    --dataset_config_name ro-en \
+    --source_lang en-XX \
+    --target_lang ro-RO\
+    --train_file path_to_json_file \
+    --validation_file path_to_json_file \
+    --output_dir ~/tmp/tst-translation \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+Here the files are expected to be JSON files, with each input being a dictionary with a key `"translation"` containing one key per language (here `"en"` and `"ro"`).
+
+## Old script
+
+The new script is very new and hasn't been widely tested yet. It also misses a few functionality offered by the old
+script, which is why we are leaving the old script here for now.
+
+### Downlowd the Datasets
 
 #### XSUM
 
diff --git a/examples/seq2seq/run_seq2seq.py b/examples/seq2seq/run_seq2seq.py
index 4968670096..b5a04fdb73 100644
--- a/examples/seq2seq/run_seq2seq.py
+++ b/examples/seq2seq/run_seq2seq.py
@@ -136,10 +136,10 @@ class DataTrainingArguments:
         },
     )
     val_max_target_length: Optional[int] = field(
-        default=142,
+        default=None,
         metadata={
             "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded. "
+            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
             "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
             "during ``evaluate`` and ``predict``."
         },
@@ -175,6 +175,9 @@ class DataTrainingArguments:
             "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
         },
     )
+    source_prefix: Optional[str] = field(
+        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+    )
 
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -190,11 +193,22 @@ class DataTrainingArguments:
             raise ValueError(
                 "`task` should be summarization, summarization_{dataset}, translation or translation_{xx}_to_{yy}."
             )
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
 
 
 summarization_name_mapping = {
+    "amazon_reviews_multi": ("review_body", "review_title"),
+    "big_patent": ("description", "abstract"),
     "cnn_dailymail": ("article", "highlights"),
+    "orange_sum": ("text", "summary"),
+    "pn_summary": ("article", "summary"),
+    "psc": ("extract_text", "summary_text"),
+    "samsum": ("dialogue", "summary"),
+    "thaisum": ("body", "summary"),
+    "xglue": ("news_body", "news_title"),
     "xsum": ("document", "summary"),
+    "wiki_summary": ("article", "highlights"),
 }
 
 
@@ -302,6 +316,16 @@ def main():
     if model.config.decoder_start_token_id is None:
         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
 
+    # Get the default prefix if None is passed.
+    if data_args.source_prefix is None:
+        task_specific_params = model.config.task_specific_params
+        if task_specific_params is not None:
+            prefix = task_specific_params.get("prefix", "")
+        else:
+            prefix = ""
+    else:
+        prefix = data_args.source_prefix
+
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.
     if training_args.do_train:
@@ -362,6 +386,7 @@ def main():
         else:
             inputs = examples[text_column]
             targets = examples[summary_column]
+        inputs = [prefix + inp for inp in inputs]
         model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
 
         # Setup the tokenizer for targets