From e2bb9abb6ad20e695686dc410ebd398a30cc1943 Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Wed, 7 Oct 2020 11:20:44 -0400
Subject: [PATCH] [s2s] release pseudolabel links and instructions (#7639)

---
 examples/seq2seq/make_student.py              | 10 +++++---
 examples/seq2seq/precomputed_pseudo_labels.md | 24 +++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 examples/seq2seq/precomputed_pseudo_labels.md

diff --git a/examples/seq2seq/make_student.py b/examples/seq2seq/make_student.py
index 32328363fe..2ccff5efde 100644
--- a/examples/seq2seq/make_student.py
+++ b/examples/seq2seq/make_student.py
@@ -13,7 +13,7 @@ logger = logging.get_logger(__name__)
 
 
 def copy_layers(src_layers: nn.ModuleList, dest_layers: nn.ModuleList, layers_to_copy: List[int]) -> None:
-    layers_to_copy = nn.ModuleList([l for i, l in enumerate(src_layers) if i in layers_to_copy])
+    layers_to_copy = nn.ModuleList([src_layers[i] for i in layers_to_copy])
     assert len(dest_layers) == len(layers_to_copy), f"{len(dest_layers)} != {len(layers_to_copy)}"
     dest_layers.load_state_dict(layers_to_copy.state_dict())
 
@@ -81,6 +81,8 @@ def create_student_by_copying_alternating_layers(
     e: Union[int, None] = None,
     d: Union[int, None] = None,
     copy_first_teacher_layers=False,
+    e_layers_to_copy=None,
+    d_layers_to_copy=None,
     **extra_config_kwargs
 ) -> Tuple[PreTrainedModel, List[int], List[int]]:
     """Make a student by copying alternating layers from a teacher, save it to save_path.
@@ -142,8 +144,10 @@ def create_student_by_copying_alternating_layers(
         return student, e_layers_to_copy, d_layers_to_copy
 
     # Decide which layers of the teacher to copy. Not exactly alternating -- we try to keep first and last layer.
-    e_layers_to_copy: List[int] = pick_layers_to_copy(e, teacher_e)
-    d_layers_to_copy: List[int] = pick_layers_to_copy(d, teacher_d)
+    if e_layers_to_copy is None:
+        e_layers_to_copy: List[int] = pick_layers_to_copy(e, teacher_e)
+    if d_layers_to_copy is None:
+        d_layers_to_copy: List[int] = pick_layers_to_copy(d, teacher_d)
 
     try:
         copy_layers(teacher.model.encoder.layers, student.model.encoder.layers, e_layers_to_copy)
diff --git a/examples/seq2seq/precomputed_pseudo_labels.md b/examples/seq2seq/precomputed_pseudo_labels.md
new file mode 100644
index 0000000000..aa1b786ab4
--- /dev/null
+++ b/examples/seq2seq/precomputed_pseudo_labels.md
@@ -0,0 +1,24 @@
+### Precomputed pseudolabels
++ decompress with tar -xzvf. The produced directory name may differ from the filename.
+
+| Dataset | Model                       | Rouge Scores       | Notes                                                                                                       | Link                                                                                   |
+|---------|-----------------------------|--------------------|-------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------|
+| XSUM    | facebook/bart-large-xsum    | 49.8/28.0/42.5     |                                                                                                             | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/xsum/bart_xsum_pl.tgz)          |
+| XSUM    | google/pegasus-xsum         | 53.3/32.7/46.5     |                                                                                                             | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/xsum/pegasus_xsum.tgz)          |
+| XSUM    | facebook/bart-large-xsum    | ?                  | Bart pseudolabels filtered to those with Rouge2 > 10.0 w GT                                                 | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/xsum/xsum_pl2_bart.tgz)         |
+|         |                             |                    |                                                                                                             | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/xsum/pegasus_xsum_on_cnn.tgz)   |
+| CNN/DM  | sshleifer/pegasus-cnn-ft-v2 | 47.316/26.65/44.56 | do not worry about the fact that train.source is one line shorter.                                          | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/cnn_dm/pegasus_cnn_cnn_pls.tgz) |
+| CNN/DM  | facebook/bart-large-cnn     |                    | 5K (2%) are missing, there should be 282173                                                                 | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/cnn_dm/cnn_bart_pl.tgz)         |
+| CNN/DM  | google/pegasus-xsum         | 21.5/6.76/25       | extra labels for xsum distillation  Used max_source_length=512, (and all other pegasus-xsum configuration). | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/cnn_dm/pegasus_xsum_on_cnn.tgz) |
+| EN-RO   | Helsinki-NLP/opus-mt-en-ro  |       |  | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/wmt_en_ro/opus_mt_en_ro.tgz) |
+| EN-RO   | facebook/mbart-large-en-ro  |       |  | [download](https://s3.amazonaws.com/datasets.huggingface.co/pseudo/wmt_en_ro/mbart_large_en_ro.tgz) |
+
+### Generating Pseudolabels
++ These command takes a while to run. For example,  pegasus_cnn_cnn_pls.tgz took 8 hours on 8 GPUs.
++ Pegasus does not work in fp16 :(, Bart, mBART and Marian do.
+
+```
+python -m torch.distributed.launch --nproc_per_node=8 run_distributed_eval.py \
+    --model_name facebook/bart-large-xsum --save_dir bart_xsum_pl --data_dir xsum \
+    --fp16 --bs 32 --sync_timeout 60000 --max_source_length 1024
+```