PegasusForConditionalGeneration (torch version) (#6340)

Co-authored-by: Jingqing Zhang <jingqing.zhang15@imperial.ac.uk>
2020-08-11 14:31:23 -04:00
parent f6cb0f806e
commit 66fa8ceaea
20 changed files with 860 additions and 20 deletions
--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
@@ -413,6 +413,18 @@ def get_layers_to_copy(n_to_get, tot):
            12: all_layers,
        }
        return layers_to_copy[n_to_get]
+    elif tot == 16:
+        layers_to_copy = {  # maps  num layers in student -> which teacher layers to copy
+            1: [0],
+            2: [0, 8],
+            3: [0, 8, 15],
+            4: [0, 5, 10, 15],
+            6: [0, 3, 6, 9, 12, 15],
+            8: [0, 2, 4, 6, 8, 10, 12, 15],
+            9: [0, 1, 3, 5, 7, 9, 11, 13, 15],
+            16: all_layers,
+        }
+        return layers_to_copy[n_to_get]
    else:
        return all_layers[:n_to_get]  # TODO: better version on theseus-bart branch

--- a/examples/seq2seq/finetune_pegasus_xsum.sh
+++ b/examples/seq2seq/finetune_pegasus_xsum.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+export PYTHONPATH="../":"${PYTHONPATH}"
+
+# From appendix C of paper https://arxiv.org/abs/1912.08777
+# Set --gradient_accumulation_steps  so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16)
+python finetune.py \
+    --learning_rate=1e-4 \
+    --do_train \
+    --do_predict \
+    --n_val 1000 \
+    --val_check_interval 0.25 \
+    --max_source_length 512 --max_target_length 56 \
+    --freeze_embeds --max_target_length 56 --label_smoothing 0.1 \
+    $@