[examples] SummarizationModule improvements (#4951)

2020-06-17 13:51:34 -04:00
parent cd40f6564e
commit 043f9f51f9
15 changed files with 1465 additions and 348 deletions
--- a/examples/summarization/test_summarization_examples.py
+++ b/examples/summarization/test_summarization_examples.py
@@ -7,28 +7,40 @@ import unittest
 from pathlib import Path
 from unittest.mock import patch

+import torch
 from torch.utils.data import DataLoader

 from transformers import BartTokenizer

-from .evaluate_cnn import run_generate
+from .distillation import distill_main, evaluate_checkpoint
 from .finetune import main
-from .utils import SummarizationDataset
+from .run_eval import generate_summaries, run_generate
+from .utils import SummarizationDataset, lmap, pickle_load


 logging.basicConfig(level=logging.DEBUG)

 logger = logging.getLogger()
-
-DEFAULT_ARGS = {
+FP16_EVER = False
+CHEAP_ARGS = {
+    "logger": "default",
+    "alpha_hid": 0,
+    "freeze_embeds": True,
+    "enc_only": False,
+    "tgt_suffix": "",
+    "resume_from_checkpoint": None,
+    "sortish_sampler": True,
+    "student_decoder_layers": 1,
+    "val_check_interval": 1.0,
    "output_dir": "",
    "fp16": False,
+    "no_teacher": False,
    "fp16_opt_level": "O1",
-    "n_gpu": 1,
+    "gpus": 1 if torch.cuda.is_available() else 0,
    "n_tpu_cores": 0,
    "max_grad_norm": 1.0,
    "do_train": True,
-    "do_predict": False,
+    "do_predict": True,
    "gradient_accumulation_steps": 1,
    "server_ip": "",
    "server_port": "",
@@ -36,7 +48,7 @@ DEFAULT_ARGS = {
    "model_type": "bart",
    "model_name_or_path": "sshleifer/bart-tiny-random",
    "config_name": "",
-    "tokenizer_name": "",
+    "tokenizer_name": "facebook/bart-large",
    "cache_dir": "",
    "do_lower_case": False,
    "learning_rate": 3e-05,
@@ -48,6 +60,17 @@ DEFAULT_ARGS = {
    "eval_batch_size": 2,
    "max_source_length": 12,
    "max_target_length": 12,
+    "val_max_target_length": 12,
+    "test_max_target_length": 12,
+    "fast_dev_run": False,
+    "no_cache": False,
+    "n_train": -1,
+    "n_val": -1,
+    "n_test": -1,
+    "student_encoder_layers": 1,
+    "alpha_loss_encoder": 0.0,
+    "freeze_encoder": False,
+    "auto_scale_batch_size": False,
 }


@@ -56,6 +79,9 @@ def _dump_articles(path: Path, articles: list):
        f.write("\n".join(articles))


+BDIR = Path("~/transformers_fork/examples/summarization/bart/").absolute()
+
+
 def make_test_data_dir():
    tmp_dir = Path(tempfile.gettempdir())
    articles = [" Sam ate lunch today", "Sams lunch ingredients"]
@@ -66,6 +92,169 @@ def make_test_data_dir():
    return tmp_dir


+@unittest.skip("These wont' pass until hidden_states kwarg is merged.")
+class TestSummarizationDistiller(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
+        return cls
+
+    @unittest.skipUnless(torch.cuda.device_count() > 1, "skipping multiGPU test")
+    def test_bdc_multigpu(self):
+        updates = dict(
+            student_encoder_layers=2,
+            student_decoder_layers=1,
+            no_teacher=True,
+            freeze_encoder=True,
+            gpus=2,
+            sortish_sampler=False,
+        )
+        self._bart_distiller_cli(updates)
+
+    @unittest.skipUnless(torch.cuda.is_available(), "skipping fp16 test")
+    def test_bdc_fp16(self):
+        updates = dict(
+            student_encoder_layers=2,
+            student_decoder_layers=1,
+            alpha_hid=3.0,
+            freeze_encoder=True,
+            gpus=1,
+            fp16=FP16_EVER,
+            fp16_opt_level="O1",
+        )
+        self._bart_distiller_cli(updates)
+
+    @unittest.skipUnless(torch.cuda.is_available(), "skipping fp16 test")
+    def test_bdc_t5_eval_fp16(self):
+        updates = dict(
+            fp16=FP16_EVER,
+            gpus=1,
+            model_type="t5",
+            model_name_or_path="patrickvonplaten/t5-tiny-random",
+            do_train=False,
+            do_predict=True,
+            tokenizer_name=None,
+            no_teacher=True,
+        )
+        self._bart_distiller_cli(updates, check_contents=False)
+
+    @unittest.skipUnless(torch.cuda.is_available(), "skipping fp16 test")
+    def test_bdc_t5_train_fp16(self):
+        updates = dict(
+            fp16=FP16_EVER,
+            gpus=1,
+            model_type="t5",
+            model_name_or_path="patrickvonplaten/t5-tiny-random",
+            do_train=True,
+            do_predict=True,
+            tokenizer_name="patrickvonplaten/t5-tiny-random",
+            no_teacher=True,
+        )
+        self._bart_distiller_cli(updates)
+
+    def test_bdc_no_teacher(self):
+        updates = dict(student_encoder_layers=2, student_decoder_layers=1, no_teacher=True,)
+        self._bart_distiller_cli(updates)
+
+    def test_bdc_yes_teacher(self):
+        updates = dict(student_encoder_layers=2, student_decoder_layers=1,)
+        self._bart_distiller_cli(updates)
+
+    def test_bdc_checkpointing(self):
+
+        updates = dict(
+            student_encoder_layers=2,
+            student_decoder_layers=1,
+            num_train_epochs=4,
+            val_check_interval=0.25,
+            alpha_hid=2.0,
+        )
+        model = self._bart_distiller_cli(updates, check_contents=False)
+
+        ckpts = list(Path(model.output_dir).glob("*.ckpt"))
+        self.assertEqual(1, len(ckpts))
+        transformer_ckpts = list(Path(model.output_dir).glob("**/*.bin"))
+        self.assertEqual(len(transformer_ckpts), len(ckpts))
+        new_transformer_ckpts = list(Path(model.output_dir).glob("**/*.bin"))
+        self.assertEqual(len(new_transformer_ckpts), 1)
+        examples = lmap(str.strip, model.hparams.data_dir.joinpath("test.source").open().readlines())
+        out_path = tempfile.mktemp()
+        generate_summaries(examples, out_path, new_transformer_ckpts[0].parent)
+        self.assertTrue(Path(out_path).exists())
+
+        evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp()))
+
+    def test_bdc_t5(self):
+        updates = dict(
+            student_encoder_layers=1,
+            student_decoder_layers=1,
+            alpha_hid=2.0,
+            teacher="patrickvonplaten/t5-tiny-random",
+            model_type="t5",
+            model_name_or_path="patrickvonplaten/t5-tiny-random",
+            tokenizer_name="patrickvonplaten/t5-tiny-random",
+        )
+        self._bart_distiller_cli(updates)
+
+    def test_bdc_t5_eval(self):
+        updates = dict(
+            model_type="t5",
+            model_name_or_path="patrickvonplaten/t5-tiny-random",
+            do_train=False,
+            do_predict=True,
+            tokenizer_name="patrickvonplaten/t5-tiny-random",
+            no_teacher=True,
+        )
+        self._bart_distiller_cli(updates, check_contents=False)
+
+    def _bart_distiller_cli(self, updates, check_contents=True):
+        default_updates = dict(
+            model_type="bart",
+            train_batch_size=1,
+            eval_batch_size=2,
+            num_train_epochs=2,
+            alpha_mlm=0.2,
+            alpha_ce=0.8,
+            do_predict=True,
+            gpus=1 if torch.cuda.is_available() else 0,
+            model_name_or_path="sshleifer/tinier_bart",
+            teacher=CHEAP_ARGS["model_name_or_path"],
+            val_check_interval=0.5,
+            alpha_encoder_loss=0.4,
+        )
+        default_updates.update(updates)
+        args_d: dict = CHEAP_ARGS.copy()
+        tmp_dir = make_test_data_dir()
+        output_dir = tempfile.mkdtemp(prefix="output_")
+
+        args_d.update(data_dir=tmp_dir, output_dir=output_dir, **default_updates)
+        model = distill_main(argparse.Namespace(**args_d))
+        if not check_contents:
+            return model
+        contents = os.listdir(output_dir)
+        ckpt_name = "val_avg_rouge2=0.0000-step_count=2.ckpt"  # "val_avg_rouge2=0.0000-epoch=1.ckpt"  # "epoch=1-val_avg_rouge2=0.0000.ckpt"
+        contents = {os.path.basename(p) for p in contents}
+        self.assertIn(ckpt_name, contents)
+        self.assertIn("metrics.pkl", contents)
+        self.assertIn("test_generations.txt", contents)
+        self.assertIn("val_generations_1.txt", contents)
+        self.assertIn("val_1_results.txt", contents)
+        self.assertIn("test_results.txt", contents)
+        # self.assertEqual(len(contents), 15)
+
+        metrics = pickle_load(Path(output_dir) / "metrics.pkl")
+        import pandas as pd
+
+        val_df = pd.DataFrame(metrics["val"])
+        train_df = pd.DataFrame(metrics["train"])
+        test_df = pd.DataFrame(metrics["test"])
+        desired_n_evals = args_d["num_train_epochs"] * 2 + 1
+        self.assertEqual(val_df.shape[0], desired_n_evals)  #
+        self.assertEqual(test_df.shape[1], val_df.shape[1])
+        self.assertEqual(train_df.shape[0], 0)
+        return model
+
+
 class TestBartExamples(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
@@ -79,49 +268,31 @@ class TestBartExamples(unittest.TestCase):
        output_file_name = Path(tempfile.gettempdir()) / "utest_output_bart_sum.hypo"
        articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
        _dump_articles(tmp, articles)
-        testargs = ["evaluate_cnn.py", str(tmp), str(output_file_name), "sshleifer/bart-tiny-random"]
+        testargs = ["run_eval.py", str(tmp), str(output_file_name), "sshleifer/bart-tiny-random"]
        with patch.object(sys, "argv", testargs):
            run_generate()
            self.assertTrue(Path(output_file_name).exists())
            os.remove(Path(output_file_name))

-    def test_bart_run_sum_cli(self):
-        args_d: dict = DEFAULT_ARGS.copy()
-        tmp_dir = make_test_data_dir()
-        output_dir = tempfile.mkdtemp(prefix="output_")
-        args_d.update(
-            data_dir=tmp_dir, model_type="bart", train_batch_size=2, eval_batch_size=2, n_gpu=0, output_dir=output_dir,
-        )
-        main(argparse.Namespace(**args_d))
-        args_d.update({"do_train": False, "do_predict": True})
-
-        main(argparse.Namespace(**args_d))
-        contents = os.listdir(output_dir)
-        expected_contents = {
-            "checkpointepoch=0.ckpt",
-            "test_results.txt",
-        }
-        created_files = {os.path.basename(p) for p in contents}
-        self.assertSetEqual(expected_contents, created_files)
-
    def test_t5_run_sum_cli(self):
-        args_d: dict = DEFAULT_ARGS.copy()
+        args_d: dict = CHEAP_ARGS.copy()
+
        tmp_dir = make_test_data_dir()
        output_dir = tempfile.mkdtemp(prefix="output_")
        args_d.update(
            data_dir=tmp_dir,
            model_type="t5",
            model_name_or_path="patrickvonplaten/t5-tiny-random",
+            tokenizer_name=None,  # "patrickvonplaten/t5-tiny-random",
            train_batch_size=2,
            eval_batch_size=2,
-            n_gpu=0,
+            gpus=0,
            output_dir=output_dir,
            do_predict=True,
        )
-        main(argparse.Namespace(**args_d))
-
-        # args_d.update({"do_train": False, "do_predict": True})
-        # main(argparse.Namespace(**args_d))
+        assert "n_train" in args_d
+        args = argparse.Namespace(**args_d)
+        main(args)

    def test_bart_summarization_dataset(self):
        tmp_dir = Path(tempfile.gettempdir())
@@ -138,42 +309,16 @@ class TestBartExamples(unittest.TestCase):
        )
        dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
        for batch in dataloader:
-            self.assertEqual(batch["source_mask"].shape, batch["source_ids"].shape)
+            self.assertEqual(batch["attention_mask"].shape, batch["input_ids"].shape)
            # show that articles were trimmed.
-            self.assertEqual(batch["source_ids"].shape[1], max_len_source)
-            self.assertGreater(20, batch["source_ids"].shape[1])  # trimmed significantly
+            self.assertEqual(batch["input_ids"].shape[1], max_len_source)
+            self.assertGreater(20, batch["input_ids"].shape[1])  # trimmed significantly

            # show that targets were truncated
-            self.assertEqual(batch["target_ids"].shape[1], trunc_target)  # Truncated
+            self.assertEqual(batch["decoder_input_ids"].shape[1], trunc_target)  # Truncated
            self.assertGreater(max_len_target, trunc_target)  # Truncated


-class TestT5Examples(unittest.TestCase):
-    def test_t5_cli(self):
-        output_file_name = "output_t5_sum.txt"
-        score_file_name = "score_t5_sum.txt"
-        articles = ["New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-        tmp = Path(tempfile.gettempdir()) / "utest_generations_t5_sum.hypo"
-        with tmp.open("w", encoding="utf-8") as f:
-            f.write("\n".join(articles))
-
-        output_file_name = Path(tempfile.gettempdir()) / "utest_output_t5_sum.hypo"
-        score_file_name = Path(tempfile.gettempdir()) / "utest_score_t5_sum.hypo"
-
-        testargs = [
-            "evaluate_cnn.py",
-            str(tmp),
-            str(output_file_name),
-            "patrickvonplaten/t5-tiny-random",
-            "--reference_path",
-            str(tmp),
-            "--score_path",
-            str(score_file_name),
-        ]
-
-        with patch.object(sys, "argv", testargs):
-            run_generate()
-            self.assertTrue(Path(output_file_name).exists())
-            self.assertTrue(Path(score_file_name).exists())
+def list_to_text_file(lst, path):
+    dest = Path(path)
+    dest.open("w+").writelines(lst)