From ccb6947dc1a5ddc9e1e6c3dc7f010385ed92c2b6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 17:39:47 +0200
Subject: [PATCH] optimization tests

---
 examples/run_glue.py                          | 29 +++++---
 examples/test_examples.py                     | 15 +++-
 pytorch_transformers/optimization.py          | 23 +++---
 .../tests/optimization_test.py                | 73 +++++++++++++------
 4 files changed, 91 insertions(+), 49 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index aaf9a9876c..c3dffb4fdb 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -96,8 +96,10 @@ def train(args, train_dataset, model, tokenizer):
     global_step = 0
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':      batch[0],
@@ -129,7 +131,7 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
@@ -148,8 +150,10 @@ def train(args, train_dataset, model, tokenizer):
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
                 break
         if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
             break
 
     return global_step, tr_loss / global_step
@@ -164,11 +168,10 @@ def evaluate(args, model, tokenizer, prefix=""):
     for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
         eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
 
-        """ Evaluate the model """
         if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(eval_output_dir)
 
-        args.eval_batch_size = args.per_gpu_eval_batch_size * args.n_gpu
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
         # Note that DistributedSampler samples randomly
         eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
@@ -177,7 +180,7 @@ def evaluate(args, model, tokenizer, prefix=""):
         logger.info("***** Running evaluation {} *****".format(prefix))
         logger.info("  Num examples = %d", len(eval_dataset))
         logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0
+        eval_loss = 0.0
         nb_eval_steps = 0
         preds = None
         out_label_ids = None
@@ -287,6 +290,8 @@ def main():
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action='store_true',
                         help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action='store_true',
                         help="Set this flag if you are using an uncased model.")
 
@@ -364,7 +369,7 @@ def main():
                         datefmt = '%m/%d/%Y %H:%M:%S',
                         level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
     # Setup seeds
     random.seed(args.seed)
@@ -409,6 +414,8 @@ def main():
     elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    logger.info("Training/evaluation parameters %s", args)
+
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
@@ -438,22 +445,22 @@ def main():
         model.to(args.device)
 
     # Evaluation
+    results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir + './' + WEIGHTS_NAME]
+        checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
             logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        results = {}
         for checkpoint in checkpoints:
-            global_step = int(checkpoint.split('-')[-1])
+            global_step = checkpoint.split('-')[-1]
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=global_step)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
             results.update(result)
 
-        return results
+    return results
 
 
 if __name__ == "__main__":
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 56c30efae4..dec59358b8 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -45,9 +45,18 @@ class ExamplesTests(unittest.TestCase):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
-                    "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
-                    "--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
+        testargs = ["run_glue.py",
+                    "--data_dir=./examples/tests_samples/MRPC/",
+                    "--task_name=mrpc",
+                    "--do_train",
+                    "--do_eval",
+                    "--output_dir=./examples/tests_samples/temp_dir",
+                    "--per_gpu_train_batch_size=2",
+                    "--per_gpu_eval_batch_size=1",
+                    "--learning_rate=1e-4",
+                    "--max_steps=10",
+                    "--warmup_steps=2",
+                    "--overwrite_output_dir"]
         model_name = "--model_name=bert-base-uncased"
         with patch.object(sys, 'argv', testargs + [model_name]):
             result = run_glue.main()
diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
index c78818dd74..8d224f1294 100644
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
 
 class ConstantLRSchedule(LambdaLR):
     def __init__(self, optimizer, last_epoch=-1):
-        super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
+        super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
 
 class WarmupCosineSchedule(LambdaLR):
     """
@@ -42,10 +42,10 @@ class WarmupCosineSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / max(1, warmup_steps)
+                return float(step) / float(max(1.0, warmup_steps))
             else:
-                progress = (step - warmup_steps) / max(1, t_total - warmup_steps)   # progress after warmup
-                return 0.5 * (1. + math.cos(math.pi * cycles * 2 * progress))
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                return 0.5 * (1. + math.cos(math.pi * float(cycles) * 2.0 * progress))
 
         super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
@@ -59,11 +59,12 @@ class WarmupCosineWithHardRestartsSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / max(1, warmup_steps)
+                return float(step) / float(max(1, warmup_steps))
             else:
-                progress = (step - warmup_steps) / max(1, t_total - warmup_steps)   # progress after warmup
-                ret = 0.5 * (1. + math.cos(math.pi * ((cycles * progress) % 1)))
-                return ret
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                if progress >= 1.0:
+                    return 0.0
+                return 0.5 * (1. + math.cos(math.pi * ((float(cycles) * progress) % 1.0)))
 
         super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
@@ -77,7 +78,7 @@ class WarmupConstantSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / warmup_steps
+                return float(step) / float(max(1.0, warmup_steps))
             return 1.
 
         super(WarmupConstantSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@@ -92,8 +93,8 @@ class WarmupLinearSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / max(1, warmup_steps)
-            return (t_total - step) / max(1, t_total - warmup_steps)
+                return float(step) / float(max(1, warmup_steps))
+            return float(t_total - step) / float(max(1.0, t_total - warmup_steps))
 
         super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
diff --git a/pytorch_transformers/tests/optimization_test.py b/pytorch_transformers/tests/optimization_test.py
index 7a9c93048d..ef1a1b1d50 100644
--- a/pytorch_transformers/tests/optimization_test.py
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -26,6 +26,13 @@ from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSched
 import numpy as np
 
 
+def unwrap_schedule(scheduler, num_steps=10):
+    lrs = []
+    for _ in range(num_steps):
+        scheduler.step()
+        lrs.append(scheduler.get_lr())
+    return lrs
+
 class OptimizationTest(unittest.TestCase):
 
     def assertListAlmostEqual(self, list1, list2, tol):
@@ -38,9 +45,7 @@ class OptimizationTest(unittest.TestCase):
         target = torch.tensor([0.4, 0.2, -0.5])
         criterion = torch.nn.MSELoss()
         # No warmup, constant schedule, no gradient clipping
-        optimizer = AdamW(params=[w], lr=2e-1,
-                                          weight_decay=0.0,
-                                          max_grad_norm=-1)
+        optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
         for _ in range(100):
             loss = criterion(w, target)
             loss.backward()
@@ -51,29 +56,49 @@ class OptimizationTest(unittest.TestCase):
 
 
 class ScheduleInitTest(unittest.TestCase):
-    def test_sched_init(self):
-        m = torch.nn.Linear(50, 50)
-        optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = AdamW(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
-        # shouldn't fail
+    m = torch.nn.Linear(50, 50)
+    optimizer = AdamW(m.parameters(), lr=10.)
+    num_steps = 10
 
+    def assertListAlmostEqual(self, list1, list2, tol):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
 
-class WarmupCosineWithRestartsTest(unittest.TestCase):
-    def test_it(self):
-        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000., cycles=5)
-        x = np.arange(0, 1000)
-        y = [m.get_lr(xe) for xe in x]
-        y = np.asarray(y)
-        expected_zeros = y[[0, 200, 400, 600, 800]]
-        print(expected_zeros)
-        expected_ones = y[[50, 250, 450, 650, 850]]
-        print(expected_ones)
-        self.assertTrue(np.allclose(expected_ones, 1))
-        self.assertTrue(np.allclose(expected_zeros, 0))
+    def test_constant_scheduler(self):
+        scheduler = ConstantLRSchedule(self.optimizer)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [10.] * self.num_steps
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+    def test_warmup_constant_scheduler(self):
+        scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+    def test_warmup_linear_scheduler(self):
+        scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+    def test_warmup_cosine_scheduler(self):
+        scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
+
+    def test_warmup_cosine_hard_restart_scheduler(self):
+        scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
 
 
 if __name__ == "__main__":