diff --git a/README.md b/README.md
index fde35d23ea..b348fde28c 100644
--- a/README.md
+++ b/README.md
@@ -984,7 +984,10 @@ The optimizer accepts the following arguments:
 - `warmup` : portion of `t_total` for the warmup, `-1`  means no warmup. Default : `-1`
 - `t_total` : total number of training steps for the learning
     rate schedule, `-1`  means constant learning rate. Default : `-1`
-- `schedule` : schedule to use for the warmup (see above). Default : `'warmup_linear'`
+- `schedule` : schedule to use for the warmup (see above).
+    Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+    If `None` or `'none'`, learning rate is always kept constant.
+    Default : `'warmup_linear'`
 - `b1` : Adams b1. Default : `0.9`
 - `b2` : Adams b2. Default : `0.999`
 - `e` : Adams epsilon. Default : `1e-6`
@@ -998,6 +1001,32 @@ The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as i
 
 `OpenAIGPTAdam` accepts the same arguments as `BertAdam`.
 
+#### Learning Rate Schedules
+The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
+All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction.
+When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`, 
+the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used. 
+An overview of the implemented schedules:
+- `ConstantLR`: always returns learning rate 1.
+- `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    ![](docs/imgs/warmup_constant_schedule.png)
+- `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    ![](docs/imgs/warmup_linear_schedule.png)
+-  `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    ![](docs/imgs/warmup_cosine_schedule.png)
+- `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
+    ![](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
+- `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
+    ![](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
+
 ## Examples
 
 | Sub-section | Description |
diff --git a/docs/imgs/warmup_constant_schedule.png b/docs/imgs/warmup_constant_schedule.png
new file mode 100644
index 0000000000..e2448e9f2c
Binary files /dev/null and b/docs/imgs/warmup_constant_schedule.png differ
diff --git a/docs/imgs/warmup_cosine_hard_restarts_schedule.png b/docs/imgs/warmup_cosine_hard_restarts_schedule.png
new file mode 100644
index 0000000000..be73605b9c
Binary files /dev/null and b/docs/imgs/warmup_cosine_hard_restarts_schedule.png differ
diff --git a/docs/imgs/warmup_cosine_schedule.png b/docs/imgs/warmup_cosine_schedule.png
new file mode 100644
index 0000000000..6d27926ab1
Binary files /dev/null and b/docs/imgs/warmup_cosine_schedule.png differ
diff --git a/docs/imgs/warmup_cosine_warm_restarts_schedule.png b/docs/imgs/warmup_cosine_warm_restarts_schedule.png
new file mode 100644
index 0000000000..71b39bffd3
Binary files /dev/null and b/docs/imgs/warmup_cosine_warm_restarts_schedule.png differ
diff --git a/docs/imgs/warmup_linear_schedule.png b/docs/imgs/warmup_linear_schedule.png
new file mode 100644
index 0000000000..4e1af31025
Binary files /dev/null and b/docs/imgs/warmup_linear_schedule.png differ
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 7e88b1b61c..03856956ac 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -85,7 +85,9 @@ class ConstantLR(_LRSchedule):
 
 class WarmupCosineSchedule(_LRSchedule):
     """
-    Cosine learning rate schedule with linear warmup. Cosine after warmup is without restarts.
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
     """
     warn_t_total = True
     def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
@@ -108,7 +110,9 @@ class WarmupCosineSchedule(_LRSchedule):
 
 class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
     """
-    Cosine learning rate schedule with linear warmup and hard restarts (if cycles > 1).
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
     """
     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
         super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
@@ -125,9 +129,9 @@ class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
 
 class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
     """
-    Cosine learning rate schedule with linear warmups and linear warmup restarts.
-    The same warmup rate is used for warmup restarts as for initial warmup.
-    The total effective fraction of warmup steps over all cycles is warmup * cycles!
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
     """
     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
         assert(warmup * cycles < 1.)
@@ -146,7 +150,8 @@ class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedul
 
 class WarmupConstantSchedule(_LRSchedule):
     """
-    Applies linear warmup. After warmup always returns 1..
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
     """
     def get_lr_(self, progress):
         if progress < self.warmup:
@@ -156,7 +161,8 @@ class WarmupConstantSchedule(_LRSchedule):
 
 class WarmupLinearSchedule(_LRSchedule):
     """
-    Linear warmup. Linear decay after warmup.
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
     """
     warn_t_total = True
     def get_lr_(self, progress):
@@ -182,8 +188,9 @@ class BertAdam(Optimizer):
         t_total: total number of training steps for the learning
             rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
         schedule: schedule to use for the warmup (see above).
-            Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object.
-            Default: 'warmup_linear'
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
         b1: Adams b1. Default: 0.9
         b2: Adams b2. Default: 0.999
         e: Adams epsilon. Default: 1e-6
@@ -208,8 +215,8 @@ class BertAdam(Optimizer):
             schedule = schedule_type(warmup=warmup, t_total=t_total)
         else:
             if warmup != -1 or t_total != -1:
-                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
-                               "Please specify custom warmup and t_total in LRSchedule object.")
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
         defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 0cf0494e20..bff4ebe61f 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -48,8 +48,8 @@ class OpenAIAdam(Optimizer):
             schedule = schedule_type(warmup=warmup, t_total=t_total)
         else:
             if warmup != -1 or t_total != -1:
-                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
-                               "Please specify custom warmup and t_total in LRSchedule object.")
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
         defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
                         max_grad_norm=max_grad_norm)
diff --git a/tests/optimization_test.py b/tests/optimization_test.py
index f52aeb506b..bc12ff8a90 100644
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@@ -22,7 +22,8 @@ import torch
 
 from pytorch_pretrained_bert import BertAdam
 from pytorch_pretrained_bert import OpenAIAdam
-from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupCosineWithWarmupRestartsSchedule
+from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
+    WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
 import numpy as np
 
 
@@ -86,6 +87,18 @@ class WarmupCosineWithRestartsTest(unittest.TestCase):
         self.assertTrue(np.allclose(expected_zeros, 0))
 
 
+class TestSchedulePlot(unittest.TestCase):
+    def test_plot_schedule(self):
+        import matplotlib as mpl
+        from matplotlib import pyplot as plt
+        m = WarmupCosineWithWarmupRestartsSchedule(warmup=.1, t_total=1000., cycles=3.)
+        x = np.arange(0, 1000)
+        y = [m.get_lr(xe) for xe in x]
+        y = np.asarray(y)
+        plt.figure(figsize=(9, 2))
+        plt.plot(y)
+        #plt.grid(True)
+        plt.show()
 
 
 if __name__ == "__main__":