Merge pull request #1832 from huggingface/memory-leak-schedulers

replace LambdaLR scheduler wrappers by function
This commit is contained in:
Thomas Wolf
2019-11-14 22:10:31 +01:00
committed by GitHub
16 changed files with 96 additions and 119 deletions

View File

@@ -18,19 +18,17 @@ Schedules
Learning Rate Schedules
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: transformers.ConstantLRSchedule
:members:
.. autofunction:: transformers.get_constant_schedule
.. autoclass:: transformers.WarmupConstantSchedule
:members:
.. autofunction:: transformers.get_constant_schedule_with_warmup
.. image:: /imgs/warmup_constant_schedule.png
:target: /imgs/warmup_constant_schedule.png
:alt:
.. autoclass:: transformers.WarmupCosineSchedule
.. autofunction:: transformers.get_cosine_schedule_with_warmup
:members:
.. image:: /imgs/warmup_cosine_schedule.png
@@ -38,8 +36,7 @@ Learning Rate Schedules
:alt:
.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
:members:
.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
:target: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -47,8 +44,7 @@ Learning Rate Schedules
.. autoclass:: transformers.WarmupLinearSchedule
:members:
.. autofunction:: transformers.get_linear_schedule_with_warmup
.. image:: /imgs/warmup_linear_schedule.png
:target: /imgs/warmup_linear_schedule.png

View File

@@ -84,12 +84,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
# Parameters:
lr = 1e-3
max_grad_norm = 1.0
num_total_steps = 1000
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
### Previously BertAdam optimizer was instantiated like this:
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
### and used like this:
for batch in train_data:
loss = model(batch)
@@ -98,7 +98,7 @@ for batch in train_data:
### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
### and used like this:
for batch in train_data:
loss = model(batch)