[Re-submit] Compute true loss Flax examples (#19504)

* Compute true loss

* fixup

* final

* final

* final

* Update examples/flax/language-modeling/run_bart_dlm_flax.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* jax.tree_map => jax.tree_util.tree_map

* Compute true loss

* final

* fixup

* final

* final

* Update examples/flax/language-modeling/run_bart_dlm_flax.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* jax.tree_map => jax.tree_util.tree_map

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
This commit is contained in:
Duong A. Nguyen
2022-10-13 17:33:36 +07:00
committed by GitHub
parent 0903fc80b5
commit 4212bb0d60
6 changed files with 82 additions and 62 deletions

View File

@@ -723,18 +723,25 @@ def main():
loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
# take average
loss = loss.sum() / label_mask.sum()
loss = loss.sum()
num_labels = label_mask.sum()
return loss
return loss, num_labels
grad_fn = jax.value_and_grad(loss_fn)
loss, grad = grad_fn(state.params)
grad = jax.lax.pmean(grad, "batch")
grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
(loss, num_labels), grad = grad_fn(state.params)
num_labels = jax.lax.psum(num_labels, "batch")
# true loss = total loss / total samples
loss = jax.lax.psum(loss, "batch")
loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
# true grad = total grad / total samples
grad = jax.lax.psum(grad, "batch")
grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
new_state = state.apply_gradients(grads=grad)
metrics = jax.lax.pmean(
{"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
)
metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
return new_state, metrics, new_dropout_rng