[Re-submit] Compute true loss Flax examples (#19504)
* Compute true loss * fixup * final * final * final * Update examples/flax/language-modeling/run_bart_dlm_flax.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * jax.tree_map => jax.tree_util.tree_map * Compute true loss * final * fixup * final * final * Update examples/flax/language-modeling/run_bart_dlm_flax.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * jax.tree_map => jax.tree_util.tree_map Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
This commit is contained in:
@@ -784,8 +784,9 @@ def main():
|
||||
|
||||
# ignore padded tokens from loss
|
||||
loss = loss * padding_mask
|
||||
loss = loss.sum() / padding_mask.sum()
|
||||
return loss
|
||||
loss = loss.sum()
|
||||
num_labels = padding_mask.sum()
|
||||
return loss, num_labels
|
||||
|
||||
# Define gradient update step fn
|
||||
def train_step(state, batch, label_smoothing_factor=0.0):
|
||||
@@ -794,29 +795,38 @@ def main():
|
||||
def compute_loss(params):
|
||||
labels = batch.pop("labels")
|
||||
logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
|
||||
loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
|
||||
return loss
|
||||
loss, num_labels = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
|
||||
return loss, num_labels
|
||||
|
||||
grad_fn = jax.value_and_grad(compute_loss)
|
||||
loss, grad = grad_fn(state.params)
|
||||
grad = jax.lax.pmean(grad, "batch")
|
||||
grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
|
||||
(loss, num_labels), grad = grad_fn(state.params)
|
||||
num_labels = jax.lax.psum(num_labels, "batch")
|
||||
|
||||
# true loss = total loss / total samples
|
||||
loss = jax.lax.psum(loss, "batch")
|
||||
loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
|
||||
|
||||
# true grad = total grad / total samples
|
||||
grad = jax.lax.psum(grad, "batch")
|
||||
grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
|
||||
new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
|
||||
|
||||
metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
|
||||
metrics = jax.lax.pmean(metrics, axis_name="batch")
|
||||
|
||||
return new_state, metrics
|
||||
|
||||
# Define eval fn
|
||||
def eval_step(params, batch, label_smoothing_factor=0.0):
|
||||
labels = batch.pop("labels")
|
||||
logits = model(**batch, params=params, train=False)[0]
|
||||
loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
|
||||
|
||||
# summarize metrics
|
||||
loss, num_labels = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor)
|
||||
num_labels = jax.lax.psum(num_labels, "batch")
|
||||
|
||||
# true loss = total loss / total samples
|
||||
loss = jax.lax.psum(loss, "batch")
|
||||
loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
|
||||
|
||||
metrics = {"loss": loss}
|
||||
metrics = jax.lax.pmean(metrics, axis_name="batch")
|
||||
return metrics
|
||||
|
||||
# Define generation function
|
||||
|
||||
Reference in New Issue
Block a user