From ed57c979b996a11b99b61c90f8d97107d6180f52 Mon Sep 17 00:00:00 2001 From: fpgaminer Date: Tue, 28 Mar 2023 06:09:17 -0700 Subject: [PATCH] Fix bug in perplexity guide calculations and update perplexity numbers. Fixes #22348 (#22411) Fix bug in perplexity guide calculations and update perplexity numbers. --- docs/source/en/perplexity.mdx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/source/en/perplexity.mdx b/docs/source/en/perplexity.mdx index 01f861c99c..76ffd33856 100644 --- a/docs/source/en/perplexity.mdx +++ b/docs/source/en/perplexity.mdx @@ -115,11 +115,10 @@ for begin_loc in tqdm(range(0, seq_len, stride)): with torch.no_grad(): outputs = model(input_ids, labels=target_ids) - # loss is calculated using CrossEntropyLoss which averages over input tokens. - # Multiply it with trg_len to get the summation instead of average. - # We will take average over all the tokens to get the true average - # in the last step of this example. - neg_log_likelihood = outputs.loss * trg_len + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss nlls.append(neg_log_likelihood) @@ -127,14 +126,14 @@ for begin_loc in tqdm(range(0, seq_len, stride)): if end_loc == seq_len: break -ppl = torch.exp(torch.stack(nlls).sum() / end_loc) +ppl = torch.exp(torch.stack(nlls).mean()) ``` Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction, and the better the reported perplexity will typically be. -When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.64`, which is about the same +When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window -strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is +strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is closer to the true autoregressive decomposition of a sequence likelihood.