Misc. fixes for Pytorch QA examples: (#16958)
1. Fixes evaluation errors popping up when you train/eval on squad v2 (one was newly encountered and one that was previously reported Running SQuAD 1.0 sample command raises IndexError #15401 but not completely fixed). 2. Removes boolean arguments that don't use store_true. Please, don't use these: *ANY non-empty string is being converted to True in this case and this clearly is not the desired behavior (and it creates a LOT of confusion). 3. All no-trainer test scripts are now saving metric values in the same way (with the right prefix eval_), which is consistent with the trainer-based versions. 4. Adds forgotten model.eval() in the no-trainer versions. This improved some results, but not everything (see the discussion in the end). Please, see the F1 scores and the discussion below.
This commit is contained in:
@@ -158,7 +158,7 @@ def postprocess_qa_predictions(
|
||||
"end_logit": end_logits[end_index],
|
||||
}
|
||||
)
|
||||
if version_2_with_negative:
|
||||
if version_2_with_negative and min_null_prediction is not None:
|
||||
# Add the minimum null prediction
|
||||
prelim_predictions.append(min_null_prediction)
|
||||
null_score = min_null_prediction["score"]
|
||||
@@ -167,7 +167,11 @@ def postprocess_qa_predictions(
|
||||
predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
|
||||
|
||||
# Add back the minimum null prediction if it was removed because of its low score.
|
||||
if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
|
||||
if (
|
||||
version_2_with_negative
|
||||
and min_null_prediction is not None
|
||||
and not any(p["offsets"] == (0, 0) for p in predictions)
|
||||
):
|
||||
predictions.append(min_null_prediction)
|
||||
|
||||
# Use the offsets to gather the answer text in the original context.
|
||||
@@ -350,9 +354,12 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
start_index >= len(offset_mapping)
|
||||
or end_index >= len(offset_mapping)
|
||||
or offset_mapping[start_index] is None
|
||||
or len(offset_mapping[start_index]) < 2
|
||||
or offset_mapping[end_index] is None
|
||||
or len(offset_mapping[end_index]) < 2
|
||||
):
|
||||
continue
|
||||
|
||||
# Don't consider answers with a length negative or > max_answer_length.
|
||||
if end_index < start_index or end_index - start_index + 1 > max_answer_length:
|
||||
continue
|
||||
@@ -381,7 +388,9 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
# In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
|
||||
# failure.
|
||||
if len(predictions) == 0:
|
||||
predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
|
||||
# Without predictions min_null_score is going to be None and None will cause an exception later
|
||||
min_null_score = -2e-6
|
||||
predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score})
|
||||
|
||||
# Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
|
||||
# the LogSumExp trick).
|
||||
|
||||
Reference in New Issue
Block a user