Add Flax example tests (#14599)

* add test for glue * add tests for clm * fix clm test * add summrization tests * more tests * fix few tests * add test for t5 mlm * fix t5 mlm test * fix tests for multi device * cleanup * ci job * fix metric file name * make t5 more robust
2021-12-06 10:48:58 +05:30
parent 803a8cd18f
commit c5bd732ac6
11 changed files with 553 additions and 6 deletions
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -18,6 +18,7 @@ Fine-tuning the library models for question answering.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.

+import json
 import logging
 import os
 import random
@@ -911,6 +912,58 @@ def main():
        epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}"
    # endregion

+    # Eval after training
+    if training_args.do_eval:
+        eval_metrics = {}
+        all_start_logits = []
+        all_end_logits = []
+
+        eva_loader = eval_data_collator(eval_dataset, eval_batch_size)
+        for batch in tqdm(eva_loader, total=len(eval_dataset) // eval_batch_size, desc="Evaluating ...", position=2):
+            _ = batch.pop("example_id")
+            _ = batch.pop("offset_mapping")
+            predictions = p_eval_step(state, batch)
+            start_logits = np.array([pred for pred in chain(*predictions[0])])
+            end_logits = np.array([pred for pred in chain(*predictions[1])])
+            all_start_logits.append(start_logits)
+            all_end_logits.append(end_logits)
+
+        # evaluate also on leftover examples (not divisible by batch_size)
+        num_leftover_samples = len(eval_dataset) % eval_batch_size
+
+        # make sure leftover batch is evaluated on one device
+        if num_leftover_samples > 0 and jax.process_index() == 0:
+            # take leftover samples
+            batch = eval_dataset[-num_leftover_samples:]
+            batch = {k: np.array(v) for k, v in batch.items()}
+            _ = batch.pop("example_id")
+            _ = batch.pop("offset_mapping")
+
+            predictions = eval_step(unreplicate(state), batch)
+            start_logits = np.array([pred for pred in predictions[0]])
+            end_logits = np.array([pred for pred in predictions[1]])
+            all_start_logits.append(start_logits)
+            all_end_logits.append(end_logits)
+
+        max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+
+        # concatenate the numpy array
+        start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
+        end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
+
+        # delete the list of numpy arrays
+        del all_start_logits
+        del all_end_logits
+        outputs_numpy = (start_logits_concat, end_logits_concat)
+        prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+        eval_metrics = compute_metrics(prediction)
+
+        if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+

 if __name__ == "__main__":
    main()