diff --git a/examples/run_glue.py b/examples/run_glue.py index e02e9b4294..bc5f0cf350 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -249,7 +249,7 @@ def evaluate(args, model, tokenizer, prefix=""): result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) - output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") + output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): @@ -490,9 +490,11 @@ def main(): logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=global_step) + result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index a91deebb6c..c167703d7b 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -282,7 +282,7 @@ def evaluate(args, model, tokenizer, prefix=""): "perplexity": perplexity } - output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") + output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): @@ -484,9 +484,11 @@ def main(): logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=global_step) + result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 54f3a8a904..a983daad76 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -512,9 +512,11 @@ def main(): logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=global_step) + result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) @@ -528,9 +530,11 @@ def main(): logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=global_step, test=True) + result = evaluate(args, model, tokenizer, prefix=prefix, test=True) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if best_steps: