[Examples] Replicates the new --log_level feature to all trainer-based pytorch (#12359)

* added log_level

* fix comment

* fixed log_level

* Trigger CI

* Unfied logging

* simplified args for log_level
This commit is contained in:
Bhadresh Savani
2021-06-25 22:58:42 +01:00
committed by GitHub
parent 64e6098094
commit 539ee456d4
13 changed files with 202 additions and 165 deletions

View File

@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import nltk # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric
@@ -260,16 +261,18 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
logger.info(f"Training/evaluation parameters {training_args}")
if data_args.source_prefix is None and model_args.model_name_or_path in [
@@ -313,7 +316,9 @@ def main():
# download the dataset.
if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else:
data_files = {}
if data_args.train_file is not None:
@@ -325,7 +330,7 @@ def main():
if data_args.test_file is not None:
data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -366,11 +371,11 @@ def main():
# Preprocessing the datasets.
# We need to tokenize inputs and targets.
if training_args.do_train:
column_names = datasets["train"].column_names
column_names = raw_datasets["train"].column_names
elif training_args.do_eval:
column_names = datasets["validation"].column_names
column_names = raw_datasets["validation"].column_names
elif training_args.do_predict:
column_names = datasets["test"].column_names
column_names = raw_datasets["test"].column_names
else:
logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
return
@@ -425,9 +430,9 @@ def main():
return model_inputs
if training_args.do_train:
if "train" not in datasets:
if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"]
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map(
@@ -441,9 +446,9 @@ def main():
if training_args.do_eval:
max_target_length = data_args.val_max_target_length
if "validation" not in datasets:
if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"]
eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map(
@@ -457,9 +462,9 @@ def main():
if training_args.do_predict:
max_target_length = data_args.val_max_target_length
if "test" not in datasets:
if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset")
predict_dataset = datasets["test"]
predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map(