[Examples] Replicates the new --log_level feature to all trainer-based pytorch (#12359)

* added log_level

* fix comment

* fixed log_level

* Trigger CI

* Unfied logging

* simplified args for log_level
This commit is contained in:
Bhadresh Savani
2021-06-25 22:58:42 +01:00
committed by GitHub
parent 64e6098094
commit 539ee456d4
13 changed files with 202 additions and 165 deletions

View File

@@ -28,6 +28,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset from datasets import load_dataset
import transformers import transformers
@@ -203,18 +204,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -246,15 +248,17 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
if "validation" not in datasets.keys(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
datasets["validation"] = load_dataset( )
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
) )
datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
@@ -273,7 +277,7 @@ def main():
) )
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -334,9 +338,9 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0] text_column_name = "text" if "text" in column_names else column_names[0]
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
@@ -352,7 +356,7 @@ def main():
) )
return output return output
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,

View File

@@ -28,6 +28,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset from datasets import load_dataset
import transformers import transformers
@@ -212,7 +213,13 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
@@ -220,10 +227,6 @@ def main():
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -255,15 +258,17 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
if "validation" not in datasets.keys(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
datasets["validation"] = load_dataset( )
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
) )
datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
@@ -278,7 +283,7 @@ def main():
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -337,9 +342,9 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0] text_column_name = "text" if "text" in column_names else column_names[0]
if data_args.max_seq_length is None: if data_args.max_seq_length is None:
@@ -377,7 +382,7 @@ def main():
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
@@ -392,7 +397,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True) return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,

View File

@@ -25,6 +25,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset from datasets import load_dataset
import transformers import transformers
@@ -209,18 +210,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -252,15 +254,17 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
if "validation" not in datasets.keys(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
datasets["validation"] = load_dataset( )
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
) )
datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
@@ -275,7 +279,7 @@ def main():
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -334,9 +338,9 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0] text_column_name = "text" if "text" in column_names else column_names[0]
if data_args.max_seq_length > tokenizer.model_max_length: if data_args.max_seq_length > tokenizer.model_max_length:
@@ -355,7 +359,7 @@ def main():
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
@@ -368,7 +372,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name]) return tokenizer(examples[text_column_name])
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,

View File

@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional, Union from typing import Optional, Union
import datasets
import numpy as np import numpy as np
import torch import torch
from datasets import load_dataset from datasets import load_dataset
@@ -220,18 +221,18 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -268,10 +269,10 @@ def main():
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
else: else:
# Downloading and loading the swag dataset from the hub. # Downloading and loading the swag dataset from the hub.
datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir) raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -347,9 +348,9 @@ def main():
return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
@@ -360,9 +361,9 @@ def main():
) )
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(

View File

@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
import transformers import transformers
@@ -216,18 +217,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -259,7 +261,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
@@ -272,7 +276,7 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -314,11 +318,11 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slighlty different for training and evaluation.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
else: else:
column_names = datasets["test"].column_names column_names = raw_datasets["test"].column_names
question_column_name = "question" if "question" in column_names else column_names[0] question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1] context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2] answer_column_name = "answers" if "answers" in column_names else column_names[2]
@@ -407,9 +411,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
@@ -469,9 +473,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_examples = datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
@@ -489,9 +493,9 @@ def main():
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_examples = datasets["test"] predict_examples = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
@@ -529,7 +533,7 @@ def main():
max_answer_length=data_args.max_answer_length, max_answer_length=data_args.max_answer_length,
null_score_diff_threshold=data_args.null_score_diff_threshold, null_score_diff_threshold=data_args.null_score_diff_threshold,
output_dir=training_args.output_dir, output_dir=training_args.output_dir,
is_world_process_zero=trainer.is_world_process_zero(), log_level=log_level,
prefix=stage, prefix=stage,
) )
# Format the result to the format the metric expects. # Format the result to the format the metric expects.

View File

@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
import transformers import transformers
@@ -215,18 +216,18 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -258,7 +259,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
@@ -270,7 +273,7 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -303,11 +306,11 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slighlty different for training and evaluation.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
else: else:
column_names = datasets["test"].column_names column_names = raw_datasets["test"].column_names
question_column_name = "question" if "question" in column_names else column_names[0] question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1] context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2] answer_column_name = "answers" if "answers" in column_names else column_names[2]
@@ -419,9 +422,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Select samples from Dataset, This will help to decrease processing time # Select samples from Dataset, This will help to decrease processing time
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
@@ -505,9 +508,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_examples = datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# Selecting Eval Samples from Dataset # Selecting Eval Samples from Dataset
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
@@ -525,9 +528,9 @@ def main():
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_examples = datasets["test"] predict_examples = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
@@ -566,7 +569,7 @@ def main():
start_n_top=model.config.start_n_top, start_n_top=model.config.start_n_top,
end_n_top=model.config.end_n_top, end_n_top=model.config.end_n_top,
output_dir=training_args.output_dir, output_dir=training_args.output_dir,
is_world_process_zero=trainer.is_world_process_zero(), log_level=log_level,
prefix=stage, prefix=stage,
) )
# Format the result to the format the metric expects. # Format the result to the format the metric expects.

View File

@@ -38,7 +38,7 @@ def postprocess_qa_predictions(
null_score_diff_threshold: float = 0.0, null_score_diff_threshold: float = 0.0,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
prefix: Optional[str] = None, prefix: Optional[str] = None,
is_world_process_zero: bool = True, log_level: Optional[int] = logging.WARNING,
): ):
""" """
Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
@@ -70,8 +70,8 @@ def postprocess_qa_predictions(
answers, are saved in `output_dir`. answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`): prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names. If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`): log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
Whether this process is the main process or not (used to determine if logging/saves should be done). ``logging`` log level (e.g., ``logging.WARNING``)
""" """
assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)." assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
all_start_logits, all_end_logits = predictions all_start_logits, all_end_logits = predictions
@@ -91,7 +91,7 @@ def postprocess_qa_predictions(
scores_diff_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict()
# Logging. # Logging.
logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN) logger.setLevel(log_level)
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
# Let's loop over all the examples! # Let's loop over all the examples!
@@ -250,7 +250,7 @@ def postprocess_qa_predictions_with_beam_search(
end_n_top: int = 5, end_n_top: int = 5,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
prefix: Optional[str] = None, prefix: Optional[str] = None,
is_world_process_zero: bool = True, log_level: Optional[int] = logging.WARNING,
): ):
""" """
Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
@@ -280,8 +280,8 @@ def postprocess_qa_predictions_with_beam_search(
answers, are saved in `output_dir`. answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`): prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names. If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`): log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
Whether this process is the main process or not (used to determine if logging/saves should be done). ``logging`` log level (e.g., ``logging.WARNING``)
""" """
assert len(predictions) == 5, "`predictions` should be a tuple with five elements." assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
@@ -302,7 +302,7 @@ def postprocess_qa_predictions_with_beam_search(
scores_diff_json = collections.OrderedDict() if version_2_with_negative else None scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
# Logging. # Logging.
logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN) logger.setLevel(log_level)
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
# Let's loop over all the examples! # Let's loop over all the examples!

View File

@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import nltk # Here to have a nice missing dependency error message early on import nltk # Here to have a nice missing dependency error message early on
import numpy as np import numpy as np
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
@@ -260,16 +261,18 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
if data_args.source_prefix is None and model_args.model_name_or_path in [ if data_args.source_prefix is None and model_args.model_name_or_path in [
@@ -313,7 +316,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
@@ -325,7 +330,7 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -366,11 +371,11 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# We need to tokenize inputs and targets. # We need to tokenize inputs and targets.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
elif training_args.do_predict: elif training_args.do_predict:
column_names = datasets["test"].column_names column_names = raw_datasets["test"].column_names
else: else:
logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
return return
@@ -425,9 +430,9 @@ def main():
return model_inputs return model_inputs
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
@@ -441,9 +446,9 @@ def main():
if training_args.do_eval: if training_args.do_eval:
max_target_length = data_args.val_max_target_length max_target_length = data_args.val_max_target_length
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
@@ -457,9 +462,9 @@ def main():
if training_args.do_predict: if training_args.do_predict:
max_target_length = data_args.val_max_target_length max_target_length = data_args.val_max_target_length
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(

View File

@@ -23,6 +23,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import numpy as np import numpy as np
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
@@ -204,18 +205,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -250,10 +252,12 @@ def main():
# download the dataset. # download the dataset.
if data_args.task_name is not None: if data_args.task_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
elif data_args.dataset_name is not None: elif data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
# Loading a dataset from your local files. # Loading a dataset from your local files.
# CSV/JSON training and evaluation files are needed. # CSV/JSON training and evaluation files are needed.
@@ -277,10 +281,10 @@ def main():
if data_args.train_file.endswith(".csv"): if data_args.train_file.endswith(".csv"):
# Loading a dataset from local csv files # Loading a dataset from local csv files
datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
else: else:
# Loading a dataset from local json files # Loading a dataset from local json files
datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset at # See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -288,19 +292,19 @@ def main():
if data_args.task_name is not None: if data_args.task_name is not None:
is_regression = data_args.task_name == "stsb" is_regression = data_args.task_name == "stsb"
if not is_regression: if not is_regression:
label_list = datasets["train"].features["label"].names label_list = raw_datasets["train"].features["label"].names
num_labels = len(label_list) num_labels = len(label_list)
else: else:
num_labels = 1 num_labels = 1
else: else:
# Trying to have good defaults here, don't hesitate to tweak to your needs. # Trying to have good defaults here, don't hesitate to tweak to your needs.
is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
if is_regression: if is_regression:
num_labels = 1 num_labels = 1
else: else:
# A useful fast method: # A useful fast method:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
label_list = datasets["train"].unique("label") label_list = raw_datasets["train"].unique("label")
label_list.sort() # Let's sort it for determinism label_list.sort() # Let's sort it for determinism
num_labels = len(label_list) num_labels = len(label_list)
@@ -332,12 +336,12 @@ def main():
use_auth_token=True if model_args.use_auth_token else None, use_auth_token=True if model_args.use_auth_token else None,
) )
# Preprocessing the datasets # Preprocessing the raw_datasets
if data_args.task_name is not None: if data_args.task_name is not None:
sentence1_key, sentence2_key = task_to_keys[data_args.task_name] sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
else: else:
# Again, we try to have some nice defaults but don't hesitate to tweak to your use case. # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
non_label_column_names = [name for name in datasets["train"].column_names if name != "label"] non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
sentence1_key, sentence2_key = "sentence1", "sentence2" sentence1_key, sentence2_key = "sentence1", "sentence2"
else: else:
@@ -396,30 +400,30 @@ def main():
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
return result return result
datasets = datasets.map( raw_datasets = raw_datasets.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on dataset", desc="Running tokenizer on dataset",
) )
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets and "validation_matched" not in datasets: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
if "test" not in datasets and "test_matched" not in datasets: if "test" not in raw_datasets and "test_matched" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
@@ -497,7 +501,7 @@ def main():
eval_datasets = [eval_dataset] eval_datasets = [eval_dataset]
if data_args.task_name == "mnli": if data_args.task_name == "mnli":
tasks.append("mnli-mm") tasks.append("mnli-mm")
eval_datasets.append(datasets["validation_mismatched"]) eval_datasets.append(raw_datasets["validation_mismatched"])
for eval_dataset, task in zip(eval_datasets, tasks): for eval_dataset, task in zip(eval_datasets, tasks):
metrics = trainer.evaluate(eval_dataset=eval_dataset) metrics = trainer.evaluate(eval_dataset=eval_dataset)
@@ -518,7 +522,7 @@ def main():
predict_datasets = [predict_dataset] predict_datasets = [predict_dataset]
if data_args.task_name == "mnli": if data_args.task_name == "mnli":
tasks.append("mnli-mm") tasks.append("mnli-mm")
predict_datasets.append(datasets["test_mismatched"]) predict_datasets.append(raw_datasets["test_mismatched"])
for predict_dataset, task in zip(predict_datasets, tasks): for predict_dataset, task in zip(predict_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that. # Removing the `label` columns because it contains -1 and Trainer won't like that.

View File

@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import numpy as np import numpy as np
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
@@ -174,19 +175,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.

View File

@@ -25,6 +25,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import numpy as np import numpy as np
from datasets import ClassLabel, load_dataset, load_metric from datasets import ClassLabel, load_dataset, load_metric
@@ -195,18 +196,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
@@ -238,7 +240,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
@@ -248,16 +252,16 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
features = datasets["train"].features features = raw_datasets["train"].features
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
features = datasets["validation"].features features = raw_datasets["validation"].features
if data_args.text_column_name is not None: if data_args.text_column_name is not None:
text_column_name = data_args.text_column_name text_column_name = data_args.text_column_name
@@ -288,7 +292,7 @@ def main():
# No need to convert the labels since they are already ints. # No need to convert the labels since they are already ints.
label_to_id = {i: i for i in range(len(label_list))} label_to_id = {i: i for i in range(len(label_list))}
else: else:
label_list = get_label_list(datasets["train"][label_column_name]) label_list = get_label_list(raw_datasets["train"][label_column_name])
label_to_id = {l: i for i, l in enumerate(label_list)} label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels = len(label_list) num_labels = len(label_list)
@@ -381,9 +385,9 @@ def main():
return tokenized_inputs return tokenized_inputs
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
@@ -395,9 +399,9 @@ def main():
) )
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
@@ -409,9 +413,9 @@ def main():
) )
if training_args.do_predict: if training_args.do_predict:
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(

View File

@@ -344,7 +344,7 @@ def main():
model.resize_token_embeddings(len(tokenizer)) model.resize_token_embeddings(len(tokenizer))
# Preprocessing the raw_datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
padding = "max_length" if args.pad_to_max_length else False padding = "max_length" if args.pad_to_max_length else False

View File

@@ -250,6 +250,8 @@ def main():
logger.setLevel(log_level) logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level) datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(