diff --git a/.gitignore b/.gitignore index d4d0027953..5a9bc779b8 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,7 @@ proc_data # examples runs +/runs_old examples/runs # data diff --git a/README.md b/README.md index f949bc18b7..b2c8cafdab 100644 --- a/README.md +++ b/README.md @@ -306,8 +306,9 @@ setup your environment to run the examples. The library comprises several example scripts with SOTA performances for NLU and NLG tasks: -- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*) -- `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*) +- `run_glue.py`: an example fine-tuning sequence classification models on nine different GLUE tasks (*sequence-level classification*) +- `run_squad.py`: an example fine-tuning question answering models on the question answering dataset SQuAD 2.0 (*token-level classification*) +- `run_ner.py`: an example fine-tuning token classification models on named entity recognition (*token-level classification*) - `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation - other model-specific examples (see the documentation). @@ -317,7 +318,7 @@ Here are three quick usage examples for these scripts: The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems. -Before running anyone of these GLUE tasks you should download the +Before running any of these GLUE tasks you should download the [GLUE data](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to some directory `$GLUE_DIR`. @@ -333,7 +334,6 @@ export GLUE_DIR=/path/to/glue export TASK_NAME=MRPC python ./examples/run_glue.py \ - --model_type bert \ --model_name_or_path bert-base-uncased \ --task_name $TASK_NAME \ --do_train \ @@ -360,7 +360,6 @@ Parallel training is a simple way to use several GPUs (but is slower and less fl export GLUE_DIR=/path/to/glue python ./examples/run_glue.py \ - --model_type xlnet \ --model_name_or_path xlnet-large-cased \ --do_train \ --do_eval \ @@ -386,7 +385,6 @@ This example code fine-tunes the Bert Whole Word Masking model on the Microsoft ```bash python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py \ - --model_type bert \ --model_name_or_path bert-large-uncased-whole-word-masking \ --task_name MRPC \ --do_train \ diff --git a/examples/README.md b/examples/README.md index 39614bd42e..6903926376 100644 --- a/examples/README.md +++ b/examples/README.md @@ -246,7 +246,6 @@ and unpack it to some directory `$GLUE_DIR`. export GLUE_DIR=/path/to/glue python run_glue.py \ - --model_type bert \ --model_name_or_path bert-base-cased \ --task_name MRPC \ --do_train \ @@ -272,7 +271,6 @@ Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. export GLUE_DIR=/path/to/glue python run_glue.py \ - --model_type bert \ --model_name_or_path bert-base-cased \ --task_name MRPC \ --do_train \ @@ -296,7 +294,6 @@ export GLUE_DIR=/path/to/glue python -m torch.distributed.launch \ --nproc_per_node 8 run_glue.py \ - --model_type bert \ --model_name_or_path bert-base-cased \ --task_name MRPC \ --do_train \ @@ -329,7 +326,6 @@ export GLUE_DIR=/path/to/glue python -m torch.distributed.launch \ --nproc_per_node 8 run_glue.py \ - --model_type bert \ --model_name_or_path bert-base-cased \ --task_name mnli \ --do_train \ @@ -369,7 +365,6 @@ Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data #training on 4 tesla V100(16GB) GPUS export SWAG_DIR=/path/to/swag_data_dir python ./examples/run_multiple_choice.py \ ---model_type roberta \ --task_name swag \ --model_name_or_path roberta-base \ --do_train \ diff --git a/examples/glue/run_pl.sh b/examples/glue/run_pl.sh index 23c5089f28..3801fcaec5 100755 --- a/examples/glue/run_pl.sh +++ b/examples/glue/run_pl.sh @@ -11,7 +11,6 @@ export DATA_DIR=./glue_data/MRPC/ export MAX_LENGTH=128 export LEARNING_RATE=2e-5 export BERT_MODEL=bert-base-cased -export MODEL_TYPE=bert export BATCH_SIZE=32 export NUM_EPOCHS=3 export SEED=2 @@ -25,7 +24,6 @@ mkdir -p $OUTPUT_DIR export PYTHONPATH="../":"${PYTHONPATH}" python3 run_pl_glue.py --data_dir $DATA_DIR \ ---model_type $MODEL_TYPE \ --task $TASK \ --model_name_or_path $BERT_MODEL \ --output_dir $OUTPUT_DIR \ diff --git a/examples/glue/run_pl_glue.py b/examples/glue/run_pl_glue.py index 0ed00821b3..d5cdbee33a 100644 --- a/examples/glue/run_pl_glue.py +++ b/examples/glue/run_pl_glue.py @@ -35,8 +35,8 @@ class GLUETransformer(BaseTransformer): def training_step(self, batch, batch_idx): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if self.hparams.model_type != "distilbert": - inputs["token_type_ids"] = batch[2] if self.hparams.model_type in ["bert", "xlnet", "albert"] else None + if self.config.model_type != "distilbert": + inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None outputs = self(**inputs) loss = outputs[0] @@ -95,8 +95,8 @@ class GLUETransformer(BaseTransformer): def validation_step(self, batch, batch_idx): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if self.hparams.model_type != "distilbert": - inputs["token_type_ids"] = batch[2] if self.hparams.model_type in ["bert", "xlnet", "albert"] else None + if self.config.model_type != "distilbert": + inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None outputs = self(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -179,7 +179,7 @@ if __name__ == "__main__": # If output_dir not provided, a folder will be generated in pwd if args.output_dir is None: - args.output_dir = os.path.join("./results", f"{args.task}_{args.model_type}_{time.strftime('%Y%m%d_%H%M%S')}",) + args.output_dir = os.path.join("./results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",) os.makedirs(args.output_dir) model = GLUETransformer(args) diff --git a/examples/ner/README.md b/examples/ner/README.md index 6eeb083de7..0391d5499b 100644 --- a/examples/ner/README.md +++ b/examples/ner/README.md @@ -64,7 +64,6 @@ To start training, just run: ```bash python3 run_ner.py --data_dir ./ \ ---model_type bert \ --labels ./labels.txt \ --model_name_or_path $BERT_MODEL \ --output_dir $OUTPUT_DIR \ @@ -125,7 +124,6 @@ To start training, just run: ```bash python3 run_tf_ner.py --data_dir ./ \ ---model_type bert \ --labels ./labels.txt \ --model_name_or_path $BERT_MODEL \ --output_dir $OUTPUT_DIR \ diff --git a/examples/ner/run.sh b/examples/ner/run.sh index 7d3e4a14ff..5691f95f57 100644 --- a/examples/ner/run.sh +++ b/examples/ner/run.sh @@ -4,7 +4,7 @@ curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attre | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \ | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp - wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py" +wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py" export MAX_LENGTH=128 export BERT_MODEL=bert-base-multilingual-cased python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt @@ -17,8 +17,8 @@ export NUM_EPOCHS=3 export SAVE_STEPS=750 export SEED=1 -python3 run_ner.py --data_dir ./ \ ---model_type bert \ +python3 run_ner.py \ +--data_dir . \ --labels ./labels.txt \ --model_name_or_path $BERT_MODEL \ --output_dir $OUTPUT_DIR \ diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py index e280037ad8..ef22683b9a 100644 --- a/examples/ner/run_ner.py +++ b/examples/ner/run_ner.py @@ -16,656 +16,264 @@ """ Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """ -import argparse -import glob import logging import os -import random +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple import numpy as np -import torch from seqeval.metrics import f1_score, precision_score, recall_score -from torch.nn import CrossEntropyLoss -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset -from torch.utils.data.distributed import DistributedSampler -from tqdm import tqdm, trange +from torch import nn from transformers import ( - MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, - WEIGHTS_NAME, - AdamW, AutoConfig, AutoModelForTokenClassification, AutoTokenizer, - get_linear_schedule_with_warmup, + EvalPrediction, + HfArgumentParser, + Trainer, + TrainingArguments, + set_seed, ) -from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file - - -try: - from torch.utils.tensorboard import SummaryWriter -except ImportError: - from tensorboardX import SummaryWriter +from utils_ner import NerDataset, Split, get_labels logger = logging.getLogger(__name__) -MODEL_CONFIG_CLASSES = list(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), ()) +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ -TOKENIZER_ARGS = ["do_lower_case", "strip_accents", "keep_accents", "use_fast"] + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."}) + # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, + # or just modify its tokenizer_config.json. + cache_dir: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + ) -def set_seed(args): - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if args.n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ - -def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): - """ Train the model """ - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() - - args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) - train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) - - if args.max_steps > 0: - t_total = args.max_steps - args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 - else: - t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay, + data_dir: str = field( + metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."} + ) + labels: Optional[str] = field( + metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."} + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." }, - {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) - - # Check if saved optimizer or scheduler states exist - if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( - os.path.join(args.model_name_or_path, "scheduler.pt") - ): - # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) - scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) - - if args.fp16: - try: - from apex import amp - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) - - # multi-gpu training (should be after apex fp16 initialization) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Distributed training (should be after apex fp16 initialization) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True - ) - - # Train! - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_dataset)) - logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info( - " Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size - * args.gradient_accumulation_steps - * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} ) - logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - global_step = 0 - epochs_trained = 0 - steps_trained_in_current_epoch = 0 - # Check if continuing training from a checkpoint - if os.path.exists(args.model_name_or_path): - # set global_step to gobal_step of last saved checkpoint from model path - try: - global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) - except ValueError: - global_step = 0 - epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) - steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) - - logger.info(" Continuing training from checkpoint, will skip to saved global_step") - logger.info(" Continuing training from epoch %d", epochs_trained) - logger.info(" Continuing training from global step %d", global_step) - logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) - - tr_loss, logging_loss = 0.0, 0.0 - model.zero_grad() - train_iterator = trange( - epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] - ) - set_seed(args) # Added here for reproductibility - for _ in train_iterator: - epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) - for step, batch in enumerate(epoch_iterator): - - # Skip past any already trained steps if resuming training - if steps_trained_in_current_epoch > 0: - steps_trained_in_current_epoch -= 1 - continue - - model.train() - batch = tuple(t.to(args.device) for t in batch) - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if args.model_type != "distilbert": - inputs["token_type_ids"] = ( - batch[2] if args.model_type in ["bert", "xlnet"] else None - ) # XLM and RoBERTa don"t use segment_ids - - outputs = model(**inputs) - loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - tr_loss += loss.item() - if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16: - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) - else: - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - - optimizer.step() - scheduler.step() # Update learning rate schedule - model.zero_grad() - global_step += 1 - - if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics - if ( - args.local_rank == -1 and args.evaluate_during_training - ): # Only evaluate when single GPU otherwise metrics may not average well - results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") - for key, value in results.items(): - tb_writer.add_scalar("eval_{}".format(key), value, global_step) - tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) - logging_loss = tr_loss - - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - # Save model checkpoint - output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - - torch.save(args, os.path.join(output_dir, "training_args.bin")) - logger.info("Saving model checkpoint to %s", output_dir) - - torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) - torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) - logger.info("Saving optimizer and scheduler states to %s", output_dir) - - if args.max_steps > 0 and global_step > args.max_steps: - epoch_iterator.close() - break - if args.max_steps > 0 and global_step > args.max_steps: - train_iterator.close() - break - - if args.local_rank in [-1, 0]: - tb_writer.close() - - return global_step, tr_loss / global_step - - -def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): - eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu evaluate - if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation %s *****", prefix) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - model.eval() - for batch in tqdm(eval_dataloader, desc="Evaluating"): - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if args.model_type != "distilbert": - inputs["token_type_ids"] = ( - batch[2] if args.model_type in ["bert", "xlnet"] else None - ) # XLM and RoBERTa don"t use segment_ids - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - if args.n_gpu > 1: - tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating - - eval_loss += tmp_eval_loss.item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs["labels"].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - preds = np.argmax(preds, axis=2) - - label_map = {i: label for i, label in enumerate(labels)} - - out_label_list = [[] for _ in range(out_label_ids.shape[0])] - preds_list = [[] for _ in range(out_label_ids.shape[0])] - - for i in range(out_label_ids.shape[0]): - for j in range(out_label_ids.shape[1]): - if out_label_ids[i, j] != pad_token_label_id: - out_label_list[i].append(label_map[out_label_ids[i][j]]) - preds_list[i].append(label_map[preds[i][j]]) - - results = { - "loss": eval_loss, - "precision": precision_score(out_label_list, preds_list), - "recall": recall_score(out_label_list, preds_list), - "f1": f1_score(out_label_list, preds_list), - } - - logger.info("***** Eval results %s *****", prefix) - for key in sorted(results.keys()): - logger.info(" %s = %s", key, str(results[key])) - - return results, preds_list - - -def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Load data features from cache or dataset file - cached_features_file = os.path.join( - args.data_dir, - "cached_{}_{}_{}".format( - mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length) - ), - ) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - examples = read_examples_from_file(args.data_dir, mode) - features = convert_examples_to_features( - examples, - labels, - args.max_seq_length, - tokenizer, - cls_token_at_end=bool(args.model_type in ["xlnet"]), - # xlnet has a cls token at the end - cls_token=tokenizer.cls_token, - cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, - sep_token=tokenizer.sep_token, - sep_token_extra=bool(args.model_type in ["roberta"]), - # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 - pad_on_left=bool(args.model_type in ["xlnet"]), - # pad on the left for xlnet - pad_token=tokenizer.pad_token_id, - pad_token_segment_id=tokenizer.pad_token_type_id, - pad_token_label_id=pad_token_label_id, - ) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) - - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - return dataset def main(): - parser = argparse.ArgumentParser() + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. - # Required parameters - parser.add_argument( - "--data_dir", - default=None, - type=str, - required=True, - help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.", - ) - parser.add_argument( - "--model_type", - default=None, - type=str, - required=True, - help="Model type selected in the list: " + ", ".join(MODEL_TYPES), - ) - parser.add_argument( - "--model_name_or_path", - default=None, - type=str, - required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - required=True, - help="The output directory where the model predictions and checkpoints will be written.", - ) - - # Other parameters - parser.add_argument( - "--labels", - default="", - type=str, - help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", - ) - parser.add_argument( - "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" - ) - parser.add_argument( - "--tokenizer_name", - default="", - type=str, - help="Pretrained tokenizer name or path if not the same as model_name", - ) - parser.add_argument( - "--cache_dir", - default="", - type=str, - help="Where do you want to store the pre-trained models downloaded from s3", - ) - parser.add_argument( - "--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", - ) - parser.add_argument("--do_train", action="store_true", help="Whether to run training.") - parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") - parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") - parser.add_argument( - "--evaluate_during_training", - action="store_true", - help="Whether to run evaluation during training at each logging step.", - ) - parser.add_argument( - "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." - ) - parser.add_argument( - "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents." - ) - parser.add_argument( - "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents." - ) - parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.") - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") - parser.add_argument( - "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." - ) - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", - ) - parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - parser.add_argument( - "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." - ) - parser.add_argument( - "--max_steps", - default=-1, - type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.", - ) - parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - - parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") - parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") - parser.add_argument( - "--eval_all_checkpoints", - action="store_true", - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", - ) - parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") - parser.add_argument( - "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" - ) - parser.add_argument( - "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" - ) - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - - parser.add_argument( - "--fp16", - action="store_true", - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", - ) - parser.add_argument( - "--fp16_opt_level", - type=str, - default="O1", - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html", - ) - parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") - parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") - parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") - args = parser.parse_args() + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( - os.path.exists(args.output_dir) - and os.listdir(args.output_dir) - and args.do_train - and not args.overwrite_output_dir + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir ): raise ValueError( - "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( - args.output_dir - ) + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) - # Setup distant debugging if needed - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - # Setup CUDA, GPU & distributed training - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() - else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend="nccl") - args.n_gpu = 1 - args.device = device - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, - device, - args.n_gpu, - bool(args.local_rank != -1), - args.fp16, + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, ) + logger.info("Training/evaluation parameters %s", training_args) # Set seed - set_seed(args) + set_seed(training_args.seed) # Prepare CONLL-2003 task - labels = get_labels(args.labels) + labels = get_labels(data_args.labels) + label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) - # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later - pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. - args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( - args.config_name if args.config_name else args.model_name_or_path, + model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, - id2label={str(i): label for i, label in enumerate(labels)}, + id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, - cache_dir=args.cache_dir if args.cache_dir else None, + cache_dir=model_args.cache_dir, ) - tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS} - logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None, - **tokenizer_args, + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast, ) model = AutoModelForTokenClassification.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, - cache_dir=args.cache_dir if args.cache_dir else None, + cache_dir=model_args.cache_dir, ) - if args.local_rank == 0: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # Get datasets + train_dataset = ( + NerDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + labels=labels, + model_type=config.model_type, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.train, + local_rank=training_args.local_rank, + ) + if training_args.do_train + else None + ) + eval_dataset = ( + NerDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + labels=labels, + model_type=config.model_type, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.dev, + local_rank=training_args.local_rank, + ) + if training_args.do_eval + else None + ) - model.to(args.device) + def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: + preds = np.argmax(predictions, axis=2) - logger.info("Training/evaluation parameters %s", args) + batch_size, seq_len = preds.shape + + out_label_list = [[] for _ in range(batch_size)] + preds_list = [[] for _ in range(batch_size)] + + for i in range(batch_size): + for j in range(seq_len): + if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: + out_label_list[i].append(label_map[label_ids[i][j]]) + preds_list[i].append(label_map[preds[i][j]]) + + return preds_list, out_label_list + + def compute_metrics(p: EvalPrediction) -> Dict: + preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) + return { + "precision": precision_score(out_label_list, preds_list), + "recall": recall_score(out_label_list, preds_list), + "f1": f1_score(out_label_list, preds_list), + } + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + ) # Training - if args.do_train: - train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") - global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) - logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() - if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - # Create output directory if needed - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - - logger.info("Saving model checkpoint to %s", args.output_dir) - # Save a trained model, configuration and tokenizer using `save_pretrained()`. - # They can then be reloaded using `from_pretrained()` - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - - # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, "training_args.bin")) + if training_args.do_train: + trainer.train( + model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None + ) # Evaluation results = {} - if args.do_eval and args.local_rank in [-1, 0]: - tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args) - checkpoints = [args.output_dir] - if args.eval_all_checkpoints: - checkpoints = list( - os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) - ) - logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" - model = AutoModelForTokenClassification.from_pretrained(checkpoint) - model.to(args.device) - result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) - if global_step: - result = {"{}_{}".format(global_step, k): v for k, v in result.items()} - results.update(result) - output_eval_file = os.path.join(args.output_dir, "eval_results.txt") - with open(output_eval_file, "w") as writer: - for key in sorted(results.keys()): - writer.write("{} = {}\n".format(key, str(results[key]))) + if training_args.do_eval and training_args.local_rank in [-1, 0]: + logger.info("*** Evaluate ***") - if args.do_predict and args.local_rank in [-1, 0]: - tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args) - model = AutoModelForTokenClassification.from_pretrained(args.output_dir) - model.to(args.device) - result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") - # Save results - output_test_results_file = os.path.join(args.output_dir, "test_results.txt") + result = trainer.evaluate() + + output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key, value in result.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) + + results.update(result) + + # Predict + if training_args.do_predict and training_args.local_rank in [-1, 0]: + test_dataset = NerDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + labels=labels, + model_type=config.model_type, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.test, + local_rank=training_args.local_rank, + ) + + predictions, label_ids, metrics = trainer.predict(test_dataset) + preds_list, _ = align_predictions(predictions, label_ids) + + output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: - for key in sorted(result.keys()): - writer.write("{} = {}\n".format(key, str(result[key]))) + for key, value in metrics.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) + # Save predictions - output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") + output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: - with open(os.path.join(args.data_dir, "test.txt"), "r") as f: + with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) - if not predictions[example_id]: + if not preds_list[example_id]: example_id += 1 - elif predictions[example_id]: - output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n" + elif preds_list[example_id]: + output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) diff --git a/examples/ner/run_pl.sh b/examples/ner/run_pl.sh index bc774e3903..5a863be22d 100755 --- a/examples/ner/run_pl.sh +++ b/examples/ner/run_pl.sh @@ -11,7 +11,7 @@ curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attre | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \ | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp - wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py" +wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py" export MAX_LENGTH=128 export BERT_MODEL=bert-base-multilingual-cased python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt diff --git a/examples/ner/run_pl_ner.py b/examples/ner/run_pl_ner.py index 6b84697891..24cecf7efc 100644 --- a/examples/ner/run_pl_ner.py +++ b/examples/ner/run_pl_ner.py @@ -27,7 +27,7 @@ class NERTransformer(BaseTransformer): self.labels = get_labels(hparams.labels) num_labels = len(self.labels) self.pad_token_label_id = CrossEntropyLoss().ignore_index - super(NERTransformer, self).__init__(hparams, num_labels, self.mode) + super().__init__(hparams, num_labels, self.mode) def forward(self, **inputs): return self.model(**inputs) @@ -35,10 +35,10 @@ class NERTransformer(BaseTransformer): def training_step(self, batch, batch_num): "Compute loss and log." inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if self.hparams.model_type != "distilbert": + if self.config.model_type != "distilbert": inputs["token_type_ids"] = ( - batch[2] if self.hparams.model_type in ["bert", "xlnet"] else None - ) # XLM and RoBERTa don"t use segment_ids + batch[2] if self.config.model_type in ["bert", "xlnet"] else None + ) # XLM and RoBERTa don"t use token_type_ids outputs = self(**inputs) loss = outputs[0] @@ -58,12 +58,12 @@ class NERTransformer(BaseTransformer): self.labels, args.max_seq_length, self.tokenizer, - cls_token_at_end=bool(args.model_type in ["xlnet"]), + cls_token_at_end=bool(self.config.model_type in ["xlnet"]), cls_token=self.tokenizer.cls_token, - cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, + cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0, sep_token=self.tokenizer.sep_token, - sep_token_extra=bool(args.model_type in ["roberta"]), - pad_on_left=bool(args.model_type in ["xlnet"]), + sep_token_extra=bool(self.config.model_type in ["roberta"]), + pad_on_left=bool(self.config.model_type in ["xlnet"]), pad_token=self.tokenizer.pad_token_id, pad_token_segment_id=self.tokenizer.pad_token_type_id, pad_token_label_id=self.pad_token_label_id, @@ -77,21 +77,25 @@ class NERTransformer(BaseTransformer): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + if features[0].token_type_ids is not None: + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + else: + all_token_type_ids = torch.tensor([0 for f in features], dtype=torch.long) + # HACK(we will not use this anymore soon) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) return DataLoader( - TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids), batch_size=batch_size + TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids), batch_size=batch_size ) def validation_step(self, batch, batch_nb): "Compute validation" inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if self.hparams.model_type != "distilbert": + if self.config.model_type != "distilbert": inputs["token_type_ids"] = ( - batch[2] if self.hparams.model_type in ["bert", "xlnet"] else None - ) # XLM and RoBERTa don"t use segment_ids + batch[2] if self.config.model_type in ["bert", "xlnet"] else None + ) # XLM and RoBERTa don"t use token_type_ids outputs = self(**inputs) tmp_eval_loss, logits = outputs[:2] preds = logits.detach().cpu().numpy() diff --git a/examples/ner/run_tf_ner.py b/examples/ner/run_tf_ner.py index cc76989cd7..f09fe56f9f 100644 --- a/examples/ner/run_tf_ner.py +++ b/examples/ner/run_tf_ner.py @@ -9,6 +9,7 @@ import re import numpy as np import tensorflow as tf from absl import app, flags, logging +from fastprogress import master_bar, progress_bar from seqeval import metrics from transformers import ( @@ -17,34 +18,23 @@ from transformers import ( AutoConfig, AutoTokenizer, GradientAccumulator, + PreTrainedTokenizer, TFAutoModelForTokenClassification, create_optimizer, ) from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file -try: - from fastprogress import master_bar, progress_bar -except ImportError: - from fastprogress.fastprogress import master_bar, progress_bar - - MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), (),) - flags.DEFINE_string( - "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task." + "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) for the task." ) -flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_TYPES)) - flags.DEFINE_string( - "model_name_or_path", - None, - "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + "model_name_or_path", None, "Path to pretrained model or model identifier from huggingface.co/models", ) flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.") @@ -53,11 +43,11 @@ flags.DEFINE_string( "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used." ) -flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name") +flags.DEFINE_string("config_name", None, "Pretrained config name or path if not the same as model_name") -flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name") +flags.DEFINE_string("tokenizer_name", None, "Pretrained tokenizer name or path if not the same as model_name") -flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3") +flags.DEFINE_string("cache_dir", None, "Where do you want to store the pre-trained models downloaded from s3") flags.DEFINE_integer( "max_seq_length", @@ -123,7 +113,7 @@ flags.DEFINE_boolean( "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) -flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available") +flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA even if it is available") flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory") @@ -198,12 +188,10 @@ def train( @tf.function def train_step(train_features, train_labels): def step_fn(train_features, train_labels): - inputs = {"attention_mask": train_features["input_mask"], "training": True} + inputs = {"attention_mask": train_features["attention_mask"], "training": True} - if args["model_type"] != "distilbert": - inputs["token_type_ids"] = ( - train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None - ) + if "token_type_ids" in train_features: + inputs["token_type_ids"] = train_features["token_type_ids"] with tf.GradientTape() as tape: logits = model(train_features["input_ids"], **inputs)[0] @@ -320,12 +308,10 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode) logging.info(" Batch size = %d", eval_batch_size) for eval_features, eval_labels in eval_iterator: - inputs = {"attention_mask": eval_features["input_mask"], "training": False} + inputs = {"attention_mask": eval_features["attention_mask"], "training": False} - if args["model_type"] != "distilbert": - inputs["token_type_ids"] = ( - eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None - ) + if "token_type_ids" in eval_features: + inputs["token_type_ids"] = eval_features["token_type_ids"] with strategy.scope(): logits = model(eval_features["input_ids"], **inputs)[0] @@ -356,20 +342,23 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode) return y_true, y_pred, loss.numpy() -def load_cache(cached_file, max_seq_length): +def load_cache(cached_file, tokenizer: PreTrainedTokenizer, max_seq_length): name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), + "attention_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), } + # TODO Find a cleaner way to do this. + if "token_type_ids" in tokenizer.model_input_names: + name_to_features["token_type_ids"] = tf.io.FixedLenFeature([max_seq_length], tf.int64) def _decode_record(record): example = tf.io.parse_single_example(record, name_to_features) features = {} features["input_ids"] = example["input_ids"] - features["input_mask"] = example["input_mask"] - features["segment_ids"] = example["segment_ids"] + features["attention_mask"] = example["attention_mask"] + if "token_type_ids" in example: + features["token_type_ids"] = example["token_type_ids"] return features, example["label_ids"] @@ -393,8 +382,9 @@ def save_cache(features, cached_features_file): record_feature = collections.OrderedDict() record_feature["input_ids"] = create_int_feature(feature.input_ids) - record_feature["input_mask"] = create_int_feature(feature.input_mask) - record_feature["segment_ids"] = create_int_feature(feature.segment_ids) + record_feature["attention_mask"] = create_int_feature(feature.attention_mask) + if feature.token_type_ids is not None: + record_feature["token_type_ids"] = create_int_feature(feature.token_type_ids) record_feature["label_ids"] = create_int_feature(feature.label_ids) tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature)) @@ -410,13 +400,11 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s # Load data features from cache or dataset file cached_features_file = os.path.join( args["data_dir"], - "cached_{}_{}_{}.tf_record".format( - mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"]) - ), + "cached_{}_{}_{}.tf_record".format(mode, tokenizer.__class__.__name__, str(args["max_seq_length"])), ) if os.path.exists(cached_features_file) and not args["overwrite_cache"]: logging.info("Loading features from cached file %s", cached_features_file) - dataset, size = load_cache(cached_features_file, args["max_seq_length"]) + dataset, size = load_cache(cached_features_file, tokenizer, args["max_seq_length"]) else: logging.info("Creating features from dataset file at %s", args["data_dir"]) examples = read_examples_from_file(args["data_dir"], mode) @@ -440,7 +428,7 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s ) logging.info("Saving features into cached file %s", cached_features_file) save_cache(features, cached_features_file) - dataset, size = load_cache(cached_features_file, args["max_seq_length"]) + dataset, size = load_cache(cached_features_file, tokenizer, args["max_seq_length"]) if mode == "train": dataset = dataset.repeat() @@ -500,17 +488,18 @@ def main(_): config = AutoConfig.from_pretrained( args["config_name"] if args["config_name"] else args["model_name_or_path"], num_labels=num_labels, - cache_dir=args["cache_dir"] if args["cache_dir"] else None, + cache_dir=args["cache_dir"], ) logging.info("Training/evaluation parameters %s", args) + args["model_type"] = config.model_type # Training if args["do_train"]: tokenizer = AutoTokenizer.from_pretrained( args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"], do_lower_case=args["do_lower_case"], - cache_dir=args["cache_dir"] if args["cache_dir"] else None, + cache_dir=args["cache_dir"], ) with strategy.scope(): @@ -518,7 +507,7 @@ def main(_): args["model_name_or_path"], from_pt=bool(".bin" in args["model_name_or_path"]), config=config, - cache_dir=args["cache_dir"] if args["cache_dir"] else None, + cache_dir=args["cache_dir"], ) train_batch_size = args["per_device_train_batch_size"] * args["n_device"] @@ -538,8 +527,7 @@ def main(_): pad_token_label_id, ) - if not os.path.exists(args["output_dir"]): - os.makedirs(args["output_dir"]) + os.makedirs(args["output_dir"], exist_ok=True) logging.info("Saving model to %s", args["output_dir"]) @@ -637,5 +625,4 @@ if __name__ == "__main__": flags.mark_flag_as_required("data_dir") flags.mark_flag_as_required("output_dir") flags.mark_flag_as_required("model_name_or_path") - flags.mark_flag_as_required("model_type") app.run(main) diff --git a/examples/ner/test_ner_examples.py b/examples/ner/test_ner_examples.py new file mode 100644 index 0000000000..336457698c --- /dev/null +++ b/examples/ner/test_ner_examples.py @@ -0,0 +1,33 @@ +import logging +import sys +import unittest +from unittest.mock import patch + +import run_ner + + +logging.basicConfig(level=logging.DEBUG) + +logger = logging.getLogger() + + +class ExamplesTests(unittest.TestCase): + def test_run_ner(self): + stream_handler = logging.StreamHandler(sys.stdout) + logger.addHandler(stream_handler) + + testargs = """ + --model_name distilbert-base-german-cased + --output_dir ./examples/tests_samples/temp_dir + --overwrite_output_dir + --data_dir ./examples/tests_samples/GermEval + --labels ./examples/tests_samples/GermEval/labels.txt + --max_seq_length 128 + --num_train_epochs 6 + --logging_steps 1 + --do_train + --do_eval + """.split() + with patch.object(sys, "argv", ["run.py"] + testargs): + result = run_ner.main() + self.assertLess(result["loss"], 1.5) diff --git a/examples/ner/utils_ner.py b/examples/ner/utils_ner.py index bda1b65a7c..d79d61af5a 100644 --- a/examples/ner/utils_ner.py +++ b/examples/ner/utils_ner.py @@ -18,40 +18,126 @@ import logging import os +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional, Union + +import torch +from torch import nn +from torch.utils.data.dataset import Dataset + +from transformers import PreTrainedTokenizer, torch_distributed_zero_first logger = logging.getLogger(__name__) -class InputExample(object): - """A single training/test example for token classification.""" +@dataclass +class InputExample: + """ + A single training/test example for token classification. - def __init__(self, guid, words, labels): - """Constructs a InputExample. + Args: + guid: Unique id for the example. + words: list. The words of the sequence. + labels: (Optional) list. The labels for each word of the sequence. This should be + specified for train and dev examples, but not for test examples. + """ - Args: - guid: Unique id for the example. - words: list. The words of the sequence. - labels: (Optional) list. The labels for each word of the sequence. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.words = words - self.labels = labels + guid: str + words: List[str] + labels: Optional[List[str]] -class InputFeatures(object): - """A single set of features of data.""" +@dataclass +class InputFeatures: + """ + A single set of features of data. + Property names are the same names as the corresponding inputs to a model. + """ - def __init__(self, input_ids, input_mask, segment_ids, label_ids): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_ids = label_ids + input_ids: List[int] + attention_mask: List[int] + token_type_ids: Optional[List[int]] = None + label_ids: Optional[List[int]] = None -def read_examples_from_file(data_dir, mode): - file_path = os.path.join(data_dir, "{}.txt".format(mode)) +class Split(Enum): + train = "train" + dev = "dev" + test = "test" + + +class NerDataset(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + features: List[InputFeatures] + pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index + # Use cross entropy ignore_index as padding label id so that only + # real label ids contribute to the loss later. + + def __init__( + self, + data_dir: str, + tokenizer: PreTrainedTokenizer, + labels: List[str], + model_type: str, + max_seq_length: Optional[int] = None, + overwrite_cache=False, + mode: Split = Split.train, + local_rank=-1, + ): + # Load data features from cache or dataset file + cached_features_file = os.path.join( + data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)), + ) + + with torch_distributed_zero_first(local_rank): + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + + if os.path.exists(cached_features_file) and not overwrite_cache: + logger.info(f"Loading features from cached file {cached_features_file}") + self.features = torch.load(cached_features_file) + else: + logger.info(f"Creating features from dataset file at {data_dir}") + examples = read_examples_from_file(data_dir, mode) + # TODO clean up all this to leverage built-in features of tokenizers + self.features = convert_examples_to_features( + examples, + labels, + max_seq_length, + tokenizer, + cls_token_at_end=bool(model_type in ["xlnet"]), + # xlnet has a cls token at the end + cls_token=tokenizer.cls_token, + cls_token_segment_id=2 if model_type in ["xlnet"] else 0, + sep_token=tokenizer.sep_token, + sep_token_extra=bool(model_type in ["roberta"]), + # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, + pad_token_label_id=self.pad_token_label_id, + ) + if local_rank in [-1, 0]: + logger.info(f"Saving features into cached file {cached_features_file}") + torch.save(self.features, cached_features_file) + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + +def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]: + if isinstance(mode, Split): + mode = mode.value + file_path = os.path.join(data_dir, f"{mode}.txt") guid_index = 1 examples = [] with open(file_path, encoding="utf-8") as f: @@ -60,7 +146,7 @@ def read_examples_from_file(data_dir, mode): for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": if words: - examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels)) + examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) guid_index += 1 words = [] labels = [] @@ -73,15 +159,15 @@ def read_examples_from_file(data_dir, mode): # Examples could have no label for mode = "test" labels.append("O") if words: - examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels)) + examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) return examples def convert_examples_to_features( - examples, - label_list, - max_seq_length, - tokenizer, + examples: List[InputExample], + label_list: List[str], + max_seq_length: int, + tokenizer: PreTrainedTokenizer, cls_token_at_end=False, cls_token="[CLS]", cls_token_segment_id=1, @@ -93,19 +179,20 @@ def convert_examples_to_features( pad_token_label_id=-100, sequence_a_segment_id=0, mask_padding_with_zero=True, -): - """ Loads a data file into a list of `InputBatch`s +) -> List[InputFeatures]: + """ Loads a data file into a list of `InputFeatures` `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ + # TODO clean up all this to leverage built-in features of tokenizers label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: + if ex_index % 10_000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) tokens = [] @@ -120,7 +207,7 @@ def convert_examples_to_features( label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. - special_tokens_count = tokenizer.num_added_tokens() + special_tokens_count = tokenizer.num_special_tokens_to_add() if len(tokens) > max_seq_length - special_tokens_count: tokens = tokens[: (max_seq_length - special_tokens_count)] label_ids = label_ids[: (max_seq_length - special_tokens_count)] @@ -193,13 +280,18 @@ def convert_examples_to_features( logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) + if "token_type_ids" not in tokenizer.model_input_names: + segment_ids = None + features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) + InputFeatures( + input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids + ) ) return features -def get_labels(path): +def get_labels(path: str) -> List[str]: if path: with open(path, "r") as f: labels = f.read().splitlines() diff --git a/examples/run_glue.py b/examples/run_glue.py index 3351dacb8d..c50a38963c 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -16,369 +16,33 @@ """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa).""" -import argparse -import glob -import json +import dataclasses import logging import os -import random from dataclasses import dataclass, field -from typing import Optional +from typing import Dict, Optional import numpy as np -import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset -from torch.utils.data.distributed import DistributedSampler -from tqdm import tqdm, trange from transformers import ( - MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, - WEIGHTS_NAME, - AdamW, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, + EvalPrediction, + GlueDataset, + GlueDataTrainingArguments, HfArgumentParser, + Trainer, TrainingArguments, - get_linear_schedule_with_warmup, + glue_compute_metrics, + glue_output_modes, + glue_tasks_num_labels, + set_seed, ) -from transformers import glue_compute_metrics as compute_metrics -from transformers import glue_convert_examples_to_features as convert_examples_to_features -from transformers import glue_output_modes as output_modes -from transformers import glue_processors as processors - - -try: - from torch.utils.tensorboard import SummaryWriter -except ImportError: - from tensorboardX import SummaryWriter logger = logging.getLogger(__name__) -MODEL_CONFIG_CLASSES = list(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), (),) - - -def set_seed(args): - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if args.n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) - - -def train(args, train_dataset, model, tokenizer): - """ Train the model """ - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() - - args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) - train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) - - if args.max_steps > 0: - t_total = args.max_steps - args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 - else: - t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay, - }, - {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, - ] - - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total - ) - - # Check if saved optimizer or scheduler states exist - if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( - os.path.join(args.model_name_or_path, "scheduler.pt") - ): - # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) - scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) - - if args.fp16: - try: - from apex import amp - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) - - # multi-gpu training (should be after apex fp16 initialization) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Distributed training (should be after apex fp16 initialization) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, - ) - - # Train! - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_dataset)) - logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info( - " Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size - * args.gradient_accumulation_steps - * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), - ) - logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - global_step = 0 - epochs_trained = 0 - steps_trained_in_current_epoch = 0 - # Check if continuing training from a checkpoint - if os.path.exists(args.model_name_or_path): - # set global_step to global_step of last saved checkpoint from model path - try: - global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) - except ValueError: - global_step = 0 - epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) - steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) - - logger.info(" Continuing training from checkpoint, will skip to saved global_step") - logger.info(" Continuing training from epoch %d", epochs_trained) - logger.info(" Continuing training from global step %d", global_step) - logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) - - tr_loss, logging_loss = 0.0, 0.0 - model.zero_grad() - train_iterator = trange( - epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], - ) - set_seed(args) # Added here for reproducibility - for _ in train_iterator: - epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) - for step, batch in enumerate(epoch_iterator): - - # Skip past any already trained steps if resuming training - if steps_trained_in_current_epoch > 0: - steps_trained_in_current_epoch -= 1 - continue - - model.train() - batch = tuple(t.to(args.device) for t in batch) - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if args.model_type != "distilbert": - inputs["token_type_ids"] = ( - batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None - ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids - outputs = model(**inputs) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - tr_loss += loss.item() - if (step + 1) % args.gradient_accumulation_steps == 0 or ( - # last step in epoch but step is always smaller than gradient_accumulation_steps - len(epoch_iterator) <= args.gradient_accumulation_steps - and (step + 1) == len(epoch_iterator) - ): - if args.fp16: - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) - else: - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - - optimizer.step() - scheduler.step() # Update learning rate schedule - model.zero_grad() - global_step += 1 - - if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - logs = {} - if ( - args.local_rank == -1 and args.evaluate_during_training - ): # Only evaluate when single GPU otherwise metrics may not average well - results = evaluate(args, model, tokenizer) - for key, value in results.items(): - eval_key = "eval_{}".format(key) - logs[eval_key] = value - - loss_scalar = (tr_loss - logging_loss) / args.logging_steps - learning_rate_scalar = scheduler.get_lr()[0] - logs["learning_rate"] = learning_rate_scalar - logs["loss"] = loss_scalar - logging_loss = tr_loss - - for key, value in logs.items(): - tb_writer.add_scalar(key, value, global_step) - print(json.dumps({**logs, **{"step": global_step}})) - - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - # Save model checkpoint - output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - - torch.save(args, os.path.join(output_dir, "training_args.bin")) - logger.info("Saving model checkpoint to %s", output_dir) - - torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) - torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) - logger.info("Saving optimizer and scheduler states to %s", output_dir) - - if args.max_steps > 0 and global_step > args.max_steps: - epoch_iterator.close() - break - if args.max_steps > 0 and global_step > args.max_steps: - train_iterator.close() - break - - if args.local_rank in [-1, 0]: - tb_writer.close() - - return global_step, tr_loss / global_step - - -def evaluate(args, model, tokenizer, prefix=""): - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) - - results = {} - for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu eval - if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} - if args.model_type != "distilbert": - inputs["token_type_ids"] = ( - batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None - ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs["labels"].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - if args.output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif args.output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - results.update(result) - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return results - - -def load_and_cache_examples(args, task, tokenizer, evaluate=False): - if args.local_rank not in [-1, 0] and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - output_mode = output_modes[task] - # Load data features from cache or dataset file - cached_features_file = os.path.join( - args.data_dir, - "cached_{}_{}_{}_{}".format( - "dev" if evaluate else "train", - list(filter(None, args.model_name_or_path.split("/"))).pop(), - str(args.max_seq_length), - str(task), - ), - ) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - label_list = processor.get_labels() - if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]: - # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] - examples = ( - processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - ) - features = convert_examples_to_features( - examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode, - ) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0 and not evaluate: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - if output_mode == "classification": - all_labels = torch.tensor([f.label for f in features], dtype=torch.long) - elif output_mode == "regression": - all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - - dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) - return dataset - @dataclass class ModelArguments: @@ -387,9 +51,8 @@ class ModelArguments: """ model_name_or_path: str = field( - metadata={"help": "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)} + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) - model_type: str = field(metadata={"help": "Model type selected in the list: " + ", ".join(MODEL_TYPES)}) config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} ) @@ -397,162 +60,136 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pre-trained models downloaded from s3"} - ) - - -@dataclass -class DataProcessingArguments: - task_name: str = field( - metadata={"help": "The name of the task to train selected in the list: " + ", ".join(processors.keys())} - ) - data_dir: str = field( - metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."} - ) - max_seq_length: int = field( - default=128, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} ) def main(): - parser = HfArgumentParser((ModelArguments, DataProcessingArguments, TrainingArguments)) - model_args, dataprocessing_args, training_args = parser.parse_args_into_dataclasses() + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. - # For now, let's merge all the sets of args into one, - # but soon, we'll keep distinct sets of args, with a cleaner separation of concerns. - args = argparse.Namespace(**vars(model_args), **vars(dataprocessing_args), **vars(training_args)) + parser = HfArgumentParser((ModelArguments, GlueDataTrainingArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( - os.path.exists(args.output_dir) - and os.listdir(args.output_dir) - and args.do_train - and not args.overwrite_output_dir + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir ): raise ValueError( - f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) - # Setup CUDA, GPU & distributed training - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() - else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend="nccl") - args.n_gpu = 1 - args.device = device - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, - device, - args.n_gpu, - bool(args.local_rank != -1), - args.fp16, + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, ) + logger.info("Training/evaluation parameters %s", training_args) # Set seed - set_seed(args) + set_seed(training_args.seed) - # Prepare GLUE task - args.task_name = args.task_name.lower() - if args.task_name not in processors: - raise ValueError("Task not found: %s" % (args.task_name)) - processor = processors[args.task_name]() - args.output_mode = output_modes[args.task_name] - label_list = processor.get_labels() - num_labels = len(label_list) + try: + num_labels = glue_tasks_num_labels[data_args.task_name] + output_mode = glue_output_modes[data_args.task_name] + except KeyError: + raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. - args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( - args.config_name if args.config_name else args.model_name_or_path, + model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, - finetuning_task=args.task_name, - cache_dir=args.cache_dir, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, - cache_dir=args.cache_dir, + cache_dir=model_args.cache_dir, ) - if args.local_rank == 0: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # Get datasets + train_dataset = ( + GlueDataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank) + if training_args.do_train + else None + ) + eval_dataset = ( + GlueDataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True) + if training_args.do_eval + else None + ) - model.to(args.device) + def compute_metrics(p: EvalPrediction) -> Dict: + if output_mode == "classification": + preds = np.argmax(p.predictions, axis=1) + elif output_mode == "regression": + preds = np.squeeze(p.predictions) + return glue_compute_metrics(data_args.task_name, preds, p.label_ids) - logger.info("Training/evaluation parameters %s", args) + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + ) # Training - if args.do_train: - train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) - global_step, tr_loss = train(args, train_dataset, model, tokenizer) - logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() - if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - # Create output directory if needed - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - - logger.info("Saving model checkpoint to %s", args.output_dir) - # Save a trained model, configuration and tokenizer using `save_pretrained()`. - # They can then be reloaded using `from_pretrained()` - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - - # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, "training_args.bin")) - - # Load a trained model and vocabulary that you have fine-tuned - model = AutoModelForSequenceClassification.from_pretrained(args.output_dir) - tokenizer = AutoTokenizer.from_pretrained(args.output_dir) - model.to(args.device) + if training_args.do_train: + trainer.train( + model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None + ) + trainer.save_model() # Evaluation results = {} - if args.do_eval and args.local_rank in [-1, 0]: - tokenizer = AutoTokenizer.from_pretrained(args.output_dir) - checkpoints = [args.output_dir] - if args.eval_all_checkpoints: - checkpoints = list( - os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) - ) - logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + if training_args.do_eval and training_args.local_rank in [-1, 0]: + logger.info("*** Evaluate ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + eval_datasets = [eval_dataset] + if data_args.task_name == "mnli": + mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") + eval_datasets.append( + GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True) + ) + + for eval_dataset in eval_datasets: + result = trainer.evaluate(eval_dataset=eval_dataset) + + output_eval_file = os.path.join( + training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt" + ) + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name)) + for key, value in result.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) - model = AutoModelForSequenceClassification.from_pretrained(checkpoint) - model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results diff --git a/examples/run_language_modeling.py b/examples/run_language_modeling.py index 859bc164c3..3030d2d79b 100644 --- a/examples/run_language_modeling.py +++ b/examples/run_language_modeling.py @@ -20,42 +20,29 @@ using a masked language modeling (MLM) loss. """ -import argparse -import glob import logging +import math import os -import pickle -import random -import re -import shutil -from typing import Dict, List, Tuple - -import numpy as np -import torch -from torch.nn.utils.rnn import pad_sequence -from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler -from torch.utils.data.distributed import DistributedSampler -from tqdm import tqdm, trange +from dataclasses import dataclass, field +from typing import Optional from transformers import ( + CONFIG_MAPPING, MODEL_WITH_LM_HEAD_MAPPING, - WEIGHTS_NAME, - AdamW, AutoConfig, AutoModelWithLMHead, AutoTokenizer, - PreTrainedModel, + DataCollatorForLanguageModeling, + HfArgumentParser, + LineByLineTextDataset, PreTrainedTokenizer, - get_linear_schedule_with_warmup, + TextDataset, + Trainer, + TrainingArguments, + set_seed, ) -try: - from torch.utils.tensorboard import SummaryWriter -except ImportError: - from tensorboardX import SummaryWriter - - logger = logging.getLogger(__name__) @@ -63,722 +50,228 @@ MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) -class TextDataset(Dataset): - def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): - assert os.path.isfile(file_path) +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ - block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) - - directory, filename = os.path.split(file_path) - cached_features_file = os.path.join( - directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename - ) - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - with open(cached_features_file, "rb") as handle: - self.examples = pickle.load(handle) - else: - logger.info("Creating features from dataset file at %s", directory) - - self.examples = [] - with open(file_path, encoding="utf-8") as f: - text = f.read() - - tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) - - for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size - self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])) - # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) - # If your dataset is small, first you should loook for a bigger one :-) and second you - # can change this behavior by adding (model specific) padding. - - logger.info("Saving features into cached file %s", cached_features_file) - with open(cached_features_file, "wb") as handle: - pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) - - def __len__(self): - return len(self.examples) - - def __getitem__(self, item): - return torch.tensor(self.examples[item], dtype=torch.long) + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + ) -class LineByLineTextDataset(Dataset): - def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): - assert os.path.isfile(file_path) - # Here, we do not cache the features, operating under the assumption - # that we will soon use fast multithreaded tokenizers from the - # `tokenizers` repo everywhere =) - logger.info("Creating features from dataset file at %s", file_path) +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ - with open(file_path, encoding="utf-8") as f: - lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] + train_data_file: Optional[str] = field( + default=None, metadata={"help": "The input training data file (a text file)."} + ) + eval_data_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + line_by_line: bool = field( + default=False, + metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, + ) - self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"] + mlm: bool = field( + default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."} + ) + mlm_probability: float = field( + default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"} + ) - def __len__(self): - return len(self.examples) - - def __getitem__(self, i): - return torch.tensor(self.examples[i], dtype=torch.long) + block_size: int = field( + default=-1, + metadata={ + "help": "Optional input sequence length after tokenization." + "The training dataset will be truncated in block of this size for training." + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) -def load_and_cache_examples(args, tokenizer, evaluate=False): +def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: - return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size) - else: - return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size) - - -def set_seed(args): - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if args.n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) - - -def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]: - ordering_and_checkpoint_path = [] - - glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix))) - - for path in glob_checkpoints: - if use_mtime: - ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) - else: - regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path) - if regex_match and regex_match.groups(): - ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) - - checkpoints_sorted = sorted(ordering_and_checkpoint_path) - checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] - return checkpoints_sorted - - -def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None: - if not args.save_total_limit: - return - if args.save_total_limit <= 0: - return - - # Check if we should delete older checkpoint(s) - checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime) - if len(checkpoints_sorted) <= args.save_total_limit: - return - - number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit) - checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] - for checkpoint in checkpoints_to_be_deleted: - logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) - shutil.rmtree(checkpoint) - - -def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: - """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ - - if tokenizer.mask_token is None: - raise ValueError( - "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." + return LineByLineTextDataset( + tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank ) - - labels = inputs.clone() - # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) - probability_matrix = torch.full(labels.shape, args.mlm_probability) - special_tokens_mask = [ - tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() - ] - probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) - if tokenizer._pad_token is not None: - padding_mask = labels.eq(tokenizer.pad_token_id) - probability_matrix.masked_fill_(padding_mask, value=0.0) - masked_indices = torch.bernoulli(probability_matrix).bool() - labels[~masked_indices] = -100 # We only compute loss on masked tokens - - # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) - indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices - inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) - - # 10% of the time, we replace masked input tokens with random word - indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced - random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) - inputs[indices_random] = random_words[indices_random] - - # The rest of the time (10% of the time) we keep the masked input tokens unchanged - return inputs, labels - - -def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: - """ Train the model """ - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() - - args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) - - def collate(examples: List[torch.Tensor]): - if tokenizer._pad_token is None: - return pad_sequence(examples, batch_first=True) - return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) - - train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate - ) - - if args.max_steps > 0: - t_total = args.max_steps - args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: - t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training - model.resize_token_embeddings(len(tokenizer)) - - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay, - }, - {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total - ) - - # Check if saved optimizer or scheduler states exist - if ( - args.model_name_or_path - and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) - and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) - ): - # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) - scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) - - if args.fp16: - try: - from apex import amp - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) - - # multi-gpu training (should be after apex fp16 initialization) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Distributed training (should be after apex fp16 initialization) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + return TextDataset( + tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank, ) - # Train! - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_dataset)) - logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info( - " Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size - * args.gradient_accumulation_steps - * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), - ) - logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - global_step = 0 - epochs_trained = 0 - steps_trained_in_current_epoch = 0 - # Check if continuing training from a checkpoint - if args.model_name_or_path and os.path.exists(args.model_name_or_path): - try: - # set global_step to gobal_step of last saved checkpoint from model path - checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] - global_step = int(checkpoint_suffix) - epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) - steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) - - logger.info(" Continuing training from checkpoint, will skip to saved global_step") - logger.info(" Continuing training from epoch %d", epochs_trained) - logger.info(" Continuing training from global step %d", global_step) - logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) - except ValueError: - logger.info(" Starting fine-tuning.") - - tr_loss, logging_loss = 0.0, 0.0 - - model.zero_grad() - train_iterator = trange( - epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] - ) - set_seed(args) # Added here for reproducibility - for epoch in train_iterator: - epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) - - if args.local_rank != -1: - train_sampler.set_epoch(epoch) - - for step, batch in enumerate(epoch_iterator): - - # Skip past any already trained steps if resuming training - if steps_trained_in_current_epoch > 0: - steps_trained_in_current_epoch -= 1 - continue - - inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) - inputs = inputs.to(args.device) - labels = labels.to(args.device) - model.train() - outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - tr_loss += loss.item() - if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16: - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) - else: - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - optimizer.step() - scheduler.step() # Update learning rate schedule - model.zero_grad() - global_step += 1 - - if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics - if ( - args.local_rank == -1 and args.evaluate_during_training - ): # Only evaluate when single GPU otherwise metrics may not average well - results = evaluate(args, model, tokenizer) - for key, value in results.items(): - tb_writer.add_scalar("eval_{}".format(key), value, global_step) - tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) - logging_loss = tr_loss - - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - checkpoint_prefix = "checkpoint" - # Save model checkpoint - output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) - os.makedirs(output_dir, exist_ok=True) - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - - torch.save(args, os.path.join(output_dir, "training_args.bin")) - logger.info("Saving model checkpoint to %s", output_dir) - - _rotate_checkpoints(args, checkpoint_prefix) - - torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) - torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) - logger.info("Saving optimizer and scheduler states to %s", output_dir) - - if args.max_steps > 0 and global_step > args.max_steps: - epoch_iterator.close() - break - if args.max_steps > 0 and global_step > args.max_steps: - train_iterator.close() - break - - if args.local_rank in [-1, 0]: - tb_writer.close() - - return global_step, tr_loss / global_step - - -def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_output_dir = args.output_dir - - eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) - - if args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir, exist_ok=True) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - - def collate(examples: List[torch.Tensor]): - if tokenizer._pad_token is None: - return pad_sequence(examples, batch_first=True) - return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) - - eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate - ) - - # multi-gpu evaluate - if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - model.eval() - - for batch in tqdm(eval_dataloader, desc="Evaluating"): - inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) - inputs = inputs.to(args.device) - labels = labels.to(args.device) - - with torch.no_grad(): - outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) - lm_loss = outputs[0] - eval_loss += lm_loss.mean().item() - nb_eval_steps += 1 - - eval_loss = eval_loss / nb_eval_steps - perplexity = torch.exp(torch.tensor(eval_loss)) - - result = {"perplexity": perplexity} - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return result - def main(): - parser = argparse.ArgumentParser() + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. - # Required parameters - parser.add_argument( - "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)." - ) - parser.add_argument( - "--output_dir", - type=str, - required=True, - help="The output directory where the model predictions and checkpoints will be written.", - ) - parser.add_argument( - "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.", - ) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Other parameters - parser.add_argument( - "--eval_data_file", - default=None, - type=str, - help="An optional input evaluation data file to evaluate the perplexity on (a text file).", - ) - parser.add_argument( - "--line_by_line", - action="store_true", - help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.", - ) - parser.add_argument( - "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir" - ) - parser.add_argument( - "--model_name_or_path", - default=None, - type=str, - help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", - ) - - parser.add_argument( - "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling." - ) - parser.add_argument( - "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss" - ) - - parser.add_argument( - "--config_name", - default=None, - type=str, - help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.", - ) - parser.add_argument( - "--tokenizer_name", - default=None, - type=str, - help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.", - ) - parser.add_argument( - "--cache_dir", - default=None, - type=str, - help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)", - ) - parser.add_argument( - "--block_size", - default=-1, - type=int, - help="Optional input sequence length after tokenization." - "The training dataset will be truncated in block of this size for training." - "Default to the model max input length for single sentence inputs (take into account special tokens).", - ) - parser.add_argument("--do_train", action="store_true", help="Whether to run training.") - parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") - parser.add_argument( - "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." - ) - - parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") - parser.add_argument( - "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation." - ) - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", - ) - parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - parser.add_argument( - "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform." - ) - parser.add_argument( - "--max_steps", - default=-1, - type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.", - ) - parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - - parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") - parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") - parser.add_argument( - "--save_total_limit", - type=int, - default=None, - help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", - ) - parser.add_argument( - "--eval_all_checkpoints", - action="store_true", - help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number", - ) - parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") - parser.add_argument( - "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" - ) - parser.add_argument( - "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" - ) - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - - parser.add_argument( - "--fp16", - action="store_true", - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", - ) - parser.add_argument( - "--fp16_opt_level", - type=str, - default="O1", - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html", - ) - parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") - parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") - parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") - args = parser.parse_args() - - if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm: - raise ValueError( - "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " - "flag (masked language modeling)." - ) - if args.eval_data_file is None and args.do_eval: + if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) - if args.should_continue: - sorted_checkpoints = _sorted_checkpoints(args) - if len(sorted_checkpoints) == 0: - raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.") - else: - args.model_name_or_path = sorted_checkpoints[-1] if ( - os.path.exists(args.output_dir) - and os.listdir(args.output_dir) - and args.do_train - and not args.overwrite_output_dir - and not args.should_continue + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir ): raise ValueError( - "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( - args.output_dir - ) + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) - # Setup distant debugging if needed - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - # Setup CUDA, GPU & distributed training - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() - else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend="nccl") - args.n_gpu = 1 - args.device = device - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, - device, - args.n_gpu, - bool(args.local_rank != -1), - args.fp16, + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, ) + logger.info("Training/evaluation parameters %s", training_args) # Set seed - set_seed(args) + set_seed(training_args.seed) # Load pretrained model and tokenizer - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. - if args.config_name: - config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir) - elif args.model_name_or_path: - config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: - # When we release a pip version exposing CONFIG_MAPPING, - # we can do `config = CONFIG_MAPPING[args.model_type]()`. - raise ValueError( - "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it," - "and load it from here, using --config_name" - ) + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") - if args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir) - elif args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) - if args.block_size <= 0: - args.block_size = tokenizer.max_len - # Our input block size will be the max possible for the model - else: - args.block_size = min(args.block_size, tokenizer.max_len) - - if args.model_name_or_path: + if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, - cache_dir=args.cache_dir, + cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) - model.to(args.device) + model.resize_token_embeddings(len(tokenizer)) - if args.local_rank == 0: - torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab + if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: + raise ValueError( + "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " + "flag (masked language modeling)." + ) - logger.info("Training/evaluation parameters %s", args) + if data_args.block_size <= 0: + data_args.block_size = tokenizer.max_len + # Our input block size will be the max possible for the model + else: + data_args.block_size = min(data_args.block_size, tokenizer.max_len) + + # Get datasets + train_dataset = ( + get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank) + if training_args.do_train + else None + ) + eval_dataset = ( + get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True) + if training_args.do_eval + else None + ) + data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability + ) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + prediction_loss_only=True, + ) # Training - if args.do_train: - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache - - train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) - - if args.local_rank == 0: - torch.distributed.barrier() - - global_step, tr_loss = train(args, train_dataset, model, tokenizer) - logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - - # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() - if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - # Create output directory if needed - if args.local_rank in [-1, 0]: - os.makedirs(args.output_dir, exist_ok=True) - - logger.info("Saving model checkpoint to %s", args.output_dir) - # Save a trained model, configuration and tokenizer using `save_pretrained()`. - # They can then be reloaded using `from_pretrained()` - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - - # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, "training_args.bin")) - - # Load a trained model and vocabulary that you have fine-tuned - model = AutoModelWithLMHead.from_pretrained(args.output_dir) - tokenizer = AutoTokenizer.from_pretrained(args.output_dir) - model.to(args.device) + if training_args.do_train: + model_path = ( + model_args.model_name_or_path + if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) + else None + ) + trainer.train(model_path=model_path) + trainer.save_model() # Evaluation results = {} - if args.do_eval and args.local_rank in [-1, 0]: - checkpoints = [args.output_dir] - if args.eval_all_checkpoints: - checkpoints = list( - os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) - ) - logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + if training_args.do_eval and training_args.local_rank in [-1, 0]: + logger.info("*** Evaluate ***") - model = AutoModelWithLMHead.from_pretrained(checkpoint) - model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) - results.update(result) + eval_output = trainer.evaluate() + + perplexity = math.exp(eval_output["loss"]) + result = {"perplexity": perplexity} + + output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + results.update(result) return results diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 341733b8a0..100174ac6b 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -16,662 +16,203 @@ """ Finetuning the library models for multiple choice (Bert, Roberta, XLNet).""" -import argparse -import glob import logging import os -import random +from dataclasses import dataclass, field +from typing import Dict, Optional import numpy as np -import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset -from torch.utils.data.distributed import DistributedSampler -from tqdm import tqdm, trange from transformers import ( - WEIGHTS_NAME, - AdamW, - BertConfig, - BertForMultipleChoice, - BertTokenizer, - RobertaConfig, - RobertaForMultipleChoice, - RobertaTokenizer, - XLNetConfig, - XLNetForMultipleChoice, - XLNetTokenizer, - get_linear_schedule_with_warmup, + AutoConfig, + AutoModelForMultipleChoice, + AutoTokenizer, + EvalPrediction, + HfArgumentParser, + Trainer, + TrainingArguments, + set_seed, ) -from utils_multiple_choice import convert_examples_to_features, processors - - -try: - from torch.utils.tensorboard import SummaryWriter -except ImportError: - from tensorboardX import SummaryWriter +from utils_multiple_choice import MultipleChoiceDataset, Split, processors logger = logging.getLogger(__name__) -ALL_MODELS = sum( - (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), () -) - -MODEL_CLASSES = { - "bert": (BertConfig, BertForMultipleChoice, BertTokenizer), - "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer), - "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer), -} - - -def select_field(features, field): - return [[choice[field] for choice in feature.choices_features] for feature in features] - def simple_accuracy(preds, labels): return (preds == labels).mean() -def set_seed(args): - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if args.n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + ) -def train(args, train_dataset, model, tokenizer): - """ Train the model """ - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ - args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) - train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) - - if args.max_steps > 0: - t_total = args.max_steps - args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 - else: - t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay, + task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())}) + data_dir: str = field(metadata={"help": "Should contain the data files for the task."}) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." }, - {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) - if args.fp16: - try: - from apex import amp - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) - - # multi-gpu training (should be after apex fp16 initialization) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Distributed training (should be after apex fp16 initialization) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True - ) - - # Train! - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_dataset)) - logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info( - " Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size - * args.gradient_accumulation_steps - * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} ) - logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - global_step = 0 - tr_loss, logging_loss = 0.0, 0.0 - best_dev_acc = 0.0 - best_steps = 0 - model.zero_grad() - train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) - set_seed(args) # Added here for reproductibility - for _ in train_iterator: - epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) - for step, batch in enumerate(epoch_iterator): - model.train() - batch = tuple(t.to(args.device) for t in batch) - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "token_type_ids": batch[2] - if args.model_type in ["bert", "xlnet"] - else None, # XLM don't use segment_ids - "labels": batch[3], - } - outputs = model(**inputs) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) - else: - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - - tr_loss += loss.item() - if (step + 1) % args.gradient_accumulation_steps == 0: - - optimizer.step() - scheduler.step() # Update learning rate schedule - model.zero_grad() - global_step += 1 - - if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics - if ( - args.local_rank == -1 and args.evaluate_during_training - ): # Only evaluate when single GPU otherwise metrics may not average well - results = evaluate(args, model, tokenizer) - for key, value in results.items(): - tb_writer.add_scalar("eval_{}".format(key), value, global_step) - if results["eval_acc"] > best_dev_acc: - best_dev_acc = results["eval_acc"] - best_steps = global_step - if args.do_test: - results_test = evaluate(args, model, tokenizer, test=True) - for key, value in results_test.items(): - tb_writer.add_scalar("test_{}".format(key), value, global_step) - logger.info( - "test acc: %s, loss: %s, global steps: %s", - str(results_test["eval_acc"]), - str(results_test["eval_loss"]), - str(global_step), - ) - tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) - logger.info( - "Average loss: %s at global step: %s", - str((tr_loss - logging_loss) / args.logging_steps), - str(global_step), - ) - logging_loss = tr_loss - - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - # Save model checkpoint - output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(output_dir) - tokenizer.save_vocabulary(output_dir) - torch.save(args, os.path.join(output_dir, "training_args.bin")) - logger.info("Saving model checkpoint to %s", output_dir) - - if args.max_steps > 0 and global_step > args.max_steps: - epoch_iterator.close() - break - if args.max_steps > 0 and global_step > args.max_steps: - train_iterator.close() - break - - if args.local_rank in [-1, 0]: - tb_writer.close() - - return global_step, tr_loss / global_step, best_steps - - -def evaluate(args, model, tokenizer, prefix="", test=False): - eval_task_names = (args.task_name,) - eval_outputs_dirs = (args.output_dir,) - - results = {} - for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): - eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=not test, test=test) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu evaluate - if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - model.eval() - batch = tuple(t.to(args.device) for t in batch) - - with torch.no_grad(): - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "token_type_ids": batch[2] - if args.model_type in ["bert", "xlnet"] - else None, # XLM don't use segment_ids - "labels": batch[3], - } - outputs = model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs["labels"].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - preds = np.argmax(preds, axis=1) - acc = simple_accuracy(preds, out_label_ids) - result = {"eval_acc": acc, "eval_loss": eval_loss} - results.update(result) - - output_eval_file = os.path.join(eval_output_dir, "is_test_" + str(test).lower() + "_eval_results.txt") - - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test))) - writer.write("model =%s\n" % str(args.model_name_or_path)) - writer.write( - "total batch size=%d\n" - % ( - args.per_gpu_train_batch_size - * args.gradient_accumulation_steps - * (torch.distributed.get_world_size() if args.local_rank != -1 else 1) - ) - ) - writer.write("train num epochs=%d\n" % args.num_train_epochs) - writer.write("fp16 =%s\n" % args.fp16) - writer.write("max seq length =%d\n" % args.max_seq_length) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - return results - - -def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - processor = processors[task]() - # Load data features from cache or dataset file - if evaluate: - cached_mode = "dev" - elif test: - cached_mode = "test" - else: - cached_mode = "train" - assert not (evaluate and test) - cached_features_file = os.path.join( - args.data_dir, - "cached_{}_{}_{}_{}".format( - cached_mode, - list(filter(None, args.model_name_or_path.split("/"))).pop(), - str(args.max_seq_length), - str(task), - ), - ) - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) - else: - logger.info("Creating features from dataset file at %s", args.data_dir) - label_list = processor.get_labels() - if evaluate: - examples = processor.get_dev_examples(args.data_dir) - elif test: - examples = processor.get_test_examples(args.data_dir) - else: - examples = processor.get_train_examples(args.data_dir) - logger.info("Training number: %s", str(len(examples))) - features = convert_examples_to_features( - examples, - label_list, - args.max_seq_length, - tokenizer, - pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token=tokenizer.pad_token_id, - pad_token_segment_id=tokenizer.pad_token_type_id, - ) - if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) - - if args.local_rank == 0: - torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - - # Convert to Tensors and build dataset - all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long) - all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long) - all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long) - all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) - - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - return dataset def main(): - parser = argparse.ArgumentParser() + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. - # Required parameters - parser.add_argument( - "--data_dir", - default=None, - type=str, - required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.", - ) - parser.add_argument( - "--model_type", - default=None, - type=str, - required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), - ) - parser.add_argument( - "--model_name_or_path", - default=None, - type=str, - required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), - ) - parser.add_argument( - "--task_name", - default=None, - type=str, - required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - required=True, - help="The output directory where the model predictions and checkpoints will be written.", - ) - - # Other parameters - parser.add_argument( - "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" - ) - parser.add_argument( - "--tokenizer_name", - default="", - type=str, - help="Pretrained tokenizer name or path if not the same as model_name", - ) - parser.add_argument( - "--cache_dir", - default="", - type=str, - help="Where do you want to store the pre-trained models downloaded from s3", - ) - parser.add_argument( - "--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", - ) - parser.add_argument("--do_train", action="store_true", help="Whether to run training.") - parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") - parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set") - parser.add_argument( - "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." - ) - parser.add_argument( - "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." - ) - - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") - parser.add_argument( - "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." - ) - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", - ) - parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - parser.add_argument( - "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." - ) - parser.add_argument( - "--max_steps", - default=-1, - type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.", - ) - parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - - parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") - parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") - parser.add_argument( - "--eval_all_checkpoints", - action="store_true", - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", - ) - parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") - parser.add_argument( - "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" - ) - parser.add_argument( - "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" - ) - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - - parser.add_argument( - "--fp16", - action="store_true", - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", - ) - parser.add_argument( - "--fp16_opt_level", - type=str, - default="O1", - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html", - ) - parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") - parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") - parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") - args = parser.parse_args() + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( - os.path.exists(args.output_dir) - and os.listdir(args.output_dir) - and args.do_train - and not args.overwrite_output_dir + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir ): raise ValueError( - "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( - args.output_dir - ) + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) - # Setup distant debugging if needed - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - # Setup CUDA, GPU & distributed training - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() - else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend="nccl") - args.n_gpu = 1 - args.device = device - # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, - device, - args.n_gpu, - bool(args.local_rank != -1), - args.fp16, + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, ) + logger.info("Training/evaluation parameters %s", training_args) # Set seed - set_seed(args) + set_seed(training_args.seed) - # Prepare GLUE task - args.task_name = args.task_name.lower() - if args.task_name not in processors: - raise ValueError("Task not found: %s" % (args.task_name)) - processor = processors[args.task_name]() - label_list = processor.get_labels() - num_labels = len(label_list) + try: + processor = processors[data_args.task_name]() + label_list = processor.get_labels() + num_labels = len(label_list) + except KeyError: + raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. - args.model_type = args.model_type.lower() - config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained( - args.config_name if args.config_name else args.model_name_or_path, + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, - finetuning_task=args.task_name, - cache_dir=args.cache_dir if args.cache_dir else None, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, ) - tokenizer = tokenizer_class.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None, + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, ) - model = model_class.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), + model = AutoModelForMultipleChoice.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, - cache_dir=args.cache_dir if args.cache_dir else None, + cache_dir=model_args.cache_dir, ) - if args.local_rank == 0: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # Get datasets + train_dataset = ( + MultipleChoiceDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + task=data_args.task_name, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.train, + local_rank=training_args.local_rank, + ) + if training_args.do_train + else None + ) + eval_dataset = ( + MultipleChoiceDataset( + data_dir=data_args.data_dir, + tokenizer=tokenizer, + task=data_args.task_name, + max_seq_length=data_args.max_seq_length, + overwrite_cache=data_args.overwrite_cache, + mode=Split.dev, + local_rank=training_args.local_rank, + ) + if training_args.do_eval + else None + ) - model.to(args.device) + def compute_metrics(p: EvalPrediction) -> Dict: + preds = np.argmax(p.predictions, axis=1) + return {"acc": simple_accuracy(preds, p.label_ids)} - logger.info("Training/evaluation parameters %s", args) - best_steps = 0 + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + ) # Training - if args.do_train: - train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) - global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer) - logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() - if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - # Create output directory if needed - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - - logger.info("Saving model checkpoint to %s", args.output_dir) - # Save a trained model, configuration and tokenizer using `save_pretrained()`. - # They can then be reloaded using `from_pretrained()` - model_to_save = ( - model.module if hasattr(model, "module") else model - ) # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - - # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, "training_args.bin")) - - # Load a trained model and vocabulary that you have fine-tuned - model = model_class.from_pretrained(args.output_dir) - tokenizer = tokenizer_class.from_pretrained(args.output_dir) - model.to(args.device) + if training_args.do_train: + trainer.train( + model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None + ) # Evaluation results = {} - if args.do_eval and args.local_rank in [-1, 0]: - if not args.do_train: - args.output_dir = args.model_name_or_path - checkpoints = [args.output_dir] - if args.eval_all_checkpoints: - checkpoints = list( - os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) - ) - logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + if training_args.do_eval and training_args.local_rank in [-1, 0]: + logger.info("*** Evaluate ***") + + result = trainer.evaluate() + + output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key, value in result.items(): + logger.info(" %s = %s", key, value) + writer.write("%s = %s\n" % (key, value)) - model = model_class.from_pretrained(checkpoint) - model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) - if args.do_test and args.local_rank in [-1, 0]: - if not args.do_train: - args.output_dir = args.model_name_or_path - checkpoints = [args.output_dir] - # if args.eval_all_checkpoints: # can not use this to do test!! - # checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) - # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" - - model = model_class.from_pretrained(checkpoint) - model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=prefix, test=True) - result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) - results.update(result) - if best_steps: - logger.info("best steps of eval acc is the following checkpoints: %s", best_steps) return results diff --git a/examples/summarization/bart/finetune.py b/examples/summarization/bart/finetune.py index 9e3d55b3e9..893188e76f 100644 --- a/examples/summarization/bart/finetune.py +++ b/examples/summarization/bart/finetune.py @@ -159,7 +159,7 @@ def main(args): # If output_dir not provided, a folder will be generated in pwd if not args.output_dir: - args.output_dir = os.path.join("./results", f"{args.task}_{args.model_type}_{time.strftime('%Y%m%d_%H%M%S')}",) + args.output_dir = os.path.join("./results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",) os.makedirs(args.output_dir) model = SummarizationTrainer(args) trainer = generic_train(model, args) diff --git a/examples/summarization/bart/run_train.sh b/examples/summarization/bart/run_train.sh index 3c2371e9cf..8ac009ef27 100755 --- a/examples/summarization/bart/run_train.sh +++ b/examples/summarization/bart/run_train.sh @@ -10,7 +10,6 @@ export PYTHONPATH="../../":"${PYTHONPATH}" python finetune.py \ --data_dir=./cnn-dailymail/cnn_dm \ ---model_type=bart \ --model_name_or_path=bart-large \ --learning_rate=3e-5 \ --train_batch_size=4 \ diff --git a/examples/test_examples.py b/examples/test_examples.py index 54d8de4bc3..5babf45eb2 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -22,6 +22,7 @@ from unittest.mock import patch import run_generation import run_glue +import run_language_modeling import run_squad @@ -56,13 +57,38 @@ class ExamplesTests(unittest.TestCase): "--warmup_steps=2", "--overwrite_output_dir", "--seed=42", + "--max_seq_length=128", ] - model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased") - with patch.object(sys, "argv", testargs + [model_type, model_name]): + model_name = "--model_name_or_path=bert-base-uncased" + with patch.object(sys, "argv", testargs + [model_name]): result = run_glue.main() + del result["loss"] for value in result.values(): self.assertGreaterEqual(value, 0.75) + def test_run_language_modeling(self): + stream_handler = logging.StreamHandler(sys.stdout) + logger.addHandler(stream_handler) + + testargs = """ + run_language_modeling.py + --model_name_or_path distilroberta-base + --model_type roberta + --mlm + --line_by_line + --train_data_file ./tests/fixtures/sample_text.txt + --eval_data_file ./tests/fixtures/sample_text.txt + --output_dir ./tests/fixtures + --overwrite_output_dir + --do_train + --do_eval + --num_train_epochs=1 + --no_cuda + """.split() + with patch.object(sys, "argv", testargs): + result = run_language_modeling.main() + self.assertLess(result["perplexity"], 35) + def test_run_squad(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) diff --git a/examples/tests_samples/.gitignore b/examples/tests_samples/.gitignore index c8ce21fe24..46ad771d45 100644 --- a/examples/tests_samples/.gitignore +++ b/examples/tests_samples/.gitignore @@ -1,6 +1,7 @@ *.* cache* temp* +!*.txt !*.tsv !*.json !.gitignore \ No newline at end of file diff --git a/examples/tests_samples/GermEval/dev.txt b/examples/tests_samples/GermEval/dev.txt new file mode 100644 index 0000000000..de00158230 --- /dev/null +++ b/examples/tests_samples/GermEval/dev.txt @@ -0,0 +1,202 @@ +Gleich O +darauf O +entwirft O +er O +seine O +Selbstdarstellung O +" O +Ecce B-OTH +homo I-OTH +" O +in O +enger O +Auseinandersetzung O +mit O +diesem O +Bild O +Jesu B-PER +. O + +1980 O +kam O +der O +Crown B-OTH +als O +Versuch O +von O +Toyota B-ORG +, O +sich O +in O +der O +Oberen O +Mittelklasse O +zu O +etablieren O +, O +auch O +nach O +Deutschland B-LOC +. O + +– O +4:26 O +# O +Sometime B-OTH +Ago/La I-OTH +Fiesta I-OTH +– O +23:18 O +Alle O +Stücke O +wurden O +von O +Corea B-PER +komponiert O +mit O +Ausnahme O +der O +einleitenden O +Improvisation O +zu O +Sometime B-OTH +Ago I-OTH +. O + +Bis O +2013 O +steigen O +die O +Mittel O +aus O +dem O +EU-Budget B-ORGpart +auf O +rund O +120 O +Millionen O +Euro B-OTH +. O + +Daraus O +entwickelte O +sich O +im O +Rokoko B-OTH +die O +Sitte O +des O +gemeinsamen O +Weinens O +im O +Theater O +, O +das O +die O +Standesgrenzen O +innerhalb O +des O +Publikums O +überbrücken O +sollte O +. O + +Die O +Spinne O +hatte O +sie O +mit O +Seidenfäden O +an O +ihrem O +Schwanz O +gefesselt O +und O +nach O +oben O +gezogen O +. O + +In O +Deutschland B-LOC +ist O +nach O +StGB O +eine O +Anwerbung O +für O +die O +Fremdenlegion O +strafbar O +. O + +Am O +Donnerstag O +wird O +sich O +zeigen O +, O +ob O +die O +Idee O +der O +DLR-Forscher B-ORGpart +funktioniert O +. O + +Der O +sechste O +Lauf O +der O +ADAC B-ORG +GT I-ORG +Mastersstand O +ganz O +klar O +im O +Mittelpunkt O +des O +Motorsport-Wochenendes O +auf O +dem O +Eurospeedway B-ORG +Lausitz I-ORG +. O + +Nach O +den O +schwächeren O +Vorgaben O +der O +Wall B-ORG +Street I-ORG +vom O +Vortag O +setzten O +die O +deutschen B-LOCderiv +Standardwerte O +ihren O +Konsolidierungskurs O +fort O +. O + +Kolb B-PER +war O +seit O +1986 O +im O +Turnverein O +als O +Leiter O +tätig O +, O +darunter O +elf O +Jahre O +als O +Hauptleiter O +in O +der O +Männerriege O +. O diff --git a/examples/tests_samples/GermEval/labels.txt b/examples/tests_samples/GermEval/labels.txt new file mode 100644 index 0000000000..a781cbd47e --- /dev/null +++ b/examples/tests_samples/GermEval/labels.txt @@ -0,0 +1,25 @@ +B-LOC +B-LOCderiv +B-LOCpart +B-ORG +B-ORGderiv +B-ORGpart +B-OTH +B-OTHderiv +B-OTHpart +B-PER +B-PERderiv +B-PERpart +I-LOC +I-LOCderiv +I-LOCpart +I-ORG +I-ORGderiv +I-ORGpart +I-OTH +I-OTHderiv +I-OTHpart +I-PER +I-PERderiv +I-PERpart +O diff --git a/examples/tests_samples/GermEval/train.txt b/examples/tests_samples/GermEval/train.txt new file mode 100644 index 0000000000..3d613ae1ee --- /dev/null +++ b/examples/tests_samples/GermEval/train.txt @@ -0,0 +1,200 @@ +Schartau B-PER +sagte O +dem O +" O +Tagesspiegel B-ORG +" O +vom O +Freitag O +, O +Fischer B-PER +sei O +" O +in O +einer O +Weise O +aufgetreten O +, O +die O +alles O +andere O +als O +überzeugend O +war O +" O +. O + +Firmengründer O +Wolf B-PER +Peter I-PER +Bree I-PER +arbeitete O +Anfang O +der O +siebziger O +Jahre O +als O +Möbelvertreter O +, O +als O +er O +einen O +fliegenden O +Händler O +aus O +dem O +Libanon B-LOC +traf O +. O + +Ob O +sie O +dabei O +nach O +dem O +Runden O +Tisch O +am O +23. O +April O +in O +Berlin B-LOC +durch O +ein O +pädagogisches O +Konzept O +unterstützt O +wird O +, O +ist O +allerdings O +zu O +bezweifeln O +. O + +Bayern B-ORG +München I-ORG +ist O +wieder O +alleiniger O +Top- O +Favorit O +auf O +den O +Gewinn O +der O +deutschen B-LOCderiv +Fußball-Meisterschaft O +. O + +Dabei O +hätte O +der O +tapfere O +Schlussmann O +allen O +Grund O +gehabt O +, O +sich O +viel O +früher O +aufzuregen O +. O + +ARD-Programmchef B-ORGpart +Günter B-PER +Struve I-PER +war O +wegen O +eines O +vierwöchigen O +Urlaubs O +für O +eine O +Stellungnahme O +nicht O +erreichbar O +. O + +Alternativ O +sollten O +sich O +die O +Restaurantbetreiber O +aus O +Sicht O +der O +Solingerin B-LOCderiv +zu O +längeren O +Öffnungszeiten O +verpflichten O +, O +um O +wartende O +Kunden O +aufzunehmen O +. O + +Die O +Deutsche B-ORG +Flugsicherung I-ORG +( O +DFS B-ORG +) O +beschloss O +ein O +Flugverbot O +für O +alle O +internationalen O +Flughäfen O +mit O +Ausnahme O +der O +beiden O +Berliner B-LOCderiv +Flughäfen O +bis O +2.00 O +Uhr O +nachts O +. O + +New O +Small O +Family O +mit O +E-Motor O +: O +Studie O +E-Up O +! O + +Eine O +Schwachstelle O +war O +beispielsweise O +der O +Spiegelkasten O +. O + +Denn O +durch O +den O +Einsatz O +moderner O +Fahrzeugtechnik O +( O +Dieseltriebwagen O +) O +und O +schalldämmender O +Fenster O +entsteht O +keine O +Einschränkung O +der O +Wohnqualität O +. O diff --git a/examples/tests_samples/STS-B/dev.tsv b/examples/tests_samples/STS-B/dev.tsv new file mode 100644 index 0000000000..8d689c2ccc --- /dev/null +++ b/examples/tests_samples/STS-B/dev.tsv @@ -0,0 +1,10 @@ +index genre filename year old_index source1 source2 sentence1 sentence2 score +0 main-captions MSRvid 2012test 0000 none none A man with a hard hat is dancing. A man wearing a hard hat is dancing. 5.000 +1 main-captions MSRvid 2012test 0002 none none A young child is riding a horse. A child is riding a horse. 4.750 +2 main-captions MSRvid 2012test 0003 none none A man is feeding a mouse to a snake. The man is feeding a mouse to the snake. 5.000 +3 main-captions MSRvid 2012test 0007 none none A woman is playing the guitar. A man is playing guitar. 2.400 +4 main-captions MSRvid 2012test 0008 none none A woman is playing the flute. A man is playing a flute. 2.750 +5 main-captions MSRvid 2012test 0010 none none A woman is cutting an onion. A man is cutting onions. 2.615 +6 main-captions MSRvid 2012test 0015 none none A man is erasing a chalk board. The man is erasing the chalk board. 5.000 +7 main-captions MSRvid 2012test 0023 none none A woman is carrying a boy. A woman is carrying her baby. 2.333 +8 main-captions MSRvid 2012test 0027 none none Three men are playing guitars. Three men are on stage playing guitars. 3.750 diff --git a/examples/tests_samples/STS-B/train.tsv b/examples/tests_samples/STS-B/train.tsv new file mode 100644 index 0000000000..a38be956d6 --- /dev/null +++ b/examples/tests_samples/STS-B/train.tsv @@ -0,0 +1,10 @@ +index genre filename year old_index source1 source2 sentence1 sentence2 score +0 main-captions MSRvid 2012test 0001 none none A plane is taking off. An air plane is taking off. 5.000 +1 main-captions MSRvid 2012test 0004 none none A man is playing a large flute. A man is playing a flute. 3.800 +2 main-captions MSRvid 2012test 0005 none none A man is spreading shreded cheese on a pizza. A man is spreading shredded cheese on an uncooked pizza. 3.800 +3 main-captions MSRvid 2012test 0006 none none Three men are playing chess. Two men are playing chess. 2.600 +4 main-captions MSRvid 2012test 0009 none none A man is playing the cello. A man seated is playing the cello. 4.250 +5 main-captions MSRvid 2012test 0011 none none Some men are fighting. Two men are fighting. 4.250 +6 main-captions MSRvid 2012test 0012 none none A man is smoking. A man is skating. 0.500 +7 main-captions MSRvid 2012test 0013 none none The man is playing the piano. The man is playing the guitar. 1.600 +8 main-captions MSRvid 2012test 0014 none none A man is playing on a guitar and singing. A woman is playing an acoustic guitar and singing. 2.200 diff --git a/examples/transformer_base.py b/examples/transformer_base.py index 0ab355ad65..480b69f268 100644 --- a/examples/transformer_base.py +++ b/examples/transformer_base.py @@ -8,7 +8,6 @@ import pytorch_lightning as pl import torch from transformers import ( - ALL_PRETRAINED_MODEL_ARCHIVE_MAP, AdamW, AutoConfig, AutoModel, @@ -20,15 +19,11 @@ from transformers import ( AutoTokenizer, get_linear_schedule_with_warmup, ) -from transformers.modeling_auto import MODEL_MAPPING logger = logging.getLogger(__name__) -ALL_MODELS = tuple(ALL_PRETRAINED_MODEL_ARCHIVE_MAP) -MODEL_CLASSES = tuple(m.model_type for m in MODEL_MAPPING) - MODEL_MODES = { "base": AutoModel, "sequence-classification": AutoModelForSequenceClassification, @@ -51,28 +46,25 @@ class BaseTransformer(pl.LightningModule): def __init__(self, hparams: argparse.Namespace, num_labels=None, mode="base", **config_kwargs): "Initialize a model." - super(BaseTransformer, self).__init__() + super().__init__() self.hparams = hparams cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None - self.hparams.model_type = self.hparams.model_type.lower() - config = AutoConfig.from_pretrained( + self.config = AutoConfig.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, **({"num_labels": num_labels} if num_labels is not None else {}), cache_dir=cache_dir, **config_kwargs, ) - tokenizer = AutoTokenizer.from_pretrained( + self.tokenizer = AutoTokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, - do_lower_case=self.hparams.do_lower_case, cache_dir=cache_dir, ) - model = MODEL_MODES[mode].from_pretrained( + self.model = MODEL_MODES[mode].from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), - config=config, + config=self.config, cache_dir=cache_dir, ) - self.config, self.tokenizer, self.model = config, tokenizer, model def is_logger(self): return self.trainer.proc_rank <= 0 @@ -148,19 +140,12 @@ class BaseTransformer(pl.LightningModule): @staticmethod def add_model_specific_args(parser, root_dir): - parser.add_argument( - "--model_type", - default=None, - type=str, - required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES), - ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + help="Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" @@ -177,9 +162,6 @@ class BaseTransformer(pl.LightningModule): type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) - parser.add_argument( - "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." - ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") @@ -252,8 +234,6 @@ def add_generic_args(parser, root_dir): help="Number of updates steps to accumulate before performing a backward/update pass.", ) - parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") - parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") @@ -261,15 +241,6 @@ def generic_train(model: BaseTransformer, args: argparse.Namespace): # init model set_seed(args) - # Setup distant debugging if needed - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py index d374e3a290..8fe201e8d0 100644 --- a/examples/utils_multiple_choice.py +++ b/examples/utils_multiple_choice.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """ +""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """ import csv @@ -21,48 +21,124 @@ import glob import json import logging import os -from typing import List +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional +import torch import tqdm +from torch.utils.data.dataset import Dataset -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizer, torch_distributed_zero_first logger = logging.getLogger(__name__) -class InputExample(object): - """A single training/test example for multiple choice""" +@dataclass(frozen=True) +class InputExample: + """ + A single training/test example for multiple choice - def __init__(self, example_id, question, contexts, endings, label=None): - """Constructs a InputExample. + Args: + example_id: Unique id for the example. + question: string. The untokenized text of the second sequence (question). + contexts: list of str. The untokenized text of the first sequence (context of corresponding question). + endings: list of str. multiple choice's options. Its length must be equal to contexts' length. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ - Args: - example_id: Unique id for the example. - contexts: list of str. The untokenized text of the first sequence (context of corresponding question). - question: string. The untokenized text of the second sequence (question). - endings: list of str. multiple choice's options. Its length must be equal to contexts' length. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.example_id = example_id - self.question = question - self.contexts = contexts - self.endings = endings - self.label = label + example_id: str + question: str + contexts: List[str] + endings: List[str] + label: Optional[str] -class InputFeatures(object): - def __init__(self, example_id, choices_features, label): - self.example_id = example_id - self.choices_features = [ - {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids} - for input_ids, input_mask, segment_ids in choices_features - ] - self.label = label +@dataclass(frozen=True) +class InputFeatures: + """ + A single set of features of data. + Property names are the same names as the corresponding inputs to a model. + """ + + example_id: str + input_ids: List[List[int]] + attention_mask: Optional[List[List[int]]] + token_type_ids: Optional[List[List[int]]] + label: Optional[int] -class DataProcessor(object): +class Split(Enum): + train = "train" + dev = "dev" + test = "test" + + +class MultipleChoiceDataset(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + features: List[InputFeatures] + + def __init__( + self, + data_dir: str, + tokenizer: PreTrainedTokenizer, + task: str, + max_seq_length: Optional[int] = None, + overwrite_cache=False, + mode: Split = Split.train, + local_rank=-1, + ): + processor = processors[task]() + + cached_features_file = os.path.join( + data_dir, + "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,), + ) + with torch_distributed_zero_first(local_rank): + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + + if os.path.exists(cached_features_file) and not overwrite_cache: + logger.info(f"Loading features from cached file {cached_features_file}") + self.features = torch.load(cached_features_file) + else: + logger.info(f"Creating features from dataset file at {data_dir}") + label_list = processor.get_labels() + if mode == Split.dev: + examples = processor.get_dev_examples(data_dir) + elif mode == Split.test: + examples = processor.get_test_examples(data_dir) + else: + examples = processor.get_train_examples(data_dir) + logger.info("Training examples: %s", len(examples)) + # TODO clean up all this to leverage built-in features of tokenizers + self.features = convert_examples_to_features( + examples, + label_list, + max_seq_length, + tokenizer, + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, + ) + if local_rank in [-1, 0]: + logger.info("Saving features into cached file %s", cached_features_file) + torch.save(self.features, cached_features_file) + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] + + +class DataProcessor: """Base class for data converters for multiple choice data sets.""" def get_train_examples(self, data_dir): @@ -311,7 +387,7 @@ def convert_examples_to_features( for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) - choices_features = [] + choices_inputs = [] for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)): text_a = context if example.question.find("_") != -1: @@ -321,7 +397,7 @@ def convert_examples_to_features( text_b = example.question + " " + ending inputs = tokenizer.encode_plus( - text_a, text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True + text_a, text_b, add_special_tokens=True, max_length=max_length, pad_to_max_length=True, ) if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: logger.info( @@ -330,41 +406,31 @@ def convert_examples_to_features( "you need to try to use a bigger max seq length!" ) - input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_length - len(input_ids) - if pad_on_left: - input_ids = ([pad_token] * padding_length) + input_ids - attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask - token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids - else: - input_ids = input_ids + ([pad_token] * padding_length) - attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) - - assert len(input_ids) == max_length - assert len(attention_mask) == max_length - assert len(token_type_ids) == max_length - choices_features.append((input_ids, attention_mask, token_type_ids)) + choices_inputs.append(inputs) label = label_map[example.label] - if ex_index < 2: - logger.info("*** Example ***") - logger.info("race_id: {}".format(example.example_id)) - for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features): - logger.info("choice: {}".format(choice_idx)) - logger.info("input_ids: {}".format(" ".join(map(str, input_ids)))) - logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask)))) - logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids)))) - logger.info("label: {}".format(label)) + input_ids = [x["input_ids"] for x in choices_inputs] + attention_mask = ( + [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None + ) + token_type_ids = ( + [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None + ) - features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,)) + features.append( + InputFeatures( + example_id=example.example_id, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + ) + ) + + for f in features[:2]: + logger.info("*** Example ***") + logger.info("feature: %s" % f) return features diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 547e1d3afa..88c48020d4 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -31,6 +31,8 @@ from .benchmark_utils import ( start_memory_tracing, stop_memory_tracing, ) + +# Configurations from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig from .configuration_bart import BartConfig @@ -46,8 +48,6 @@ from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, Open from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig - -# Configurations from .configuration_utils import PretrainedConfig from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig @@ -121,6 +121,8 @@ from .pipelines import ( TranslationPipeline, pipeline, ) + +# Tokenizers from .tokenization_albert import AlbertTokenizer from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer from .tokenization_bart import BartTokenizer, MBartTokenizer @@ -136,8 +138,6 @@ from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_t5 import T5Tokenizer from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast - -# Tokenizers from .tokenization_utils import PreTrainedTokenizer from .tokenization_xlm import XLMTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer @@ -162,6 +162,7 @@ if is_torch_available(): AutoModelForQuestionAnswering, AutoModelWithLMHead, AutoModelForTokenClassification, + AutoModelForMultipleChoice, ALL_PRETRAINED_MODEL_ARCHIVE_MAP, MODEL_MAPPING, MODEL_FOR_PRETRAINING_MAPPING, @@ -169,6 +170,7 @@ if is_torch_available(): MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_FOR_MULTIPLE_CHOICE_MAPPING, ) from .modeling_bert import ( @@ -320,6 +322,10 @@ if is_torch_available(): get_linear_schedule_with_warmup, ) + # Trainer + from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction + from .data.data_collator import DefaultDataCollator, DataCollator, DataCollatorForLanguageModeling + from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments # TensorFlow if is_tf_available(): diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 2066d83d65..026c9d3124 100644 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -87,7 +87,7 @@ class PretrainedConfig(object): self.architectures = kwargs.pop("architectures", None) self.finetuning_task = kwargs.pop("finetuning_task", None) self.num_labels = kwargs.pop("num_labels", 2) - self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)}) + self.id2label = kwargs.pop("id2label", {i: f"LABEL_{i}" for i in range(self.num_labels)}) self.id2label = dict((int(key), value) for key, value in self.id2label.items()) self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys()))) self.label2id = dict((key, int(value)) for key, value in self.label2id.items()) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py new file mode 100644 index 0000000000..b8f3f571b6 --- /dev/null +++ b/src/transformers/data/data_collator.py @@ -0,0 +1,144 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List, NewType, Tuple + +import torch +from torch.nn.utils.rnn import pad_sequence + +from ..tokenization_utils import PreTrainedTokenizer + + +class DataCollator(ABC): + """ + A `DataCollator` is responsible for batching + and pre-processing samples of data as requested by the training loop. + """ + + @abstractmethod + def collate_batch(self) -> Dict[str, torch.Tensor]: + """ + Take a list of samples from a Dataset and collate them into a batch. + + Returns: + A dictionary of tensors + """ + pass + + +InputDataClass = NewType("InputDataClass", Any) + + +@dataclass +class DefaultDataCollator(DataCollator): + """ + Very simple data collator that: + - simply collates batches of dict-like objects + - Performs special handling for potential keys named: + - `label`: handles a single value (int or float) per object + - `label_ids`: handles a list of values per object + - does not do any additional preprocessing + + i.e., Property names of the input object will be used as corresponding inputs to the model. + See glue and ner for example of how it's useful. + """ + + def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]: + # In this method we'll make the assumption that all `features` in the batch + # have the same attributes. + # So we will look at the first element as a proxy for what attributes exist + # on the whole batch. + first = features[0] + + # Special handling for labels. + # Ensure that tensor is created with the correct type + # (it should be automatically the case, but let's make sure of it.) + if hasattr(first, "label") and first.label is not None: + if type(first.label) is int: + labels = torch.tensor([f.label for f in features], dtype=torch.long) + else: + labels = torch.tensor([f.label for f in features], dtype=torch.float) + batch = {"labels": labels} + elif hasattr(first, "label_ids") and first.label_ids is not None: + if type(first.label_ids[0]) is int: + labels = torch.tensor([f.label_ids for f in features], dtype=torch.long) + else: + labels = torch.tensor([f.label_ids for f in features], dtype=torch.float) + batch = {"labels": labels} + else: + batch = {} + + # Handling of all other possible attributes. + # Again, we will use the first element to figure out which key/values are not None for this model. + for k, v in vars(first).items(): + if k not in ("label", "label_ids") and v is not None and not isinstance(v, str): + batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long) + return batch + + +@dataclass +class DataCollatorForLanguageModeling(DataCollator): + """ + Data collator used for language modeling. + - collates batches of tensors, honoring their tokenizer's pad_token + - preprocesses batches for masked language modeling + """ + + tokenizer: PreTrainedTokenizer + mlm: bool = True + mlm_probability: float = 0.15 + + def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]: + batch = self._tensorize_batch(examples) + if self.mlm: + inputs, labels = self.mask_tokens(batch) + return {"input_ids": inputs, "masked_lm_labels": labels} + else: + return {"input_ids": batch, "labels": batch} + + def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor: + length_of_first = examples[0].size(0) + are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) + if are_tensors_same_length: + return torch.stack(examples, dim=0) + else: + if self.tokenizer._pad_token is None: + raise ValueError( + "You are attempting to pad samples but the tokenizer you are using" + f" ({self.tokenizer.__class__.__name__}) does not have one." + ) + return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) + + def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + + if self.tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." + ) + + labels = inputs.clone() + # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) + probability_matrix = torch.full(labels.shape, self.mlm_probability) + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) + if self.tokenizer._pad_token is not None: + padding_mask = labels.eq(self.tokenizer.pad_token_id) + probability_matrix.masked_fill_(padding_mask, value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) + + # 10% of the time, we replace masked input tokens with random word + indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels diff --git a/src/transformers/data/datasets/__init__.py b/src/transformers/data/datasets/__init__.py new file mode 100644 index 0000000000..74a2147bc5 --- /dev/null +++ b/src/transformers/data/datasets/__init__.py @@ -0,0 +1,6 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +from .glue import GlueDataset, GlueDataTrainingArguments +from .language_modeling import LineByLineTextDataset, TextDataset diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py new file mode 100644 index 0000000000..63d9b69af8 --- /dev/null +++ b/src/transformers/data/datasets/glue.py @@ -0,0 +1,124 @@ +import logging +import os +import time +from dataclasses import dataclass, field +from typing import List, Optional + +import torch +from torch.utils.data.dataset import Dataset + +from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast +from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_xlm_roberta import XLMRobertaTokenizer +from ...trainer import torch_distributed_zero_first +from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors +from ..processors.utils import InputFeatures + + +logger = logging.getLogger(__name__) + + +@dataclass +class GlueDataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())}) + data_dir: str = field( + metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."} + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + + def __post_init__(self): + self.task_name = self.task_name.lower() + + +class GlueDataset(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + args: GlueDataTrainingArguments + output_mode: str + features: List[InputFeatures] + + def __init__( + self, + args: GlueDataTrainingArguments, + tokenizer: PreTrainedTokenizer, + limit_length: Optional[int] = None, + evaluate=False, + local_rank=-1, + ): + self.args = args + processor = glue_processors[args.task_name]() + self.output_mode = glue_output_modes[args.task_name] + # Load data features from cache or dataset file + cached_features_file = os.path.join( + args.data_dir, + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name, + ), + ) + with torch_distributed_zero_first(local_rank): + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + start = time.time() + self.features = torch.load(cached_features_file) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) + else: + logger.info(f"Creating features from dataset file at {args.data_dir}") + label_list = processor.get_labels() + if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( + RobertaTokenizer, + RobertaTokenizerFast, + XLMRobertaTokenizer, + ): + # HACK(label indices are swapped in RoBERTa pretrained model) + label_list[1], label_list[2] = label_list[2], label_list[1] + examples = ( + processor.get_dev_examples(args.data_dir) + if evaluate + else processor.get_train_examples(args.data_dir) + ) + if limit_length is not None: + examples = examples[:limit_length] + self.features = glue_convert_examples_to_features( + examples, + tokenizer, + max_length=args.max_seq_length, + label_list=label_list, + output_mode=self.output_mode, + ) + if local_rank in [-1, 0]: + start = time.time() + torch.save(self.features, cached_features_file) + # ^ This seems to take a lot of time so I want to investigate why and how we can improve. + logger.info( + f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start + ) + + def __len__(self): + return len(self.features) + + def __getitem__(self, i) -> InputFeatures: + return self.features[i] diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py new file mode 100644 index 0000000000..5695be482b --- /dev/null +++ b/src/transformers/data/datasets/language_modeling.py @@ -0,0 +1,101 @@ +import logging +import os +import pickle +import time + +import torch +from torch.utils.data.dataset import Dataset + +from ...tokenization_utils import PreTrainedTokenizer +from ...trainer import torch_distributed_zero_first + + +logger = logging.getLogger(__name__) + + +class TextDataset(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + def __init__( + self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1, + ): + assert os.path.isfile(file_path) + + block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) + + directory, filename = os.path.split(file_path) + cached_features_file = os.path.join( + directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), + ) + + with torch_distributed_zero_first(local_rank): + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + + if os.path.exists(cached_features_file) and not overwrite_cache: + start = time.time() + with open(cached_features_file, "rb") as handle: + self.examples = pickle.load(handle) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) + + else: + logger.info(f"Creating features from dataset file at {directory}") + + self.examples = [] + with open(file_path, encoding="utf-8") as f: + text = f.read() + + tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) + + for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size + self.examples.append( + tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]) + ) + # Note that we are losing the last truncated example here for the sake of simplicity (no padding) + # If your dataset is small, first you should loook for a bigger one :-) and second you + # can change this behavior by adding (model specific) padding. + + start = time.time() + with open(cached_features_file, "wb") as handle: + pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) + logger.info( + f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start + ) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, i) -> torch.Tensor: + return torch.tensor(self.examples[i], dtype=torch.long) + + +class LineByLineTextDataset(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1): + assert os.path.isfile(file_path) + # Here, we do not cache the features, operating under the assumption + # that we will soon use fast multithreaded tokenizers from the + # `tokenizers` repo everywhere =) + logger.info("Creating features from dataset file at %s", file_path) + + with open(file_path, encoding="utf-8") as f: + lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] + + lines = lines[:50_000] + batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) + self.examples = batch_encoding["input_ids"] + + def __len__(self): + return len(self.examples) + + def __getitem__(self, i) -> torch.Tensor: + return torch.tensor(self.examples[i], dtype=torch.long) diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py index 22c18ad577..cc091e2a7c 100644 --- a/src/transformers/data/processors/glue.py +++ b/src/transformers/data/processors/glue.py @@ -17,6 +17,7 @@ import logging import os +from enum import Enum from typing import List, Optional, Union from ...file_utils import is_tf_available @@ -153,6 +154,11 @@ def _glue_convert_examples_to_features( return features +class OutputMode(Enum): + classification = "classification" + regression = "regression" + + class MrpcProcessor(DataProcessor): """Processor for the MRPC data set (GLUE version).""" diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py index 73998cc1c7..eb36551884 100644 --- a/src/transformers/data/processors/utils.py +++ b/src/transformers/data/processors/utils.py @@ -14,13 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import csv import dataclasses import json import logging from dataclasses import dataclass -from typing import Optional +from typing import List, Optional, Union from ...file_utils import is_tf_available, is_torch_available @@ -28,7 +27,7 @@ from ...file_utils import is_tf_available, is_torch_available logger = logging.getLogger(__name__) -@dataclass(frozen=False) +@dataclass class InputExample: """ A single training/test example for simple sequence classification. @@ -50,42 +49,37 @@ class InputExample: def to_json_string(self): """Serializes this instance to a JSON string.""" - return json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n" + return json.dumps(dataclasses.asdict(self), indent=2) + "\n" -class InputFeatures(object): +@dataclass(frozen=True) +class InputFeatures: """ A single set of features of data. + Property names are the same names as the corresponding inputs to a model. Args: input_ids: Indices of input sequence tokens in the vocabulary. attention_mask: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. - token_type_ids: Segment token indices to indicate first and second portions of the inputs. - label: Label corresponding to the input + token_type_ids: (Optional) Segment token indices to indicate first and second + portions of the inputs. Only some models use them. + label: (Optional) Label corresponding to the input. Int for classification problems, + float for regression problems. """ - def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None): - self.input_ids = input_ids - self.attention_mask = attention_mask - self.token_type_ids = token_type_ids - self.label = label - - def __repr__(self): - return str(self.to_json_string()) - - def to_dict(self): - """Serializes this instance to a Python dictionary.""" - output = copy.deepcopy(self.__dict__) - return output + input_ids: List[int] + attention_mask: Optional[List[int]] = None + token_type_ids: Optional[List[int]] = None + label: Optional[Union[int, float]] = None def to_json_string(self): """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), sort_keys=True) + "\n" + return json.dumps(dataclasses.asdict(self)) + "\n" -class DataProcessor(object): +class DataProcessor: """Base class for data converters for sequence classification data sets.""" def get_example_from_tensor_dict(self, tensor_dict): diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 0967199564..cf70217a26 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -456,6 +456,11 @@ def get_from_cache( lock_path = cache_path + ".lock" with FileLock(lock_path): + # If the download just completed while the lock was activated. + if os.path.exists(cache_path) and not force_download: + # Even if returning early like here, the lock will be released. + return cache_path + if resume_download: incomplete_path = cache_path + ".incomplete" @@ -496,3 +501,50 @@ def get_from_cache( json.dump(meta, meta_file) return cache_path + + +class cached_property(property): + """ + Descriptor that mimics @property but caches output in member variable. + + From tensorflow_datasets + + Built-in in functools from Python 3.8. + """ + + def __get__(self, obj, objtype=None): + # See docs.python.org/3/howto/descriptor.html#properties + if obj is None: + return self + if self.fget is None: + raise AttributeError("unreadable attribute") + attr = "__cached_" + self.fget.__name__ + cached = getattr(obj, attr, None) + if cached is None: + cached = self.fget(obj) + setattr(obj, attr, cached) + return cached + + +def torch_required(func): + # Chose a different decorator name than in tests so it's clear they are not the same. + @wraps(func) + def wrapper(*args, **kwargs): + if is_torch_available(): + return func(*args, **kwargs) + else: + raise ImportError(f"Method `{func.__name__}` requires PyTorch.") + + return wrapper + + +def tf_required(func): + # Chose a different decorator name than in tests so it's clear they are not the same. + @wraps(func) + def wrapper(*args, **kwargs): + if is_tf_available(): + return func(*args, **kwargs) + else: + raise ImportError(f"Method `{func.__name__}` requires TF.") + + return wrapper diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py index 89fbe6e59d..a007af33c2 100644 --- a/src/transformers/modeling_auto.py +++ b/src/transformers/modeling_auto.py @@ -55,6 +55,7 @@ from .modeling_bart import ( from .modeling_bert import ( BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForMaskedLM, + BertForMultipleChoice, BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, @@ -64,6 +65,7 @@ from .modeling_bert import ( from .modeling_camembert import ( CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, CamembertForMaskedLM, + CamembertForMultipleChoice, CamembertForSequenceClassification, CamembertForTokenClassification, CamembertModel, @@ -96,6 +98,7 @@ from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTL from .modeling_roberta import ( ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, + RobertaForMultipleChoice, RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, @@ -114,12 +117,14 @@ from .modeling_xlm import ( from .modeling_xlm_roberta import ( XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, XLMRobertaForMaskedLM, + XLMRobertaForMultipleChoice, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLMRobertaModel, ) from .modeling_xlnet import ( XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple, XLNetForSequenceClassification, XLNetForTokenClassification, @@ -259,7 +264,18 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( ) -class AutoModel(object): +MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( + [ + (CamembertConfig, CamembertForMultipleChoice), + (XLMRobertaConfig, XLMRobertaForMultipleChoice), + (RobertaConfig, RobertaForMultipleChoice), + (BertConfig, BertForMultipleChoice), + (XLNetConfig, XLNetForMultipleChoice), + ] +) + + +class AutoModel: r""" :class:`~transformers.AutoModel` is a generic model class that will be instantiated as one of the base model classes of the library @@ -410,7 +426,7 @@ class AutoModel(object): ) -class AutoModelForPreTraining(object): +class AutoModelForPreTraining: r""" :class:`~transformers.AutoModelForPreTraining` is a generic model class that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` @@ -552,7 +568,7 @@ class AutoModelForPreTraining(object): ) -class AutoModelWithLMHead(object): +class AutoModelWithLMHead: r""" :class:`~transformers.AutoModelWithLMHead` is a generic model class that will be instantiated as one of the language modeling model classes of the library @@ -696,7 +712,7 @@ class AutoModelWithLMHead(object): ) -class AutoModelForSequenceClassification(object): +class AutoModelForSequenceClassification: r""" :class:`~transformers.AutoModelForSequenceClassification` is a generic model class that will be instantiated as one of the sequence classification model classes of the library @@ -843,7 +859,7 @@ class AutoModelForSequenceClassification(object): ) -class AutoModelForQuestionAnswering(object): +class AutoModelForQuestionAnswering: r""" :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class that will be instantiated as one of the question answering model classes of the library @@ -1126,3 +1142,55 @@ class AutoModelForTokenClassification: ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()), ) ) + + +class AutoModelForMultipleChoice: + r""" + :class:`~transformers.AutoModelForMultipleChoice` is a generic model class + that will be instantiated as one of the multiple choice model classes of the library + when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` + class method. + + This class cannot be instantiated using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoModelForMultipleChoice is designed to be instantiated " + "using the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelForMultipleChoice.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config): + for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): + if isinstance(config, config_class): + return model_class(config) + + raise ValueError( + "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, + cls.__name__, + ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), + ) + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + config = kwargs.pop("config", None) + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + + for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): + if isinstance(config, config_class): + return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + + raise ValueError( + "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, + cls.__name__, + ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()), + ) + ) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py new file mode 100644 index 0000000000..b6009a8106 --- /dev/null +++ b/src/transformers/trainer.py @@ -0,0 +1,558 @@ +import json +import logging +import os +import random +import re +import shutil +from contextlib import contextmanager +from pathlib import Path +from typing import Callable, Dict, List, NamedTuple, Optional, Tuple + +import numpy as np +import torch +from torch import nn +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import Dataset +from torch.utils.data.distributed import DistributedSampler +from torch.utils.data.sampler import RandomSampler +from tqdm import tqdm, trange + +from .data.data_collator import DataCollator, DefaultDataCollator +from .modeling_utils import PreTrainedModel +from .optimization import AdamW, get_linear_schedule_with_warmup +from .training_args import TrainingArguments + + +try: + from apex import amp + + _has_apex = True +except ImportError: + _has_apex = False + + +def is_apex_available(): + return _has_apex + + +try: + from torch.utils.tensorboard import SummaryWriter + + _has_tensorboard = True +except ImportError: + try: + from tensorboardX import SummaryWriter + + _has_tensorboard = True + except ImportError: + _has_tensorboard = False + + +def is_tensorboard_available(): + return _has_tensorboard + + +logger = logging.getLogger(__name__) + + +def set_seed(seed: int): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + # ^^ safe to call this function even if cuda is not available + + +@contextmanager +def torch_distributed_zero_first(local_rank: int): + """ + Decorator to make all processes in distributed training wait for the first one (locally) to do something. + """ + if local_rank not in [-1, 0]: + torch.distributed.barrier() + yield + if local_rank == 0: + torch.distributed.barrier() + + +class EvalPrediction(NamedTuple): + """ + Evaluation output (always contains labels), to be used + to compute metrics. + """ + + predictions: np.ndarray + label_ids: np.ndarray + + +class PredictionOutput(NamedTuple): + predictions: np.ndarray + label_ids: Optional[np.ndarray] + metrics: Optional[Dict[str, float]] + + +class TrainOutput(NamedTuple): + global_step: int + training_loss: float + + +PREFIX_CHECKPOINT_DIR = "checkpoint" + + +class Trainer: + """ + Trainer is a simple but feature-complete training and eval loop for PyTorch, + optimized for Transformers. + """ + + model: PreTrainedModel + args: TrainingArguments + data_collator: DataCollator + train_dataset: Optional[Dataset] + eval_dataset: Optional[Dataset] + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None + prediction_loss_only: bool + tb_writer: Optional["SummaryWriter"] = None + + def __init__( + self, + model: PreTrainedModel, + args: TrainingArguments, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + prediction_loss_only=False, + ): + """ + Trainer is a simple but feature-complete training and eval loop for PyTorch, + optimized for Transformers. + + Args: + prediction_loss_only: + (Optional) in evaluation and prediction, only return the loss + """ + self.model = model + self.args = args + if data_collator is not None: + self.data_collator = data_collator + else: + self.data_collator = DefaultDataCollator() + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + self.compute_metrics = compute_metrics + self.prediction_loss_only = prediction_loss_only + if is_tensorboard_available() and self.args.local_rank in [-1, 0]: + self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) + if not is_tensorboard_available(): + logger.warning( + "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." + ) + set_seed(self.args.seed) + # Create output directory if needed + if self.args.local_rank in [-1, 0]: + os.makedirs(self.args.output_dir, exist_ok=True) + + def get_train_dataloader(self) -> DataLoader: + if self.train_dataset is None: + raise ValueError("Trainer: training requires a train_dataset.") + train_sampler = ( + RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset) + ) + return DataLoader( + self.train_dataset, + batch_size=self.args.train_batch_size, + sampler=train_sampler, + collate_fn=self.data_collator.collate_batch, + ) + + def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: + if eval_dataset is None and self.eval_dataset is None: + raise ValueError("Trainer: evaluation requires an eval_dataset.") + return DataLoader( + eval_dataset if eval_dataset is not None else self.eval_dataset, + batch_size=self.args.eval_batch_size, + shuffle=False, + collate_fn=self.data_collator.collate_batch, + ) + + def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: + # We use the same batch_size as for eval. + return DataLoader( + test_dataset, + batch_size=self.args.eval_batch_size, + shuffle=False, + collate_fn=self.data_collator.collate_batch, + ) + + def get_optimizers( + self, num_training_steps: int + ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": self.args.weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps + ) + return optimizer, scheduler + + def train(self, model_path: Optional[str] = None): + """ + Main training entry point. + + Args: + model_path: + (Optional) Local path to model if model to train has been instantiated from a local path + If present, we will try reloading the optimizer/scheduler states from there. + """ + train_dataloader = self.get_train_dataloader() + + if self.args.max_steps > 0: + t_total = self.args.max_steps + num_train_epochs = ( + self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 + ) + else: + t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) + num_train_epochs = self.args.num_train_epochs + + optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) + + # Check if saved optimizer or scheduler states exist + if ( + model_path is not None + and os.path.isfile(os.path.join(model_path, "optimizer.pt")) + and os.path.isfile(os.path.join(model_path, "scheduler.pt")) + ): + # Load in optimizer and scheduler states + optimizer.load_state_dict(torch.load(os.path.join(model_path, "optimizer.pt"))) + scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) + + model = self.model + model.to(self.args.device) + if self.args.fp16: + if not is_apex_available(): + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) + + # multi-gpu training (should be after apex fp16 initialization) + if self.args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Distributed training (should be after apex fp16 initialization) + if self.args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel( + model, + device_ids=[self.args.local_rank], + output_device=self.args.local_rank, + find_unused_parameters=True, + ) + + if self.tb_writer is not None: + self.tb_writer.add_text("args", self.args.to_json_string()) + + # Train! + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataloader.dataset)) + logger.info(" Num Epochs = %d", num_train_epochs) + logger.info(" Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + self.args.train_batch_size + * self.args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1), + ) + logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) + logger.info(" Total optimization steps = %d", t_total) + + global_step = 0 + epochs_trained = 0 + steps_trained_in_current_epoch = 0 + # Check if continuing training from a checkpoint + if model_path is not None: + # set global_step to global_step of last saved checkpoint from model path + try: + global_step = int(model_path.split("-")[-1].split("/")[0]) + epochs_trained = global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) + steps_trained_in_current_epoch = global_step % ( + len(train_dataloader) // self.args.gradient_accumulation_steps + ) + + logger.info(" Continuing training from checkpoint, will skip to saved global_step") + logger.info(" Continuing training from epoch %d", epochs_trained) + logger.info(" Continuing training from global step %d", global_step) + logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) + except ValueError: + global_step = 0 + logger.info(" Starting fine-tuning.") + + tr_loss = 0.0 + logging_loss = 0.0 + model.zero_grad() + train_iterator = trange( + epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0], + ) + for epoch in train_iterator: + epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0]) + for step, inputs in enumerate(epoch_iterator): + + # Skip past any already trained steps if resuming training + if steps_trained_in_current_epoch > 0: + steps_trained_in_current_epoch -= 1 + continue + + tr_loss += self._training_step(model, inputs, optimizer) + + if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( + # last step in epoch but step is always smaller than gradient_accumulation_steps + len(epoch_iterator) <= self.args.gradient_accumulation_steps + and (step + 1) == len(epoch_iterator) + ): + if self.args.fp16: + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) + else: + torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) + + optimizer.step() + scheduler.step() + model.zero_grad() + global_step += 1 + + if self.args.local_rank in [-1, 0]: + if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or ( + global_step == 1 and self.args.logging_first_step + ): + logs = {} + if self.args.evaluate_during_training: + results = self.evaluate() + for key, value in results.items(): + eval_key = "eval_{}".format(key) + logs[eval_key] = value + + loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps + learning_rate_scalar = scheduler.get_last_lr()[0] + logs["learning_rate"] = learning_rate_scalar + logs["loss"] = loss_scalar + logging_loss = tr_loss + + if self.tb_writer: + for k, v in logs.items(): + self.tb_writer.add_scalar(k, v, global_step) + epoch_iterator.write(json.dumps({**logs, **{"step": global_step}})) + + if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: + # In all cases (even distributed/parallel), self.model is always a reference + # to the model we want to save. + if hasattr(model, "module"): + assert model.module is self.model + else: + assert model is self.model + # Save model checkpoint + output_dir = os.path.join(self.args.output_dir, f"checkpoint-{global_step}") + self.save_model(output_dir) + self._rotate_checkpoints() + torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + logger.info("Saving optimizer and scheduler states to %s", output_dir) + + if self.args.max_steps > 0 and global_step > self.args.max_steps: + epoch_iterator.close() + break + if self.args.max_steps > 0 and global_step > self.args.max_steps: + train_iterator.close() + break + + if self.tb_writer: + self.tb_writer.close() + + logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") + return TrainOutput(global_step, tr_loss / global_step) + + def _training_step( + self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer + ) -> float: + model.train() + for k, v in inputs.items(): + inputs[k] = v.to(self.args.device) + + outputs = model(**inputs) + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if self.args.gradient_accumulation_steps > 1: + loss = loss / self.args.gradient_accumulation_steps + + if self.args.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + return loss.item() + + def is_world_master(self) -> bool: + """ + This will be True only in one process, even in distributed mode, + even when training on multiple machines. + """ + return self.args.local_rank == -1 or torch.distributed.get_rank() == 0 + + def save_model(self, output_dir: Optional[str] = None): + """ + Saving best-practices: if you use default names for the model, + you can reload it using from_pretrained(). + + Will only save from the master process. + """ + if self.is_world_master(): + self._save(output_dir) + + def _save(self, output_dir: Optional[str] = None): + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not isinstance(self.model, PreTrainedModel): + raise ValueError("Trainer.model appears to not be a PreTrainedModel") + self.model.save_pretrained(output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(self.args, os.path.join(output_dir, "training_args.bin")) + + def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]: + ordering_and_checkpoint_path = [] + + glob_checkpoints = Path(self.args.output_dir).glob(f"{checkpoint_prefix}-*") + + for path in glob_checkpoints: + if use_mtime: + ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) + else: + regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path) + if regex_match and regex_match.groups(): + ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) + + checkpoints_sorted = sorted(ordering_and_checkpoint_path) + checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] + return checkpoints_sorted + + def _rotate_checkpoints(self, use_mtime=False) -> None: + if not self.args.save_total_limit: + return + if self.args.save_total_limit <= 0: + return + + # Check if we should delete older checkpoint(s) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime) + if len(checkpoints_sorted) <= self.args.save_total_limit: + return + + number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args.save_total_limit) + checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] + for checkpoint in checkpoints_to_be_deleted: + logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) + shutil.rmtree(checkpoint) + + def evaluate( + self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None + ) -> Dict[str, float]: + """ + Run evaluation and return metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are + task-dependent. + + Args: + eval_dataset: (Optional) Pass a dataset if you wish to override + the one on the instance. + Returns: + A dict containing: + - the eval loss + - the potential metrics computed from the predictions + """ + eval_dataloader = self.get_eval_dataloader(eval_dataset) + + output = self._prediction_loop(eval_dataloader, description="Evaluation") + return output.metrics + + def predict(self, test_dataset: Dataset) -> PredictionOutput: + """ + Run prediction and return predictions and potential metrics. + + Depending on the dataset and your use case, your test dataset may contain labels. + In that case, this method will also return metrics, like in evaluate(). + """ + test_dataloader = self.get_test_dataloader(test_dataset) + return self._prediction_loop(test_dataloader, description="Prediction") + + def _prediction_loop( + self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None + ) -> PredictionOutput: + """ + Prediction/evaluation loop, shared by `evaluate()` and `predict()`. + + Works both with or without labels. + """ + + prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only + + # multi-gpu eval + if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): + model = torch.nn.DataParallel(self.model) + else: + model = self.model + model.to(self.args.device) + + logger.info("***** Running %s *****", description) + logger.info(" Num examples = %d", len(dataloader.dataset)) + logger.info(" Batch size = %d", dataloader.batch_size) + eval_losses: List[float] = [] + preds: np.ndarray = None + label_ids: np.ndarray = None + model.eval() + + for inputs in tqdm(dataloader, desc=description): + has_labels = any(inputs.get(k) is not None for k in ["labels", "masked_lm_labels"]) + + for k, v in inputs.items(): + inputs[k] = v.to(self.args.device) + + with torch.no_grad(): + outputs = model(**inputs) + if has_labels: + step_eval_loss, logits = outputs[:2] + eval_losses += [step_eval_loss.mean().item()] + else: + logits = outputs[0] + + if not prediction_loss_only: + if preds is None: + preds = logits.detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + if inputs.get("labels") is not None: + if label_ids is None: + label_ids = inputs["labels"].detach().cpu().numpy() + else: + label_ids = np.append(label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) + + if self.compute_metrics is not None and preds is not None and label_ids is not None: + metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) + else: + metrics = {} + if len(eval_losses) > 0: + metrics["loss"] = np.mean(eval_losses) + + return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b48486dfb0..af32eac25b 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1,5 +1,17 @@ +import dataclasses +import json +import logging from dataclasses import dataclass, field -from typing import Optional +from typing import Optional, Tuple + +from .file_utils import cached_property, is_torch_available, torch_required + + +if is_torch_available(): + import torch + + +logger = logging.getLogger(__name__) @dataclass @@ -22,6 +34,7 @@ class TrainingArguments: do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."}) + do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) evaluate_during_training: bool = field( default=False, metadata={"help": "Run evaluation during training at each logging step."} ) @@ -44,6 +57,8 @@ class TrainingArguments: ) warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) + logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."}) + logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"}) logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) save_total_limit: Optional[int] = field( @@ -52,12 +67,6 @@ class TrainingArguments: "help": "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default" }, ) - eval_all_checkpoints: bool = field( - default=False, - metadata={ - "help": "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" - }, - ) no_cuda: bool = field(default=False, metadata={"help": "Avoid using CUDA even if it is available"}) seed: int = field(default=42, metadata={"help": "random seed for initialization"}) @@ -73,3 +82,47 @@ class TrainingArguments: }, ) local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) + + @property + def train_batch_size(self) -> int: + return self.per_gpu_train_batch_size * max(1, self.n_gpu) + + @property + def eval_batch_size(self) -> int: + return self.per_gpu_eval_batch_size * max(1, self.n_gpu) + + @cached_property + @torch_required + def _setup_devices(self) -> Tuple["torch.device", int]: + logger.info("PyTorch: setting up devices") + if self.no_cuda: + device = torch.device("cpu") + n_gpu = 0 + elif self.local_rank == -1: + # if n_gpu is > 1 we'll use nn.DataParallel. + # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + # Here, we'll use torch.distributed. + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend="nccl") + device = torch.device("cuda", self.local_rank) + n_gpu = 1 + return device, n_gpu + + @property + @torch_required + def device(self) -> "torch.device": + return self._setup_devices[0] + + @property + @torch_required + def n_gpu(self): + return self._setup_devices[1] + + def to_json_string(self): + """ + Serializes this instance to a JSON string. + """ + return json.dumps(dataclasses.asdict(self), indent=2) diff --git a/tests/test_hf_argparser.py b/tests/test_hf_argparser.py index 232d2e86af..f03b3a6819 100644 --- a/tests/test_hf_argparser.py +++ b/tests/test_hf_argparser.py @@ -5,8 +5,7 @@ from dataclasses import dataclass, field from enum import Enum from typing import Optional -from transformers.hf_argparser import HfArgumentParser -from transformers.training_args import TrainingArguments +from transformers import HfArgumentParser, TrainingArguments @dataclass diff --git a/tests/test_trainer.py b/tests/test_trainer.py new file mode 100644 index 0000000000..66195c20eb --- /dev/null +++ b/tests/test_trainer.py @@ -0,0 +1,109 @@ +import unittest + +from transformers import AutoTokenizer, TrainingArguments, is_torch_available + +from .utils import require_torch + + +if is_torch_available(): + import torch + from transformers import ( + Trainer, + LineByLineTextDataset, + AutoModelForSequenceClassification, + DefaultDataCollator, + DataCollatorForLanguageModeling, + GlueDataset, + GlueDataTrainingArguments, + TextDataset, + ) + + +PATH_SAMPLE_TEXT = "./tests/fixtures/sample_text.txt" + + +@require_torch +class DataCollatorIntegrationTest(unittest.TestCase): + def test_default_classification(self): + MODEL_ID = "bert-base-cased-finetuned-mrpc" + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + data_args = GlueDataTrainingArguments( + task_name="mrpc", data_dir="./examples/tests_samples/MRPC", overwrite_cache=True + ) + dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) + data_collator = DefaultDataCollator() + batch = data_collator.collate_batch(dataset.features) + self.assertEqual(batch["labels"].dtype, torch.long) + + def test_default_regression(self): + MODEL_ID = "distilroberta-base" + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + data_args = GlueDataTrainingArguments( + task_name="sts-b", data_dir="./examples/tests_samples/STS-B", overwrite_cache=True + ) + dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) + data_collator = DefaultDataCollator() + batch = data_collator.collate_batch(dataset.features) + self.assertEqual(batch["labels"].dtype, torch.float) + + def test_lm_tokenizer_without_padding(self): + tokenizer = AutoTokenizer.from_pretrained("gpt2") + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + # ^ causal lm + + dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) + examples = [dataset[i] for i in range(len(dataset))] + with self.assertRaises(ValueError): + # Expect error due to padding token missing on gpt2: + data_collator.collate_batch(examples) + + dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) + examples = [dataset[i] for i in range(len(dataset))] + batch = data_collator.collate_batch(examples) + self.assertIsInstance(batch, dict) + self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) + self.assertEqual(batch["labels"].shape, torch.Size((2, 512))) + + def test_lm_tokenizer_with_padding(self): + tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") + data_collator = DataCollatorForLanguageModeling(tokenizer) + # ^ masked lm + + dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) + examples = [dataset[i] for i in range(len(dataset))] + batch = data_collator.collate_batch(examples) + self.assertIsInstance(batch, dict) + self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107))) + self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107))) + + dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) + examples = [dataset[i] for i in range(len(dataset))] + batch = data_collator.collate_batch(examples) + self.assertIsInstance(batch, dict) + self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) + self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512))) + + +@require_torch +class TrainerIntegrationTest(unittest.TestCase): + def test_trainer_eval_mrpc(self): + MODEL_ID = "bert-base-cased-finetuned-mrpc" + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) + data_args = GlueDataTrainingArguments( + task_name="mrpc", data_dir="./examples/tests_samples/MRPC", overwrite_cache=True + ) + eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) + + training_args = TrainingArguments(output_dir="./examples", no_cuda=True) + trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset) + result = trainer.evaluate() + self.assertLess(result["loss"], 0.2) + + def test_trainer_eval_lm(self): + MODEL_ID = "distilroberta-base" + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + dataset = LineByLineTextDataset( + tokenizer=tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=tokenizer.max_len_single_sentence, + ) + self.assertEqual(len(dataset), 31)