From 1dc9b3c7847269961458c059ad8ad443b26bf60d Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 22 Apr 2020 01:15:10 +0000 Subject: [PATCH] Fixes #3877 --- docs/source/bertology.rst | 2 +- examples/run_bertology.py | 96 +++++++++++++++++++-------------------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/docs/source/bertology.rst b/docs/source/bertology.rst index c3d1b2f8b8..59158ca279 100644 --- a/docs/source/bertology.rst +++ b/docs/source/bertology.rst @@ -8,7 +8,7 @@ There is a growing field of study concerned with investigating the inner working * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341 -In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650): +In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650): * accessing all the hidden-states of BERT/GPT/GPT-2, diff --git a/examples/run_bertology.py b/examples/run_bertology.py index d18b8bc3a2..2904358f90 100644 --- a/examples/run_bertology.py +++ b/examples/run_bertology.py @@ -30,10 +30,17 @@ from torch.utils.data import DataLoader, SequentialSampler, Subset from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm -from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed -from transformers import glue_compute_metrics as compute_metrics -from transformers import glue_output_modes as output_modes -from transformers import glue_processors as processors +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + DefaultDataCollator, + GlueDataset, + glue_compute_metrics, + glue_output_modes, + glue_processors, + set_seed, +) logger = logging.getLogger(__name__) @@ -64,7 +71,7 @@ def compute_heads_importance( - head importance scores according to http://arxiv.org/abs/1905.10650 """ # Prepare our tensors - n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads + n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads head_importance = torch.zeros(n_layers, n_heads).to(args.device) attn_entropy = torch.zeros(n_layers, n_heads).to(args.device) @@ -75,14 +82,12 @@ def compute_heads_importance( labels = None tot_tokens = 0.0 - for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): - batch = tuple(t.to(args.device) for t in batch) - input_ids, input_mask, segment_ids, label_ids = batch + for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): + for k, v in inputs.items(): + inputs[k] = v.to(args.device) # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below) - outputs = model( - input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask - ) + outputs = model(**inputs, head_mask=head_mask) loss, logits, all_attentions = ( outputs[0], outputs[1], @@ -92,7 +97,7 @@ def compute_heads_importance( if compute_entropy: for layer, attn in enumerate(all_attentions): - masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1) + masked_entropy = entropy(attn.detach()) * inputs["attention_mask"].float().unsqueeze(1) attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach() if compute_importance: @@ -101,12 +106,12 @@ def compute_heads_importance( # Also store our logits/labels if we want to compute metrics afterwards if preds is None: preds = logits.detach().cpu().numpy() - labels = label_ids.detach().cpu().numpy() + labels = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0) + labels = np.append(labels, inputs["labels"].detach().cpu().numpy(), axis=0) - tot_tokens += input_mask.float().detach().sum().data + tot_tokens += inputs["attention_mask"].float().detach().sum().data # Normalize attn_entropy /= tot_tokens @@ -145,7 +150,7 @@ def mask_heads(args, model, eval_dataloader): """ _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] + original_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) new_head_mask = torch.ones_like(head_importance) @@ -174,7 +179,7 @@ def mask_heads(args, model, eval_dataloader): args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask ) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] + current_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info( "Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, @@ -200,7 +205,7 @@ def prune_heads(args, model, eval_dataloader, head_mask): args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask ) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name] + score_masking = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] original_time = datetime.now() - before_time original_num_params = sum(p.numel() for p in model.parameters()) @@ -214,7 +219,7 @@ def prune_heads(args, model, eval_dataloader, head_mask): args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None ) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name] + score_pruning = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name] new_time = datetime.now() - before_time logger.info( @@ -242,14 +247,14 @@ def main(): default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + help="Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--task_name", default=None, type=str, required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), + help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()), ) parser.add_argument( "--output_dir", @@ -274,7 +279,7 @@ def main(): ) parser.add_argument( "--cache_dir", - default="", + default=None, type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) @@ -350,48 +355,40 @@ def main(): logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1))) # Set seeds - set_seed(args) + set_seed(args.seed) # Prepare GLUE task args.task_name = args.task_name.lower() - if args.task_name not in processors: + if args.task_name not in glue_processors: raise ValueError("Task not found: %s" % (args.task_name)) - processor = processors[args.task_name]() - args.output_mode = output_modes[args.task_name] + processor = glue_processors[args.task_name]() + args.output_mode = glue_output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. - args.model_type = "" - for key in MODEL_CLASSES: - if key in args.model_name_or_path.lower(): - args.model_type = key # take the first match in model types - break - config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained( + config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, output_attentions=True, - cache_dir=args.cache_dir if args.cache_dir else None, + cache_dir=args.cache_dir, ) - tokenizer = tokenizer_class.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None, + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, ) - model = model_class.from_pretrained( + model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, - cache_dir=args.cache_dir if args.cache_dir else None, + cache_dir=args.cache_dir, ) - if args.local_rank == 0: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab - # Distributed and parallel training model.to(args.device) if args.local_rank != -1: @@ -402,15 +399,18 @@ def main(): model = torch.nn.DataParallel(model) # Print/save training arguments + os.makedirs(args.output_dir, exist_ok=True) torch.save(args, os.path.join(args.output_dir, "run_args.bin")) logger.info("Training/evaluation parameters %s", args) # Prepare dataset for the GLUE task - eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) + eval_dataset = GlueDataset(args, tokenizer=tokenizer, evaluate=True, local_rank=args.local_rank) if args.data_subset > 0: - eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data))))) - eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) + eval_dataset = Subset(eval_dataset, list(range(min(args.data_subset, len(eval_dataset))))) + eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=DefaultDataCollator().collate_batch + ) # Compute head entropy and importance score compute_heads_importance(args, model, eval_dataloader)