From 2397f958f99767290e8bc54f96e1df62f63d34af Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sun, 14 Jul 2019 23:20:10 +0200 Subject: [PATCH] updating examples and doc --- README.md | 21 +- docs/source/index.rst | 19 +- docs/source/model_doc/bert.rst | 4 +- docs/source/model_doc/gpt.rst | 7 - docs/source/model_doc/overview.rst | 2 +- .../lm_finetuning/finetune_on_pregenerated.py | 4 +- examples/run_bertology.py | 384 +++++---- examples/run_generation.py | 2 +- examples/run_glue.py | 9 +- examples/run_squad.py | 38 +- .../run_openai_gpt.py | 4 +- .../{ => single_model_scripts}/run_swag.py | 2 +- .../run_transfo_xl.py | 0 examples/test_examples.py | 1 - pytorch_transformers/modeling_bert.py | 764 ++++++++---------- pytorch_transformers/modeling_utils.py | 7 + 16 files changed, 601 insertions(+), 667 deletions(-) rename examples/{ => single_model_scripts}/run_openai_gpt.py (98%) rename examples/{ => single_model_scripts}/run_swag.py (99%) rename examples/{ => single_model_scripts}/run_transfo_xl.py (100%) diff --git a/README.md b/README.md index f916627b90..dba18a0d5e 100644 --- a/README.md +++ b/README.md @@ -131,11 +131,8 @@ This package comprises the following classes that can be imported in Python and - Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) file): - `GPT2Tokenizer` - perform byte-level Byte-Pair-Encoding (BPE) tokenization. -- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_transformers/optimization.py) file): - - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate. - -- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_transformers/optimization_openai.py) file): - - `OpenAIAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate. +- Optimizer (in the [`optimization.py`](./pytorch_transformers/optimization.py) file): + - `AdamW` - Version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate. - Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_transformers/modeling.py), [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) files): - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files. @@ -1104,12 +1101,11 @@ Please refer to [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt ### Optimizers -#### `BertAdam` +#### `AdamW` -`BertAdam` is a `torch.optimizer` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following: +`AdamW` is a `torch.optimizer` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following: -- BertAdam implements weight decay fix, -- BertAdam doesn't compensate for bias as in the regular Adam optimizer. +- AdamW implements weight decay fix, The optimizer accepts the following arguments: @@ -1127,13 +1123,6 @@ The optimizer accepts the following arguments: - `weight_decay:` Weight decay. Default : `0.01` - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0` -#### `OpenAIAdam` - -`OpenAIAdam` is similar to `BertAdam`. -The differences with `BertAdam` is that `OpenAIAdam` compensate for bias as in the regular Adam optimizer. - -`OpenAIAdam` accepts the same arguments as `BertAdam`. - #### Learning Rate Schedules The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`. diff --git a/docs/source/index.rst b/docs/source/index.rst index ded234354d..aedb231163 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -60,10 +60,10 @@ This PyTorch implementation of Transformer-XL is an adaptation of the original ` This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation `__ and is provided with `OpenAI's pre-trained model `__ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch. **Facebook Research's XLM** was released together with the paper `Cross-lingual Language Model Pretraining `__ by Guillaume Lample and Alexis Conneau. -This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation `__. TODO Lysandre filled +This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation `__. **Google's XLNet** was released together with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding `__ by Zhilin Yang\*, Zihang Dai\*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov and Quoc V. Le. -This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation `__. TODO Lysandre filled +This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation `__. Content @@ -91,7 +91,7 @@ Content * - `Migration <./migration.html>`__ - Migrating from ``pytorch_pretrained_BERT`` (v0.6) to ``pytorch_transformers`` (v1.0) * - `Bertology <./bertology.html>`__ - - TODO Lysandre didn't know how to fill + - Exploring the internals of the pretrained models. * - `TorchScript <./torchscript.html>`__ - Convert a model to TorchScript for use in other programming languages @@ -115,8 +115,6 @@ Content * - `XLNet <./model_doc/xlnet.html>`__ - XLNet Models, Tokenizers and optimizers -TODO Lysandre filled: might need an introduction for both parts. Is it even necessary, since there is a summary? Up to you Thom. - Overview -------- @@ -219,17 +217,10 @@ TODO Lysandre filled: I filled in XLM and XLNet. I didn't do the Tokenizers beca * - Optimizer for **BERT** (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`__ file): + Optimizer (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`__ file): - * ``BertAdam`` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate. - - -* - Optimizer for **OpenAI GPT** (in the `optimization_openai.py <./_modules/pytorch_transformers/optimization_openai.html>`__ file): - - - * ``OpenAIAdam`` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate. + * ``AdamW`` - Version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate. * diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst index 3a2e12a6dd..8c786aa24f 100644 --- a/docs/source/model_doc/bert.rst +++ b/docs/source/model_doc/bert.rst @@ -15,10 +15,10 @@ BERT :members: -``BertAdam`` +``AdamW`` ~~~~~~~~~~~~~~~~ -.. autoclass:: pytorch_transformers.BertAdam +.. autoclass:: pytorch_transformers.AdamW :members: ``BertModel`` diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst index 815cbe5787..26762ae011 100644 --- a/docs/source/model_doc/gpt.rst +++ b/docs/source/model_doc/gpt.rst @@ -15,13 +15,6 @@ OpenAI GPT :members: -``OpenAIAdam`` -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: pytorch_transformers.OpenAIAdam - :members: - - ``OpenAIGPTModel`` ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst index 00e538e68d..7e24115ae3 100644 --- a/docs/source/model_doc/overview.rst +++ b/docs/source/model_doc/overview.rst @@ -236,7 +236,7 @@ Learning Rate Schedules The ``.optimization`` module also provides additional schedules in the form of schedule objects that inherit from ``_LRSchedule``. All ``_LRSchedule`` subclasses accept ``warmup`` and ``t_total`` arguments at construction. -When an ``_LRSchedule`` object is passed into ``BertAdam`` or ``OpenAIAdam``\ , +When an ``_LRSchedule`` object is passed into ``AdamW``\ , the ``warmup`` and ``t_total`` arguments on the optimizer are ignored and the ones in the ``_LRSchedule`` object are used. An overview of the implemented schedules: diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 505cd466f6..fe958345d1 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -16,7 +16,7 @@ from tqdm import tqdm from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME from pytorch_transformers.modeling_bert import BertForPreTraining from pytorch_transformers.tokenization_bert import BertTokenizer -from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule +from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next") @@ -273,7 +273,7 @@ def main(): warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: - optimizer = BertAdam(optimizer_grouped_parameters, + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) diff --git a/examples/run_bertology.py b/examples/run_bertology.py index 096b1b44fc..61c7440ecb 100644 --- a/examples/run_bertology.py +++ b/examples/run_bertology.py @@ -1,4 +1,24 @@ #!/usr/bin/env python3 +# Copyright 2018 CMU and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Bertology: this script shows how you can explore the internals of the models in the library to: + - compute the entropy of the head attentions + - compute the importance of each head + - prune (remove) the low importance head. + Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650) + which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1 +""" import os import argparse import logging @@ -12,43 +32,49 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse from torch.utils.data.distributed import DistributedSampler from torch.nn import CrossEntropyLoss, MSELoss -from pytorch_transformers import BertForSequenceClassification, BertTokenizer +from pytorch_transformers import (WEIGHTS_NAME, + BertConfig, BertForSequenceClassification, BertTokenizer, + XLMConfig, XLMForSequenceClassification, XLMTokenizer, + XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer) -from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics +from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES +from utils_glue import (compute_metrics, convert_examples_to_features, + output_modes, processors) logger = logging.getLogger(__name__) def entropy(p): + """ Compute the entropy of a probability distribution """ plogp = p * torch.log(p) plogp[p == 0] = 0 return -plogp.sum(dim=-1) -def print_1d_tensor(tensor, prefix=""): - if tensor.dtype != torch.long: - logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data)) - else: - logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data)) - - def print_2d_tensor(tensor): + """ Print a 2D tensor """ logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor)))) for row in range(len(tensor)): - print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t") + if tensor.dtype != torch.long: + logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data)) + else: + logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data)) def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None): - """ Example on how to use model outputs to compute: - - head attention entropy (activated by setting output_attentions=True when we created the model + """ This method shows how to compute: + - head attention entropy - head importance scores according to http://arxiv.org/abs/1905.10650 - (activated by setting keep_multihead_output=True when we created the model) """ # Prepare our tensors n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads head_importance = torch.zeros(n_layers, n_heads).to(args.device) attn_entropy = torch.zeros(n_layers, n_heads).to(args.device) + + if head_mask is None: + head_mask = torch.ones(n_layers, n_heads).to(args.device) + head_mask.requires_grad_(requires_grad=True) preds = None labels = None tot_tokens = 0.0 @@ -58,29 +84,17 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, input_ids, input_mask, segment_ids, label_ids = batch # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below) - all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask) + outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask) + loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1] # Loss and logits are the first, attention the last + loss.backward() # Backpropagate to populate the gradients in the head mask if compute_entropy: - # Update head attention entropy for layer, attn in enumerate(all_attentions): masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1) attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach() if compute_importance: - # Update head importance scores with regards to our loss - # First, backpropagate to populate the gradients - if args.output_mode == "classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1)) - elif args.output_mode == "regression": - loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), label_ids.view(-1)) - loss.backward() - # Second, compute importance scores according to http://arxiv.org/abs/1905.10650 - multihead_outputs = model.bert.get_multihead_outputs() - for layer, mh_layer_output in enumerate(multihead_outputs): - dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output]) - head_importance[layer] += dot.abs().sum(-1).sum(0).detach() + head_importance += head_mask.grad.abs().detach() # Also store our logits/labels if we want to compute metrics afterwards if preds is None: @@ -104,30 +118,137 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, if not args.dont_normalize_global_importance: head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min()) + # Print/save matrices + np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy()) + np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy()) + + logger.info("Attention entropies") + print_2d_tensor(attn_entropy) + logger.info("Head importance scores") + print_2d_tensor(head_importance) + logger.info("Head ranked by importance scores") + head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device) + head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device) + head_ranks = head_ranks.view_as(head_importance) + print_2d_tensor(head_ranks) + return attn_entropy, head_importance, preds, labels -def run_model(): +def mask_heads(args, model, eval_dataloader): + """ This method shows how to mask head (set some heads to zero), to test the effect on the network, + based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) + """ + _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False) + preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) + original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] + logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) + + new_head_mask = torch.ones_like(head_importance) + num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount)) + + current_score = original_score + while current_score >= original_score * args.masking_threshold: + head_mask = new_head_mask.clone() # save current head mask + # heads from least important to most - keep only not-masked heads + head_importance[head_mask == 0.0] = float('Inf') + current_heads_to_mask = head_importance.view(-1).sort()[1] + + if len(current_heads_to_mask) <= num_to_mask: + break + + # mask heads + current_heads_to_mask = current_heads_to_mask[:num_to_mask] + logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist())) + new_head_mask = new_head_mask.view(-1) + new_head_mask[current_heads_to_mask] = 0.0 + new_head_mask = new_head_mask.view_as(head_mask) + print_2d_tensor(new_head_mask) + + # Compute metric and head importance again + _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask) + preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) + current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] + logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100) + + logger.info("Final head mask") + print_2d_tensor(head_mask) + np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy()) + + return head_mask + + +def prune_heads(args, model, eval_dataloader, head_mask): + """ This method shows how to prune head (remove heads weights) based on + the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650) + """ + # Try pruning and test time speedup + # Pruning is like masking but we actually remove the masked weights + before_time = datetime.now() + _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, + compute_entropy=False, compute_importance=False, head_mask=head_mask) + preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) + score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name] + original_time = datetime.now() - before_time + + original_num_params = sum(p.numel() for p in model.parameters()) + heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask))) + assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item() + model.prune_heads(heads_to_prune) + pruned_num_params = sum(p.numel() for p in model.parameters()) + + before_time = datetime.now() + _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, + compute_entropy=False, compute_importance=False, head_mask=None) + preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) + score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name] + new_time = datetime.now() - before_time + + logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100) + logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) + logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100) + + +def main(): parser = argparse.ArgumentParser() - parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint') - parser.add_argument("--task_name", type=str, default='mrpc', help="The name of the task to train.") - parser.add_argument("--data_dir", type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") - parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.") - parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory") + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_name", default=None, type=str, required=True, + help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS)) + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model predictions and checkpoints will be written.") - parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', help="Don't normalize importance score by layers") - parser.add_argument("--dont_normalize_global_importance", action='store_true', help="Don't normalize all importance scores between 0 and 1") + ## Other parameters + parser.add_argument("--config_name", default="", type=str, + help="Pretrained config name or path if not the same as model_name") + parser.add_argument("--tokenizer_name", default="", type=str, + help="Pretrained tokenizer name or path if not the same as model_name") + parser.add_argument("--cache_dir", default="", type=str, + help="Where do you want to store the pre-trained models downloaded from s3") + parser.add_argument("--data_subset", type=int, default=-1, + help="If > 0: limit the data to a subset of data_subset instances.") + parser.add_argument("--overwrite_output_dir", action='store_true', + help="Whether to overwrite data in output directory") - parser.add_argument("--try_masking", action='store_true', help="Whether to try to mask head until a threshold of accuracy.") - parser.add_argument("--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics" - "(stop masking when metric < threshold * original metric value).") - parser.add_argument("--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.") - parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") + parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', + help="Don't normalize importance score by layers") + parser.add_argument("--dont_normalize_global_importance", action='store_true', + help="Don't normalize all importance scores between 0 and 1") - parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" - "Sequences longer than this will be truncated, and sequences shorter \n" - "than this will be padded.") + parser.add_argument("--try_masking", action='store_true', + help="Whether to try to mask head until a threshold of accuracy.") + parser.add_argument("--masking_threshold", default=0.9, type=float, + help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).") + parser.add_argument("--masking_amount", default=0.1, type=float, + help="Amount to heads to masking at each masking step.") + parser.add_argument("--metric_name", default="acc", type=str, + help="Metric to use for head masking.") + + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, sequences shorter padded.") parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") parser.add_argument("--seed", type=int, default=42) @@ -147,164 +268,79 @@ def run_model(): # Setup devices and distributed training if args.local_rank == -1 or args.no_cuda: args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - n_gpu = torch.cuda.device_count() + args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) - n_gpu = 1 + args.n_gpu = 1 torch.distributed.init_process_group(backend='nccl') # Initializes the distributed backend # Setup logging logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, n_gpu, bool(args.local_rank != -1))) + logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1))) # Set seeds - np.random.seed(args.seed) - torch.random.manual_seed(args.seed) - if n_gpu > 0: - torch.cuda.manual_seed(args.seed) + set_seed(args) # Prepare GLUE task - task_name = args.task_name.lower() - processor = processors[task_name]() + args.task_name = args.task_name.lower() + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() - args.output_mode = output_modes[task_name] - args.num_labels = len(label_list) + num_labels = len(label_list) - # Prepare output directory - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - - # Load model & tokenizer + # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only one distributed process download model & vocab - tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) + torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + + args.model_type = "" + for key in MODEL_CLASSES: + if key in args.model_name.lower(): + args.model_type = key # take the first match in model types + break + config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name, + num_labels=num_labels, finetuning_task=args.task_name, + output_attentions=True) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name) + model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config) - # Load a model with all BERTology options on: - # output_attentions => will output attention weights - # keep_multihead_output => will store gradient of attention head outputs for head importance computation - # see: http://arxiv.org/abs/1905.10650 - model = BertForSequenceClassification.from_pretrained(args.model_name_or_path, - num_labels=args.num_labels, - output_attentions=True, - keep_multihead_output=True) if args.local_rank == 0: - torch.distributed.barrier() # Make sure only one distributed process download model & vocab + torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + + # Distributed and parallel training model.to(args.device) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) - model.eval() + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], + output_device=args.local_rank, + find_unused_parameters=True) + elif args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Print/save training arguments + torch.save(args, os.path.join(args.output_dir, 'run_args.bin')) + logger.info("Training/evaluation parameters %s", args) # Prepare dataset for the GLUE task - eval_examples = processor.get_dev_examples(args.data_dir) - cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( - list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name))) - try: - eval_features = torch.load(cached_eval_features_file) - except: - eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, args.output_mode) - if args.local_rank in [-1, 0]: - logger.info("Saving eval features to cache file %s", cached_eval_features_file) - torch.save(eval_features, cached_eval_features_file) - - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if args.output_mode == "classification" else torch.float) - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - + eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) if args.data_subset > 0: eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data))))) - eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) - # Print/save training arguments - print(args) - torch.save(args, os.path.join(args.output_dir, 'run_args.bin')) # Compute head entropy and importance score - attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader) + compute_heads_importance(args, model, eval_dataloader) - # Print/save matrices - np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy()) - np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy()) - logger.info("Attention entropies") - print_2d_tensor(attn_entropy) - logger.info("Head importance scores") - print_2d_tensor(head_importance) - logger.info("Head ranked by importance scores") - head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device) - head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device) - head_ranks = head_ranks.view_as(head_importance) - print_2d_tensor(head_ranks) - - # Do masking if we want to + # Try head masking (set heads to zero until the score goes under a threshole) + # and head pruning (remove masked heads and see the effect on the network) if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: - _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False) - preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - original_score = compute_metrics(task_name, preds, labels)[args.metric_name] - logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) + head_mask = mask_heads(args, model, eval_dataloader) + prune_heads(args, model, eval_dataloader, head_mask) - new_head_mask = torch.ones_like(head_importance) - num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount)) - - current_score = original_score - while current_score >= original_score * args.masking_threshold: - head_mask = new_head_mask.clone() # save current head mask - # heads from least important to most - keep only not-masked heads - head_importance[head_mask == 0.0] = float('Inf') - current_heads_to_mask = head_importance.view(-1).sort()[1] - - if len(current_heads_to_mask) <= num_to_mask: - break - - # mask heads - current_heads_to_mask = current_heads_to_mask[:num_to_mask] - logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist())) - new_head_mask = new_head_mask.view(-1) - new_head_mask[current_heads_to_mask] = 0.0 - new_head_mask = new_head_mask.view_as(head_mask) - print_2d_tensor(new_head_mask) - - # Compute metric and head importance again - _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask) - preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - current_score = compute_metrics(task_name, preds, labels)[args.metric_name] - logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100) - - logger.info("Final head mask") - print_2d_tensor(head_mask) - np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy()) - - # Try pruning and test time speedup - # Pruning is like masking but we actually remove the masked weights - before_time = datetime.now() - _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, - compute_entropy=False, compute_importance=False, head_mask=head_mask) - preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - score_masking = compute_metrics(task_name, preds, labels)[args.metric_name] - original_time = datetime.now() - before_time - - original_num_params = sum(p.numel() for p in model.parameters()) - heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask))) - assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item() - model.bert.prune_heads(heads_to_prune) - pruned_num_params = sum(p.numel() for p in model.parameters()) - - before_time = datetime.now() - _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, - compute_entropy=False, compute_importance=False, head_mask=None) - preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) - score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name] - new_time = datetime.now() - before_time - - logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100) - logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) - logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100) if __name__ == '__main__': - run_model() + main() diff --git a/examples/run_generation.py b/examples/run_generation.py index 047e24679f..4108b2894a 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Generation with GPT/GPT-2/Transformer-XL/XLNet models +""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/Transformer-XL/XLNet) """ from __future__ import absolute_import, division, print_function, unicode_literals diff --git a/examples/run_glue.py b/examples/run_glue.py index f0633c3f12..ea5cc9f42d 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Finetuning a classification model (Bert, XLM, XLNet,...) on GLUE.""" +""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function @@ -230,6 +230,9 @@ def evaluate(args, model, tokenizer, prefix=""): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) + if args.local_rank in [-1, 0]: + tb_writer.close() + return results @@ -242,7 +245,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task))) - if os.path.exists(cached_features_file) and not args.overwrite_cache: + if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: @@ -410,7 +413,7 @@ def main(): if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab - # Distributed and parrallel training + # Distributed and parallel training model.to(args.device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], diff --git a/examples/run_squad.py b/examples/run_squad.py index af4a771f4a..24f00e0518 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Finetuning a question-answering model (Bert, XLM, XLNet,...) on SQuAD.""" +""" Finetuning the library models for question-answering on SQuAD (Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function @@ -21,7 +21,7 @@ import argparse import logging import os import random -from io import open +import glob import numpy as np import torch @@ -43,6 +43,9 @@ from pytorch_transformers import AdamW, WarmupLinearSchedule from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions +# The follwing import is the official SQuAD evaluation script (2.0). +# You can remove it from the dependencies if you are using this script outside of the library +# We've added it here for automated tests (see examples/test_examples.py file) from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad logger = logging.getLogger(__name__) @@ -123,7 +126,7 @@ def train(args, train_dataset, model, tokenizer): loss = ouputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -169,6 +172,9 @@ def train(args, train_dataset, model, tokenizer): train_iterator.close() break + if args.local_rank in [-1, 0]: + tb_writer.close() + return global_step, tr_loss / global_step @@ -208,16 +214,16 @@ def evaluate(args, model, tokenizer, prefix=""): start_logits=start_logits, end_logits=end_logits)) + # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) - all_predictions = write_predictions(examples, features, all_results, - args.n_best_size, args.max_answer_length, - args.do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, - args.verbose_logging, args.version_2_with_negative, - args.null_score_diff_threshold) + write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, + args.do_lower_case, output_prediction_file, output_nbest_file, + output_null_log_odds_file, args.verbose_logging, + args.version_2_with_negative, args.null_score_diff_threshold) + # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file) @@ -432,7 +438,7 @@ def main(): logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() + # Save the trained model and the tokenizer if args.local_rank == -1 or torch.distributed.get_rank() == 0: # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: @@ -454,22 +460,30 @@ def main(): model.to(args.device) - # Evaluation + # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) - logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging + logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + # Reload the model global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) + + # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) + result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) + logger.info("Results: {}".format(results)) + return results diff --git a/examples/run_openai_gpt.py b/examples/single_model_scripts/run_openai_gpt.py similarity index 98% rename from examples/run_openai_gpt.py rename to examples/single_model_scripts/run_openai_gpt.py index 02b86b3a22..b2e85271cb 100644 --- a/examples/run_openai_gpt.py +++ b/examples/single_model_scripts/run_openai_gpt.py @@ -40,7 +40,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, - OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME) + AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME) ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz" @@ -191,7 +191,7 @@ def main(): {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs - optimizer = OpenAIAdam(optimizer_grouped_parameters, + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, diff --git a/examples/run_swag.py b/examples/single_model_scripts/run_swag.py similarity index 99% rename from examples/run_swag.py rename to examples/single_model_scripts/run_swag.py index 00cd3a7840..fdda56e40b 100644 --- a/examples/run_swag.py +++ b/examples/single_model_scripts/run_swag.py @@ -34,7 +34,7 @@ from tqdm import tqdm, trange from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig -from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule +from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule from pytorch_transformers.tokenization_bert import BertTokenizer logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', diff --git a/examples/run_transfo_xl.py b/examples/single_model_scripts/run_transfo_xl.py similarity index 100% rename from examples/run_transfo_xl.py rename to examples/single_model_scripts/run_transfo_xl.py diff --git a/examples/test_examples.py b/examples/test_examples.py index 989ec367ee..a07c0ea31b 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -91,7 +91,6 @@ class ExamplesTests(unittest.TestCase): self.assertGreaterEqual(result['f1'], 30) self.assertGreaterEqual(result['exact'], 30) - def test_generation(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py index 8c75925a07..a8239038a7 100644 --- a/pytorch_transformers/modeling_bert.py +++ b/pytorch_transformers/modeling_bert.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""PyTorch BERT model.""" +"""PyTorch BERT model. """ from __future__ import absolute_import, division, print_function, unicode_literals @@ -28,7 +28,8 @@ import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss -from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer +from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, + prune_linear_layer, add_start_docstrings) logger = logging.getLogger(__name__) @@ -66,7 +67,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { def load_tf_weights_in_bert(model, config, tf_checkpoint_path): - """ Load tf checkpoints in a pytorch model + """ Load tf checkpoints in a pytorch model. """ try: import re @@ -583,25 +584,84 @@ class BertPreTrainedModel(PreTrainedModel): module.bias.data.zero_() +BERT_START_DOCSTRING = r""" The BERT model was proposed in + `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ + by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer + pre-trained using a combination of masked language modeling objective and next sentence prediction + on a large corpus comprising the Toronto Book Corpus and Wikipedia. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`: + https://arxiv.org/abs/1810.04805 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. +""" + +BERT_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`. + See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and + :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask indices selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask indices selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertModel(BertPreTrainedModel): - r"""BERT model ("Bidirectional Embedding Representations from a Transformer"). + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - :class:`~pytorch_transformers.BertModel` is the basic BERT Transformer model with a layer of summed token, \ - position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 \ - for BERT-large). The model is instantiated with the following parameters. + Examples:: - Arguments: - config: a BertConfig class instance with the configuration to build a new model - output_attentions: If True, also output attentions weights computed by the model at each layer. Default: False - output_hidden_states: If True, also output hidden states computed by the model at each layer. Default: Fals - - - Example:: - - config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = modeling.BertModel(config=config) + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertModel(config) + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids) + >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ def __init__(self, config): @@ -628,58 +688,6 @@ class BertModel(BertPreTrainedModel): self.encoder.layer[layer].attention.prune_heads(heads) def forward(self, input_ids, token_type_ids=None, attention_mask=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - - Arguments: - input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length] with the word token indices in the \ - vocabulary(see the tokens pre-processing logic in the scripts `run_bert_extract_features.py`, \ - `run_bert_classifier.py` and `run_bert_squad.py`) - token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token \ - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \ - a `sentence B` token (see BERT paper for more details). - attention_mask: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices \ - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \ - input sequence length in the current batch. It's the mask that we typically use for attention when \ - a batch has varying length sentences. - output_all_encoded_layers: boolean which controls the content of the `encoded_layers` output as described \ - below. Default: `True`. - head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 \ - and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 \ - => head is not masked. - - - Returns: - A tuple composed of (encoded_layers, pooled_output). Encoded layers are controlled by the \ - ``output_all_encoded_layers`` argument. - - If ``output_all_encoded_layers`` is set to True, outputs a list of the full sequences of \ - encoded-hidden-states at the end of each attention \ - block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a\ - ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size]. - - If set to False, outputs only the full sequence of hidden-states corresponding \ - to the last attention block of shape [batch_size, sequence_length, hidden_size]. - - ``pooled_output`` is a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \ - classifier pretrained on top of the hidden state associated to the first character of the \ - input (`CLS`) to train on the Next-Sentence task (see BERT's paper). - - Example:: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - - all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) - # or - all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask) - - - """ if attention_mask is None: attention_mask = torch.ones_like(input_ids) if token_type_ids is None: @@ -726,25 +734,47 @@ class BertModel(BertPreTrainedModel): return outputs # sequence_output, pooled_output, (hidden_states), (attentions) +@add_start_docstrings("""Bert Model transformer BERT model with two heads on top as done during the pre-training: + a `masked language modeling` head and a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForPreTraining(BertPreTrainedModel): - """BERT model with pre-training heads. - This module comprises the BERT model followed by the two pre-training heads: + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. - - the masked language modeling head, and + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - - the next sentence classification head. + Examples:: - Args: - `config`: a BertConfig class instance with the configuration to build a new model - `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False - `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> + >>> model = BertForPreTraining(config) + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids) + >>> prediction_scores, seq_relationship_scores = outputs[:1] - Example :: - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForPreTraining(config) """ def __init__(self, config): super(BertForPreTraining, self).__init__(config) @@ -764,58 +794,6 @@ class BertForPreTraining(BertPreTrainedModel): def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - Args: - `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) - `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `masked_lm_labels`: optional masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length] - with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss - is only computed for the labels set in [0, ..., vocab_size] - `next_sentence_label`: optional next sentence classification loss: ``torch.LongTensor`` of shape [batch_size] - with indices selected in [0, 1]. - 0 => next sentence is the continuation, 1 => next sentence is a random sentence. - `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. - It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. - - - Returns: - Either a ``torch.Tensor`` or ``tuple(torch.Tensor, torch.Tensor)``. - - if ``masked_lm_labels`` and ``next_sentence_label`` are not ``None``, outputs the total_loss which is the \ - sum of the masked language modeling loss and the next \ - sentence classification loss. - - if ``masked_lm_labels`` or ``next_sentence_label`` is ``None``, outputs a tuple made of: - - - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size] - - - the next sentence classification logits of shape [batch_size, 2]. - - Example :: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForPreTraining(config) - masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) - # or - masked_lm_logits_scores, seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask) - """ outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) sequence_output, pooled_output = outputs[:2] @@ -833,21 +811,39 @@ class BertForPreTraining(BertPreTrainedModel): return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) +@add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): - """BERT model with the masked language modeling head. - This module comprises the BERT model followed by the masked language modeling head. + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` - Args: - `config`: a BertConfig class instance with the configuration to build a new model - `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False - `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - Example:: + Examples:: - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> + >>> model = BertForMaskedLM(config) + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids, masked_lm_labels=input_ids) + >>> loss, prediction_scores = outputs[:1] - model = BertForMaskedLM(config) """ def __init__(self, config): super(BertForMaskedLM, self).__init__(config) @@ -866,45 +862,6 @@ class BertForMaskedLM(BertPreTrainedModel): self.bert.embeddings.word_embeddings) def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - Args: - `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) - `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `masked_lm_labels`: masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length] - with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss - is only computed for the labels set in [0, ..., vocab_size] - `head_mask`: an optional ``torch.LongTensor`` of shape [num_heads] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. - It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. - - Returns: - Masked language modeling loss if ``masked_lm_labels`` is specified, masked language modeling - logits of shape [batch_size, sequence_length, vocab_size] otherwise. - - Example:: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) - # or - masked_lm_logits_scores = model.forward(input_ids, token_type_ids, input_mask) - """ outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) sequence_output = outputs[0] @@ -919,21 +876,39 @@ class BertForMaskedLM(BertPreTrainedModel): return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) +@add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForNextSentencePrediction(BertPreTrainedModel): - """BERT model with next sentence prediction head. - This module comprises the BERT model followed by the next sentence classification head. + r""" + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. - Args: - `config`: a BertConfig class instance with the configuration to build a new model - `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False - `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Next sequence prediction (classification) loss. + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - Example:: + Examples:: - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> + >>> model = BertForNextSentencePrediction(config) + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids) + >>> seq_relationship_scores = outputs[0] - model = BertForNextSentencePrediction(config) """ def __init__(self, config): super(BertForNextSentencePrediction, self).__init__(config) @@ -944,44 +919,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel): self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - Args: - `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts - `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) - `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `next_sentence_label`: next sentence classification loss: ``torch.LongTensor`` of shape [batch_size] - with indices selected in [0, 1]. - 0 => next sentence is the continuation, 1 => next sentence is a random sentence. - `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between - 0 and 1.It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, - 0.0 => head is not masked. - - Returns: - If ``next_sentence_label`` is specified, outputs the total_loss which is the sum of the masked language - modeling loss and the next sentence classification loss. If ``next_sentence_label`` is ``None``, outputs - the next sentence classification logits of shape [batch_size, 2]. - - - Example:: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - seq_relationship_logits = model(input_ids, token_type_ids, input_mask) - # or - seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask) - """ outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) pooled_output = outputs[1] @@ -996,25 +933,41 @@ class BertForNextSentencePrediction(BertPreTrainedModel): return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) +@add_start_docstrings("""Bert Model transformer BERT model with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForSequenceClassification(BertPreTrainedModel): - """BERT model for classification. - This module is composed of the BERT model with a linear layer on top of - the pooled output. + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - Params: - `config`: a BertConfig class instance with the configuration to build a new model - `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False - `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False - `num_labels`: the number of classes for the classifier. Default = 2. + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - Example:: + Examples:: - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> + >>> model = BertForSequenceClassification(config) + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids, labels=labels) + >>> loss, logits = outputs[:1] - num_labels = 2 - - model = BertForSequenceClassification(config, num_labels) """ def __init__(self, config): super(BertForSequenceClassification, self).__init__(config) @@ -1027,40 +980,6 @@ class BertForSequenceClassification(BertPreTrainedModel): self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - Parameters: - `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] - with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts - `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) - `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size] - with indices selected in [0, ..., num_labels]. - `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. - It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. - - Returns: - If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels. - If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels]. - - Example:: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - logits = model(input_ids, token_type_ids, input_mask) - # or - logits = model.forward(input_ids, token_type_ids, input_mask) - """ outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) pooled_output = outputs[1] @@ -1082,26 +1001,78 @@ class BertForSequenceClassification(BertPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) +@add_start_docstrings("""Bert Model transformer BERT model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING) class BertForMultipleChoice(BertPreTrainedModel): - """BERT model for multiple choice tasks. - This module is composed of the BERT model with a linear layer on top of the pooled output. + r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows: - Parameters: - `config`: a BertConfig class instance with the configuration to build a new model - `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False - `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False + (a) For sequence pairs: - Example:: + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) - input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) - token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`. + See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and + :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Mask to avoid performing attention on padding token indices. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Mask indices selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask indices selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + + Examples:: + + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> + >>> model = BertForMultipleChoice(config) + >>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + >>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids, labels=labels) + >>> loss, classification_scores = outputs[:1] - model = BertForMultipleChoice(config) - logits = model(input_ids, token_type_ids, input_mask) """ def __init__(self, config): super(BertForMultipleChoice, self).__init__(config) @@ -1113,42 +1084,6 @@ class BertForMultipleChoice(BertPreTrainedModel): self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - Parameters: - `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) - `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] - with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` - and type 1 corresponds to a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size] - with indices selected in [0, ..., num_choices]. - `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. - It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. - - Returns: - If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels. - If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels]. - - Example:: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) - input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) - token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForMultipleChoice(config) - logits = model(input_ids, token_type_ids, input_mask) - """ - """ Input shapes should be [bsz, num choices, seq length] """ num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) @@ -1171,25 +1106,39 @@ class BertForMultipleChoice(BertPreTrainedModel): return outputs # (loss), reshaped_logits, (hidden_states), (attentions) +@add_start_docstrings("""Bert Model transformer BERT model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForTokenClassification(BertPreTrainedModel): - """BERT model for token-level classification. - This module is composed of the BERT model with a linear layer on top of - the full hidden state of the last layer. + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels]``. - Parameters: - `config`: a BertConfig class instance with the configuration to build a new model - `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False - `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False - `num_labels`: the number of classes for the classifier. Default = 2. + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - Example:: + Examples:: - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> + >>> model = BertForTokenClassification(config) + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids, labels=labels) + >>> loss, scores = outputs[:1] - num_labels = 2 - - model = BertForTokenClassification(config, num_labels) """ def __init__(self, config): super(BertForTokenClassification, self).__init__(config) @@ -1202,40 +1151,6 @@ class BertForTokenClassification(BertPreTrainedModel): self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - Parameters: - `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts - `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) - `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size, sequence_length] - with indices selected in [0, ..., num_labels]. - `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. - It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. - - Returns: - If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels. - If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, sequence_length, num_labels]. - - Example:: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - logits = model(input_ids, token_type_ids, input_mask) - # or - logits = model.forward(input_ids, token_type_ids, input_mask) - """ outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) sequence_output = outputs[0] @@ -1255,25 +1170,50 @@ class BertForTokenClassification(BertPreTrainedModel): loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs - return outputs # (loss), logits, (hidden_states), (attentions) + return outputs # (loss), scores, (hidden_states), (attentions) +@add_start_docstrings("""Bert Model transformer BERT model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForQuestionAnswering(BertPreTrainedModel): - """BERT model for Question Answering (span extraction). - This module is composed of the BERT model with a linear layer on top of - the sequence output that computes start_logits and end_logits + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. - Parameters: - `config`: a BertConfig class instance with the configuration to build a new model - `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False - `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - Example:: + Examples:: - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + >>> config = BertConfig.from_pretrained('bert-base-uncased') + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> + >>> model = BertForQuestionAnswering(config) + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + >>> start_positions = torch.tensor([1]) + >>> end_positions = torch.tensor([3]) + >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + >>> loss, start_scores, end_scores = outputs[:2] - model = BertForQuestionAnswering(config) """ def __init__(self, config): super(BertForQuestionAnswering, self).__init__(config) @@ -1286,44 +1226,6 @@ class BertForQuestionAnswering(BertPreTrainedModel): def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, head_mask=None): - """ - Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.** - - Parameters: - `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) - `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size]. - Positions are clamped to the length of the sequence and position outside of the sequence are not taken - into account for computing the loss. - `end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size]. - Positions are clamped to the length of the sequence and position outside of the sequence are not taken - into account for computing the loss. - `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. - It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. - - Returns: - If ``start_positions`` and ``end_positions`` are not ``None``, outputs the total_loss which is the sum of the - CrossEntropy loss for the start and end token positions. - If ``start_positions`` or ``end_positions`` is ``None``, outputs a tuple of start_logits, end_logits which are the - logits respectively for the start and end position tokens of shape [batch_size, sequence_length]. - - Example:: - - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - start_logits, end_logits = model(input_ids, token_type_ids, input_mask) - """ outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) sequence_output = outputs[0] diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index bb2b82b41c..8971af306e 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -36,6 +36,13 @@ WEIGHTS_NAME = "pytorch_model.bin" TF_WEIGHTS_NAME = 'model.ckpt' +def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = ''.join(docstr) + fn.__doc__ + return fn + return docstring_decorator + + class PretrainedConfig(object): """ An abstract class to handle dowloading a model pretrained config. """