updating examples

This commit is contained in:
thomwolf
2019-06-19 13:23:20 +02:00
parent 68ab9599ce
commit dc8e0019b7
5 changed files with 212 additions and 42 deletions

View File

@@ -1288,6 +1288,29 @@ Training with these hyper-parameters gave us the following results:
loss = 0.07231863956341798 loss = 0.07231863956341798
``` ```
Here is an example on MNLI:
```bash
python -m torch.distributed.launch --nproc_per_node 8 run_classifier.py --bert_model bert-large-uncased-whole-word-masking --task_name mnli --do_train --do_eval --do_lower_case --data_dir /datadrive/bert_data/glue_data//MNLI/ --max_seq_length 128 --train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
```
```bash
***** Eval results *****
acc = 0.8679706601466992
eval_loss = 0.4911287787382479
global_step = 18408
loss = 0.04755385363816904
***** Eval results *****
acc = 0.8747965825874695
eval_loss = 0.45516540421714036
global_step = 18408
loss = 0.04755385363816904
```
This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
#### SQuAD #### SQuAD
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB. This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.

View File

@@ -1,32 +1,108 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os
import argparse import argparse
import logging import logging
from tqdm import trange from tqdm import tqdm
import numpy as np
import torch import torch
import torch.nn.functional as F from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
import numpy as np from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss
from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def entropy(p):
plogp = p * torch.log(p)
plogp[p == 0] = 0
return -plogp.sum(dim=-1)
def print_1d_tensor(tensor, prefix=""):
if tensor.dtype != torch.long:
logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data))
else:
logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data))
def print_2d_tensor(tensor):
logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
for row in range(len(tensor)):
print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
def compute_heads_importance(args, model, eval_dataloader):
""" Example on how to use model outputs to compute:
- head attention entropy (activated by setting output_attentions=True when we created the model
- head importance scores according to http://arxiv.org/abs/1905.10650
(activated by setting keep_multihead_output=True when we created the model)
"""
for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
batch = tuple(t.to(args.device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
# Do a forward pass
all_attentions, logits = model(input_ids, segment_ids, input_mask)
# Update head attention entropy
for layer, attn in enumerate(all_attentions):
masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
# Update head importance scores with regards to our loss
# First backpropagate to populate the gradients
if output_mode == "classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
elif output_mode == "regression":
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1), label_ids.view(-1))
loss.backward()
# Second compute importance scores according to http://arxiv.org/abs/1905.10650
multihead_outputs = model.bert.get_multihead_outputs()
for layer, mh_layer_output in enumerate(multihead_outputs):
dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
tot_tokens += input_mask.float().detach().sum().data
# Normalize
attn_entropy /= tot_tokens
head_importance /= tot_tokens
if args.normalize_importance:
head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
return attn_entropy, head_importance
def run_model(): def run_model():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased', help='pretrained model name or path to local checkpoint') parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')
parser.add_argument("--task_name", type=str, default='mrpc', help="The name of the task to train.")
parser.add_argument("--data_dir", type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
parser.add_argument("--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.")
parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory")
parser.add_argument("--normalize_importance", action='store_true', help="Whether to normalize importance score between 0 and 1")
parser.add_argument("--try_pruning", action='store_true', help="Whether to try to prune head until a threshold of accuracy.")
parser.add_argument("--pruning_threshold", default=0.9, type=float, help="Pruning threshold of accuracy.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
parser.add_argument("--seed", type=int, default=42) parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
args = parser.parse_args() args = parser.parse_args()
np.random.seed(args.seed) # Setup devices and distributed training
torch.random.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count() n_gpu = torch.cuda.device_count()
@@ -34,21 +110,107 @@ def run_model():
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
args.device = torch.device("cuda", args.local_rank) args.device = torch.device("cuda", args.local_rank)
n_gpu = 1 n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # Initializes the distributed backend
torch.distributed.init_process_group(backend='nccl')
# Setup logging
logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, n_gpu, bool(args.local_rank != -1)))
args.device, n_gpu, bool(args.local_rank != -1), args.fp16))
# Set seeds
np.random.seed(args.seed)
torch.random.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed(args.seed)
# Prepare GLUE task
task_name = args.task_name.lower()
processor = processors[task_name]()
output_mode = output_modes[task_name]
label_list = processor.get_labels()
num_labels = len(label_list)
# Prepare output directory
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
os.makedirs(args.output_dir)
# Load model & tokenizer
if args.local_rank not in [-1, 0]:
torch.distributed.barrier() # Make sure only one distributed process download model & vocab
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
model = BertForSequenceClassification.from_pretrained(args.model_name_or_path)
# Load a model with all BERTology options on:
# output_attentions => will output attention weights
# keep_multihead_output => will store gradient of attention head outputs for head importance computation
# see: http://arxiv.org/abs/1905.10650
model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,
num_labels=num_labels,
output_attentions=True,
keep_multihead_output=True)
if args.local_rank == 0:
torch.distributed.barrier() # Make sure only one distributed process download model & vocab
model.to(args.device) model.to(args.device)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
model.eval() model.eval()
# Prepare dataset for the GLUE task
eval_examples = processor.get_dev_examples(args.data_dir)
cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
try:
eval_features = torch.load(cached_eval_features_file)
except:
eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
if args.local_rank in [-1, 0]:
logger.info("Saving eval features to cache file %s", cached_eval_features_file)
torch.save(eval_features, cached_eval_features_file)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if output_mode == "classification" else torch.float)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args.data_subset > 0:
eval_data = Subset(eval_data, list(range(args.data_subset)))
eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
# Print/save training arguments
print(args)
torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
# To showcase some BERTology methods, we will compute:
# - the average entropy of each head over the dev set
# - the importance score of each head over the dev set as explained in http://arxiv.org/abs/1905.10650
n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
head_importance = torch.zeros(n_layers, n_heads).to(args.device)
attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
tot_tokens = 0.0
# Compute head entropy and importance score
attn_entropy, head_importance = compute_heads_importance(args, model, eval_dataloader)
# Print/save matrices
np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy)
np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance)
logger.info("Attention entropies")
print_2d_tensor(attn_entropy)
logger.info("Head importance scores")
print_2d_tensor(head_importance)
logger.info("Head ranked by importance scores")
head_ranks = torch.zeros(n_layers * n_heads, dtype=torch.long, device=args.device)
head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel())
print_2d_tensor(head_ranks.view_as(head_importance))
# Do pruning if we want to
if args.try_pruning and args.pruning_threshold > 0.0 and args.pruning_threshold < 1.0:
if __name__ == '__main__': if __name__ == '__main__':
run_model() run_model()

View File

@@ -366,7 +366,7 @@ def main():
output_args_file = os.path.join(args.output_dir, 'training_args.bin') output_args_file = os.path.join(args.output_dir, 'training_args.bin')
torch.save(args, output_args_file) torch.save(args, output_args_file)
else: else:
model = BertForSequenceClassification.from_pretrained(args.bert_model) model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
model.to(device) model.to(device)

View File

@@ -707,36 +707,15 @@ class BertPreTrainedModel(nn.Module):
archive_file, resolved_archive_file)) archive_file, resolved_archive_file))
logger.info("loading configuration file {} from cache at {}".format( logger.info("loading configuration file {} from cache at {}".format(
config_file, resolved_config_file)) config_file, resolved_config_file))
### Switching to split config/weight files configuration
# tempdir = None
# if os.path.isdir(resolved_archive_file) or from_tf:
# serialization_dir = resolved_archive_file
# else:
# # Extract archive to temp dir
# tempdir = tempfile.mkdtemp()
# logger.info("extracting archive file {} to temp dir {}".format(
# resolved_archive_file, tempdir))
# with tarfile.open(resolved_archive_file, 'r:gz') as archive:
# archive.extractall(tempdir)
# serialization_dir = tempdir
# config_file = os.path.join(serialization_dir, CONFIG_NAME)
# if not os.path.exists(config_file):
# # Backward compatibility with old naming format
# config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
# Load config # Load config
config = BertConfig.from_json_file(resolved_config_file) config = BertConfig.from_json_file(resolved_config_file)
logger.info("Model config {}".format(config)) logger.info("Model config {}".format(config))
# Instantiate model. # Instantiate model.
model = cls(config, *inputs, **kwargs) model = cls(config, *inputs, **kwargs)
if state_dict is None and not from_tf: if state_dict is None and not from_tf:
# weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
state_dict = torch.load(resolved_archive_file, map_location='cpu') state_dict = torch.load(resolved_archive_file, map_location='cpu')
# if tempdir:
# # Clean up temp dir
# shutil.rmtree(tempdir)
if from_tf: if from_tf:
# Directly load from a TensorFlow checkpoint # Directly load from a TensorFlow checkpoint
# weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
return load_tf_weights_in_bert(model, weights_path) return load_tf_weights_in_bert(model, weights_path)
# Load from a PyTorch state_dict # Load from a PyTorch state_dict
old_keys = [] old_keys = []

View File

@@ -37,6 +37,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
} }
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-uncased': 512, 'bert-base-uncased': 512,
@@ -49,6 +52,9 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-german-cased': 512, 'bert-base-german-cased': 512,
'bert-large-uncased-whole-word-masking': 512, 'bert-large-uncased-whole-word-masking': 512,
'bert-large-cased-whole-word-masking': 512, 'bert-large-cased-whole-word-masking': 512,
'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
'bert-large-cased-whole-word-masking-finetuned-squad': 512,
'bert-base-cased-finetuned-mrpc': 512,
} }
VOCAB_NAME = 'vocab.txt' VOCAB_NAME = 'vocab.txt'