diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 1638b02a6f..cf27ef6cc6 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -315,8 +315,7 @@ def main(): if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps, - args.warmup_proportion) + lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index e6c3598a9f..6cb8954465 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -4,11 +4,11 @@ from tqdm import tqdm, trange from tempfile import TemporaryDirectory import shelve -from random import random, randrange, randint, shuffle, choice, sample +from random import random, randrange, randint, shuffle, choice from pytorch_pretrained_bert.tokenization import BertTokenizer import numpy as np import json - +import collections class DocumentDatabase: def __init__(self, reduce_memory=False): @@ -98,42 +98,77 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens): else: trunc_tokens.pop() +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) -def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list): +def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list): """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but with several refactors to clean it up and remove a lot of unnecessary variables.""" cand_indices = [] for (i, token) in enumerate(tokens): if token == "[CLS]" or token == "[SEP]": continue - cand_indices.append(i) + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")): + cand_indices[-1].append(i) + else: + cand_indices.append([i]) num_to_mask = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) shuffle(cand_indices) - mask_indices = sorted(sample(cand_indices, num_to_mask)) - masked_token_labels = [] - for index in mask_indices: - # 80% of the time, replace with [MASK] - if random() < 0.8: - masked_token = "[MASK]" - else: - # 10% of the time, keep original - if random() < 0.5: - masked_token = tokens[index] - # 10% of the time, replace with random word + masked_lms = [] + covered_indexes = set() + for index_set in cand_indices: + if len(masked_lms) >= num_to_mask: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_mask: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if random() < 0.8: + masked_token = "[MASK]" else: - masked_token = choice(vocab_list) - masked_token_labels.append(tokens[index]) - # Once we've saved the true label for that token, we can overwrite it with the masked version - tokens[index] = masked_token + # 10% of the time, keep original + if random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = choice(vocab_list) + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + tokens[index] = masked_token + + assert len(masked_lms) <= num_to_mask + masked_lms = sorted(masked_lms, key=lambda x: x.index) + mask_indices = [p.index for p in masked_lms] + masked_token_labels = [p.label for p in masked_lms] return tokens, mask_indices, masked_token_labels def create_instances_from_document( doc_database, doc_idx, max_seq_length, short_seq_prob, - masked_lm_prob, max_predictions_per_seq, vocab_list): + masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list): """This code is mostly a duplicate of the equivalent function from Google BERT's repo. However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function. Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence @@ -213,7 +248,7 @@ def create_instances_from_document( segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)] tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions( - tokens, masked_lm_prob, max_predictions_per_seq, vocab_list) + tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list) instance = { "tokens": tokens, @@ -237,7 +272,8 @@ def main(): choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-base-multilingual", "bert-base-chinese"]) parser.add_argument("--do_lower_case", action="store_true") - + parser.add_argument("--do_whole_word_mask", action="store_true", + help="Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument("--reduce_memory", action="store_true", help="Reduce memory usage for large datasets by keeping data on disc rather than in memory") @@ -284,7 +320,7 @@ def main(): doc_instances = create_instances_from_document( docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, - vocab_list=vocab_list) + whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list) doc_instances = [json.dumps(instance) for instance in doc_instances] for instance in doc_instances: epoch_file.write(instance + '\n') diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index 6511ead590..610912675f 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -534,36 +534,37 @@ def main(): model = torch.nn.DataParallel(model) # Prepare optimizer - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + if args.do_train: + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) - warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) global_step = 0 if args.do_train: @@ -603,8 +604,7 @@ def main(): if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps, - args.warmup_proportion) + lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() diff --git a/examples/run_classifier.py b/examples/run_classifier.py index bdcad6f0eb..1ebdf9fd51 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -271,7 +271,7 @@ class StsbProcessor(DataProcessor): class QqpProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" + """Processor for the QQP data set (GLUE version).""" def get_train_examples(self, data_dir): """See base class.""" @@ -306,7 +306,7 @@ class QqpProcessor(DataProcessor): class QnliProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" + """Processor for the QNLI data set (GLUE version).""" def get_train_examples(self, data_dir): """See base class.""" @@ -763,35 +763,36 @@ def main(): model = torch.nn.DataParallel(model) # Prepare optimizer - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + if args.do_train: + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) - warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 @@ -854,8 +855,7 @@ def main(): if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps, - args.warmup_proportion) + lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() @@ -939,7 +939,7 @@ def main(): elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) - loss = tr_loss/nb_tr_steps if args.do_train else None + loss = tr_loss/global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step @@ -1007,7 +1007,7 @@ def main(): preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) - loss = tr_loss/nb_tr_steps if args.do_train else None + loss = tr_loss/global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py index cb5aa8d9cb..f0a14f7e87 100644 --- a/examples/run_openai_gpt.py +++ b/examples/run_openai_gpt.py @@ -83,8 +83,8 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d input_ids[i, 1, :len(with_cont2)] = with_cont2 mc_token_ids[i, 0] = len(with_cont1) - 1 mc_token_ids[i, 1] = len(with_cont2) - 1 - lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:] - lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:] + lm_labels[i, 0, :len(with_cont1)] = with_cont1 + lm_labels[i, 1, :len(with_cont2)] = with_cont2 mc_labels[i] = mc_label all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels) tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs)) @@ -183,19 +183,20 @@ def main(): eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size - optimizer = OpenAIAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - max_grad_norm=args.max_grad_norm, - weight_decay=args.weight_decay, - t_total=num_train_optimization_steps) + if args.do_train: + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size + optimizer = OpenAIAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + max_grad_norm=args.max_grad_norm, + weight_decay=args.weight_decay, + t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None diff --git a/examples/run_squad.py b/examples/run_squad.py index c3fdf03774..249aff7f8a 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -922,40 +922,41 @@ def main(): model = torch.nn.DataParallel(model) # Prepare optimizer - param_optimizer = list(model.named_parameters()) + if args.do_train: + param_optimizer = list(model.named_parameters()) - # hack to remove pooler, which is not used - # thus it produce None grad that break apex - param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] + # hack to remove pooler, which is not used + # thus it produce None grad that break apex + param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) - warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) global_step = 0 if args.do_train: @@ -1015,8 +1016,7 @@ def main(): if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps, - args.warmup_proportion) + lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() diff --git a/examples/run_swag.py b/examples/run_swag.py index bd724c48ad..59bb9866c3 100644 --- a/examples/run_swag.py +++ b/examples/run_swag.py @@ -385,39 +385,40 @@ def main(): model = torch.nn.DataParallel(model) # Prepare optimizer - param_optimizer = list(model.named_parameters()) + if args.do_train: + param_optimizer = list(model.named_parameters()) - # hack to remove pooler, which is not used - # thus it produce None grad that break apex - param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] + # hack to remove pooler, which is not used + # thus it produce None grad that break apex + param_optimizer = [n for n in param_optimizer] - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) - warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) global_step = 0 if args.do_train: @@ -466,8 +467,7 @@ def main(): if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps, - args.warmup_proportion) + lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() @@ -540,7 +540,7 @@ def main(): result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, - 'loss': tr_loss/nb_tr_steps} + 'loss': tr_loss/global_step} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: diff --git a/hubconf.py b/hubconf.py index 755e181d20..ba09cbab3c 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,187 +1,19 @@ -from pytorch_pretrained_bert.tokenization import BertTokenizer -from pytorch_pretrained_bert.modeling import ( - BertModel, - BertForNextSentencePrediction, - BertForMaskedLM, - BertForMultipleChoice, - BertForPreTraining, - BertForQuestionAnswering, - BertForSequenceClassification, - BertForTokenClassification, - ) - dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex'] -# A lot of models share the same param doc. Use a decorator -# to save typing -bert_docstring = """ - Params: - pretrained_model_name_or_path: either: - - a str with the name of a pre-trained model to load - . `bert-base-uncased` - . `bert-large-uncased` - . `bert-base-cased` - . `bert-large-cased` - . `bert-base-multilingual-uncased` - . `bert-base-multilingual-cased` - . `bert-base-chinese` - - a path or url to a pretrained model archive containing: - . `bert_config.json` a configuration file for the model - . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining - instance - - a path or url to a pretrained model archive containing: - . `bert_config.json` a configuration file for the model - . `model.chkpt` a TensorFlow checkpoint - from_tf: should we load the weights from a locally saved TensorFlow - checkpoint - cache_dir: an optional path to a folder in which the pre-trained models - will be cached. - state_dict: an optional state dictionnary - (collections.OrderedDict object) to use instead of Google - pre-trained models - *inputs, **kwargs: additional input for the specific Bert class - (ex: num_labels for BertForSequenceClassification) -""" - - -def _append_from_pretrained_docstring(docstr): - def docstring_decorator(fn): - fn.__doc__ = fn.__doc__ + docstr - return fn - return docstring_decorator - - -def bertTokenizer(*args, **kwargs): - """ - Instantiate a BertTokenizer from a pre-trained/customized vocab file - Args: - pretrained_model_name_or_path: Path to pretrained model archive - or one of pre-trained vocab configs below. - * bert-base-uncased - * bert-large-uncased - * bert-base-cased - * bert-large-cased - * bert-base-multilingual-uncased - * bert-base-multilingual-cased - * bert-base-chinese - Keyword args: - cache_dir: an optional path to a specific directory to download and cache - the pre-trained model weights. - Default: None - do_lower_case: Whether to lower case the input. - Only has an effect when do_wordpiece_only=False - Default: True - do_basic_tokenize: Whether to do basic tokenization before wordpiece. - Default: True - max_len: An artificial maximum length to truncate tokenized sequences to; - Effective maximum length is always the minimum of this - value (if specified) and the underlying BERT model's - sequence length. - Default: None - never_split: List of tokens which will never be split during tokenization. - Only has an effect when do_wordpiece_only=False - Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] - - Example: - >>> sentence = 'Hello, World!' - >>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False) - >>> toks = tokenizer.tokenize(sentence) - ['Hello', '##,', 'World', '##!'] - >>> ids = tokenizer.convert_tokens_to_ids(toks) - [8667, 28136, 1291, 28125] - """ - tokenizer = BertTokenizer.from_pretrained(*args, **kwargs) - return tokenizer - - -@_append_from_pretrained_docstring(bert_docstring) -def bertModel(*args, **kwargs): - """ - BertModel is the basic BERT Transformer model with a layer of summed token, - position and sequence embeddings followed by a series of identical - self-attention blocks (12 for BERT-base, 24 for BERT-large). - """ - model = BertModel.from_pretrained(*args, **kwargs) - return model - - -@_append_from_pretrained_docstring(bert_docstring) -def bertForNextSentencePrediction(*args, **kwargs): - """ - BERT model with next sentence prediction head. - This module comprises the BERT model followed by the next sentence - classification head. - """ - model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs) - return model - - -@_append_from_pretrained_docstring(bert_docstring) -def bertForPreTraining(*args, **kwargs): - """ - BERT model with pre-training heads. - This module comprises the BERT model followed by the two pre-training heads - - the masked language modeling head, and - - the next sentence classification head. - """ - model = BertForPreTraining.from_pretrained(*args, **kwargs) - return model - - -@_append_from_pretrained_docstring(bert_docstring) -def bertForMaskedLM(*args, **kwargs): - """ - BertForMaskedLM includes the BertModel Transformer followed by the - (possibly) pre-trained masked language modeling head. - """ - model = BertForMaskedLM.from_pretrained(*args, **kwargs) - return model - - -@_append_from_pretrained_docstring(bert_docstring) -def bertForSequenceClassification(*args, **kwargs): - """ - BertForSequenceClassification is a fine-tuning model that includes - BertModel and a sequence-level (sequence or pair of sequences) classifier - on top of the BertModel. - - The sequence-level classifier is a linear layer that takes as input the - last hidden state of the first character in the input sequence - (see Figures 3a and 3b in the BERT paper). - """ - model = BertForSequenceClassification.from_pretrained(*args, **kwargs) - return model - - -@_append_from_pretrained_docstring(bert_docstring) -def bertForMultipleChoice(*args, **kwargs): - """ - BertForMultipleChoice is a fine-tuning model that includes BertModel and a - linear layer on top of the BertModel. - """ - model = BertForMultipleChoice.from_pretrained(*args, **kwargs) - return model - - -@_append_from_pretrained_docstring(bert_docstring) -def bertForQuestionAnswering(*args, **kwargs): - """ - BertForQuestionAnswering is a fine-tuning model that includes BertModel - with a token-level classifiers on top of the full sequence of last hidden - states. - """ - model = BertForQuestionAnswering.from_pretrained(*args, **kwargs) - return model - - -@_append_from_pretrained_docstring(bert_docstring) -def bertForTokenClassification(*args, **kwargs): - """ - BertForTokenClassification is a fine-tuning model that includes BertModel - and a token-level classifier on top of the BertModel. - - The token-level classifier is a linear layer that takes as input the last - hidden state of the sequence. - """ - model = BertForTokenClassification.from_pretrained(*args, **kwargs) - return model +from hubconfs.bert_hubconf import ( + bertTokenizer, + bertModel, + bertForNextSentencePrediction, + bertForPreTraining, + bertForMaskedLM, + bertForSequenceClassification, + bertForMultipleChoice, + bertForQuestionAnswering, + bertForTokenClassification +) +from hubconfs.gpt_hubconf import ( + openAIGPTTokenizer, + openAIGPTModel, + openAIGPTLMHeadModel, + openAIGPTDoubleHeadsModel +) \ No newline at end of file diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py new file mode 100644 index 0000000000..0595bdeccb --- /dev/null +++ b/hubconfs/bert_hubconf.py @@ -0,0 +1,348 @@ +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import ( + BertModel, + BertForNextSentencePrediction, + BertForMaskedLM, + BertForMultipleChoice, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + ) + +# A lot of models share the same param doc. Use a decorator +# to save typing +bert_docstring = """ + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining + instance + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + from_tf: should we load the weights from a locally saved TensorFlow + checkpoint + cache_dir: an optional path to a folder in which the pre-trained models + will be cached. + state_dict: an optional state dictionnary + (collections.OrderedDict object) to use instead of Google + pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) +""" + + +def _append_from_pretrained_docstring(docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + docstr + return fn + return docstring_decorator + + +def bertTokenizer(*args, **kwargs): + """ + Instantiate a BertTokenizer from a pre-trained/customized vocab file + Args: + pretrained_model_name_or_path: Path to pretrained model archive + or one of pre-trained vocab configs below. + * bert-base-uncased + * bert-large-uncased + * bert-base-cased + * bert-large-cased + * bert-base-multilingual-uncased + * bert-base-multilingual-cased + * bert-base-chinese + Keyword args: + cache_dir: an optional path to a specific directory to download and cache + the pre-trained model weights. + Default: None + do_lower_case: Whether to lower case the input. + Only has an effect when do_wordpiece_only=False + Default: True + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + Default: True + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + Default: None + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False + Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] + + Example: + >>> sentence = 'Hello, World!' + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + >>> toks = tokenizer.tokenize(sentence) + ['Hello', '##,', 'World', '##!'] + >>> ids = tokenizer.convert_tokens_to_ids(toks) + [8667, 28136, 1291, 28125] + """ + tokenizer = BertTokenizer.from_pretrained(*args, **kwargs) + return tokenizer + + +@_append_from_pretrained_docstring(bert_docstring) +def bertModel(*args, **kwargs): + """ + BertModel is the basic BERT Transformer model with a layer of summed token, + position and sequence embeddings followed by a series of identical + self-attention blocks (12 for BERT-base, 24 for BERT-large). + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> segments_tensors = torch.tensor([segments_ids]) + # Load bertModel + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased') + >>> model.eval() + # Predict hidden states features for each layer + >>> with torch.no_grad(): + encoded_layers, _ = model(tokens_tensor, segments_tensors) + """ + model = BertModel.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(bert_docstring) +def bertForNextSentencePrediction(*args, **kwargs): + """ + BERT model with next sentence prediction head. + This module comprises the BERT model followed by the next sentence + classification head. + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> segments_tensors = torch.tensor([segments_ids]) + # Load bertForNextSentencePrediction + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased') + >>> model.eval() + # Predict the next sentence classification logits + >>> with torch.no_grad(): + next_sent_classif_logits = model(tokens_tensor, segments_tensors) + """ + model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(bert_docstring) +def bertForPreTraining(*args, **kwargs): + """ + BERT model with pre-training heads. + This module comprises the BERT model followed by the two pre-training heads + - the masked language modeling head, and + - the next sentence classification head. + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> segments_tensors = torch.tensor([segments_ids]) + # Load bertForPreTraining + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased') + >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors) + """ + model = BertForPreTraining.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(bert_docstring) +def bertForMaskedLM(*args, **kwargs): + """ + BertForMaskedLM includes the BertModel Transformer followed by the + (possibly) pre-trained masked language modeling head. + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> masked_index = 8 + >>> tokenized_text[masked_index] = '[MASK]' + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> segments_tensors = torch.tensor([segments_ids]) + # Load bertForMaskedLM + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased') + >>> model.eval() + # Predict all tokens + >>> with torch.no_grad(): + predictions = model(tokens_tensor, segments_tensors) + >>> predicted_index = torch.argmax(predictions[0, masked_index]).item() + >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] + 'henson' + """ + model = BertForMaskedLM.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(bert_docstring) +def bertForSequenceClassification(*args, **kwargs): + """ + BertForSequenceClassification is a fine-tuning model that includes + BertModel and a sequence-level (sequence or pair of sequences) classifier + on top of the BertModel. Note that the classification head is only initialized + and has to be trained. + + The sequence-level classifier is a linear layer that takes as input the + last hidden state of the first character in the input sequence + (see Figures 3a and 3b in the BERT paper). + + Args: + num_labels: the number (>=2) of classes for the classifier. + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> segments_tensors = torch.tensor([segments_ids]) + # Load bertForSequenceClassification + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2) + >>> model.eval() + # Predict the sequence classification logits + >>> with torch.no_grad(): + seq_classif_logits = model(tokens_tensor, segments_tensors) + # Or get the sequence classification loss + >>> labels = torch.tensor([1]) + >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss + """ + model = BertForSequenceClassification.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(bert_docstring) +def bertForMultipleChoice(*args, **kwargs): + """ + BertForMultipleChoice is a fine-tuning model that includes BertModel and a + linear layer on top of the BertModel. Note that the multiple choice head is + only initialized and has to be trained. + + Args: + num_choices: the number (>=2) of classes for the classifier. + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0) + >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0) + # Load bertForMultipleChoice + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2) + >>> model.eval() + # Predict the multiple choice logits + >>> with torch.no_grad(): + multiple_choice_logits = model(tokens_tensor, segments_tensors) + # Or get the multiple choice loss + >>> labels = torch.tensor([1]) + >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss + """ + model = BertForMultipleChoice.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(bert_docstring) +def bertForQuestionAnswering(*args, **kwargs): + """ + BertForQuestionAnswering is a fine-tuning model that includes BertModel + with a token-level classifiers on top of the full sequence of last hidden + states. Note that the classification head is only initialized + and has to be trained. + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> segments_tensors = torch.tensor([segments_ids]) + # Load bertForQuestionAnswering + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased') + >>> model.eval() + # Predict the start and end positions logits + >>> with torch.no_grad(): + start_logits, end_logits = model(tokens_tensor, segments_tensors) + # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions + >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14]) + # set model.train() before if training this loss + >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions) + """ + model = BertForQuestionAnswering.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(bert_docstring) +def bertForTokenClassification(*args, **kwargs): + """ + BertForTokenClassification is a fine-tuning model that includes BertModel + and a token-level classifier on top of the BertModel. Note that the classification + head is only initialized and has to be trained. + + The token-level classifier is a linear layer that takes as input the last + hidden state of the sequence. + + Args: + num_labels: the number (>=2) of classes for the classifier. + + Example: + # Load the tokenizer + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) + # Prepare tokenized input + >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> segments_tensors = torch.tensor([segments_ids]) + # Load bertForTokenClassification + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2) + >>> model.eval() + # Predict the token classification logits + >>> with torch.no_grad(): + classif_logits = model(tokens_tensor, segments_tensors) + # Or get the token classification loss + >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]]) + >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss + """ + model = BertForTokenClassification.from_pretrained(*args, **kwargs) + return model diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py new file mode 100644 index 0000000000..77162dc244 --- /dev/null +++ b/hubconfs/gpt_hubconf.py @@ -0,0 +1,183 @@ +from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer +from pytorch_pretrained_bert.modeling_openai import ( + OpenAIGPTModel, + OpenAIGPTLMHeadModel, + OpenAIGPTDoubleHeadsModel +) + +# Dependecies that are not specified in global hubconf.py +specific_dependencies = ['spacy', 'ftfy'] + +# A lot of models share the same param doc. Use a decorator +# to save typing +gpt_docstring = """ + OpenAI GPT use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: + [0, ---------------------- + ... -> word embeddings + config.vocab_size - 1, ______________________ + config.vocab_size, + ... -> special embeddings + config.vocab_size + config.n_special - 1] ______________________ + + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special + You should use the associate indices to index the embeddings. + + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `openai-gpt` + - a path or url to a pretrained model archive containing: + . `openai_gpt_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance + - a path or url to a pretrained model archive containing: + . `openai-gpt-config.json` a configuration file for the model + . a series of NumPy files containing OpenAI TensorFlow trained weights + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) + to use instead of pre-trained models + *inputs, **kwargs: additional input for the specific OpenAI-GPT class +""" + + +def _append_from_pretrained_docstring(docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + docstr + return fn + return docstring_decorator + + +def openAIGPTTokenizer(*args, **kwargs): + """ + Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file. + Peculiarities: + - lower case all inputs + - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. + - argument special_tokens and function set_special_tokens: + can be used to add additional symbols (ex: "__classify__") to a vocabulary. + + Args: + pretrained_model_name_or_path: Path to pretrained model archive + or one of pre-trained vocab configs below. + * openai-gpt + Keyword args: + special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...) + Default: None + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + Default: None + + Example: + >>> import torch + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') + + >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483] + """ + tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs) + return tokenizer + + +@_append_from_pretrained_docstring(gpt_docstring) +def openAIGPTModel(*args, **kwargs): + """ + OpenAIGPTModel is the basic OpenAI GPT Transformer model based on + identical stacked masked self-attention blocks and pre-trained + on large scale dataset using language modeling signal. + + Example: + # Load the tokenizer + >>> import torch + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') + + # Prepare tokenized input + >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> tokens_tensor = torch.tensor([indexed_tokens]) + + # Load openAIGPTModel + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt') + >>> model.eval() + + # Predict hidden states features for each layer + >>> with torch.no_grad(): + hidden_states = model(tokens_tensor) + """ + model = OpenAIGPTModel.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(gpt_docstring) +def openAIGPTLMHeadModel(*args, **kwargs): + """ + OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the + tied (pre-trained) language modeling head on top. + + Example: + # Load the tokenizer + >>> import torch + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') + + # Prepare tokenized input + >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> tokens_tensor = torch.tensor([indexed_tokens]) + + # Load openAIGPTLMHeadModel + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt') + >>> model.eval() + + # Predict hidden states features for each layer + >>> with torch.no_grad(): + predictions = model(tokens_tensor) + + # Get the predicted last token + >>> predicted_index = torch.argmax(predictions[0, -1, :]).item() + >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] + '.' + """ + model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs) + return model + + +@_append_from_pretrained_docstring(gpt_docstring) +def openAIGPTDoubleHeadsModel(*args, **kwargs): + """ + OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the + tied (pre-trained) language modeling head and a multiple choice + classification head (only initialized, not pre-trained). + + Example: + # Load the tokenizer + >>> import torch + >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') + + # Prepare tokenized input + >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer" + >>> tokenized_text = tokenizer.tokenize(text) + >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + >>> tokens_tensor = torch.tensor([indexed_tokens]) + >>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ]) + + # Load openAIGPTDoubleHeadsModel + >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt') + >>> model.eval() + + # Predict hidden states features for each layer + >>> with torch.no_grad(): + lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids) + """ + model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs) + return model diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index 17bdd258ea..605c841235 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -22,6 +22,15 @@ import requests from botocore.exceptions import ClientError from tqdm import tqdm +try: + from torch.hub import _get_torch_home + torch_cache_home = _get_torch_home() +except ImportError: + torch_cache_home = os.path.expanduser( + os.getenv('TORCH_HOME', os.path.join( + os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) +default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert') + try: from urllib.parse import urlparse except ImportError: @@ -29,11 +38,11 @@ except ImportError: try: from pathlib import Path - PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', - Path.home() / '.pytorch_pretrained_bert')) + PYTORCH_PRETRAINED_BERT_CACHE = Path( + os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)) except (AttributeError, ImportError): PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', - os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) + default_cache_path) CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index bfcbcc9edf..bbf8f4800b 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -145,7 +145,8 @@ class BertConfig(object): attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, - initializer_range=0.02): + initializer_range=0.02, + layer_norm_eps=1e-12): """Constructs BertConfig. Args: @@ -169,6 +170,7 @@ class BertConfig(object): `BertModel`. initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. """ if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 and isinstance(vocab_size_or_config_json_file, unicode)): @@ -188,6 +190,7 @@ class BertConfig(object): self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps else: raise ValueError("First argument must be either a vocabulary size (int)" "or the path to a pretrained model config file (str)") @@ -254,7 +257,7 @@ class BertEmbeddings(nn.Module): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None): @@ -332,7 +335,7 @@ class BertSelfOutput(nn.Module): def __init__(self, config): super(BertSelfOutput, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): @@ -378,7 +381,7 @@ class BertOutput(nn.Module): def __init__(self, config): super(BertOutput, self).__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): @@ -454,7 +457,7 @@ class BertPredictionHeadTransform(nn.Module): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -1020,7 +1023,7 @@ class BertForSequenceClassification(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ - def __init__(self, config, num_labels, output_attentions=False): + def __init__(self, config, num_labels=2, output_attentions=False): super(BertForSequenceClassification, self).__init__(config) self.output_attentions = output_attentions self.num_labels = num_labels @@ -1091,7 +1094,7 @@ class BertForMultipleChoice(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ - def __init__(self, config, num_choices, output_attentions=False): + def __init__(self, config, num_choices=2, output_attentions=False): super(BertForMultipleChoice, self).__init__(config) self.output_attentions = output_attentions self.num_choices = num_choices @@ -1167,7 +1170,7 @@ class BertForTokenClassification(BertPreTrainedModel): logits = model(input_ids, token_type_ids, input_mask) ``` """ - def __init__(self, config, num_labels, output_attentions=False): + def __init__(self, config, num_labels=2, output_attentions=False): super(BertForTokenClassification, self).__init__(config) self.output_attentions = output_attentions self.num_labels = num_labels diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 769a6b3288..f805f63912 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -434,9 +434,7 @@ class OpenAIGPTPreTrainedModel(nn.Module): module.bias.data.zero_() @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs - ): + def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs): """ Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict. Download and cache the pre-trained model file if needed. @@ -449,14 +447,20 @@ class OpenAIGPTPreTrainedModel(nn.Module): . `openai_gpt_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance - a path or url to a pretrained model archive containing: - . `bert_config.json` a configuration file for the model + . `openai-gpt-config.json` a configuration file for the model . a series of NumPy files containing OpenAI TensorFlow trained weights from_tf: should we load the weights from a locally saved TensorFlow checkpoint cache_dir: an optional path to a folder in which the pre-trained models will be cached. state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models - *inputs, **kwargs: additional input for the specific Bert class - (ex: num_labels for BertForSequenceClassification) + *inputs, **kwargs: additional input for the specific OpenAI-GPT class """ + state_dict = kwargs.get('state_dict', None) + kwargs.pop('state_dict', None) + cache_dir = kwargs.get('cache_dir', None) + kwargs.pop('cache_dir', None) + from_tf = kwargs.get('from_tf', False) + kwargs.pop('from_tf', None) + if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]