diff --git a/README.md b/README.md index e7834f8605..d8309be01e 100644 --- a/README.md +++ b/README.md @@ -1394,7 +1394,7 @@ The data for SWAG can be downloaded by cloning the following [repository](https: ```shell export SWAG_DIR=/path/to/SWAG -python run_swag.py \ +python run_bert_swag.py \ --bert_model bert-base-uncased \ --do_train \ --do_lower_case \ @@ -1581,7 +1581,6 @@ python run_xlnet_classifier.py \ --task_name STS-B \ --do_train \ --do_eval \ - --do_lower_case \ --data_dir $GLUE_DIR/STS-B/ \ --max_seq_length 128 \ --train_batch_size 8 \ diff --git a/examples/run_swag.py b/examples/run_bert_swag.py similarity index 100% rename from examples/run_swag.py rename to examples/run_bert_swag.py diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py index 514776b242..0278b40cdd 100644 --- a/examples/run_xlnet_classifier.py +++ b/examples/run_xlnet_classifier.py @@ -70,6 +70,8 @@ def main(): parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") + parser.add_argument("--clip_gradients", default=1.0, type=float, + help="Clip gradient norms.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, @@ -80,6 +82,8 @@ def main(): help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") + parser.add_argument("--log_every", default=10, type=int, + help="Log metrics every X training steps.") # evaluation parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") @@ -234,12 +238,13 @@ def main(): # Prepare optimizer - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + optimizer_grouped_parameters = model.parameters() + # param_optimizer = list(model.named_parameters()) + # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + # optimizer_grouped_parameters = [ + # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + # ] if args.fp16: try: from apex.optimizers import FP16_Optimizer @@ -297,6 +302,9 @@ def main(): else: loss.backward() + if args.clip_gradients > 0.0: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients) + tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 @@ -310,7 +318,7 @@ def main(): optimizer.step() optimizer.zero_grad() global_step += 1 - if args.local_rank in [-1, 0]: + if args.local_rank in [-1, 0] and (args.log_every <= 0 or (step + 1) % args.log_every == 0): tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index 89639820b8..069c6c52e2 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -20,6 +20,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model, load_tf_weights_in_gpt2) from .modeling_xlnet import (XLNetBaseConfig, XLNetConfig, XLNetRunConfig, XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, + XLNetForSequenceClassification, XLNetForQuestionAnswering, load_tf_weights_in_xlnet) from .optimization import BertAdam diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py index d343fd2189..d46cc99e73 100755 --- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py +++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py @@ -28,20 +28,31 @@ from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME, XLNetForSequenceClassification, load_tf_weights_in_xlnet) -GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "sst-2", "sts-b", "qqp", "qnli", "rte", "wnli"] +GLUE_TASKS = { + "cola": "classification", + "mnli": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", +} def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None): # Initialise PyTorch model config = XLNetConfig.from_json_file(bert_config_file) - if finetuning_task is not None and finetuning_task.lower() in GLUE_TASKS: - model_class = XLNetLMHeadModel - elif finetuning_task is not None and 'squad' in finetuning_task.lower(): - model_class = XLNetForQuestionAnswering + + finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" + if finetuning_task in GLUE_TASKS: + print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) + model = XLNetForSequenceClassification(config, is_regression=bool(GLUE_TASKS[finetuning_task] == "regression")) + elif 'squad' in finetuning_task: + model = XLNetForQuestionAnswering(config) else: - model_class = XLNetLMHeadModel - print("Building PyTorch model {} from configuration: {}".format(str(model_class), str(config))) - model = model_class(config) + model = XLNetLMHeadModel(config) # Load weights from tf checkpoint load_tf_weights_in_xlnet(model, config, tf_checkpoint_path, finetuning_task) @@ -80,6 +91,8 @@ if __name__ == "__main__": type = str, help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned") args = parser.parse_args() + print(args) + convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py index 495f756f7e..7ee7be9025 100644 --- a/pytorch_pretrained_bert/modeling_xlnet.py +++ b/pytorch_pretrained_bert/modeling_xlnet.py @@ -30,7 +30,7 @@ from io import open import torch from torch import nn from torch.nn import functional as F -from torch.nn import CrossEntropyLoss +from torch.nn import CrossEntropyLoss, MSELoss from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME @@ -58,11 +58,11 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None, finetuning_tas if hasattr(model, 'lm_loss'): # We will load also the output bias tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias - elif hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights: + if hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights: # We will load also the sequence summary tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias - elif hasattr(model, 'logits_proj') and finetuning_task is not None and any('model/regression' in name for name in tf_weights.keys()): + if hasattr(model, 'logits_proj') and finetuning_task is not None and 'model/regression_{}/logit/kernel'.format(finetuning_task) in tf_weights: tf_to_pt_map['model/regression_{}/logit/kernel'.format(finetuning_task)] = model.logits_proj.weight tf_to_pt_map['model/regression_{}/logit/bias'.format(finetuning_task)] = model.logits_proj.bias @@ -133,6 +133,8 @@ def load_tf_weights_in_xlnet(model, config, tf_path, finetuning_task=None): array = tf.train.load_variable(tf_path, name) tf_weights[name] = array + input("Press Enter to continue...") + # Build TF to PyTorch weights loading map tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights, finetuning_task) @@ -144,7 +146,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path, finetuning_task=None): array = tf_weights[name] # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model - if 'kernel' in name and 'ff' in name: + if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name): print("Transposing") array = np.transpose(array) if isinstance(pointer, list):