From 93f563b8a87d6928979206260dbc129aa10bae83 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 7 Jan 2019 12:55:36 +0100 Subject: [PATCH] adding OpenAI GPT --- .../convert_openai_checkpoint_to_pytorch.py | 174 ++++++++++ pytorch_pretrained_bert/modeling.py | 22 +- pytorch_pretrained_bert/modeling_openai.py | 302 ++++++++++++++++++ .../optimization_openai.py | 104 ++++++ .../tokenization_openai.py | 108 +++++++ 5 files changed, 699 insertions(+), 11 deletions(-) create mode 100755 pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py create mode 100644 pytorch_pretrained_bert/modeling_openai.py create mode 100644 pytorch_pretrained_bert/optimization_openai.py create mode 100644 pytorch_pretrained_bert/tokenization_openai.py diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py new file mode 100755 index 0000000000..59791450ee --- /dev/null +++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import re +import argparse +import tensorflow as tf +import torch +import numpy as np + +from .modeling import BertConfig, BertForPreTraining + + +def convert_openai_checkpoint_to_pytorch(open_checkpoint_folder_path, openai_config_file, pytorch_dump_path): +def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/', + path_names='./'): + # Load weights from TF model + print("Loading weights...") + names = json.load(open(path_names + 'parameters_names.json')) + shapes = json.load(open(path + 'params_shapes.json')) + offsets = np.cumsum([np.prod(shape) for shape in shapes]) + init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)] + init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] + init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] + if n_ctx > 0: + init_params[0] = init_params[0][:n_ctx] + if n_special > 0: + init_params[0] = np.concatenate( + [init_params[1], + (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32), + init_params[0] + ], 0) + else: + init_params[0] = np.concatenate( + [init_params[1], + init_params[0] + ], 0) + del init_params[1] + if n_transfer == -1: + n_transfer = 0 + else: + n_transfer = 1 + n_transfer * 12 + init_params = [arr.squeeze() for arr in init_params] + + try: + assert model.embed.weight.shape == init_params[0].shape + except AssertionError as e: + e.args += (model.embed.weight.shape, init_params[0].shape) + raise + + model.embed.weight.data = torch.from_numpy(init_params[0]) + + for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]): + name = name[6:] # skip "model/" + assert name[-2:] == ":0" + name = name[:-2] + name = name.split('/') + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) + else: + l = [m_name] + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + try: + assert pointer.shape == ip.shape + except AssertionError as e: + e.args += (pointer.shape, ip.shape) + raise + pointer.data = torch.from_numpy(ip) + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): + config_path = os.path.abspath(bert_config_file) + tf_path = os.path.abspath(tf_checkpoint_path) + print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + # Initialise PyTorch model + config = BertConfig.from_json_file(bert_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = BertForPreTraining(config) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m"] for n in name): + print("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + + # Save pytorch-model + print("Save PyTorch model to {}".format(pytorch_dump_path)) + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path the TensorFlow checkpoint path.") + parser.add_argument("--bert_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.bert_config_file, + args.pytorch_dump_path) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index acdc741f6d..650918af7f 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -416,12 +416,12 @@ class BertPreTrainingHeads(nn.Module): return prediction_scores, seq_relationship_score -class PreTrainedBertModel(nn.Module): +class PreTrainedModel(nn.Module): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ def __init__(self, config, *inputs, **kwargs): - super(PreTrainedBertModel, self).__init__() + super(PreTrainedModel, self).__init__() if not isinstance(config, BertConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " @@ -447,7 +447,7 @@ class PreTrainedBertModel(nn.Module): @classmethod def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs): """ - Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict. + Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict. Download and cache the pre-trained model file if needed. Params: @@ -551,7 +551,7 @@ class PreTrainedBertModel(nn.Module): return model -class BertModel(PreTrainedBertModel): +class BertModel(PreTrainedModel): """BERT model ("Bidirectional Embedding Representations from a Transformer"). Params: @@ -634,7 +634,7 @@ class BertModel(PreTrainedBertModel): return encoded_layers, pooled_output -class BertForPreTraining(PreTrainedBertModel): +class BertForPreTraining(PreTrainedModel): """BERT model with pre-training heads. This module comprises the BERT model followed by the two pre-training heads: - the masked language modeling head, and @@ -705,7 +705,7 @@ class BertForPreTraining(PreTrainedBertModel): return prediction_scores, seq_relationship_score -class BertForMaskedLM(PreTrainedBertModel): +class BertForMaskedLM(PreTrainedModel): """BERT model with the masked language modeling head. This module comprises the BERT model followed by the masked language modeling head. @@ -766,7 +766,7 @@ class BertForMaskedLM(PreTrainedBertModel): return prediction_scores -class BertForNextSentencePrediction(PreTrainedBertModel): +class BertForNextSentencePrediction(PreTrainedModel): """BERT model with next sentence prediction head. This module comprises the BERT model followed by the next sentence classification head. @@ -828,7 +828,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel): return seq_relationship_score -class BertForSequenceClassification(PreTrainedBertModel): +class BertForSequenceClassification(PreTrainedModel): """BERT model for classification. This module is composed of the BERT model with a linear layer on top of the pooled output. @@ -894,7 +894,7 @@ class BertForSequenceClassification(PreTrainedBertModel): return logits -class BertForMultipleChoice(PreTrainedBertModel): +class BertForMultipleChoice(PreTrainedModel): """BERT model for multiple choice tasks. This module is composed of the BERT model with a linear layer on top of the pooled output. @@ -963,7 +963,7 @@ class BertForMultipleChoice(PreTrainedBertModel): return reshaped_logits -class BertForTokenClassification(PreTrainedBertModel): +class BertForTokenClassification(PreTrainedModel): """BERT model for token-level classification. This module is composed of the BERT model with a linear layer on top of the full hidden state of the last layer. @@ -1029,7 +1029,7 @@ class BertForTokenClassification(PreTrainedBertModel): return logits -class BertForQuestionAnswering(PreTrainedBertModel): +class BertForQuestionAnswering(PreTrainedModel): """BERT model for Question Answering (span extraction). This module is composed of the BERT model with a linear layer on top of the sequence output that computes start_logits and end_logits diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py new file mode 100644 index 0000000000..349baee79b --- /dev/null +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -0,0 +1,302 @@ +import copy +import json +import math +import re +import collections + +import numpy as np +import torch +import torch.nn as nn +from torch.nn.parameter import Parameter + +from .modeling import BertLayerNorm as LayerNorm + + +def gelu(x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT_FNS = { + 'relu': nn.ReLU, + 'swish': swish, + 'gelu': gelu +} + + +class Conv1D(nn.Module): + def __init__(self, nf, rf, nx): + super(Conv1D, self).__init__() + self.rf = rf + self.nf = nf + if rf == 1: # faster 1x1 conv + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.w = Parameter(w) + self.b = Parameter(torch.zeros(nf)) + else: # was used to train LM + raise NotImplementedError + + def forward(self, x): + if self.rf == 1: + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w) + x = x.view(*size_out) + else: + raise NotImplementedError + return x + + +class Attention(nn.Module): + def __init__(self, nx, n_ctx, cfg, scale=False): + super(Attention, self).__init__() + n_state = nx # in Attention: n_state=768 (nx=n_embd) + # [switch nx => n_state from Block to Attention to keep identical to TF implem] + assert n_state % cfg.n_head == 0 + self.register_buffer('b', torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.n_head = cfg.n_head + self.split_size = n_state + self.scale = scale + self.c_attn = Conv1D(n_state * 3, 1, nx) + self.c_proj = Conv1D(n_state, 1, nx) + self.attn_dropout = nn.Dropout(cfg.attn_pdrop) + self.resid_dropout = nn.Dropout(cfg.resid_pdrop) + + def _attn(self, q, k, v): + w = torch.matmul(q, k) + if self.scale: + w = w / math.sqrt(v.size(-1)) + w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights + w = nn.Softmax(dim=-1)(w) + w = self.attn_dropout(w) + return torch.matmul(w, v) + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) + else: + return x.permute(0, 2, 1, 3) + + def forward(self, x): + x = self.c_attn(x) + query, key, value = x.split(self.split_size, dim=2) + query = self.split_heads(query) + key = self.split_heads(key, k=True) + value = self.split_heads(value) + a = self._attn(query, key, value) + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a) + return a + + +class MLP(nn.Module): + def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = cfg.n_embd + self.c_fc = Conv1D(n_state, 1, nx) + self.c_proj = Conv1D(nx, 1, n_state) + self.act = ACT_FNS[cfg.afn] + self.dropout = nn.Dropout(cfg.resid_pdrop) + + def forward(self, x): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +class Block(nn.Module): + def __init__(self, n_ctx, cfg, scale=False): + super(Block, self).__init__() + nx = cfg.n_embd + self.attn = Attention(nx, n_ctx, cfg, scale) + self.ln_1 = LayerNorm(nx) + self.mlp = MLP(4 * nx, cfg) + self.ln_2 = LayerNorm(nx) + + def forward(self, x): + a = self.attn(x) + n = self.ln_1(x + a) + m = self.mlp(n) + h = self.ln_2(n + m) + return h + + +class TransformerModel(nn.Module): + """ Transformer model """ + + def __init__(self, cfg, vocab=40990, n_ctx=512): + super(TransformerModel, self).__init__() + self.vocab = vocab + self.embed = nn.Embedding(vocab, cfg.n_embd) + self.drop = nn.Dropout(cfg.embd_pdrop) + block = Block(n_ctx, cfg, scale=True) + self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)]) + + nn.init.normal_(self.embed.weight, std=0.02) + + def forward(self, x): + x = x.view(-1, x.size(-2), x.size(-1)) + e = self.embed(x) + # Add the position information to the input embeddings + h = e.sum(dim=2) + for block in self.h: + h = block(h) + return h + + +class LMHead(nn.Module): + """ Language Model Head for the transformer """ + + def __init__(self, model, cfg): + super(LMHead, self).__init__() + self.n_embd = cfg.n_embd + embed_shape = model.embed.weight.shape + self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) + self.decoder.weight = model.embed.weight # Tied weights + + def forward(self, h): + # Truncated Language modeling logits (we remove the last token) + h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd) + lm_logits = self.decoder(h_trunc) + return lm_logits + + +class MultipleChoiceHead(nn.Module): + """ Classifier Head for the transformer """ + + def __init__(self, clf_token, cfg): + super(MultipleChoiceHead, self).__init__() + self.n_embd = cfg.n_embd + self.clf_token = clf_token + self.dropout = nn.Dropout2d(cfg.clf_pdrop) # To reproduce the noise_shape parameter of TF implementation + self.linear = nn.Linear(cfg.n_embd, 1) + + nn.init.normal_(self.linear.weight, std = 0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, h, x): + # Classification logits + clf_h = h.view(-1, self.n_embd) + flat = x[..., 0].contiguous().view(-1) + clf_h = clf_h[flat == self.clf_token, :] + clf_h = clf_h.view(-1, x.size(1), self.n_embd, 1) + # This double transposition is there to replicate the behavior + # of the noise_shape argument in the tensorflow + # implementation. For more details, see + # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11 + clf_h = self.dropout(clf_h.transpose(1, 2)).transpose(1, 2) + clf_h = clf_h.contiguous().view(-1, self.n_embd) + clf_logits = self.linear(clf_h) + + return clf_logits.view(-1, x.size(1)) + + +class ClfHead(nn.Module): + """Classification Head for the transformer + + TODO: test this class.""" + def __init__(self, clf_token, cfg, n_class): + super(ClfHead, self).__init__() + self.n_embd = cfg.n_embd + self.clf_token = clf_token + self.dropout = nn.Dropout(cfg.clf_pdrop) + self.linear = nn.Linear(cfg.n_embd, n_class) + + nn.init.normal_(self.linear.weight, std = 0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, h, x): + clf_h = h.view(-1, self.n_embd) + flat = x[..., 0].contiguous().view(-1) + clf_h = clf_h[flat == self.clf_token, :] + clf_h = self.dropout(clf_h) + clf_logits = self.linear(clf_h) + + return clf_logits + +class SimilarityHead(nn.Module): + """ Similarity Head for the transformer + + TODO: test this class.""" + def __init__(self, clf_token, cfg): + super(SimilarityHead, self).__init__() + self.n_embd = cfg.n_embd + self.clf_token = clf_token + self.dropout = nn.Dropout(cfg.clf_pdrop) + self.linear = nn.Linear(cfg.n_embd, 1) + + nn.init.normal_(self.linear.weight, std = 0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, h, x): + sim_h = h.view(-1, self.n_embd) + flat = x[..., 0].contiguous().view(-1) + sim_h = sim_h[flat == self.clf_token, :] + sim_h = self.dropout(sim_h) + sim_h = sim_h.sum(dim = 1) + sim_logits = self.linear(sim_h) + + return sim_logits + +class DoubleHeadModel(nn.Module): + """ Transformer with language model and task specific heads """ + def __init__(self, cfg, clf_token, task_head_type, vocab=40990, n_ctx=512): + super(DoubleHeadModel, self).__init__() + self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx) + self.lm_head = LMHead(self.transformer, cfg) + if isinstance(task_head_type, str): + if task_head_type == 'multiple_choice': + self.task_head = MultipleChoiceHead(clf_token, cfg) + elif task_head_type == 'similarity': + self.task_head = SimilarityHead(clf_token, cfg) + elif task_head_type == 'inference': + # the three classes correspond to entailment, contradiction and neutral. + self.task_head = ClfHead(clf_token, cfg, 3) + else: + raise ValueError("task_head_type is expected to be 'multiple_choice' " + "'similarity', 'inference' or ('classification', n_class) " + f"got {task_head_type}.") + elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \ + task_head_type[0] == 'classification': + n_class = task_head_type[1] + self.task_head = ClfHead(clf_token, cfg, n_class) + else: + raise ValueError("task_head_type is expected to be 'multiple_choice' " + "'similarity', 'inference' or ('classification', n_class) " + f"got {task_head_type}.") + + def forward(self, x): + h = self.transformer(x) + lm_logits = self.lm_head(h) + task_logits = self.task_head(h, x) + + return lm_logits, task_logits + + +class dotdict(dict): + """dot.notation access to dictionary attributes""" + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +DEFAULT_CONFIG = dotdict({ + 'n_embd': 768, + 'n_head': 12, + 'n_layer': 12, + 'embd_pdrop': 0.1, + 'attn_pdrop': 0.1, + 'resid_pdrop': 0.1, + 'afn': 'gelu', + 'clf_pdrop': 0.1}) diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py new file mode 100644 index 0000000000..991d2699b3 --- /dev/null +++ b/pytorch_pretrained_bert/optimization_openai.py @@ -0,0 +1,104 @@ +import math +import torch +from torch.optim import Optimizer +from torch.nn.utils import clip_grad_norm_ + +def warmup_cosine(x, warmup=0.002): + s = 1 if x <= warmup else 0 + return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x))) + +def warmup_constant(x, warmup=0.002): + s = 1 if x <= warmup else 0 + return s*(x/warmup) + (1-s)*1 + +def warmup_linear(x, warmup=0.002): + s = 1 if x <= warmup else 0 + return (s*(x/warmup) + (1-s))*(1-x) + +SCHEDULES = { + 'warmup_cosine':warmup_cosine, + 'warmup_constant':warmup_constant, + 'warmup_linear':warmup_linear, +} + + +class OpenAIAdam(Optimizer): + """Implements Open AI version of Adam algorithm with weight decay fix. + """ + def __init__(self, params, lr, schedule, warmup, t_total, + b1=0.9, b2=0.999, e=1e-8, l2=0, + vector_l2=False, max_grad_norm=-1, **kwargs): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0 <= warmup: + raise ValueError("Invalid warmup: {}".format(warmup)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {}".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {}".format(b2)) + if not 0.0 <= e: + raise ValueError("Invalid epsilon value: {}".format(e)) + defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2, + max_grad_norm=max_grad_norm) + super(OpenAIAdam, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['b1'], group['b2'] + + state['step'] += 1 + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + denom = exp_avg_sq.sqrt().add_(group['e']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + schedule_fct = SCHEDULES[group['schedule']] + lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) + step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 + + p.data.addcdiv_(-step_size, exp_avg, denom) + + # Add weight decay at the end (fixed version) + if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0: + p.data.add_(-lr_scheduled * group['l2'], p.data) + + return loss diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py new file mode 100644 index 0000000000..59d78f2f1e --- /dev/null +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -0,0 +1,108 @@ +import re +import ftfy +import json +import spacy + +from tqdm import tqdm + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + word is represented as tuple of symbols (symbols being variable-length strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + +def text_standardize(text): + """ + fixes some issues the spacy tokenizer had on books corpus + also does some whitespace standardization + """ + text = text.replace('—', '-') + text = text.replace('–', '-') + text = text.replace('―', '-') + text = text.replace('…', '...') + text = text.replace('´', "'") + text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) + text = re.sub(r'\s*\n\s*', ' \n ', text) + text = re.sub(r'[^\S\n]+', ' ', text) + return text.strip() + +class TextEncoder(object): + """ + mostly a wrapper for a public python bpe tokenizer + """ + + def __init__(self, encoder_path, bpe_path): + self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) + self.encoder = json.load(open(encoder_path)) + self.decoder = {v:k for k,v in self.encoder.items()} + merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + def bpe(self, token): + word = tuple(token[:-1]) + ( token[-1] + '',) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + if word == '\n ': + word = '\n' + self.cache[token] = word + return word + + def encode(self, texts, verbose=True): + texts_tokens = [] + if verbose: + for text in tqdm(texts, ncols=80, leave=False): + text = self.nlp(text_standardize(ftfy.fix_text(text))) + text_tokens = [] + for token in text: + text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) + texts_tokens.append(text_tokens) + else: + for text in texts: + text = self.nlp(text_standardize(ftfy.fix_text(text))) + text_tokens = [] + for token in text: + text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) + texts_tokens.append(text_tokens) + return texts_tokens