diff --git a/modeling_pytorch.py b/modeling_pytorch.py index 0ee84c8f6a..f6ca69665c 100644 --- a/modeling_pytorch.py +++ b/modeling_pytorch.py @@ -28,6 +28,11 @@ import tensorflow as tf import torch import torch.nn as nn +def gelu(x): + raise NotImplementedError + # TF BERT says: cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + class BertConfig(object): """Configuration for `BertModel`.""" @@ -106,30 +111,54 @@ class BertConfig(object): class BERTLayerNorm(nn.Module): - def __init__(self): - tf.contrib.layers.layer_norm( - inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + def __init__(self, config, variance_epsilon=1e-12): + "Construct a layernorm module in the TF style (epsilon inside the square root)." + super(BERTLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.ones(config.hidden_size)) + self.beta = nn.Parameter(torch.zeros(config.hidden_size)) + self.variance_epsilon = variance_epsilon + + def forward(self, x): + # TODO check it's identical to TF implementation in details + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.gamma * x + self.beta + # tf.contrib.layers.layer_norm( + # inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) class BERTEmbeddings(nn.Module): def __init__(self, embedding_size, vocab_size, token_type_vocab_size, max_position_embeddings, config): + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size) + + # Position embeddings are (normally) a contiguous range so we could use a slice + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) + + # token_type_embeddings vocabulary is very small. TF used one-hot embeddings to speedup. self.token_type_embeddings = nn.Embedding(config.token_type_vocab_size, config.embedding_size) - self.LayerNorm = BERTLayerNorm() # Not snake-cased to fit with TF model variable name - self.dropout = nn.dropout(config.hidden_dropout_prob) - - self.initialize_weights(self, config.initializer_range) - - def initialize_weights(self, initializer_range): - torch.truncated_normal_initializer(stddev=initializer_range) + self.LayerNorm = BERTLayerNorm() # Not snake-cased to stick with TF model variable name + self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None): batch_size = input_ids.size(0) seq_length = input_ids.size(1) + # TODO finich that position_ids = torch.range().view(batch_size, seq_length) + if token_type_ids is None: + token_type_ids = torch.zeros(batch_size, seq_length) words_embeddings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) @@ -141,28 +170,6 @@ class BERTEmbeddings(nn.Module): return embeddings -class BERTIntermediate(nn.Module): - def __init__(self, config): - super(BERTOutput, self).__init__() - self.dense = nn.Linear() - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - return hidden_states - - -class BERTOutput(nn.Module): - def __init__(self, config): - super(BERTOutput, self).__init__() - self.dense = nn.Linear() - self.LayerNorm = BERTLayerNorm(config) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - class BERTSelfAttention(nn.Module): def __init__(self, config): super(BERTSelfAttention, self).__init__() @@ -170,22 +177,84 @@ class BERTSelfAttention(nn.Module): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads)) - attention_head_size = int(config.hidden_size / config.num_attention_heads) - all_head_size = num_attention_heads * attention_head_size + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = nn.Linear(config.hidden_size, all_head_size) - self.key = nn.Linear(config.hidden_size, all_head_size) - self.value = nn.Linear(config.hidden_size, all_head_size) + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) - def transpose_for_scores(self, x, k=False): - new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) - x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states - if k: + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, input_tensor, num_attention_heads, is_key_tensor=False): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + if is_key_tensor: return x.permute(0, 2, 3, 1) else: return x.permute(0, 2, 1, 3) - def forward(self, hidden_states): + def forward(self, hidden_states, attention_mask): + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + query_layer = self.query(hidden_states) + key_layer = self.key(hidden_states) + value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(query_layer) + key_layer = self.transpose_for_scores(key_layer, is_key_tensor=True) + value_layer = self.transpose_for_scores(value_layer) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = torch.matmul(query_layer, key_layer) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # TODO clean up this (precompute) + # MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - attention_mask) * -10000.0 + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_x_shape) + + return context_layer + + +class BERTSelfOutput(nn.Module): + def __init__(self, config): + super(BERTSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BERTLayerNorm(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(input_tensor) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states @@ -193,11 +262,37 @@ class BERTAttention(nn.Module): def __init__(self, config): super(BERTAttention, self).__init__() self.self = BERTSelfAttention(config) - self.output = BERTOutput(config) + self.output = BERTSelfOutput(config) + + def forward(self, input_tensor, attention_mask): + attention_output = self.self(input_tensor, attention_mask) + attention_output = self.output(attention_output, input_tensor) + return attention_output + + +class BERTIntermediate(nn.Module): + def __init__(self, config): + super(BERTOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = gelu def forward(self, hidden_states): - hidden_states = self.self(hidden_states) - hidden_states = self.output(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BERTOutput(nn.Module): + def __init__(self, config): + super(BERTOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BERTLayerNorm(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(input_tensor) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states @@ -208,10 +303,10 @@ class BERTLayer(nn.Module): self.intermediate = BERTIntermediate(config) self.output = BERTOutput(config) - def forward(self, hidden_states): - hidden_states = self.attention(hidden_states) - hidden_states = self.intermediate(hidden_states) - hidden_states = self.output(hidden_states) + def forward(self, hidden_states, attention_mask): + attention_output = self.attention(hidden_states, attention_mask) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) return hidden_states @@ -221,7 +316,7 @@ class BERTEncoder(nn.Module): layer = BERTLayer(n_ctx, cfg, scale=True) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) - def forward(self, hidden_states): + def forward(self, hidden_states, attention_mask): """ Args: hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size] @@ -229,7 +324,25 @@ class BERTEncoder(nn.Module): float Tensor of shape [batch_size, seq_length, hidden_size] """ for layer_module in self.layer: - hidden_states = layer_module(hidden_states) + hidden_states = layer_module(hidden_states, attention_mask) + return hidden_states + + +class BERTPooler(nn.Module): + def __init__(self, config): + super(BERTPooler, self).__init__() + layer = BERTLayer(n_ctx, cfg, scale=True) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask): + """ + Args: + hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size] + Return: + float Tensor of shape [batch_size, seq_length, hidden_size] + """ + for layer_module in self.layer: + hidden_states = layer_module(hidden_states, attention_mask) return hidden_states @@ -278,836 +391,9 @@ class BertModel(nn.Module): self.embeddings = BERTEmbeddings(config) self.encoder = BERTEncoder(config) + self.pooler = BERTPooler(config) - - def forward(self, input_ids, token_type_ids=None, input_mask=None): - if input_mask is None: - input_mask = torch.ones(batch_size, seq_length), dtype=torch.long) - - if token_type_ids is None: - token_type_ids = torch.zeros((batch_size, seq_length), dtype=torch.long) - - hidden_states = self.embeddings(input_ids, token_type_ids, input_mask) - hidden_states = self.encoder(hidden_states) - - # Perform embedding lookup on the word ids. - (self.embedding_output, self.embedding_table) = embedding_lookup( - input_ids=input_ids, - vocab_size=config.vocab_size, - embedding_size=config.hidden_size, - initializer_range=config.initializer_range, - word_embedding_name="word_embeddings", - use_one_hot_embeddings=use_one_hot_embeddings) - - # Add positional embeddings and token type embeddings, then layer - # normalize and perform dropout. - self.embedding_output = embedding_postprocessor( - input_tensor=self.embedding_output, - use_token_type=True, - token_type_ids=token_type_ids, - token_type_vocab_size=config.type_vocab_size, - token_type_embedding_name="token_type_embeddings", - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=config.initializer_range, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob) - - with tf.variable_scope("encoder"): - # This converts a 2D mask of shape [batch_size, seq_length] to a 3D - # mask of shape [batch_size, seq_length, seq_length] which is used - # for the attention scores. - attention_mask = create_attention_mask_from_input_mask( - input_ids, input_mask) - - # Run the stacked transformer. - # `sequence_output` shape = [batch_size, seq_length, hidden_size]. - self.all_encoder_layers = transformer_model( - input_tensor=self.embedding_output, - attention_mask=attention_mask, - hidden_size=config.hidden_size, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - intermediate_size=config.intermediate_size, - intermediate_act_fn=get_activation(config.hidden_act), - hidden_dropout_prob=config.hidden_dropout_prob, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, - initializer_range=config.initializer_range, - do_return_all_layers=True) - - self.sequence_output = self.all_encoder_layers[-1] - # The "pooler" converts the encoded sequence tensor of shape - # [batch_size, seq_length, hidden_size] to a tensor of shape - # [batch_size, hidden_size]. This is necessary for segment-level - # (or segment-pair-level) classification tasks where we need a fixed - # dimensional representation of the segment. - with tf.variable_scope("pooler"): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. We assume that this has been pre-trained - first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) - self.pooled_output = tf.layers.dense( - first_token_tensor, - config.hidden_size, - activation=tf.tanh, - kernel_initializer=create_initializer(config.initializer_range)) - - def get_pooled_output(self): - return self.pooled_output - - def get_sequence_output(self): - """Gets final hidden layer of encoder. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the final hidden of the transformer encoder. - """ - return self.sequence_output - - def get_all_encoder_layers(self): - return self.all_encoder_layers - - def get_embedding_output(self): - """Gets output of the embedding lookup (i.e., input to the transformer). - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the output of the embedding layer, after summing the word - embeddings with the positional embeddings and the token type embeddings, - then performing layer normalization. This is the input to the transformer. - """ - return self.embedding_output - - def get_embedding_table(self): - return self.embedding_table - - -def gelu(input_tensor): - """Gaussian Error Linear Unit. - - This is a smoother version of the RELU. - Original paper: https://arxiv.org/abs/1606.08415 - - Args: - input_tensor: float Tensor to perform activation. - - Returns: - `input_tensor` with the GELU activation applied. - """ - cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) - return input_tensor * cdf - - -def get_activation(activation_string): - """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. - - Args: - activation_string: String name of the activation function. - - Returns: - A Python function corresponding to the activation function. If - `activation_string` is None, empty, or "linear", this will return None. - If `activation_string` is not a string, it will return `activation_string`. - - Raises: - ValueError: The `activation_string` does not correspond to a known - activation. - """ - - # We assume that anything that"s not a string is already an activation - # function, so we just return it. - if not isinstance(activation_string, six.string_types): - return activation_string - - if not activation_string: - return None - - act = activation_string.lower() - if act == "linear": - return None - elif act == "relu": - return tf.nn.relu - elif act == "gelu": - return gelu - elif act == "tanh": - return tf.tanh - else: - raise ValueError("Unsupported activation: %s" % act) - - -def get_assigment_map_from_checkpoint(tvars, init_checkpoint): - """Compute the union of the current variables and checkpoint variables.""" - assignment_map = {} - initialized_variable_names = {} - - name_to_variable = collections.OrderedDict() - for var in tvars: - name = var.name - m = re.match("^(.*):\\d+$", name) - if m is not None: - name = m.group(1) - name_to_variable[name] = var - - init_vars = tf.train.list_variables(init_checkpoint) - - assignment_map = collections.OrderedDict() - for x in init_vars: - (name, var) = (x[0], x[1]) - if name not in name_to_variable: - continue - assignment_map[name] = name - initialized_variable_names[name] = 1 - initialized_variable_names[name + ":0"] = 1 - - return (assignment_map, initialized_variable_names) - - -def dropout(input_tensor, dropout_prob): - """Perform dropout. - - Args: - input_tensor: float Tensor. - dropout_prob: Python float. The probabiltiy of dropping out a value (NOT of - *keeping* a dimension as in `tf.nn.dropout`). - - Returns: - A version of `input_tensor` with dropout applied. - """ - if dropout_prob is None or dropout_prob == 0.0: - return input_tensor - - output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) - return output - - -def layer_norm(input_tensor, name=None): - """Run layer normalization on the last dimension of the tensor.""" - return tf.contrib.layers.layer_norm( - inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) - - -def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): - """Runs layer normalization followed by dropout.""" - output_tensor = layer_norm(input_tensor, name) - output_tensor = dropout(output_tensor, dropout_prob) - return output_tensor - - -def create_initializer(initializer_range=0.02): - """Creates a `truncated_normal_initializer` with the given range.""" - return tf.truncated_normal_initializer(stddev=initializer_range) - - -def embedding_lookup(input_ids, - vocab_size, - embedding_size=128, - initializer_range=0.02, - word_embedding_name="word_embeddings", - use_one_hot_embeddings=False): - """Looks up words embeddings for id tensor. - - Args: - input_ids: int32 Tensor of shape [batch_size, seq_length] containing word - ids. - vocab_size: int. Size of the embedding vocabulary. - embedding_size: int. Width of the word embeddings. - initializer_range: float. Embedding initialization range. - word_embedding_name: string. Name of the embedding table. - use_one_hot_embeddings: bool. If True, use one-hot method for word - embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better - for TPUs. - - Returns: - float Tensor of shape [batch_size, seq_length, embedding_size]. - """ - # This function assumes that the input is of shape [batch_size, seq_length, - # num_inputs]. - # - # If the input is a 2D tensor of shape [batch_size, seq_length], we - # reshape to [batch_size, seq_length, 1]. - if input_ids.shape.ndims == 2: - input_ids = tf.expand_dims(input_ids, axis=[-1]) - - embedding_table = tf.get_variable( - name=word_embedding_name, - shape=[vocab_size, embedding_size], - initializer=create_initializer(initializer_range)) - - if use_one_hot_embeddings: - flat_input_ids = tf.reshape(input_ids, [-1]) - one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) - output = tf.matmul(one_hot_input_ids, embedding_table) - else: - output = tf.nn.embedding_lookup(embedding_table, input_ids) - - input_shape = get_shape_list(input_ids) - - output = tf.reshape(output, - input_shape[0:-1] + [input_shape[-1] * embedding_size]) - return (output, embedding_table) - - -def embedding_postprocessor(input_tensor, - use_token_type=False, - token_type_ids=None, - token_type_vocab_size=16, - token_type_embedding_name="token_type_embeddings", - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=0.02, - max_position_embeddings=512, - dropout_prob=0.1): - """Performs various post-processing on a word embedding tensor. - - Args: - input_tensor: float Tensor of shape [batch_size, seq_length, - embedding_size]. - use_token_type: bool. Whether to add embeddings for `token_type_ids`. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - Must be specified if `use_token_type` is True. - token_type_vocab_size: int. The vocabulary size of `token_type_ids`. - token_type_embedding_name: string. The name of the embedding table variable - for token type ids. - use_position_embeddings: bool. Whether to add position embeddings for the - position of each token in the sequence. - position_embedding_name: string. The name of the embedding table variable - for positional embeddings. - initializer_range: float. Range of the weight initialization. - max_position_embeddings: int. Maximum sequence length that might ever be - used with this model. This can be longer than the sequence length of - input_tensor, but cannot be shorter. - dropout_prob: float. Dropout probability applied to the final output tensor. - - Returns: - float tensor with same shape as `input_tensor`. - - Raises: - ValueError: One of the tensor shapes or input values is invalid. - """ - input_shape = get_shape_list(input_tensor, expected_rank=3) - batch_size = input_shape[0] - seq_length = input_shape[1] - width = input_shape[2] - - if seq_length > max_position_embeddings: - raise ValueError("The seq length (%d) cannot be greater than " - "`max_position_embeddings` (%d)" % - (seq_length, max_position_embeddings)) - - output = input_tensor - - if use_token_type: - if token_type_ids is None: - raise ValueError("`token_type_ids` must be specified if" - "`use_token_type` is True.") - token_type_table = tf.get_variable( - name=token_type_embedding_name, - shape=[token_type_vocab_size, width], - initializer=create_initializer(initializer_range)) - # This vocab will be small so we always do one-hot here, since it is always - # faster for a small vocabulary. - flat_token_type_ids = tf.reshape(token_type_ids, [-1]) - one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) - token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) - token_type_embeddings = tf.reshape(token_type_embeddings, - [batch_size, seq_length, width]) - output += token_type_embeddings - - if use_position_embeddings: - full_position_embeddings = tf.get_variable( - name=position_embedding_name, - shape=[max_position_embeddings, width], - initializer=create_initializer(initializer_range)) - # Since the position embedding table is a learned variable, we create it - # using a (long) sequence length `max_position_embeddings`. The actual - # sequence length might be shorter than this, for faster training of - # tasks that do not have long sequences. - # - # So `full_position_embeddings` is effectively an embedding table - # for position [0, 1, 2, ..., max_position_embeddings-1], and the current - # sequence has positions [0, 1, 2, ... seq_length-1], so we can just - # perform a slice. - if seq_length < max_position_embeddings: - position_embeddings = tf.slice(full_position_embeddings, [0, 0], - [seq_length, -1]) - else: - position_embeddings = full_position_embeddings - - num_dims = len(output.shape.as_list()) - - # Only the last two dimensions are relevant (`seq_length` and `width`), so - # we broadcast among the first dimensions, which is typically just - # the batch size. - position_broadcast_shape = [] - for _ in range(num_dims - 2): - position_broadcast_shape.append(1) - position_broadcast_shape.extend([seq_length, width]) - position_embeddings = tf.reshape(position_embeddings, - position_broadcast_shape) - output += position_embeddings - - output = layer_norm_and_dropout(output, dropout_prob) - return output - - -def create_attention_mask_from_input_mask(from_tensor, to_mask): - """Create 3D attention mask from a 2D tensor mask. - - Args: - from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. - to_mask: int32 Tensor of shape [batch_size, to_seq_length]. - - Returns: - float Tensor of shape [batch_size, from_seq_length, to_seq_length]. - """ - from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) - batch_size = from_shape[0] - from_seq_length = from_shape[1] - - to_shape = get_shape_list(to_mask, expected_rank=2) - to_seq_length = to_shape[1] - - to_mask = tf.cast( - tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) - - # We don't assume that `from_tensor` is a mask (although it could be). We - # don't actually care if we attend *from* padding tokens (only *to* padding) - # tokens so we create a tensor of all ones. - # - # `broadcast_ones` = [batch_size, from_seq_length, 1] - broadcast_ones = tf.ones( - shape=[batch_size, from_seq_length, 1], dtype=tf.float32) - - # Here we broadcast along two dimensions to create the mask. - mask = broadcast_ones * to_mask - - return mask - - -def attention_layer(from_tensor, - to_tensor, - attention_mask=None, - num_attention_heads=1, - size_per_head=512, - query_act=None, - key_act=None, - value_act=None, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - do_return_2d_tensor=False, - batch_size=None, - from_seq_length=None, - to_seq_length=None): - """Performs multi-headed attention from `from_tensor` to `to_tensor`. - - This is an implementation of multi-headed attention based on "Attention - is all you Need". If `from_tensor` and `to_tensor` are the same, then - this is self-attention. Each timestep in `from_tensor` attends to the - corresponding sequence in `to_tensor`, and returns a fixed-with vector. - - This function first projects `from_tensor` into a "query" tensor and - `to_tensor` into "key" and "value" tensors. These are (effectively) a list - of tensors of length `num_attention_heads`, where each tensor is of shape - [batch_size, seq_length, size_per_head]. - - Then, the query and key tensors are dot-producted and scaled. These are - softmaxed to obtain attention probabilities. The value tensors are then - interpolated by these probabilities, then concatenated back to a single - tensor and returned. - - In practice, the multi-headed attention are done with transposes and - reshapes rather than actual separate tensors. - - Args: - from_tensor: float Tensor of shape [batch_size, from_seq_length, - from_width]. - to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. - attention_mask: (optional) int32 Tensor of shape [batch_size, - from_seq_length, to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions in - the mask that are 0, and will be unchaged for positions that are 1. - num_attention_heads: int. Number of attention heads. - size_per_head: int. Size of each attention head. - query_act: (optional) Activation function for the query transform. - key_act: (optional) Activation function for the key transform. - value_act: (optional) Activation function for the value transform. - attention_probs_dropout_prob: - initializer_range: float. Range of the weight initializer. - do_return_2d_tensor: bool. If True, the output will be of shape [batch_size - * from_seq_length, num_attention_heads * size_per_head]. If False, the - output will be of shape [batch_size, from_seq_length, num_attention_heads - * size_per_head]. - batch_size: (Optional) int. If the input is 2D, this might be the batch size - of the 3D version of the `from_tensor` and `to_tensor`. - from_seq_length: (Optional) If the input is 2D, this might be the seq length - of the 3D version of the `from_tensor`. - to_seq_length: (Optional) If the input is 2D, this might be the seq length - of the 3D version of the `to_tensor`. - - Returns: - float Tensor of shape [batch_size, from_seq_length, - num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is - true, this will be of shape [batch_size * from_seq_length, - num_attention_heads * size_per_head]). - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - """ - - def transpose_for_scores(input_tensor, batch_size, num_attention_heads, - seq_length, width): - output_tensor = tf.reshape( - input_tensor, [batch_size, seq_length, num_attention_heads, width]) - - output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) - return output_tensor - - from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) - to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) - - if len(from_shape) != len(to_shape): - raise ValueError( - "The rank of `from_tensor` must match the rank of `to_tensor`.") - - if len(from_shape) == 3: - batch_size = from_shape[0] - from_seq_length = from_shape[1] - to_seq_length = to_shape[1] - elif len(from_shape) == 2: - if (batch_size is None or from_seq_length is None or to_seq_length is None): - raise ValueError( - "When passing in rank 2 tensors to attention_layer, the values " - "for `batch_size`, `from_seq_length`, and `to_seq_length` " - "must all be specified.") - - # Scalar dimensions referenced here: - # B = batch size (number of sequences) - # F = `from_tensor` sequence length - # T = `to_tensor` sequence length - # N = `num_attention_heads` - # H = `size_per_head` - - from_tensor_2d = reshape_to_matrix(from_tensor) - to_tensor_2d = reshape_to_matrix(to_tensor) - - # `query_layer` = [B*F, N*H] - query_layer = tf.layers.dense( - from_tensor_2d, - num_attention_heads * size_per_head, - activation=query_act, - name="query", - kernel_initializer=create_initializer(initializer_range)) - - # `key_layer` = [B*T, N*H] - key_layer = tf.layers.dense( - to_tensor_2d, - num_attention_heads * size_per_head, - activation=key_act, - name="key", - kernel_initializer=create_initializer(initializer_range)) - - # `value_layer` = [B*T, N*H] - value_layer = tf.layers.dense( - to_tensor_2d, - num_attention_heads * size_per_head, - activation=value_act, - name="value", - kernel_initializer=create_initializer(initializer_range)) - - # `query_layer` = [B, N, F, H] - query_layer = transpose_for_scores(query_layer, batch_size, - num_attention_heads, from_seq_length, - size_per_head) - - # `key_layer` = [B, N, T, H] - key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, - to_seq_length, size_per_head) - - # Take the dot product between "query" and "key" to get the raw - # attention scores. - # `attention_scores` = [B, N, F, T] - attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) - attention_scores = tf.multiply(attention_scores, - 1.0 / math.sqrt(float(size_per_head))) - - if attention_mask is not None: - # `attention_mask` = [B, 1, F, T] - attention_mask = tf.expand_dims(attention_mask, axis=[1]) - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 - - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_scores += adder - - # Normalize the attention scores to probabilities. - # `attention_probs` = [B, N, F, T] - attention_probs = tf.nn.softmax(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = dropout(attention_probs, attention_probs_dropout_prob) - - # `value_layer` = [B, T, N, H] - value_layer = tf.reshape( - value_layer, - [batch_size, to_seq_length, num_attention_heads, size_per_head]) - - # `value_layer` = [B, N, T, H] - value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) - - # `context_layer` = [B, N, F, H] - context_layer = tf.matmul(attention_probs, value_layer) - - # `context_layer` = [B, F, N, H] - context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) - - if do_return_2d_tensor: - # `context_layer` = [B*F, N*V] - context_layer = tf.reshape( - context_layer, - [batch_size * from_seq_length, num_attention_heads * size_per_head]) - else: - # `context_layer` = [B, F, N*V] - context_layer = tf.reshape( - context_layer, - [batch_size, from_seq_length, num_attention_heads * size_per_head]) - - return context_layer - - -def transformer_model(input_tensor, - attention_mask=None, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - intermediate_act_fn=gelu, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - initializer_range=0.02, - do_return_all_layers=False): - """Multi-headed, multi-layer Transformer from "Attention is All You Need". - - This is almost an exact implementation of the original Transformer encoder. - - See the original paper: - https://arxiv.org/abs/1706.03762 - - Also see: - https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py - - Args: - input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. - attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, - seq_length], with 1 for positions that can be attended to and 0 in - positions that should not be. - hidden_size: int. Hidden size of the Transformer. - num_hidden_layers: int. Number of layers (blocks) in the Transformer. - num_attention_heads: int. Number of attention heads in the Transformer. - intermediate_size: int. The size of the "intermediate" (a.k.a., feed - forward) layer. - intermediate_act_fn: function. The non-linear activation function to apply - to the output of the intermediate/feed-forward layer. - hidden_dropout_prob: float. Dropout probability for the hidden layers. - attention_probs_dropout_prob: float. Dropout probability of the attention - probabilities. - initializer_range: float. Range of the initializer (stddev of truncated - normal). - do_return_all_layers: Whether to also return all layers or just the final - layer. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size], the final - hidden layer of the Transformer. - - Raises: - ValueError: A Tensor shape or parameter is invalid. - """ - if hidden_size % num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (hidden_size, num_attention_heads)) - - attention_head_size = int(hidden_size / num_attention_heads) - input_shape = get_shape_list(input_tensor, expected_rank=3) - batch_size = input_shape[0] - seq_length = input_shape[1] - input_width = input_shape[2] - - # The Transformer performs sum residuals on all layers so the input needs - # to be the same as the hidden size. - if input_width != hidden_size: - raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % - (input_width, hidden_size)) - - # We keep the representation as a 2D tensor to avoid re-shaping it back and - # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on - # the GPU/CPU but may not be free on the TPU, so we want to minimize them to - # help the optimizer. - prev_output = reshape_to_matrix(input_tensor) - - all_layer_outputs = [] - for layer_idx in range(num_hidden_layers): - with tf.variable_scope("layer_%d" % layer_idx): - layer_input = prev_output - - with tf.variable_scope("attention"): - attention_heads = [] - with tf.variable_scope("self"): - attention_head = attention_layer( - from_tensor=layer_input, - to_tensor=layer_input, - attention_mask=attention_mask, - num_attention_heads=num_attention_heads, - size_per_head=attention_head_size, - attention_probs_dropout_prob=attention_probs_dropout_prob, - initializer_range=initializer_range, - do_return_2d_tensor=True, - batch_size=batch_size, - from_seq_length=seq_length, - to_seq_length=seq_length) - attention_heads.append(attention_head) - - attention_output = None - if len(attention_heads) == 1: - attention_output = attention_heads[0] - else: - # In the case where we have other sequences, we just concatenate - # them to the self-attention head before the projection. - attention_output = tf.concat(attention_heads, axis=-1) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.variable_scope("output"): - attention_output = tf.layers.dense( - attention_output, - hidden_size, - kernel_initializer=create_initializer(initializer_range)) - attention_output = dropout(attention_output, hidden_dropout_prob) - attention_output = layer_norm(attention_output + layer_input) - - # The activation is only applied to the "intermediate" hidden layer. - with tf.variable_scope("intermediate"): - intermediate_output = tf.layers.dense( - attention_output, - intermediate_size, - activation=intermediate_act_fn, - kernel_initializer=create_initializer(initializer_range)) - - # Down-project back to `hidden_size` then add the residual. - with tf.variable_scope("output"): - layer_output = tf.layers.dense( - intermediate_output, - hidden_size, - kernel_initializer=create_initializer(initializer_range)) - layer_output = dropout(layer_output, hidden_dropout_prob) - layer_output = layer_norm(layer_output + attention_output) - prev_output = layer_output - all_layer_outputs.append(layer_output) - - if do_return_all_layers: - final_outputs = [] - for layer_output in all_layer_outputs: - final_output = reshape_from_matrix(layer_output, input_shape) - final_outputs.append(final_output) - return final_outputs - else: - final_output = reshape_from_matrix(prev_output, input_shape) - return final_output - - -def get_shape_list(tensor, expected_rank=None, name=None): - """Returns a list of the shape of tensor, preferring static dimensions. - - Args: - tensor: A tf.Tensor object to find the shape of. - expected_rank: (optional) int. The expected rank of `tensor`. If this is - specified and the `tensor` has a different rank, and exception will be - thrown. - name: Optional name of the tensor for the error message. - - Returns: - A list of dimensions of the shape of tensor. All static dimensions will - be returned as python integers, and dynamic dimensions will be returned - as tf.Tensor scalars. - """ - if name is None: - name = tensor.name - - if expected_rank is not None: - assert_rank(tensor, expected_rank, name) - - shape = tensor.shape.as_list() - - non_static_indexes = [] - for (index, dim) in enumerate(shape): - if dim is None: - non_static_indexes.append(index) - - if not non_static_indexes: - return shape - - dyn_shape = tf.shape(tensor) - for index in non_static_indexes: - shape[index] = dyn_shape[index] - return shape - - -def reshape_to_matrix(input_tensor): - """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" - ndims = input_tensor.shape.ndims - if ndims < 2: - raise ValueError("Input tensor must have at least rank 2. Shape = %s" % - (input_tensor.shape)) - if ndims == 2: - return input_tensor - - width = input_tensor.shape[-1] - output_tensor = tf.reshape(input_tensor, [-1, width]) - return output_tensor - - -def reshape_from_matrix(output_tensor, orig_shape_list): - """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" - if len(orig_shape_list) == 2: - return output_tensor - - output_shape = get_shape_list(output_tensor) - - orig_dims = orig_shape_list[0:-1] - width = output_shape[-1] - - return tf.reshape(output_tensor, orig_dims + [width]) - - -def assert_rank(tensor, expected_rank, name=None): - """Raises an exception if the tensor rank is not of the expected rank. - - Args: - tensor: A tf.Tensor to check the rank of. - expected_rank: Python integer or list of integers, expected rank. - name: Optional name of the tensor for the error message. - - Raises: - ValueError: If the expected shape doesn"t match the actual shape. - """ - if name is None: - name = tensor.name - - expected_rank_dict = {} - if isinstance(expected_rank, six.integer_types): - expected_rank_dict[expected_rank] = True - else: - for x in expected_rank: - expected_rank_dict[x] = True - - actual_rank = tensor.shape.ndims - if actual_rank not in expected_rank_dict: - scope_name = tf.get_variable_scope().name - raise ValueError( - "For the tensor `%s` in scope `%s`, the actual rank " - "`%d` (shape = %s) is not equal to the expected rank `%s`" % - (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) + def forward(self, input_ids, token_type_ids, attention_mask): + embedding_output = self.embeddings(input_ids, token_type_ids) + all_encoder_layers = self.encoder(embedding_output, attention_mask) + return all_encoder_layers