diff --git a/modeling_pytorch.py b/modeling_pytorch.py
index 0ee84c8f6a..f6ca69665c 100644
--- a/modeling_pytorch.py
+++ b/modeling_pytorch.py
@@ -28,6 +28,11 @@ import tensorflow as tf
 import torch
 import torch.nn as nn
 
+def gelu(x):
+    raise NotImplementedError
+    # TF BERT says: cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
 
 class BertConfig(object):
     """Configuration for `BertModel`."""
@@ -106,30 +111,54 @@ class BertConfig(object):
 
 
 class BERTLayerNorm(nn.Module):
-    def __init__(self):
-        tf.contrib.layers.layer_norm(
-      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
+    def __init__(self, config, variance_epsilon=1e-12):
+        "Construct a layernorm module in the TF style (epsilon inside the square root)."
+        super(BERTLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.ones(config.hidden_size))
+        self.beta = nn.Parameter(torch.zeros(config.hidden_size))
+        self.variance_epsilon = variance_epsilon
+
+    def forward(self, x):
+        # TODO check it's identical to TF implementation in details
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.gamma * x + self.beta
+    #     tf.contrib.layers.layer_norm(
+    #   inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
 
 class BERTEmbeddings(nn.Module):
     def __init__(self, embedding_size, vocab_size,
                  token_type_vocab_size, max_position_embeddings,
                  config):
+
         self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size)
+
+        # Position embeddings are (normally) a contiguous range so we could use a slice
+        # Since the position embedding table is a learned variable, we create it
+        # using a (long) sequence length `max_position_embeddings`. The actual
+        # sequence length might be shorter than this, for faster training of
+        # tasks that do not have long sequences.
+        #
+        # So `full_position_embeddings` is effectively an embedding table
+        # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
+        # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
+        # perform a slice.
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+
+        # token_type_embeddings vocabulary is very small. TF used one-hot embeddings to speedup.
         self.token_type_embeddings = nn.Embedding(config.token_type_vocab_size, config.embedding_size)
 
-        self.LayerNorm = BERTLayerNorm() # Not snake-cased to fit with TF model variable name
-        self.dropout = nn.dropout(config.hidden_dropout_prob)
-
-        self.initialize_weights(self, config.initializer_range)
-    
-    def initialize_weights(self, initializer_range):
-        torch.truncated_normal_initializer(stddev=initializer_range)
+        self.LayerNorm = BERTLayerNorm() # Not snake-cased to stick with TF model variable name
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, input_ids, token_type_ids=None):
         batch_size = input_ids.size(0)
         seq_length = input_ids.size(1)
+        # TODO finich that
         position_ids = torch.range().view(batch_size, seq_length)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(batch_size, seq_length)
 
         words_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
@@ -141,28 +170,6 @@ class BERTEmbeddings(nn.Module):
         return embeddings
 
 
-class BERTIntermediate(nn.Module):
-    def __init__(self, config):
-        super(BERTOutput, self).__init__()
-        self.dense = nn.Linear()
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        return hidden_states
-
-
-class BERTOutput(nn.Module):
-    def __init__(self, config):
-        super(BERTOutput, self).__init__()
-        self.dense = nn.Linear()
-        self.LayerNorm = BERTLayerNorm(config)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
 class BERTSelfAttention(nn.Module):
     def __init__(self, config):
         super(BERTSelfAttention, self).__init__()
@@ -170,22 +177,84 @@ class BERTSelfAttention(nn.Module):
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        all_head_size = num_attention_heads * attention_head_size
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-        self.query = nn.Linear(config.hidden_size, all_head_size)
-        self.key = nn.Linear(config.hidden_size, all_head_size)
-        self.value = nn.Linear(config.hidden_size, all_head_size)
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
-    def transpose_for_scores(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
-        if k:
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, input_tensor, num_attention_heads, is_key_tensor=False):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        if is_key_tensor:
             return x.permute(0, 2, 3, 1)
         else:
             return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, attention_mask):
+        # Scalar dimensions referenced here:
+        #   B = batch size (number of sequences)
+        #   F = `from_tensor` sequence length
+        #   T = `to_tensor` sequence length
+        #   N = `num_attention_heads`
+        #   H = `size_per_head`
+        query_layer = self.query(hidden_states)
+        key_layer = self.key(hidden_states)
+        value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(query_layer)
+        key_layer = self.transpose_for_scores(key_layer, is_key_tensor=True)
+        value_layer = self.transpose_for_scores(value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw
+        # attention scores.
+        # `attention_scores` = [B, N, F, T]
+        attention_scores = torch.matmul(query_layer, key_layer)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # TODO clean up this (precompute)
+        # MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b)  # TF implem method: mask_attn_weights
+        # `attention_mask` = [B, 1, F, T]
+        attention_mask = tf.expand_dims(attention_mask, axis=[1])
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        adder = (1.0 - attention_mask) * -10000.0
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        attention_scores += adder
+
+        # Normalize the attention scores to probabilities.
+        # `attention_probs` = [B, N, F, T]
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_x_shape)
+
+        return context_layer
+
+
+class BERTSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BERTSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BERTLayerNorm(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(input_tensor)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
 
@@ -193,11 +262,37 @@ class BERTAttention(nn.Module):
     def __init__(self, config):
         super(BERTAttention, self).__init__()
         self.self = BERTSelfAttention(config)
-        self.output = BERTOutput(config)
+        self.output = BERTSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        attention_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(attention_output, input_tensor)
+        return attention_output
+
+
+class BERTIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BERTOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = gelu
 
     def forward(self, hidden_states):
-        hidden_states = self.self(hidden_states)
-        hidden_states = self.output(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BERTOutput(nn.Module):
+    def __init__(self, config):
+        super(BERTOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BERTLayerNorm(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(input_tensor)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
 
 
@@ -208,10 +303,10 @@ class BERTLayer(nn.Module):
         self.intermediate = BERTIntermediate(config)
         self.output = BERTOutput(config)
 
-    def forward(self, hidden_states):
-        hidden_states = self.attention(hidden_states)
-        hidden_states = self.intermediate(hidden_states)
-        hidden_states = self.output(hidden_states)
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
         return hidden_states
 
 
@@ -221,7 +316,7 @@ class BERTEncoder(nn.Module):
         layer = BERTLayer(n_ctx, cfg, scale=True)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])    
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, attention_mask):
         """
         Args:
             hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size]
@@ -229,7 +324,25 @@ class BERTEncoder(nn.Module):
             float Tensor of shape [batch_size, seq_length, hidden_size]
         """
         for layer_module in self.layer:
-            hidden_states = layer_module(hidden_states)
+            hidden_states = layer_module(hidden_states, attention_mask)
+        return hidden_states
+
+
+class BERTPooler(nn.Module):
+    def __init__(self, config):
+        super(BERTPooler, self).__init__()
+        layer = BERTLayer(n_ctx, cfg, scale=True)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])    
+
+    def forward(self, hidden_states, attention_mask):
+        """
+        Args:
+            hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size]
+        Return:
+            float Tensor of shape [batch_size, seq_length, hidden_size]
+        """
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
         return hidden_states
 
 
@@ -278,836 +391,9 @@ class BertModel(nn.Module):
 
         self.embeddings = BERTEmbeddings(config)
         self.encoder = BERTEncoder(config)
+        self.pooler = BERTPooler(config)
 
-
-        def forward(self, input_ids, token_type_ids=None, input_mask=None):
-            if input_mask is None:
-                input_mask = torch.ones(batch_size, seq_length), dtype=torch.long)
-
-            if token_type_ids is None:
-                token_type_ids = torch.zeros((batch_size, seq_length), dtype=torch.long)
-
-            hidden_states = self.embeddings(input_ids, token_type_ids, input_mask)
-            hidden_states = self.encoder(hidden_states)
-
-            # Perform embedding lookup on the word ids.
-            (self.embedding_output, self.embedding_table) = embedding_lookup(
-                    input_ids=input_ids,
-                    vocab_size=config.vocab_size,
-                    embedding_size=config.hidden_size,
-                    initializer_range=config.initializer_range,
-                    word_embedding_name="word_embeddings",
-                    use_one_hot_embeddings=use_one_hot_embeddings)
-
-            # Add positional embeddings and token type embeddings, then layer
-            # normalize and perform dropout.
-            self.embedding_output = embedding_postprocessor(
-                    input_tensor=self.embedding_output,
-                    use_token_type=True,
-                    token_type_ids=token_type_ids,
-                    token_type_vocab_size=config.type_vocab_size,
-                    token_type_embedding_name="token_type_embeddings",
-                    use_position_embeddings=True,
-                    position_embedding_name="position_embeddings",
-                    initializer_range=config.initializer_range,
-                    max_position_embeddings=config.max_position_embeddings,
-                    dropout_prob=config.hidden_dropout_prob)
-
-        with tf.variable_scope("encoder"):
-            # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
-            # mask of shape [batch_size, seq_length, seq_length] which is used
-            # for the attention scores.
-            attention_mask = create_attention_mask_from_input_mask(
-                    input_ids, input_mask)
-
-            # Run the stacked transformer.
-            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
-            self.all_encoder_layers = transformer_model(
-                    input_tensor=self.embedding_output,
-                    attention_mask=attention_mask,
-                    hidden_size=config.hidden_size,
-                    num_hidden_layers=config.num_hidden_layers,
-                    num_attention_heads=config.num_attention_heads,
-                    intermediate_size=config.intermediate_size,
-                    intermediate_act_fn=get_activation(config.hidden_act),
-                    hidden_dropout_prob=config.hidden_dropout_prob,
-                    attention_probs_dropout_prob=config.attention_probs_dropout_prob,
-                    initializer_range=config.initializer_range,
-                    do_return_all_layers=True)
-
-        self.sequence_output = self.all_encoder_layers[-1]
-        # The "pooler" converts the encoded sequence tensor of shape
-        # [batch_size, seq_length, hidden_size] to a tensor of shape
-        # [batch_size, hidden_size]. This is necessary for segment-level
-        # (or segment-pair-level) classification tasks where we need a fixed
-        # dimensional representation of the segment.
-        with tf.variable_scope("pooler"):
-            # We "pool" the model by simply taking the hidden state corresponding
-            # to the first token. We assume that this has been pre-trained
-            first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
-            self.pooled_output = tf.layers.dense(
-                    first_token_tensor,
-                    config.hidden_size,
-                    activation=tf.tanh,
-                    kernel_initializer=create_initializer(config.initializer_range))
-
-    def get_pooled_output(self):
-        return self.pooled_output
-
-    def get_sequence_output(self):
-        """Gets final hidden layer of encoder.
-
-        Returns:
-            float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
-            to the final hidden of the transformer encoder.
-        """
-        return self.sequence_output
-
-    def get_all_encoder_layers(self):
-        return self.all_encoder_layers
-
-    def get_embedding_output(self):
-        """Gets output of the embedding lookup (i.e., input to the transformer).
-
-        Returns:
-            float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
-            to the output of the embedding layer, after summing the word
-            embeddings with the positional embeddings and the token type embeddings,
-            then performing layer normalization. This is the input to the transformer.
-        """
-        return self.embedding_output
-
-    def get_embedding_table(self):
-        return self.embedding_table
-
-
-def gelu(input_tensor):
-    """Gaussian Error Linear Unit.
-
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-
-    Args:
-        input_tensor: float Tensor to perform activation.
-
-    Returns:
-        `input_tensor` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
-    return input_tensor * cdf
-
-
-def get_activation(activation_string):
-    """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
-
-    Args:
-        activation_string: String name of the activation function.
-
-    Returns:
-        A Python function corresponding to the activation function. If
-        `activation_string` is None, empty, or "linear", this will return None.
-        If `activation_string` is not a string, it will return `activation_string`.
-
-    Raises:
-        ValueError: The `activation_string` does not correspond to a known
-            activation.
-    """
-
-    # We assume that anything that"s not a string is already an activation
-    # function, so we just return it.
-    if not isinstance(activation_string, six.string_types):
-        return activation_string
-
-    if not activation_string:
-        return None
-
-    act = activation_string.lower()
-    if act == "linear":
-        return None
-    elif act == "relu":
-        return tf.nn.relu
-    elif act == "gelu":
-        return gelu
-    elif act == "tanh":
-        return tf.tanh
-    else:
-        raise ValueError("Unsupported activation: %s" % act)
-
-
-def get_assigment_map_from_checkpoint(tvars, init_checkpoint):
-    """Compute the union of the current variables and checkpoint variables."""
-    assignment_map = {}
-    initialized_variable_names = {}
-
-    name_to_variable = collections.OrderedDict()
-    for var in tvars:
-        name = var.name
-        m = re.match("^(.*):\\d+$", name)
-        if m is not None:
-            name = m.group(1)
-        name_to_variable[name] = var
-
-    init_vars = tf.train.list_variables(init_checkpoint)
-
-    assignment_map = collections.OrderedDict()
-    for x in init_vars:
-        (name, var) = (x[0], x[1])
-        if name not in name_to_variable:
-            continue
-        assignment_map[name] = name
-        initialized_variable_names[name] = 1
-        initialized_variable_names[name + ":0"] = 1
-
-    return (assignment_map, initialized_variable_names)
-
-
-def dropout(input_tensor, dropout_prob):
-    """Perform dropout.
-
-    Args:
-        input_tensor: float Tensor.
-        dropout_prob: Python float. The probabiltiy of dropping out a value (NOT of
-            *keeping* a dimension as in `tf.nn.dropout`).
-
-    Returns:
-        A version of `input_tensor` with dropout applied.
-    """
-    if dropout_prob is None or dropout_prob == 0.0:
-        return input_tensor
-
-    output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
-    return output
-
-
-def layer_norm(input_tensor, name=None):
-    """Run layer normalization on the last dimension of the tensor."""
-    return tf.contrib.layers.layer_norm(
-            inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
-
-
-def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
-    """Runs layer normalization followed by dropout."""
-    output_tensor = layer_norm(input_tensor, name)
-    output_tensor = dropout(output_tensor, dropout_prob)
-    return output_tensor
-
-
-def create_initializer(initializer_range=0.02):
-    """Creates a `truncated_normal_initializer` with the given range."""
-    return tf.truncated_normal_initializer(stddev=initializer_range)
-
-
-def embedding_lookup(input_ids,
-                                         vocab_size,
-                                         embedding_size=128,
-                                         initializer_range=0.02,
-                                         word_embedding_name="word_embeddings",
-                                         use_one_hot_embeddings=False):
-    """Looks up words embeddings for id tensor.
-
-    Args:
-        input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
-            ids.
-        vocab_size: int. Size of the embedding vocabulary.
-        embedding_size: int. Width of the word embeddings.
-        initializer_range: float. Embedding initialization range.
-        word_embedding_name: string. Name of the embedding table.
-        use_one_hot_embeddings: bool. If True, use one-hot method for word
-            embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
-            for TPUs.
-
-    Returns:
-        float Tensor of shape [batch_size, seq_length, embedding_size].
-    """
-    # This function assumes that the input is of shape [batch_size, seq_length,
-    # num_inputs].
-    #
-    # If the input is a 2D tensor of shape [batch_size, seq_length], we
-    # reshape to [batch_size, seq_length, 1].
-    if input_ids.shape.ndims == 2:
-        input_ids = tf.expand_dims(input_ids, axis=[-1])
-
-    embedding_table = tf.get_variable(
-            name=word_embedding_name,
-            shape=[vocab_size, embedding_size],
-            initializer=create_initializer(initializer_range))
-
-    if use_one_hot_embeddings:
-        flat_input_ids = tf.reshape(input_ids, [-1])
-        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
-        output = tf.matmul(one_hot_input_ids, embedding_table)
-    else:
-        output = tf.nn.embedding_lookup(embedding_table, input_ids)
-
-    input_shape = get_shape_list(input_ids)
-
-    output = tf.reshape(output,
-                                            input_shape[0:-1] + [input_shape[-1] * embedding_size])
-    return (output, embedding_table)
-
-
-def embedding_postprocessor(input_tensor,
-                                                        use_token_type=False,
-                                                        token_type_ids=None,
-                                                        token_type_vocab_size=16,
-                                                        token_type_embedding_name="token_type_embeddings",
-                                                        use_position_embeddings=True,
-                                                        position_embedding_name="position_embeddings",
-                                                        initializer_range=0.02,
-                                                        max_position_embeddings=512,
-                                                        dropout_prob=0.1):
-    """Performs various post-processing on a word embedding tensor.
-
-    Args:
-        input_tensor: float Tensor of shape [batch_size, seq_length,
-            embedding_size].
-        use_token_type: bool. Whether to add embeddings for `token_type_ids`.
-        token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
-            Must be specified if `use_token_type` is True.
-        token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
-        token_type_embedding_name: string. The name of the embedding table variable
-            for token type ids.
-        use_position_embeddings: bool. Whether to add position embeddings for the
-            position of each token in the sequence.
-        position_embedding_name: string. The name of the embedding table variable
-            for positional embeddings.
-        initializer_range: float. Range of the weight initialization.
-        max_position_embeddings: int. Maximum sequence length that might ever be
-            used with this model. This can be longer than the sequence length of
-            input_tensor, but cannot be shorter.
-        dropout_prob: float. Dropout probability applied to the final output tensor.
-
-    Returns:
-        float tensor with same shape as `input_tensor`.
-
-    Raises:
-        ValueError: One of the tensor shapes or input values is invalid.
-    """
-    input_shape = get_shape_list(input_tensor, expected_rank=3)
-    batch_size = input_shape[0]
-    seq_length = input_shape[1]
-    width = input_shape[2]
-
-    if seq_length > max_position_embeddings:
-        raise ValueError("The seq length (%d) cannot be greater than "
-                                         "`max_position_embeddings` (%d)" %
-                                         (seq_length, max_position_embeddings))
-
-    output = input_tensor
-
-    if use_token_type:
-        if token_type_ids is None:
-            raise ValueError("`token_type_ids` must be specified if"
-                                             "`use_token_type` is True.")
-        token_type_table = tf.get_variable(
-                name=token_type_embedding_name,
-                shape=[token_type_vocab_size, width],
-                initializer=create_initializer(initializer_range))
-        # This vocab will be small so we always do one-hot here, since it is always
-        # faster for a small vocabulary.
-        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
-        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
-        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
-        token_type_embeddings = tf.reshape(token_type_embeddings,
-                                                                             [batch_size, seq_length, width])
-        output += token_type_embeddings
-
-    if use_position_embeddings:
-        full_position_embeddings = tf.get_variable(
-                name=position_embedding_name,
-                shape=[max_position_embeddings, width],
-                initializer=create_initializer(initializer_range))
-        # Since the position embedding table is a learned variable, we create it
-        # using a (long) sequence length `max_position_embeddings`. The actual
-        # sequence length might be shorter than this, for faster training of
-        # tasks that do not have long sequences.
-        #
-        # So `full_position_embeddings` is effectively an embedding table
-        # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
-        # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
-        # perform a slice.
-        if seq_length < max_position_embeddings:
-            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
-                                                                         [seq_length, -1])
-        else:
-            position_embeddings = full_position_embeddings
-
-        num_dims = len(output.shape.as_list())
-
-        # Only the last two dimensions are relevant (`seq_length` and `width`), so
-        # we broadcast among the first dimensions, which is typically just
-        # the batch size.
-        position_broadcast_shape = []
-        for _ in range(num_dims - 2):
-            position_broadcast_shape.append(1)
-        position_broadcast_shape.extend([seq_length, width])
-        position_embeddings = tf.reshape(position_embeddings,
-                                                                         position_broadcast_shape)
-        output += position_embeddings
-
-    output = layer_norm_and_dropout(output, dropout_prob)
-    return output
-
-
-def create_attention_mask_from_input_mask(from_tensor, to_mask):
-    """Create 3D attention mask from a 2D tensor mask.
-
-    Args:
-        from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
-        to_mask: int32 Tensor of shape [batch_size, to_seq_length].
-
-    Returns:
-        float Tensor of shape [batch_size, from_seq_length, to_seq_length].
-    """
-    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
-    batch_size = from_shape[0]
-    from_seq_length = from_shape[1]
-
-    to_shape = get_shape_list(to_mask, expected_rank=2)
-    to_seq_length = to_shape[1]
-
-    to_mask = tf.cast(
-            tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
-
-    # We don't assume that `from_tensor` is a mask (although it could be). We
-    # don't actually care if we attend *from* padding tokens (only *to* padding)
-    # tokens so we create a tensor of all ones.
-    #
-    # `broadcast_ones` = [batch_size, from_seq_length, 1]
-    broadcast_ones = tf.ones(
-            shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
-
-    # Here we broadcast along two dimensions to create the mask.
-    mask = broadcast_ones * to_mask
-
-    return mask
-
-
-def attention_layer(from_tensor,
-                                        to_tensor,
-                                        attention_mask=None,
-                                        num_attention_heads=1,
-                                        size_per_head=512,
-                                        query_act=None,
-                                        key_act=None,
-                                        value_act=None,
-                                        attention_probs_dropout_prob=0.0,
-                                        initializer_range=0.02,
-                                        do_return_2d_tensor=False,
-                                        batch_size=None,
-                                        from_seq_length=None,
-                                        to_seq_length=None):
-    """Performs multi-headed attention from `from_tensor` to `to_tensor`.
-
-    This is an implementation of multi-headed attention based on "Attention
-    is all you Need". If `from_tensor` and `to_tensor` are the same, then
-    this is self-attention. Each timestep in `from_tensor` attends to the
-    corresponding sequence in `to_tensor`, and returns a fixed-with vector.
-
-    This function first projects `from_tensor` into a "query" tensor and
-    `to_tensor` into "key" and "value" tensors. These are (effectively) a list
-    of tensors of length `num_attention_heads`, where each tensor is of shape
-    [batch_size, seq_length, size_per_head].
-
-    Then, the query and key tensors are dot-producted and scaled. These are
-    softmaxed to obtain attention probabilities. The value tensors are then
-    interpolated by these probabilities, then concatenated back to a single
-    tensor and returned.
-
-    In practice, the multi-headed attention are done with transposes and
-    reshapes rather than actual separate tensors.
-
-    Args:
-        from_tensor: float Tensor of shape [batch_size, from_seq_length,
-            from_width].
-        to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
-        attention_mask: (optional) int32 Tensor of shape [batch_size,
-            from_seq_length, to_seq_length]. The values should be 1 or 0. The
-            attention scores will effectively be set to -infinity for any positions in
-            the mask that are 0, and will be unchaged for positions that are 1.
-        num_attention_heads: int. Number of attention heads.
-        size_per_head: int. Size of each attention head.
-        query_act: (optional) Activation function for the query transform.
-        key_act: (optional) Activation function for the key transform.
-        value_act: (optional) Activation function for the value transform.
-        attention_probs_dropout_prob:
-        initializer_range: float. Range of the weight initializer.
-        do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
-            * from_seq_length, num_attention_heads * size_per_head]. If False, the
-            output will be of shape [batch_size, from_seq_length, num_attention_heads
-            * size_per_head].
-        batch_size: (Optional) int. If the input is 2D, this might be the batch size
-            of the 3D version of the `from_tensor` and `to_tensor`.
-        from_seq_length: (Optional) If the input is 2D, this might be the seq length
-            of the 3D version of the `from_tensor`.
-        to_seq_length: (Optional) If the input is 2D, this might be the seq length
-            of the 3D version of the `to_tensor`.
-
-    Returns:
-        float Tensor of shape [batch_size, from_seq_length,
-            num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
-            true, this will be of shape [batch_size * from_seq_length,
-            num_attention_heads * size_per_head]).
-
-    Raises:
-        ValueError: Any of the arguments or tensor shapes are invalid.
-    """
-
-    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
-                                                     seq_length, width):
-        output_tensor = tf.reshape(
-                input_tensor, [batch_size, seq_length, num_attention_heads, width])
-
-        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
-        return output_tensor
-
-    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
-    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
-
-    if len(from_shape) != len(to_shape):
-        raise ValueError(
-                "The rank of `from_tensor` must match the rank of `to_tensor`.")
-
-    if len(from_shape) == 3:
-        batch_size = from_shape[0]
-        from_seq_length = from_shape[1]
-        to_seq_length = to_shape[1]
-    elif len(from_shape) == 2:
-        if (batch_size is None or from_seq_length is None or to_seq_length is None):
-            raise ValueError(
-                    "When passing in rank 2 tensors to attention_layer, the values "
-                    "for `batch_size`, `from_seq_length`, and `to_seq_length` "
-                    "must all be specified.")
-
-    # Scalar dimensions referenced here:
-    #   B = batch size (number of sequences)
-    #   F = `from_tensor` sequence length
-    #   T = `to_tensor` sequence length
-    #   N = `num_attention_heads`
-    #   H = `size_per_head`
-
-    from_tensor_2d = reshape_to_matrix(from_tensor)
-    to_tensor_2d = reshape_to_matrix(to_tensor)
-
-    # `query_layer` = [B*F, N*H]
-    query_layer = tf.layers.dense(
-            from_tensor_2d,
-            num_attention_heads * size_per_head,
-            activation=query_act,
-            name="query",
-            kernel_initializer=create_initializer(initializer_range))
-
-    # `key_layer` = [B*T, N*H]
-    key_layer = tf.layers.dense(
-            to_tensor_2d,
-            num_attention_heads * size_per_head,
-            activation=key_act,
-            name="key",
-            kernel_initializer=create_initializer(initializer_range))
-
-    # `value_layer` = [B*T, N*H]
-    value_layer = tf.layers.dense(
-            to_tensor_2d,
-            num_attention_heads * size_per_head,
-            activation=value_act,
-            name="value",
-            kernel_initializer=create_initializer(initializer_range))
-
-    # `query_layer` = [B, N, F, H]
-    query_layer = transpose_for_scores(query_layer, batch_size,
-                                                                         num_attention_heads, from_seq_length,
-                                                                         size_per_head)
-
-    # `key_layer` = [B, N, T, H]
-    key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
-                                                                     to_seq_length, size_per_head)
-
-    # Take the dot product between "query" and "key" to get the raw
-    # attention scores.
-    # `attention_scores` = [B, N, F, T]
-    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
-    attention_scores = tf.multiply(attention_scores,
-                                                                 1.0 / math.sqrt(float(size_per_head)))
-
-    if attention_mask is not None:
-        # `attention_mask` = [B, 1, F, T]
-        attention_mask = tf.expand_dims(attention_mask, axis=[1])
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
-
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        attention_scores += adder
-
-    # Normalize the attention scores to probabilities.
-    # `attention_probs` = [B, N, F, T]
-    attention_probs = tf.nn.softmax(attention_scores)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
-
-    # `value_layer` = [B, T, N, H]
-    value_layer = tf.reshape(
-            value_layer,
-            [batch_size, to_seq_length, num_attention_heads, size_per_head])
-
-    # `value_layer` = [B, N, T, H]
-    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
-
-    # `context_layer` = [B, N, F, H]
-    context_layer = tf.matmul(attention_probs, value_layer)
-
-    # `context_layer` = [B, F, N, H]
-    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
-
-    if do_return_2d_tensor:
-        # `context_layer` = [B*F, N*V]
-        context_layer = tf.reshape(
-                context_layer,
-                [batch_size * from_seq_length, num_attention_heads * size_per_head])
-    else:
-        # `context_layer` = [B, F, N*V]
-        context_layer = tf.reshape(
-                context_layer,
-                [batch_size, from_seq_length, num_attention_heads * size_per_head])
-
-    return context_layer
-
-
-def transformer_model(input_tensor,
-                                            attention_mask=None,
-                                            hidden_size=768,
-                                            num_hidden_layers=12,
-                                            num_attention_heads=12,
-                                            intermediate_size=3072,
-                                            intermediate_act_fn=gelu,
-                                            hidden_dropout_prob=0.1,
-                                            attention_probs_dropout_prob=0.1,
-                                            initializer_range=0.02,
-                                            do_return_all_layers=False):
-    """Multi-headed, multi-layer Transformer from "Attention is All You Need".
-
-    This is almost an exact implementation of the original Transformer encoder.
-
-    See the original paper:
-    https://arxiv.org/abs/1706.03762
-
-    Also see:
-    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
-
-    Args:
-        input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
-        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
-            seq_length], with 1 for positions that can be attended to and 0 in
-            positions that should not be.
-        hidden_size: int. Hidden size of the Transformer.
-        num_hidden_layers: int. Number of layers (blocks) in the Transformer.
-        num_attention_heads: int. Number of attention heads in the Transformer.
-        intermediate_size: int. The size of the "intermediate" (a.k.a., feed
-            forward) layer.
-        intermediate_act_fn: function. The non-linear activation function to apply
-            to the output of the intermediate/feed-forward layer.
-        hidden_dropout_prob: float. Dropout probability for the hidden layers.
-        attention_probs_dropout_prob: float. Dropout probability of the attention
-            probabilities.
-        initializer_range: float. Range of the initializer (stddev of truncated
-            normal).
-        do_return_all_layers: Whether to also return all layers or just the final
-            layer.
-
-    Returns:
-        float Tensor of shape [batch_size, seq_length, hidden_size], the final
-        hidden layer of the Transformer.
-
-    Raises:
-        ValueError: A Tensor shape or parameter is invalid.
-    """
-    if hidden_size % num_attention_heads != 0:
-        raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (hidden_size, num_attention_heads))
-
-    attention_head_size = int(hidden_size / num_attention_heads)
-    input_shape = get_shape_list(input_tensor, expected_rank=3)
-    batch_size = input_shape[0]
-    seq_length = input_shape[1]
-    input_width = input_shape[2]
-
-    # The Transformer performs sum residuals on all layers so the input needs
-    # to be the same as the hidden size.
-    if input_width != hidden_size:
-        raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
-                                         (input_width, hidden_size))
-
-    # We keep the representation as a 2D tensor to avoid re-shaping it back and
-    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
-    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
-    # help the optimizer.
-    prev_output = reshape_to_matrix(input_tensor)
-
-    all_layer_outputs = []
-    for layer_idx in range(num_hidden_layers):
-        with tf.variable_scope("layer_%d" % layer_idx):
-            layer_input = prev_output
-
-            with tf.variable_scope("attention"):
-                attention_heads = []
-                with tf.variable_scope("self"):
-                    attention_head = attention_layer(
-                            from_tensor=layer_input,
-                            to_tensor=layer_input,
-                            attention_mask=attention_mask,
-                            num_attention_heads=num_attention_heads,
-                            size_per_head=attention_head_size,
-                            attention_probs_dropout_prob=attention_probs_dropout_prob,
-                            initializer_range=initializer_range,
-                            do_return_2d_tensor=True,
-                            batch_size=batch_size,
-                            from_seq_length=seq_length,
-                            to_seq_length=seq_length)
-                    attention_heads.append(attention_head)
-
-                attention_output = None
-                if len(attention_heads) == 1:
-                    attention_output = attention_heads[0]
-                else:
-                    # In the case where we have other sequences, we just concatenate
-                    # them to the self-attention head before the projection.
-                    attention_output = tf.concat(attention_heads, axis=-1)
-
-                # Run a linear projection of `hidden_size` then add a residual
-                # with `layer_input`.
-                with tf.variable_scope("output"):
-                    attention_output = tf.layers.dense(
-                            attention_output,
-                            hidden_size,
-                            kernel_initializer=create_initializer(initializer_range))
-                    attention_output = dropout(attention_output, hidden_dropout_prob)
-                    attention_output = layer_norm(attention_output + layer_input)
-
-            # The activation is only applied to the "intermediate" hidden layer.
-            with tf.variable_scope("intermediate"):
-                intermediate_output = tf.layers.dense(
-                        attention_output,
-                        intermediate_size,
-                        activation=intermediate_act_fn,
-                        kernel_initializer=create_initializer(initializer_range))
-
-            # Down-project back to `hidden_size` then add the residual.
-            with tf.variable_scope("output"):
-                layer_output = tf.layers.dense(
-                        intermediate_output,
-                        hidden_size,
-                        kernel_initializer=create_initializer(initializer_range))
-                layer_output = dropout(layer_output, hidden_dropout_prob)
-                layer_output = layer_norm(layer_output + attention_output)
-                prev_output = layer_output
-                all_layer_outputs.append(layer_output)
-
-    if do_return_all_layers:
-        final_outputs = []
-        for layer_output in all_layer_outputs:
-            final_output = reshape_from_matrix(layer_output, input_shape)
-            final_outputs.append(final_output)
-        return final_outputs
-    else:
-        final_output = reshape_from_matrix(prev_output, input_shape)
-        return final_output
-
-
-def get_shape_list(tensor, expected_rank=None, name=None):
-    """Returns a list of the shape of tensor, preferring static dimensions.
-
-    Args:
-        tensor: A tf.Tensor object to find the shape of.
-        expected_rank: (optional) int. The expected rank of `tensor`. If this is
-            specified and the `tensor` has a different rank, and exception will be
-            thrown.
-        name: Optional name of the tensor for the error message.
-
-    Returns:
-        A list of dimensions of the shape of tensor. All static dimensions will
-        be returned as python integers, and dynamic dimensions will be returned
-        as tf.Tensor scalars.
-    """
-    if name is None:
-        name = tensor.name
-
-    if expected_rank is not None:
-        assert_rank(tensor, expected_rank, name)
-
-    shape = tensor.shape.as_list()
-
-    non_static_indexes = []
-    for (index, dim) in enumerate(shape):
-        if dim is None:
-            non_static_indexes.append(index)
-
-    if not non_static_indexes:
-        return shape
-
-    dyn_shape = tf.shape(tensor)
-    for index in non_static_indexes:
-        shape[index] = dyn_shape[index]
-    return shape
-
-
-def reshape_to_matrix(input_tensor):
-    """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
-    ndims = input_tensor.shape.ndims
-    if ndims < 2:
-        raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
-                                         (input_tensor.shape))
-    if ndims == 2:
-        return input_tensor
-
-    width = input_tensor.shape[-1]
-    output_tensor = tf.reshape(input_tensor, [-1, width])
-    return output_tensor
-
-
-def reshape_from_matrix(output_tensor, orig_shape_list):
-    """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
-    if len(orig_shape_list) == 2:
-        return output_tensor
-
-    output_shape = get_shape_list(output_tensor)
-
-    orig_dims = orig_shape_list[0:-1]
-    width = output_shape[-1]
-
-    return tf.reshape(output_tensor, orig_dims + [width])
-
-
-def assert_rank(tensor, expected_rank, name=None):
-    """Raises an exception if the tensor rank is not of the expected rank.
-
-    Args:
-        tensor: A tf.Tensor to check the rank of.
-        expected_rank: Python integer or list of integers, expected rank.
-        name: Optional name of the tensor for the error message.
-
-    Raises:
-        ValueError: If the expected shape doesn"t match the actual shape.
-    """
-    if name is None:
-        name = tensor.name
-
-    expected_rank_dict = {}
-    if isinstance(expected_rank, six.integer_types):
-        expected_rank_dict[expected_rank] = True
-    else:
-        for x in expected_rank:
-            expected_rank_dict[x] = True
-
-    actual_rank = tensor.shape.ndims
-    if actual_rank not in expected_rank_dict:
-        scope_name = tf.get_variable_scope().name
-        raise ValueError(
-                "For the tensor `%s` in scope `%s`, the actual rank "
-                "`%d` (shape = %s) is not equal to the expected rank `%s`" %
-                (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
+    def forward(self, input_ids, token_type_ids, attention_mask):
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        all_encoder_layers = self.encoder(embedding_output, attention_mask)
+        return all_encoder_layers