consistent nn. and nn.functional: p2 templates (#12153)
This commit is contained in:
@@ -711,7 +711,7 @@ defined by the name of the class attribute you give the layer. Let's
|
|||||||
define a dummy model in PyTorch, called `SimpleModel` as follows:
|
define a dummy model in PyTorch, called `SimpleModel` as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch.nn as nn
|
from torch import nn
|
||||||
|
|
||||||
class SimpleModel(nn.Module):
|
class SimpleModel(nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|||||||
@@ -1542,7 +1542,6 @@ import random
|
|||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
|
|
||||||
@@ -1743,7 +1742,7 @@ class {{cookiecutter.camelcase_modelname}}Attention(nn.Module):
|
|||||||
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
||||||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||||
|
|
||||||
attn_weights = F.softmax(attn_weights, dim=-1)
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
||||||
|
|
||||||
if layer_head_mask is not None:
|
if layer_head_mask is not None:
|
||||||
if layer_head_mask.size() != (self.num_heads,):
|
if layer_head_mask.size() != (self.num_heads,):
|
||||||
@@ -1763,7 +1762,7 @@ class {{cookiecutter.camelcase_modelname}}Attention(nn.Module):
|
|||||||
else:
|
else:
|
||||||
attn_weights_reshaped = None
|
attn_weights_reshaped = None
|
||||||
|
|
||||||
attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
|
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
|
||||||
|
|
||||||
attn_output = torch.bmm(attn_probs, value_states)
|
attn_output = torch.bmm(attn_probs, value_states)
|
||||||
|
|
||||||
@@ -1823,15 +1822,15 @@ class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module):
|
|||||||
layer_head_mask=layer_head_mask,
|
layer_head_mask=layer_head_mask,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
)
|
)
|
||||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
hidden_states = residual + hidden_states
|
hidden_states = residual + hidden_states
|
||||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||||
|
|
||||||
residual = hidden_states
|
residual = hidden_states
|
||||||
hidden_states = self.activation_fn(self.fc1(hidden_states))
|
hidden_states = self.activation_fn(self.fc1(hidden_states))
|
||||||
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
|
||||||
hidden_states = self.fc2(hidden_states)
|
hidden_states = self.fc2(hidden_states)
|
||||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
hidden_states = residual + hidden_states
|
hidden_states = residual + hidden_states
|
||||||
hidden_states = self.final_layer_norm(hidden_states)
|
hidden_states = self.final_layer_norm(hidden_states)
|
||||||
|
|
||||||
@@ -1916,7 +1915,7 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
|
|||||||
layer_head_mask=layer_head_mask,
|
layer_head_mask=layer_head_mask,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
)
|
)
|
||||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
hidden_states = residual + hidden_states
|
hidden_states = residual + hidden_states
|
||||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||||
|
|
||||||
@@ -1936,7 +1935,7 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
|
|||||||
past_key_value=cross_attn_past_key_value,
|
past_key_value=cross_attn_past_key_value,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
)
|
)
|
||||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
hidden_states = residual + hidden_states
|
hidden_states = residual + hidden_states
|
||||||
hidden_states = self.encoder_attn_layer_norm(hidden_states)
|
hidden_states = self.encoder_attn_layer_norm(hidden_states)
|
||||||
|
|
||||||
@@ -1946,9 +1945,9 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
|
|||||||
# Fully Connected
|
# Fully Connected
|
||||||
residual = hidden_states
|
residual = hidden_states
|
||||||
hidden_states = self.activation_fn(self.fc1(hidden_states))
|
hidden_states = self.activation_fn(self.fc1(hidden_states))
|
||||||
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
|
||||||
hidden_states = self.fc2(hidden_states)
|
hidden_states = self.fc2(hidden_states)
|
||||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
hidden_states = residual + hidden_states
|
hidden_states = residual + hidden_states
|
||||||
hidden_states = self.final_layer_norm(hidden_states)
|
hidden_states = self.final_layer_norm(hidden_states)
|
||||||
|
|
||||||
@@ -2171,7 +2170,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: {{cookiecutter.camelcase_modelname}}Config
|
config: {{cookiecutter.camelcase_modelname}}Config
|
||||||
embed_tokens (torch.nn.Embedding): output embedding
|
embed_tokens (nn.Embedding): output embedding
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
|
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
|
||||||
@@ -2270,7 +2269,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model
|
|||||||
|
|
||||||
hidden_states = inputs_embeds + embed_pos
|
hidden_states = inputs_embeds + embed_pos
|
||||||
hidden_states = self.layernorm_embedding(hidden_states)
|
hidden_states = self.layernorm_embedding(hidden_states)
|
||||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
|
|
||||||
# expand attention_mask
|
# expand attention_mask
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -2337,7 +2336,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: {{cookiecutter.camelcase_modelname}}Config
|
config: {{cookiecutter.camelcase_modelname}}Config
|
||||||
embed_tokens (torch.nn.Embedding): output embedding
|
embed_tokens (nn.Embedding): output embedding
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
|
def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None):
|
||||||
@@ -2506,7 +2505,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model
|
|||||||
hidden_states = inputs_embeds + positions
|
hidden_states = inputs_embeds + positions
|
||||||
hidden_states = self.layernorm_embedding(hidden_states)
|
hidden_states = self.layernorm_embedding(hidden_states)
|
||||||
|
|
||||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||||
|
|
||||||
# decoder layers
|
# decoder layers
|
||||||
all_hidden_states = () if output_hidden_states else None
|
all_hidden_states = () if output_hidden_states else None
|
||||||
|
|||||||
@@ -725,7 +725,7 @@ defined by the name of the class attribute you give the layer. Let's
|
|||||||
define a dummy model in PyTorch, called `SimpleModel` as follows:
|
define a dummy model in PyTorch, called `SimpleModel` as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch.nn as nn
|
from torch import nn
|
||||||
|
|
||||||
class SimpleModel(nn.Module):
|
class SimpleModel(nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user