💄 super
This commit is contained in:
@@ -31,7 +31,7 @@ POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (
|
|||||||
|
|
||||||
class ImageEncoder(nn.Module):
|
class ImageEncoder(nn.Module):
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
super(ImageEncoder, self).__init__()
|
super().__init__()
|
||||||
model = torchvision.models.resnet152(pretrained=True)
|
model = torchvision.models.resnet152(pretrained=True)
|
||||||
modules = list(model.children())[:-2]
|
modules = list(model.children())[:-2]
|
||||||
self.model = nn.Sequential(*modules)
|
self.model = nn.Sequential(*modules)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ class ClassificationHead(torch.nn.Module):
|
|||||||
"""Classification Head for transformer encoders"""
|
"""Classification Head for transformer encoders"""
|
||||||
|
|
||||||
def __init__(self, class_size, embed_size):
|
def __init__(self, class_size, embed_size):
|
||||||
super(ClassificationHead, self).__init__()
|
super().__init__()
|
||||||
self.class_size = class_size
|
self.class_size = class_size
|
||||||
self.embed_size = embed_size
|
self.embed_size = embed_size
|
||||||
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
|
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ class Discriminator(torch.nn.Module):
|
|||||||
"""Transformer encoder followed by a Classification Head"""
|
"""Transformer encoder followed by a Classification Head"""
|
||||||
|
|
||||||
def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
|
def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
|
||||||
super(Discriminator, self).__init__()
|
super().__init__()
|
||||||
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
||||||
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
|
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
|
||||||
self.embed_size = self.encoder.transformer.config.hidden_size
|
self.embed_size = self.encoder.transformer.config.hidden_size
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
dec_dropout=0.2,
|
dec_dropout=0.2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
super(BertAbsConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.max_pos = max_pos
|
self.max_pos = max_pos
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class BertAbsPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
class BertAbs(BertAbsPreTrainedModel):
|
class BertAbs(BertAbsPreTrainedModel):
|
||||||
def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
|
def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
|
||||||
super(BertAbs, self).__init__(args)
|
super().__init__(args)
|
||||||
self.args = args
|
self.args = args
|
||||||
self.bert = Bert()
|
self.bert = Bert()
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ class Bert(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Bert, self).__init__()
|
super().__init__()
|
||||||
config = BertConfig.from_pretrained("bert-base-uncased")
|
config = BertConfig.from_pretrained("bert-base-uncased")
|
||||||
self.model = BertModel(config)
|
self.model = BertModel(config)
|
||||||
|
|
||||||
@@ -151,7 +151,7 @@ class TransformerDecoder(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
|
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
|
||||||
super(TransformerDecoder, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
# Basic attributes.
|
# Basic attributes.
|
||||||
self.decoder_type = "transformer"
|
self.decoder_type = "transformer"
|
||||||
@@ -261,7 +261,7 @@ class PositionalEncoding(nn.Module):
|
|||||||
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
||||||
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
||||||
pe = pe.unsqueeze(0)
|
pe = pe.unsqueeze(0)
|
||||||
super(PositionalEncoding, self).__init__()
|
super().__init__()
|
||||||
self.register_buffer("pe", pe)
|
self.register_buffer("pe", pe)
|
||||||
self.dropout = nn.Dropout(p=dropout)
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
@@ -293,7 +293,7 @@ class TransformerDecoderLayer(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, d_model, heads, d_ff, dropout):
|
def __init__(self, d_model, heads, d_ff, dropout):
|
||||||
super(TransformerDecoderLayer, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
|
self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
|
||||||
|
|
||||||
@@ -410,7 +410,7 @@ class MultiHeadedAttention(nn.Module):
|
|||||||
self.dim_per_head = model_dim // head_count
|
self.dim_per_head = model_dim // head_count
|
||||||
self.model_dim = model_dim
|
self.model_dim = model_dim
|
||||||
|
|
||||||
super(MultiHeadedAttention, self).__init__()
|
super().__init__()
|
||||||
self.head_count = head_count
|
self.head_count = head_count
|
||||||
|
|
||||||
self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
|
self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
|
||||||
@@ -639,7 +639,7 @@ class PositionwiseFeedForward(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, d_model, d_ff, dropout=0.1):
|
def __init__(self, d_model, d_ff, dropout=0.1):
|
||||||
super(PositionwiseFeedForward, self).__init__()
|
super().__init__()
|
||||||
self.w_1 = nn.Linear(d_model, d_ff)
|
self.w_1 = nn.Linear(d_model, d_ff)
|
||||||
self.w_2 = nn.Linear(d_ff, d_model)
|
self.w_2 = nn.Linear(d_ff, d_model)
|
||||||
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
|
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ class AlbertConfig(PretrainedConfig):
|
|||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(AlbertConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.embedding_size = embedding_size
|
self.embedding_size = embedding_size
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(BertConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(CTRLConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
seq_classif_dropout=0.2,
|
seq_classif_dropout=0.2,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(DistilBertConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(GPT2Config, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ class T5Config(PretrainedConfig):
|
|||||||
initializer_factor=1.0,
|
initializer_factor=1.0,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(T5Config, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.cutoffs = []
|
self.cutoffs = []
|
||||||
|
|||||||
@@ -197,7 +197,7 @@ class XLMConfig(PretrainedConfig):
|
|||||||
):
|
):
|
||||||
"""Constructs XLMConfig.
|
"""Constructs XLMConfig.
|
||||||
"""
|
"""
|
||||||
super(XLMConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.emb_dim = emb_dim
|
self.emb_dim = emb_dim
|
||||||
self.n_layers = n_layers
|
self.n_layers = n_layers
|
||||||
|
|||||||
@@ -159,7 +159,7 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
):
|
):
|
||||||
"""Constructs XLNetConfig.
|
"""Constructs XLNetConfig.
|
||||||
"""
|
"""
|
||||||
super(XLNetConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
self.n_layer = n_layer
|
self.n_layer = n_layer
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ class AlbertEmbeddings(BertEmbeddings):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertEmbeddings, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
|
||||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
|
||||||
@@ -177,7 +177,7 @@ class AlbertEmbeddings(BertEmbeddings):
|
|||||||
|
|
||||||
class AlbertAttention(BertSelfAttention):
|
class AlbertAttention(BertSelfAttention):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertAttention, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.num_attention_heads = config.num_attention_heads
|
self.num_attention_heads = config.num_attention_heads
|
||||||
@@ -258,7 +258,7 @@ class AlbertAttention(BertSelfAttention):
|
|||||||
|
|
||||||
class AlbertLayer(nn.Module):
|
class AlbertLayer(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertLayer, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
@@ -279,7 +279,7 @@ class AlbertLayer(nn.Module):
|
|||||||
|
|
||||||
class AlbertLayerGroup(nn.Module):
|
class AlbertLayerGroup(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertLayerGroup, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
@@ -309,7 +309,7 @@ class AlbertLayerGroup(nn.Module):
|
|||||||
|
|
||||||
class AlbertTransformer(nn.Module):
|
class AlbertTransformer(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertTransformer, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
@@ -471,7 +471,7 @@ class AlbertModel(AlbertPreTrainedModel):
|
|||||||
base_model_prefix = "albert"
|
base_model_prefix = "albert"
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertModel, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.embeddings = AlbertEmbeddings(config)
|
self.embeddings = AlbertEmbeddings(config)
|
||||||
@@ -571,7 +571,7 @@ class AlbertModel(AlbertPreTrainedModel):
|
|||||||
|
|
||||||
class AlbertMLMHead(nn.Module):
|
class AlbertMLMHead(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertMLMHead, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.LayerNorm = nn.LayerNorm(config.embedding_size)
|
self.LayerNorm = nn.LayerNorm(config.embedding_size)
|
||||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||||
@@ -619,7 +619,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertForMaskedLM, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.albert = AlbertModel(config)
|
self.albert = AlbertModel(config)
|
||||||
self.predictions = AlbertMLMHead(config)
|
self.predictions = AlbertMLMHead(config)
|
||||||
@@ -706,7 +706,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertForSequenceClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.albert = AlbertModel(config)
|
self.albert = AlbertModel(config)
|
||||||
@@ -804,7 +804,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertForQuestionAnswering, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.albert = AlbertModel(config)
|
self.albert = AlbertModel(config)
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ class BertEmbeddings(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertEmbeddings, self).__init__()
|
super().__init__()
|
||||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
|
||||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
||||||
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
||||||
@@ -197,7 +197,7 @@ class BertEmbeddings(nn.Module):
|
|||||||
|
|
||||||
class BertSelfAttention(nn.Module):
|
class BertSelfAttention(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertSelfAttention, self).__init__()
|
super().__init__()
|
||||||
if config.hidden_size % config.num_attention_heads != 0:
|
if config.hidden_size % config.num_attention_heads != 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The hidden size (%d) is not a multiple of the number of attention "
|
"The hidden size (%d) is not a multiple of the number of attention "
|
||||||
@@ -275,7 +275,7 @@ class BertSelfAttention(nn.Module):
|
|||||||
|
|
||||||
class BertSelfOutput(nn.Module):
|
class BertSelfOutput(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertSelfOutput, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
@@ -289,7 +289,7 @@ class BertSelfOutput(nn.Module):
|
|||||||
|
|
||||||
class BertAttention(nn.Module):
|
class BertAttention(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertAttention, self).__init__()
|
super().__init__()
|
||||||
self.self = BertSelfAttention(config)
|
self.self = BertSelfAttention(config)
|
||||||
self.output = BertSelfOutput(config)
|
self.output = BertSelfOutput(config)
|
||||||
self.pruned_heads = set()
|
self.pruned_heads = set()
|
||||||
@@ -335,7 +335,7 @@ class BertAttention(nn.Module):
|
|||||||
|
|
||||||
class BertIntermediate(nn.Module):
|
class BertIntermediate(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertIntermediate, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||||
if isinstance(config.hidden_act, str):
|
if isinstance(config.hidden_act, str):
|
||||||
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
||||||
@@ -350,7 +350,7 @@ class BertIntermediate(nn.Module):
|
|||||||
|
|
||||||
class BertOutput(nn.Module):
|
class BertOutput(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertOutput, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
@@ -364,7 +364,7 @@ class BertOutput(nn.Module):
|
|||||||
|
|
||||||
class BertLayer(nn.Module):
|
class BertLayer(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertLayer, self).__init__()
|
super().__init__()
|
||||||
self.attention = BertAttention(config)
|
self.attention = BertAttention(config)
|
||||||
self.is_decoder = config.is_decoder
|
self.is_decoder = config.is_decoder
|
||||||
if self.is_decoder:
|
if self.is_decoder:
|
||||||
@@ -399,7 +399,7 @@ class BertLayer(nn.Module):
|
|||||||
|
|
||||||
class BertEncoder(nn.Module):
|
class BertEncoder(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertEncoder, self).__init__()
|
super().__init__()
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
|
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
|
||||||
@@ -440,7 +440,7 @@ class BertEncoder(nn.Module):
|
|||||||
|
|
||||||
class BertPooler(nn.Module):
|
class BertPooler(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertPooler, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
self.activation = nn.Tanh()
|
self.activation = nn.Tanh()
|
||||||
|
|
||||||
@@ -455,7 +455,7 @@ class BertPooler(nn.Module):
|
|||||||
|
|
||||||
class BertPredictionHeadTransform(nn.Module):
|
class BertPredictionHeadTransform(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertPredictionHeadTransform, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
if isinstance(config.hidden_act, str):
|
if isinstance(config.hidden_act, str):
|
||||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||||
@@ -472,7 +472,7 @@ class BertPredictionHeadTransform(nn.Module):
|
|||||||
|
|
||||||
class BertLMPredictionHead(nn.Module):
|
class BertLMPredictionHead(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertLMPredictionHead, self).__init__()
|
super().__init__()
|
||||||
self.transform = BertPredictionHeadTransform(config)
|
self.transform = BertPredictionHeadTransform(config)
|
||||||
|
|
||||||
# The output weights are the same as the input embeddings, but there is
|
# The output weights are the same as the input embeddings, but there is
|
||||||
@@ -492,7 +492,7 @@ class BertLMPredictionHead(nn.Module):
|
|||||||
|
|
||||||
class BertOnlyMLMHead(nn.Module):
|
class BertOnlyMLMHead(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertOnlyMLMHead, self).__init__()
|
super().__init__()
|
||||||
self.predictions = BertLMPredictionHead(config)
|
self.predictions = BertLMPredictionHead(config)
|
||||||
|
|
||||||
def forward(self, sequence_output):
|
def forward(self, sequence_output):
|
||||||
@@ -502,7 +502,7 @@ class BertOnlyMLMHead(nn.Module):
|
|||||||
|
|
||||||
class BertOnlyNSPHead(nn.Module):
|
class BertOnlyNSPHead(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertOnlyNSPHead, self).__init__()
|
super().__init__()
|
||||||
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
||||||
|
|
||||||
def forward(self, pooled_output):
|
def forward(self, pooled_output):
|
||||||
@@ -512,7 +512,7 @@ class BertOnlyNSPHead(nn.Module):
|
|||||||
|
|
||||||
class BertPreTrainingHeads(nn.Module):
|
class BertPreTrainingHeads(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertPreTrainingHeads, self).__init__()
|
super().__init__()
|
||||||
self.predictions = BertLMPredictionHead(config)
|
self.predictions = BertLMPredictionHead(config)
|
||||||
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
||||||
|
|
||||||
@@ -657,7 +657,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
self.embeddings = BertEmbeddings(config)
|
self.embeddings = BertEmbeddings(config)
|
||||||
@@ -864,7 +864,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertForPreTraining, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertPreTrainingHeads(config)
|
self.cls = BertPreTrainingHeads(config)
|
||||||
@@ -954,7 +954,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertForMaskedLM, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertOnlyMLMHead(config)
|
self.cls = BertOnlyMLMHead(config)
|
||||||
@@ -1053,7 +1053,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertForNextSentencePrediction, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertOnlyNSPHead(config)
|
self.cls = BertOnlyNSPHead(config)
|
||||||
@@ -1132,7 +1132,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertForSequenceClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
@@ -1221,7 +1221,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertForMultipleChoice, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
@@ -1308,7 +1308,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertForTokenClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
@@ -1406,7 +1406,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BertForQuestionAnswering, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
|
|||||||
|
|
||||||
class MultiHeadAttention(torch.nn.Module):
|
class MultiHeadAttention(torch.nn.Module):
|
||||||
def __init__(self, d_model_size, num_heads, output_attentions=False):
|
def __init__(self, d_model_size, num_heads, output_attentions=False):
|
||||||
super(MultiHeadAttention, self).__init__()
|
super().__init__()
|
||||||
self.output_attentions = output_attentions
|
self.output_attentions = output_attentions
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.d_model_size = d_model_size
|
self.d_model_size = d_model_size
|
||||||
@@ -132,7 +132,7 @@ def point_wise_feed_forward_network(d_model_size, dff):
|
|||||||
|
|
||||||
class EncoderLayer(torch.nn.Module):
|
class EncoderLayer(torch.nn.Module):
|
||||||
def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False):
|
def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False):
|
||||||
super(EncoderLayer, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions)
|
self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions)
|
||||||
self.ffn = point_wise_feed_forward_network(d_model_size, dff)
|
self.ffn = point_wise_feed_forward_network(d_model_size, dff)
|
||||||
@@ -274,7 +274,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(CTRLModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_past = config.output_past
|
self.output_past = config.output_past
|
||||||
@@ -481,7 +481,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(CTRLLMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.transformer = CTRLModel(config)
|
self.transformer = CTRLModel(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
|
|||||||
|
|
||||||
class Embeddings(nn.Module):
|
class Embeddings(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(Embeddings, self).__init__()
|
super().__init__()
|
||||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
|
||||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
|
||||||
if config.sinusoidal_pos_embds:
|
if config.sinusoidal_pos_embds:
|
||||||
@@ -97,7 +97,7 @@ class Embeddings(nn.Module):
|
|||||||
|
|
||||||
class MultiHeadSelfAttention(nn.Module):
|
class MultiHeadSelfAttention(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(MultiHeadSelfAttention, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.n_heads = config.n_heads
|
self.n_heads = config.n_heads
|
||||||
self.dim = config.dim
|
self.dim = config.dim
|
||||||
@@ -195,7 +195,7 @@ class MultiHeadSelfAttention(nn.Module):
|
|||||||
|
|
||||||
class FFN(nn.Module):
|
class FFN(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(FFN, self).__init__()
|
super().__init__()
|
||||||
self.dropout = nn.Dropout(p=config.dropout)
|
self.dropout = nn.Dropout(p=config.dropout)
|
||||||
self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
|
self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
|
||||||
self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
|
self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
|
||||||
@@ -214,7 +214,7 @@ class FFN(nn.Module):
|
|||||||
|
|
||||||
class TransformerBlock(nn.Module):
|
class TransformerBlock(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(TransformerBlock, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.n_heads = config.n_heads
|
self.n_heads = config.n_heads
|
||||||
self.dim = config.dim
|
self.dim = config.dim
|
||||||
@@ -266,7 +266,7 @@ class TransformerBlock(nn.Module):
|
|||||||
|
|
||||||
class Transformer(nn.Module):
|
class Transformer(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(Transformer, self).__init__()
|
super().__init__()
|
||||||
self.n_layers = config.n_layers
|
self.n_layers = config.n_layers
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
@@ -424,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(DistilBertModel, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.embeddings = Embeddings(config) # Embeddings
|
self.embeddings = Embeddings(config) # Embeddings
|
||||||
self.transformer = Transformer(config) # Encoder
|
self.transformer = Transformer(config) # Encoder
|
||||||
@@ -525,7 +525,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(DistilBertForMaskedLM, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
@@ -600,7 +600,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(DistilBertForSequenceClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.distilbert = DistilBertModel(config)
|
self.distilbert = DistilBertModel(config)
|
||||||
@@ -679,7 +679,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(DistilBertForQuestionAnswering, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.distilbert = DistilBertModel(config)
|
self.distilbert = DistilBertModel(config)
|
||||||
self.qa_outputs = nn.Linear(config.dim, config.num_labels)
|
self.qa_outputs = nn.Linear(config.dim, config.num_labels)
|
||||||
@@ -766,7 +766,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(DistilBertForTokenClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.distilbert = DistilBertModel(config)
|
self.distilbert = DistilBertModel(config)
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class PreTrainedEncoderDecoder(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, encoder, decoder):
|
def __init__(self, encoder, decoder):
|
||||||
super(PreTrainedEncoderDecoder, self).__init__()
|
super().__init__()
|
||||||
self.encoder = encoder
|
self.encoder = encoder
|
||||||
self.decoder = decoder
|
self.decoder = decoder
|
||||||
|
|
||||||
@@ -290,7 +290,7 @@ class Model2Model(PreTrainedEncoderDecoder):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super(Model2Model, self).__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -321,7 +321,7 @@ class Model2Model(PreTrainedEncoderDecoder):
|
|||||||
):
|
):
|
||||||
raise ValueError("Only the Bert model is currently supported.")
|
raise ValueError("Only the Bert model is currently supported.")
|
||||||
|
|
||||||
model = super(Model2Model, cls).from_pretrained(
|
model = super().from_pretrained(
|
||||||
encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
|
encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
|
||||||
decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
|
decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
|
||||||
*args,
|
*args,
|
||||||
@@ -345,5 +345,5 @@ class Model2LSTM(PreTrainedEncoderDecoder):
|
|||||||
" E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`"
|
" E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`"
|
||||||
)
|
)
|
||||||
kwargs["decoder_model"] = torch.nn.LSTM(kwargs.pop("decoder_config"))
|
kwargs["decoder_model"] = torch.nn.LSTM(kwargs.pop("decoder_config"))
|
||||||
model = super(Model2LSTM, cls).from_pretrained(*args, **kwargs)
|
model = super().from_pretrained(*args, **kwargs)
|
||||||
return model
|
return model
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ def gelu(x):
|
|||||||
|
|
||||||
class Attention(nn.Module):
|
class Attention(nn.Module):
|
||||||
def __init__(self, nx, n_ctx, config, scale=False):
|
def __init__(self, nx, n_ctx, config, scale=False):
|
||||||
super(Attention, self).__init__()
|
super().__init__()
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||||
@@ -202,7 +202,7 @@ class Attention(nn.Module):
|
|||||||
|
|
||||||
class MLP(nn.Module):
|
class MLP(nn.Module):
|
||||||
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
|
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
|
||||||
super(MLP, self).__init__()
|
super().__init__()
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.c_fc = Conv1D(n_state, nx)
|
self.c_fc = Conv1D(n_state, nx)
|
||||||
self.c_proj = Conv1D(nx, n_state)
|
self.c_proj = Conv1D(nx, n_state)
|
||||||
@@ -217,7 +217,7 @@ class MLP(nn.Module):
|
|||||||
|
|
||||||
class Block(nn.Module):
|
class Block(nn.Module):
|
||||||
def __init__(self, n_ctx, config, scale=False):
|
def __init__(self, n_ctx, config, scale=False):
|
||||||
super(Block, self).__init__()
|
super().__init__()
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
self.attn = Attention(nx, n_ctx, config, scale)
|
self.attn = Attention(nx, n_ctx, config, scale)
|
||||||
@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|||||||
base_model_prefix = "transformer"
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
def __init__(self, *inputs, **kwargs):
|
def __init__(self, *inputs, **kwargs):
|
||||||
super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
|
super().__init__(*inputs, **kwargs)
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights.
|
""" Initialize the weights.
|
||||||
@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(GPT2Model, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_past = config.output_past
|
self.output_past = config.output_past
|
||||||
@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(GPT2LMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.transformer = GPT2Model(config)
|
self.transformer = GPT2Model(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
|
||||||
@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(GPT2DoubleHeadsModel, self).__init__(config)
|
super().__init__(config)
|
||||||
config.num_labels = 1
|
config.num_labels = 1
|
||||||
self.transformer = GPT2Model(config)
|
self.transformer = GPT2Model(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, encoder, embeddings):
|
def __init__(self, config, encoder, embeddings):
|
||||||
super(ModalEmbeddings, self).__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
self.encoder = encoder
|
self.encoder = encoder
|
||||||
self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
|
self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
|
||||||
@@ -175,7 +175,7 @@ class MMBTModel(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, transformer, encoder):
|
def __init__(self, config, transformer, encoder):
|
||||||
super(MMBTModel, self).__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
self.transformer = transformer
|
self.transformer = transformer
|
||||||
self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
|
self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
|
||||||
@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, transformer, encoder):
|
def __init__(self, config, transformer, encoder):
|
||||||
super(MMBTForClassification, self).__init__()
|
super().__init__()
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.mmbt = MMBTModel(config, transformer, encoder)
|
self.mmbt = MMBTModel(config, transformer, encoder)
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
|
|||||||
|
|
||||||
class Attention(nn.Module):
|
class Attention(nn.Module):
|
||||||
def __init__(self, nx, n_ctx, config, scale=False):
|
def __init__(self, nx, n_ctx, config, scale=False):
|
||||||
super(Attention, self).__init__()
|
super().__init__()
|
||||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||||
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||||
assert n_state % config.n_head == 0
|
assert n_state % config.n_head == 0
|
||||||
@@ -221,7 +221,7 @@ class Attention(nn.Module):
|
|||||||
|
|
||||||
class MLP(nn.Module):
|
class MLP(nn.Module):
|
||||||
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
|
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
|
||||||
super(MLP, self).__init__()
|
super().__init__()
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.c_fc = Conv1D(n_state, nx)
|
self.c_fc = Conv1D(n_state, nx)
|
||||||
self.c_proj = Conv1D(nx, n_state)
|
self.c_proj = Conv1D(nx, n_state)
|
||||||
@@ -236,7 +236,7 @@ class MLP(nn.Module):
|
|||||||
|
|
||||||
class Block(nn.Module):
|
class Block(nn.Module):
|
||||||
def __init__(self, n_ctx, config, scale=False):
|
def __init__(self, n_ctx, config, scale=False):
|
||||||
super(Block, self).__init__()
|
super().__init__()
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.attn = Attention(nx, n_ctx, config, scale)
|
self.attn = Attention(nx, n_ctx, config, scale)
|
||||||
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(OpenAIGPTModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(OpenAIGPTLMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.transformer = OpenAIGPTModel(config)
|
self.transformer = OpenAIGPTModel(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
|
||||||
@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
config.num_labels = 1
|
config.num_labels = 1
|
||||||
self.transformer = OpenAIGPTModel(config)
|
self.transformer = OpenAIGPTModel(config)
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaEmbeddings, self).__init__(config)
|
super().__init__(config)
|
||||||
self.padding_idx = 1
|
self.padding_idx = 1
|
||||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
|
||||||
self.position_embeddings = nn.Embedding(
|
self.position_embeddings = nn.Embedding(
|
||||||
@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings):
|
|||||||
else:
|
else:
|
||||||
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
||||||
|
|
||||||
return super(RobertaEmbeddings, self).forward(
|
return super().forward(
|
||||||
input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
|
input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -204,7 +204,7 @@ class RobertaModel(BertModel):
|
|||||||
base_model_prefix = "roberta"
|
base_model_prefix = "roberta"
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaModel, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.embeddings = RobertaEmbeddings(config)
|
self.embeddings = RobertaEmbeddings(config)
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
|||||||
base_model_prefix = "roberta"
|
base_model_prefix = "roberta"
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaForMaskedLM, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
self.lm_head = RobertaLMHead(config)
|
self.lm_head = RobertaLMHead(config)
|
||||||
@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module):
|
|||||||
"""Roberta Head for masked language modeling."""
|
"""Roberta Head for masked language modeling."""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaLMHead, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
|
|
||||||
@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
|||||||
base_model_prefix = "roberta"
|
base_model_prefix = "roberta"
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaForSequenceClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
|||||||
base_model_prefix = "roberta"
|
base_model_prefix = "roberta"
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaForMultipleChoice, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
|
|||||||
base_model_prefix = "roberta"
|
base_model_prefix = "roberta"
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaForTokenClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module):
|
|||||||
"""Head for sentence-level classification tasks."""
|
"""Head for sentence-level classification tasks."""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaClassificationHead, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
|
|||||||
base_model_prefix = "roberta"
|
base_model_prefix = "roberta"
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaForQuestionAnswering, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module):
|
|||||||
""" Construct a layernorm module in the T5 style
|
""" Construct a layernorm module in the T5 style
|
||||||
No bias and no substraction of mean.
|
No bias and no substraction of mean.
|
||||||
"""
|
"""
|
||||||
super(T5LayerNorm, self).__init__()
|
super().__init__()
|
||||||
self.weight = nn.Parameter(torch.ones(hidden_size))
|
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||||
self.variance_epsilon = eps
|
self.variance_epsilon = eps
|
||||||
|
|
||||||
@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module):
|
|||||||
|
|
||||||
class T5DenseReluDense(nn.Module):
|
class T5DenseReluDense(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(T5DenseReluDense, self).__init__()
|
super().__init__()
|
||||||
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
|
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
|
||||||
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
|
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
|
||||||
self.dropout = nn.Dropout(config.dropout_rate)
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module):
|
|||||||
|
|
||||||
class T5LayerFF(nn.Module):
|
class T5LayerFF(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(T5LayerFF, self).__init__()
|
super().__init__()
|
||||||
self.DenseReluDense = T5DenseReluDense(config)
|
self.DenseReluDense = T5DenseReluDense(config)
|
||||||
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
||||||
self.dropout = nn.Dropout(config.dropout_rate)
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
@@ -185,7 +185,7 @@ class T5Attention(nn.Module):
|
|||||||
NEW_ID = itertools.count()
|
NEW_ID = itertools.count()
|
||||||
|
|
||||||
def __init__(self, config, has_relative_attention_bias=False):
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
super(T5Attention, self).__init__()
|
super().__init__()
|
||||||
self.layer_id = next(T5Attention.NEW_ID)
|
self.layer_id = next(T5Attention.NEW_ID)
|
||||||
self.is_decoder = config.is_decoder
|
self.is_decoder = config.is_decoder
|
||||||
self.has_relative_attention_bias = has_relative_attention_bias
|
self.has_relative_attention_bias = has_relative_attention_bias
|
||||||
@@ -363,7 +363,7 @@ class T5Attention(nn.Module):
|
|||||||
|
|
||||||
class T5LayerSelfAttention(nn.Module):
|
class T5LayerSelfAttention(nn.Module):
|
||||||
def __init__(self, config, has_relative_attention_bias=False):
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
super(T5LayerSelfAttention, self).__init__()
|
super().__init__()
|
||||||
self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
|
self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
|
||||||
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
||||||
self.dropout = nn.Dropout(config.dropout_rate)
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module):
|
|||||||
|
|
||||||
class T5LayerCrossAttention(nn.Module):
|
class T5LayerCrossAttention(nn.Module):
|
||||||
def __init__(self, config, has_relative_attention_bias=False):
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
super(T5LayerCrossAttention, self).__init__()
|
super().__init__()
|
||||||
self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
|
self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
|
||||||
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
||||||
self.dropout = nn.Dropout(config.dropout_rate)
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module):
|
|||||||
|
|
||||||
class T5Block(nn.Module):
|
class T5Block(nn.Module):
|
||||||
def __init__(self, config, has_relative_attention_bias=False):
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
super(T5Block, self).__init__()
|
super().__init__()
|
||||||
self.is_decoder = config.is_decoder
|
self.is_decoder = config.is_decoder
|
||||||
self.layer = nn.ModuleList()
|
self.layer = nn.ModuleList()
|
||||||
self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
|
self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
|
||||||
@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
class T5Stack(T5PreTrainedModel):
|
class T5Stack(T5PreTrainedModel):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(T5Stack, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.is_decoder = config.is_decoder
|
self.is_decoder = config.is_decoder
|
||||||
@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(T5Model, self).__init__(config)
|
super().__init__(config)
|
||||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||||
|
|
||||||
encoder_config = copy.deepcopy(config)
|
encoder_config = copy.deepcopy(config)
|
||||||
@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(T5WithLMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.model_dim = config.d_model
|
self.model_dim = config.d_model
|
||||||
|
|
||||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertEmbeddings, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.position_embeddings = tf.keras.layers.Embedding(
|
self.position_embeddings = tf.keras.layers.Embedding(
|
||||||
@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
|||||||
shape=[self.config.vocab_size, self.config.embedding_size],
|
shape=[self.config.vocab_size, self.config.embedding_size],
|
||||||
initializer=get_initializer(self.config.initializer_range),
|
initializer=get_initializer(self.config.initializer_range),
|
||||||
)
|
)
|
||||||
super(TFAlbertEmbeddings, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, inputs, mode="embedding", training=False):
|
def call(self, inputs, mode="embedding", training=False):
|
||||||
"""Get token embeddings of inputs.
|
"""Get token embeddings of inputs.
|
||||||
@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFAlbertSelfAttention(tf.keras.layers.Layer):
|
class TFAlbertSelfAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertSelfAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
if config.hidden_size % config.num_attention_heads != 0:
|
if config.hidden_size % config.num_attention_heads != 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The hidden size (%d) is not a multiple of the number of attention "
|
"The hidden size (%d) is not a multiple of the number of attention "
|
||||||
@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFAlbertSelfOutput(tf.keras.layers.Layer):
|
class TFAlbertSelfOutput(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertSelfOutput, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||||
)
|
)
|
||||||
@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFAlbertAttention(TFBertSelfAttention):
|
class TFAlbertAttention(TFBertSelfAttention):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertAttention, self).__init__(config, **kwargs)
|
super().__init__(config, **kwargs)
|
||||||
|
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention):
|
|||||||
|
|
||||||
class TFAlbertLayer(tf.keras.layers.Layer):
|
class TFAlbertLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.attention = TFAlbertAttention(config, name="attention")
|
self.attention = TFAlbertAttention(config, name="attention")
|
||||||
|
|
||||||
self.ffn = tf.keras.layers.Dense(
|
self.ffn = tf.keras.layers.Dense(
|
||||||
@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFAlbertLayerGroup(tf.keras.layers.Layer):
|
class TFAlbertLayerGroup(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertLayerGroup, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFAlbertTransformer(tf.keras.layers.Layer):
|
class TFAlbertTransformer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertTransformer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
|
|||||||
|
|
||||||
class TFAlbertMLMHead(tf.keras.layers.Layer):
|
class TFAlbertMLMHead(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFAlbertMLMHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
|
|||||||
self.decoder_bias = self.add_weight(
|
self.decoder_bias = self.add_weight(
|
||||||
shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
|
shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
|
||||||
)
|
)
|
||||||
super(TFAlbertMLMHead, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, hidden_states):
|
def call(self, hidden_states):
|
||||||
hidden_states = self.dense(hidden_states)
|
hidden_states = self.dense(hidden_states)
|
||||||
@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFAlbertModel, self).__init__(config, **kwargs)
|
super().__init__(config, **kwargs)
|
||||||
self.num_hidden_layers = config.num_hidden_layers
|
self.num_hidden_layers = config.num_hidden_layers
|
||||||
|
|
||||||
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
|
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
|
||||||
@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.albert = TFAlbertModel(config, name="albert")
|
self.albert = TFAlbertModel(config, name="albert")
|
||||||
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
|
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
|
||||||
@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.albert = TFAlbertModel(config, name="albert")
|
self.albert = TFAlbertModel(config, name="albert")
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertEmbeddings, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
self.initializer_range = config.initializer_range
|
self.initializer_range = config.initializer_range
|
||||||
@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
shape=[self.vocab_size, self.hidden_size],
|
shape=[self.vocab_size, self.hidden_size],
|
||||||
initializer=get_initializer(self.initializer_range),
|
initializer=get_initializer(self.initializer_range),
|
||||||
)
|
)
|
||||||
super(TFBertEmbeddings, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, inputs, mode="embedding", training=False):
|
def call(self, inputs, mode="embedding", training=False):
|
||||||
"""Get token embeddings of inputs.
|
"""Get token embeddings of inputs.
|
||||||
@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertSelfAttention(tf.keras.layers.Layer):
|
class TFBertSelfAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertSelfAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
if config.hidden_size % config.num_attention_heads != 0:
|
if config.hidden_size % config.num_attention_heads != 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The hidden size (%d) is not a multiple of the number of attention "
|
"The hidden size (%d) is not a multiple of the number of attention "
|
||||||
@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertSelfOutput(tf.keras.layers.Layer):
|
class TFBertSelfOutput(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertSelfOutput, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||||
)
|
)
|
||||||
@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertAttention(tf.keras.layers.Layer):
|
class TFBertAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.self_attention = TFBertSelfAttention(config, name="self")
|
self.self_attention = TFBertSelfAttention(config, name="self")
|
||||||
self.dense_output = TFBertSelfOutput(config, name="output")
|
self.dense_output = TFBertSelfOutput(config, name="output")
|
||||||
|
|
||||||
@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertIntermediate(tf.keras.layers.Layer):
|
class TFBertIntermediate(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertIntermediate, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||||
)
|
)
|
||||||
@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertOutput(tf.keras.layers.Layer):
|
class TFBertOutput(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertOutput, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||||
)
|
)
|
||||||
@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertLayer(tf.keras.layers.Layer):
|
class TFBertLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.attention = TFBertAttention(config, name="attention")
|
self.attention = TFBertAttention(config, name="attention")
|
||||||
self.intermediate = TFBertIntermediate(config, name="intermediate")
|
self.intermediate = TFBertIntermediate(config, name="intermediate")
|
||||||
self.bert_output = TFBertOutput(config, name="output")
|
self.bert_output = TFBertOutput(config, name="output")
|
||||||
@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertEncoder(tf.keras.layers.Layer):
|
class TFBertEncoder(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertEncoder, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
|
self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
|
||||||
@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertPooler(tf.keras.layers.Layer):
|
class TFBertPooler(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertPooler, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
kernel_initializer=get_initializer(config.initializer_range),
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
|
class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertPredictionHeadTransform, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||||
)
|
)
|
||||||
@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertLMPredictionHead(tf.keras.layers.Layer):
|
class TFBertLMPredictionHead(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFBertLMPredictionHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.transform = TFBertPredictionHeadTransform(config, name="transform")
|
self.transform = TFBertPredictionHeadTransform(config, name="transform")
|
||||||
|
|
||||||
@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||||
super(TFBertLMPredictionHead, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, hidden_states):
|
def call(self, hidden_states):
|
||||||
hidden_states = self.transform(hidden_states)
|
hidden_states = self.transform(hidden_states)
|
||||||
@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertMLMHead(tf.keras.layers.Layer):
|
class TFBertMLMHead(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFBertMLMHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
|
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
|
||||||
|
|
||||||
def call(self, sequence_output):
|
def call(self, sequence_output):
|
||||||
@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertNSPHead(tf.keras.layers.Layer):
|
class TFBertNSPHead(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertNSPHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.seq_relationship = tf.keras.layers.Dense(
|
self.seq_relationship = tf.keras.layers.Dense(
|
||||||
2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
|
2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
|
||||||
)
|
)
|
||||||
@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBertMainLayer(tf.keras.layers.Layer):
|
class TFBertMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFBertMainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.num_hidden_layers = config.num_hidden_layers
|
self.num_hidden_layers = config.num_hidden_layers
|
||||||
|
|
||||||
self.embeddings = TFBertEmbeddings(config, name="embeddings")
|
self.embeddings = TFBertEmbeddings(config, name="embeddings")
|
||||||
@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
self.nsp = TFBertNSPHead(config, name="nsp___cls")
|
self.nsp = TFBertNSPHead(config, name="nsp___cls")
|
||||||
@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
|
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
|
||||||
@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
self.nsp = TFBertNSPHead(config, name="nsp___cls")
|
self.nsp = TFBertNSPHead(config, name="nsp___cls")
|
||||||
@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.bert = TFBertMainLayer(config, name="bert")
|
self.bert = TFBertMainLayer(config, name="bert")
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
|
|||||||
|
|
||||||
class TFMultiHeadAttention(tf.keras.layers.Layer):
|
class TFMultiHeadAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
|
def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
|
||||||
super(TFMultiHeadAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = output_attentions
|
self.output_attentions = output_attentions
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.d_model_size = d_model_size
|
self.d_model_size = d_model_size
|
||||||
@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
|
self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
|
||||||
):
|
):
|
||||||
super(TFEncoderLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.multi_head_attention = TFMultiHeadAttention(
|
self.multi_head_attention = TFMultiHeadAttention(
|
||||||
d_model_size, num_heads, output_attentions, name="multi_head_attention"
|
d_model_size, num_heads, output_attentions, name="multi_head_attention"
|
||||||
@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFCTRLMainLayer(tf.keras.layers.Layer):
|
class TFCTRLMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFCTRLMainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_past = config.output_past
|
self.output_past = config.output_past
|
||||||
@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFCTRLModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFCTRLMainLayer(config, name="transformer")
|
self.transformer = TFCTRLMainLayer(config, name="transformer")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
|
|||||||
|
|
||||||
class TFCTRLLMHead(tf.keras.layers.Layer):
|
class TFCTRLLMHead(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFCTRLLMHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
# The output weights are the same as the input embeddings, but there is
|
# The output weights are the same as the input embeddings, but there is
|
||||||
@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||||
super(TFCTRLLMHead, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, hidden_states):
|
def call(self, hidden_states):
|
||||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||||
@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFCTRLMainLayer(config, name="transformer")
|
self.transformer = TFCTRLMainLayer(config, name="transformer")
|
||||||
|
|
||||||
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
|
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ def gelu_new(x):
|
|||||||
|
|
||||||
class TFEmbeddings(tf.keras.layers.Layer):
|
class TFEmbeddings(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFEmbeddings, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.dim = config.dim
|
self.dim = config.dim
|
||||||
self.initializer_range = config.initializer_range
|
self.initializer_range = config.initializer_range
|
||||||
@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
self.word_embeddings = self.add_weight(
|
self.word_embeddings = self.add_weight(
|
||||||
"weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
|
"weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
|
||||||
)
|
)
|
||||||
super(TFEmbeddings, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
|
def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
|
||||||
"""Get token embeddings of inputs.
|
"""Get token embeddings of inputs.
|
||||||
@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
|
class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFMultiHeadSelfAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.n_heads = config.n_heads
|
self.n_heads = config.n_heads
|
||||||
self.dim = config.dim
|
self.dim = config.dim
|
||||||
@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFFFN(tf.keras.layers.Layer):
|
class TFFFN(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFFFN, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
self.lin1 = tf.keras.layers.Dense(
|
self.lin1 = tf.keras.layers.Dense(
|
||||||
config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
|
config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
|
||||||
@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFTransformerBlock(tf.keras.layers.Layer):
|
class TFTransformerBlock(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFTransformerBlock, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.n_heads = config.n_heads
|
self.n_heads = config.n_heads
|
||||||
self.dim = config.dim
|
self.dim = config.dim
|
||||||
@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFTransformer(tf.keras.layers.Layer):
|
class TFTransformer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFTransformer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.n_layers = config.n_layers
|
self.n_layers = config.n_layers
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFDistilBertMainLayer(tf.keras.layers.Layer):
|
class TFDistilBertMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFDistilBertMainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.num_hidden_layers = config.num_hidden_layers
|
self.num_hidden_layers = config.num_hidden_layers
|
||||||
|
|
||||||
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
|
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
|
||||||
@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
|
|||||||
|
|
||||||
class TFDistilBertLMHead(tf.keras.layers.Layer):
|
class TFDistilBertLMHead(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFDistilBertLMHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
# The output weights are the same as the input embeddings, but there is
|
# The output weights are the same as the input embeddings, but there is
|
||||||
@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||||
super(TFDistilBertLMHead, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, hidden_states):
|
def call(self, hidden_states):
|
||||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||||
@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
self.qa_outputs = tf.keras.layers.Dense(
|
self.qa_outputs = tf.keras.layers.Dense(
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ def gelu(x):
|
|||||||
|
|
||||||
class TFAttention(tf.keras.layers.Layer):
|
class TFAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
||||||
super(TFAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||||
@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFMLP(tf.keras.layers.Layer):
|
class TFMLP(tf.keras.layers.Layer):
|
||||||
def __init__(self, n_state, config, **kwargs):
|
def __init__(self, n_state, config, **kwargs):
|
||||||
super(TFMLP, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
|
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
|
||||||
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
|
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
|
||||||
@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBlock(tf.keras.layers.Layer):
|
class TFBlock(tf.keras.layers.Layer):
|
||||||
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
||||||
super(TFBlock, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
|
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
|
||||||
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
|
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
|
||||||
@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFGPT2MainLayer(tf.keras.layers.Layer):
|
class TFGPT2MainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.num_hidden_layers = config.n_layer
|
self.num_hidden_layers = config.n_layer
|
||||||
@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
||||||
|
|
||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
config.num_labels = 1
|
config.num_labels = 1
|
||||||
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
||||||
self.multiple_choice_head = TFSequenceSummary(
|
self.multiple_choice_head = TFSequenceSummary(
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ ACT_FNS = {
|
|||||||
|
|
||||||
class TFAttention(tf.keras.layers.Layer):
|
class TFAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
||||||
super(TFAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||||
@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFMLP(tf.keras.layers.Layer):
|
class TFMLP(tf.keras.layers.Layer):
|
||||||
def __init__(self, n_state, config, **kwargs):
|
def __init__(self, n_state, config, **kwargs):
|
||||||
super(TFMLP, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
|
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
|
||||||
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
|
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
|
||||||
@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFBlock(tf.keras.layers.Layer):
|
class TFBlock(tf.keras.layers.Layer):
|
||||||
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
||||||
super(TFBlock, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
|
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
|
||||||
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
|
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
|
||||||
@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
|
class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFOpenAIGPTMainLayer, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.num_hidden_layers = config.n_layer
|
self.num_hidden_layers = config.n_layer
|
||||||
@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
||||||
|
|
||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
config.num_labels = 1
|
config.num_labels = 1
|
||||||
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
||||||
self.multiple_choice_head = TFSequenceSummary(
|
self.multiple_choice_head = TFSequenceSummary(
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFRobertaEmbeddings, self).__init__(config, **kwargs)
|
super().__init__(config, **kwargs)
|
||||||
self.padding_idx = 1
|
self.padding_idx = 1
|
||||||
|
|
||||||
def create_position_ids_from_input_ids(self, x):
|
def create_position_ids_from_input_ids(self, x):
|
||||||
@@ -78,7 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
|
|||||||
else:
|
else:
|
||||||
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
||||||
|
|
||||||
return super(TFRobertaEmbeddings, self)._embedding(
|
return super()._embedding(
|
||||||
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training
|
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFRobertaMainLayer, self).__init__(config, **kwargs)
|
super().__init__(config, **kwargs)
|
||||||
self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
|
self.embeddings = TFRobertaEmbeddings(config, name="embeddings")
|
||||||
|
|
||||||
def get_input_embeddings(self):
|
def get_input_embeddings(self):
|
||||||
@@ -234,7 +234,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFRobertaModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -246,7 +246,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
|
|||||||
"""Roberta Head for masked language modeling."""
|
"""Roberta Head for masked language modeling."""
|
||||||
|
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFRobertaLMHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||||
@@ -260,7 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||||
super(TFRobertaLMHead, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, features):
|
def call(self, features):
|
||||||
x = self.dense(features)
|
x = self.dense(features)
|
||||||
@@ -305,7 +305,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
||||||
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
|
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
|
||||||
@@ -328,7 +328,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
|
|||||||
"""Head for sentence-level classification tasks."""
|
"""Head for sentence-level classification tasks."""
|
||||||
|
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
|
super().__init__(config, **kwargs)
|
||||||
self.dense = tf.keras.layers.Dense(
|
self.dense = tf.keras.layers.Dense(
|
||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
kernel_initializer=get_initializer(config.initializer_range),
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
@@ -383,7 +383,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
||||||
@@ -433,7 +433,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
||||||
|
|||||||
@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
|
|||||||
""" Construct a layernorm module in the T5 style
|
""" Construct a layernorm module in the T5 style
|
||||||
No bias and no substraction of mean.
|
No bias and no substraction of mean.
|
||||||
"""
|
"""
|
||||||
super(TFT5LayerNorm, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.variance_epsilon = epsilon
|
self.variance_epsilon = epsilon
|
||||||
|
|
||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
"""Build shared word embedding layer """
|
"""Build shared word embedding layer """
|
||||||
self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
|
self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones")
|
||||||
super(TFT5LayerNorm, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, x):
|
def call(self, x):
|
||||||
variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
|
variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
|
||||||
@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFT5DenseReluDense(tf.keras.layers.Layer):
|
class TFT5DenseReluDense(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFT5DenseReluDense, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi")
|
self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi")
|
||||||
self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo")
|
self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo")
|
||||||
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFT5LayerFF(tf.keras.layers.Layer):
|
class TFT5LayerFF(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFT5LayerFF, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense")
|
self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense")
|
||||||
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
|
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
|
||||||
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer):
|
|||||||
NEW_ID = itertools.count()
|
NEW_ID = itertools.count()
|
||||||
|
|
||||||
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
super(TFT5Attention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.layer_id = next(TFT5Attention.NEW_ID)
|
self.layer_id = next(TFT5Attention.NEW_ID)
|
||||||
self.is_decoder = config.is_decoder
|
self.is_decoder = config.is_decoder
|
||||||
self.has_relative_attention_bias = has_relative_attention_bias
|
self.has_relative_attention_bias = has_relative_attention_bias
|
||||||
@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFT5LayerSelfAttention(tf.keras.layers.Layer):
|
class TFT5LayerSelfAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
super(TFT5LayerSelfAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.SelfAttention = TFT5Attention(
|
self.SelfAttention = TFT5Attention(
|
||||||
config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention"
|
config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention"
|
||||||
)
|
)
|
||||||
@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFT5LayerCrossAttention(tf.keras.layers.Layer):
|
class TFT5LayerCrossAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
super(TFT5LayerCrossAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.EncDecAttention = TFT5Attention(
|
self.EncDecAttention = TFT5Attention(
|
||||||
config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention"
|
config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention"
|
||||||
)
|
)
|
||||||
@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFT5Block(tf.keras.layers.Layer):
|
class TFT5Block(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
super(TFT5Block, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.is_decoder = config.is_decoder
|
self.is_decoder = config.is_decoder
|
||||||
self.layer = []
|
self.layer = []
|
||||||
self.layer.append(
|
self.layer.append(
|
||||||
@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer):
|
|||||||
####################################################
|
####################################################
|
||||||
class TFT5MainLayer(tf.keras.layers.Layer):
|
class TFT5MainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFT5MainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.is_decoder = config.is_decoder
|
self.is_decoder = config.is_decoder
|
||||||
@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFT5Model, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
|
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
|
||||||
|
|
||||||
encoder_config = copy.deepcopy(config)
|
encoder_config = copy.deepcopy(config)
|
||||||
@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.model_dim = config.d_model
|
self.model_dim = config.d_model
|
||||||
|
|
||||||
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
|
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared")
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
class TFPositionalEmbedding(tf.keras.layers.Layer):
|
class TFPositionalEmbedding(tf.keras.layers.Layer):
|
||||||
def __init__(self, demb, **kwargs):
|
def __init__(self, demb, **kwargs):
|
||||||
super(TFPositionalEmbedding, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
|
self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
|
||||||
|
|
||||||
@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFPositionwiseFF(tf.keras.layers.Layer):
|
class TFPositionwiseFF(tf.keras.layers.Layer):
|
||||||
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
|
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
|
||||||
super(TFPositionwiseFF, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
self.d_inner = d_inner
|
self.d_inner = d_inner
|
||||||
@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
|
|||||||
init_std=0.02,
|
init_std=0.02,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.output_attentions = output_attentions
|
self.output_attentions = output_attentions
|
||||||
self.n_head = n_head
|
self.n_head = n_head
|
||||||
@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
|
|||||||
self.r_w_bias = self.add_weight(
|
self.r_w_bias = self.add_weight(
|
||||||
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
|
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
|
||||||
)
|
)
|
||||||
super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def _rel_shift(self, x):
|
def _rel_shift(self, x):
|
||||||
x_size = shape_list(x)
|
x_size = shape_list(x)
|
||||||
@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
|
|||||||
init_std=0.02,
|
init_std=0.02,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
|
self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
|
||||||
n_head,
|
n_head,
|
||||||
@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFAdaptiveEmbedding(tf.keras.layers.Layer):
|
class TFAdaptiveEmbedding(tf.keras.layers.Layer):
|
||||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
|
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
|
||||||
super(TFAdaptiveEmbedding, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.n_token = n_token
|
self.n_token = n_token
|
||||||
self.d_embed = d_embed
|
self.d_embed = d_embed
|
||||||
@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
|
|||||||
name="emb_projs_._{}".format(i),
|
name="emb_projs_._{}".format(i),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
super(TFAdaptiveEmbedding, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, inp):
|
def call(self, inp):
|
||||||
if self.div_val == 1:
|
if self.div_val == 1:
|
||||||
@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFTransfoXLMainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
|||||||
self.r_r_bias = self.add_weight(
|
self.r_r_bias = self.add_weight(
|
||||||
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
|
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
|
||||||
)
|
)
|
||||||
super(TFTransfoXLMainLayer, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def get_input_embeddings(self):
|
def get_input_embeddings(self):
|
||||||
return self.word_emb
|
return self.word_emb
|
||||||
@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
|
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(TFTransfoXLLMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
|
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
|
||||||
self.sample_softmax = config.sample_softmax
|
self.sample_softmax = config.sample_softmax
|
||||||
# use sampled softmax
|
# use sampled softmax
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list
|
|||||||
|
|
||||||
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||||
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
|
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
|
||||||
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.d_embed = d_embed
|
self.d_embed = d_embed
|
||||||
@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
name="out_layers_._{}_._bias".format(i),
|
name="out_layers_._{}_._bias".format(i),
|
||||||
)
|
)
|
||||||
self.out_layers.append((weight, bias))
|
self.out_layers.append((weight, bias))
|
||||||
super(TFAdaptiveSoftmaxMask, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _logit(x, W, b, proj=None):
|
def _logit(x, W, b, proj=None):
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
|
|||||||
return {"input_ids": tf.constant(DUMMY_INPUTS)}
|
return {"input_ids": tf.constant(DUMMY_INPUTS)}
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
|
super().__init__(*inputs, **kwargs)
|
||||||
if not isinstance(config, PretrainedConfig):
|
if not isinstance(config, PretrainedConfig):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
|
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
|
||||||
@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer):
|
|||||||
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
|
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
|
||||||
Basically works like a Linear layer but the weights are transposed
|
Basically works like a Linear layer but the weights are transposed
|
||||||
"""
|
"""
|
||||||
super(TFConv1D, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.nf = nf
|
self.nf = nf
|
||||||
self.nx = nx
|
self.nx = nx
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
|
def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
|
||||||
super(TFSharedEmbeddings, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
|
self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
|
||||||
@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
|
|||||||
self.weight = self.add_weight(
|
self.weight = self.add_weight(
|
||||||
"weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
|
"weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
|
||||||
)
|
)
|
||||||
super(TFSharedEmbeddings, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, inputs, mode="embedding"):
|
def call(self, inputs, mode="embedding"):
|
||||||
"""Get token embeddings of inputs.
|
"""Get token embeddings of inputs.
|
||||||
@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, initializer_range=0.02, **kwargs):
|
def __init__(self, config, initializer_range=0.02, **kwargs):
|
||||||
super(TFSequenceSummary, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
|
self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
|
||||||
if self.summary_type == "attn":
|
if self.summary_type == "attn":
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
|
|||||||
NEW_ID = itertools.count()
|
NEW_ID = itertools.count()
|
||||||
|
|
||||||
def __init__(self, n_heads, dim, config, **kwargs):
|
def __init__(self, n_heads, dim, config, **kwargs):
|
||||||
super(TFMultiHeadAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.layer_id = next(TFMultiHeadAttention.NEW_ID)
|
self.layer_id = next(TFMultiHeadAttention.NEW_ID)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFTransformerFFN(tf.keras.layers.Layer):
|
class TFTransformerFFN(tf.keras.layers.Layer):
|
||||||
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
|
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
|
||||||
super(TFTransformerFFN, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
|
self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
|
||||||
self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
|
self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
|
||||||
self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
|
self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
|
||||||
@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFXLMMainLayer(tf.keras.layers.Layer):
|
class TFXLMMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFXLMMainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLMModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFXLMMainLayer(config, name="transformer")
|
self.transformer = TFXLMMainLayer(config, name="transformer")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFXLMPredLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.asm = config.asm
|
self.asm = config.asm
|
||||||
self.n_words = config.n_words
|
self.n_words = config.n_words
|
||||||
self.pad_index = config.pad_index
|
self.pad_index = config.pad_index
|
||||||
@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
|
|||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
|
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
|
||||||
self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
|
self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
|
||||||
super(TFXLMPredLayer, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, hidden_states):
|
def call(self, hidden_states):
|
||||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||||
@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFXLMMainLayer(config, name="transformer")
|
self.transformer = TFXLMMainLayer(config, name="transformer")
|
||||||
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
|
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
|
||||||
|
|
||||||
@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = TFXLMMainLayer(config, name="transformer")
|
self.transformer = TFXLMMainLayer(config, name="transformer")
|
||||||
@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFXLMMainLayer(config, name="transformer")
|
self.transformer = TFXLMMainLayer(config, name="transformer")
|
||||||
self.qa_outputs = tf.keras.layers.Dense(
|
self.qa_outputs = tf.keras.layers.Dense(
|
||||||
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
|
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ ACT2FN = {
|
|||||||
|
|
||||||
class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFXLNetRelativeAttention, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
if config.d_model % config.n_head != 0:
|
if config.d_model % config.n_head != 0:
|
||||||
@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
|||||||
self.seg_embed = self.add_weight(
|
self.seg_embed = self.add_weight(
|
||||||
shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
|
shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
|
||||||
)
|
)
|
||||||
super(TFXLNetRelativeAttention, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def prune_heads(self, heads):
|
def prune_heads(self, heads):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFXLNetFeedForward(tf.keras.layers.Layer):
|
class TFXLNetFeedForward(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFXLNetFeedForward, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
|
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
|
||||||
self.layer_1 = tf.keras.layers.Dense(
|
self.layer_1 = tf.keras.layers.Dense(
|
||||||
config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
|
config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
|
||||||
@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFXLNetLayer(tf.keras.layers.Layer):
|
class TFXLNetLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFXLNetLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
|
self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
|
||||||
self.ff = TFXLNetFeedForward(config, name="ff")
|
self.ff = TFXLNetFeedForward(config, name="ff")
|
||||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFXLNetLMHead(tf.keras.layers.Layer):
|
class TFXLNetLMHead(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, input_embeddings, **kwargs):
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
super(TFXLNetLMHead, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
# The output weights are the same as the input embeddings, but there is
|
# The output weights are the same as the input embeddings, but there is
|
||||||
# an output-only bias for each token.
|
# an output-only bias for each token.
|
||||||
@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||||
super(TFXLNetLMHead, self).build(input_shape)
|
super().build(input_shape)
|
||||||
|
|
||||||
def call(self, hidden_states):
|
def call(self, hidden_states):
|
||||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||||
@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
class TFXLNetMainLayer(tf.keras.layers.Layer):
|
class TFXLNetMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFXLNetMainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_past = config.output_past
|
self.output_past = config.output_past
|
||||||
@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLNetModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
||||||
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
|
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
|
||||||
|
|
||||||
@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
||||||
@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
||||||
@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
||||||
self.qa_outputs = tf.keras.layers.Dense(
|
self.qa_outputs = tf.keras.layers.Dense(
|
||||||
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
|
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
|
||||||
@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
|
|||||||
|
|
||||||
# """
|
# """
|
||||||
# def __init__(self, config, *inputs, **kwargs):
|
# def __init__(self, config, *inputs, **kwargs):
|
||||||
# super(TFXLNetForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
|
# super().__init__(config, *inputs, **kwargs)
|
||||||
# self.start_n_top = config.start_n_top
|
# self.start_n_top = config.start_n_top
|
||||||
# self.end_n_top = config.end_n_top
|
# self.end_n_top = config.end_n_top
|
||||||
|
|
||||||
|
|||||||
@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
|
|||||||
|
|
||||||
class PositionalEmbedding(nn.Module):
|
class PositionalEmbedding(nn.Module):
|
||||||
def __init__(self, demb):
|
def __init__(self, demb):
|
||||||
super(PositionalEmbedding, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.demb = demb
|
self.demb = demb
|
||||||
|
|
||||||
@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module):
|
|||||||
|
|
||||||
class PositionwiseFF(nn.Module):
|
class PositionwiseFF(nn.Module):
|
||||||
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
|
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
|
||||||
super(PositionwiseFF, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
self.d_inner = d_inner
|
self.d_inner = d_inner
|
||||||
@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
|
|||||||
output_attentions=False,
|
output_attentions=False,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
):
|
):
|
||||||
super(RelPartialLearnableMultiHeadAttn, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.output_attentions = output_attentions
|
self.output_attentions = output_attentions
|
||||||
self.n_head = n_head
|
self.n_head = n_head
|
||||||
@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
|
|||||||
|
|
||||||
class RelPartialLearnableDecoderLayer(nn.Module):
|
class RelPartialLearnableDecoderLayer(nn.Module):
|
||||||
def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
|
def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
|
||||||
super(RelPartialLearnableDecoderLayer, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.dec_attn = RelPartialLearnableMultiHeadAttn(
|
self.dec_attn = RelPartialLearnableMultiHeadAttn(
|
||||||
n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
|
n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
|
||||||
@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
class AdaptiveEmbedding(nn.Module):
|
class AdaptiveEmbedding(nn.Module):
|
||||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
|
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
|
||||||
super(AdaptiveEmbedding, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.n_token = n_token
|
self.n_token = n_token
|
||||||
self.d_embed = d_embed
|
self.d_embed = d_embed
|
||||||
@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(TransfoXLModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(TransfoXLLMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.transformer = TransfoXLModel(config)
|
self.transformer = TransfoXLModel(config)
|
||||||
self.sample_softmax = config.sample_softmax
|
self.sample_softmax = config.sample_softmax
|
||||||
# use sampled softmax
|
# use sampled softmax
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ import torch.nn.functional as F
|
|||||||
|
|
||||||
class ProjectedAdaptiveLogSoftmax(nn.Module):
|
class ProjectedAdaptiveLogSoftmax(nn.Module):
|
||||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
|
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
|
||||||
super(ProjectedAdaptiveLogSoftmax, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.n_token = n_token
|
self.n_token = n_token
|
||||||
self.d_embed = d_embed
|
self.d_embed = d_embed
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ except ImportError:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super(Identity, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
def forward(self, input):
|
def forward(self, input):
|
||||||
return input
|
return input
|
||||||
@@ -97,7 +97,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
|||||||
return {"input_ids": torch.tensor(DUMMY_INPUTS)}
|
return {"input_ids": torch.tensor(DUMMY_INPUTS)}
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(PreTrainedModel, self).__init__()
|
super().__init__()
|
||||||
if not isinstance(config, PretrainedConfig):
|
if not isinstance(config, PretrainedConfig):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
|
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
|
||||||
@@ -1102,7 +1102,7 @@ class Conv1D(nn.Module):
|
|||||||
""" Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
|
""" Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
|
||||||
Basically works like a Linear layer but the weights are transposed
|
Basically works like a Linear layer but the weights are transposed
|
||||||
"""
|
"""
|
||||||
super(Conv1D, self).__init__()
|
super().__init__()
|
||||||
self.nf = nf
|
self.nf = nf
|
||||||
w = torch.empty(nx, nf)
|
w = torch.empty(nx, nf)
|
||||||
nn.init.normal_(w, std=0.02)
|
nn.init.normal_(w, std=0.02)
|
||||||
@@ -1120,7 +1120,7 @@ class PoolerStartLogits(nn.Module):
|
|||||||
""" Compute SQuAD start_logits from sequence hidden states. """
|
""" Compute SQuAD start_logits from sequence hidden states. """
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(PoolerStartLogits, self).__init__()
|
super().__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, 1)
|
self.dense = nn.Linear(config.hidden_size, 1)
|
||||||
|
|
||||||
def forward(self, hidden_states, p_mask=None):
|
def forward(self, hidden_states, p_mask=None):
|
||||||
@@ -1145,7 +1145,7 @@ class PoolerEndLogits(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(PoolerEndLogits, self).__init__()
|
super().__init__()
|
||||||
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
|
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
|
||||||
self.activation = nn.Tanh()
|
self.activation = nn.Tanh()
|
||||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
@@ -1191,7 +1191,7 @@ class PoolerAnswerClass(nn.Module):
|
|||||||
""" Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
|
""" Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(PoolerAnswerClass, self).__init__()
|
super().__init__()
|
||||||
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
|
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
|
||||||
self.activation = nn.Tanh()
|
self.activation = nn.Tanh()
|
||||||
self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
|
self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
|
||||||
@@ -1276,7 +1276,7 @@ class SQuADHead(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(SQuADHead, self).__init__()
|
super().__init__()
|
||||||
self.start_n_top = config.start_n_top
|
self.start_n_top = config.start_n_top
|
||||||
self.end_n_top = config.end_n_top
|
self.end_n_top = config.end_n_top
|
||||||
|
|
||||||
@@ -1368,7 +1368,7 @@ class SequenceSummary(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(SequenceSummary, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last"
|
self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last"
|
||||||
if self.summary_type == "attn":
|
if self.summary_type == "attn":
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ class MultiHeadAttention(nn.Module):
|
|||||||
NEW_ID = itertools.count()
|
NEW_ID = itertools.count()
|
||||||
|
|
||||||
def __init__(self, n_heads, dim, config):
|
def __init__(self, n_heads, dim, config):
|
||||||
super(MultiHeadAttention, self).__init__()
|
super().__init__()
|
||||||
self.layer_id = next(MultiHeadAttention.NEW_ID)
|
self.layer_id = next(MultiHeadAttention.NEW_ID)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
@@ -197,7 +197,7 @@ class MultiHeadAttention(nn.Module):
|
|||||||
|
|
||||||
class TransformerFFN(nn.Module):
|
class TransformerFFN(nn.Module):
|
||||||
def __init__(self, in_dim, dim_hidden, out_dim, config):
|
def __init__(self, in_dim, dim_hidden, out_dim, config):
|
||||||
super(TransformerFFN, self).__init__()
|
super().__init__()
|
||||||
self.dropout = config.dropout
|
self.dropout = config.dropout
|
||||||
self.lin1 = nn.Linear(in_dim, dim_hidden)
|
self.lin1 = nn.Linear(in_dim, dim_hidden)
|
||||||
self.lin2 = nn.Linear(dim_hidden, out_dim)
|
self.lin2 = nn.Linear(dim_hidden, out_dim)
|
||||||
@@ -222,7 +222,7 @@ class XLMPreTrainedModel(PreTrainedModel):
|
|||||||
base_model_prefix = "transformer"
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
def __init__(self, *inputs, **kwargs):
|
def __init__(self, *inputs, **kwargs):
|
||||||
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
|
super().__init__(*inputs, **kwargs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dummy_inputs(self):
|
def dummy_inputs(self):
|
||||||
@@ -354,7 +354,7 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config): # , dico, is_encoder, with_output):
|
def __init__(self, config): # , dico, is_encoder, with_output):
|
||||||
super(XLMModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
@@ -585,7 +585,7 @@ class XLMPredLayer(nn.Module):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLMPredLayer, self).__init__()
|
super().__init__()
|
||||||
self.asm = config.asm
|
self.asm = config.asm
|
||||||
self.n_words = config.n_words
|
self.n_words = config.n_words
|
||||||
self.pad_index = config.pad_index
|
self.pad_index = config.pad_index
|
||||||
@@ -661,7 +661,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLMWithLMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
self.pred_layer = XLMPredLayer(config)
|
self.pred_layer = XLMPredLayer(config)
|
||||||
|
|
||||||
@@ -754,7 +754,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLMForSequenceClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
@@ -856,7 +856,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLMForQuestionAnsweringSimple, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
@@ -973,7 +973,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLMForQuestionAnswering, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
self.qa_outputs = SQuADHead(config)
|
self.qa_outputs = SQuADHead(config)
|
||||||
|
|||||||
@@ -204,7 +204,7 @@ XLNetLayerNorm = nn.LayerNorm
|
|||||||
|
|
||||||
class XLNetRelativeAttention(nn.Module):
|
class XLNetRelativeAttention(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetRelativeAttention, self).__init__()
|
super().__init__()
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
if config.d_model % config.n_head != 0:
|
if config.d_model % config.n_head != 0:
|
||||||
@@ -414,7 +414,7 @@ class XLNetRelativeAttention(nn.Module):
|
|||||||
|
|
||||||
class XLNetFeedForward(nn.Module):
|
class XLNetFeedForward(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetFeedForward, self).__init__()
|
super().__init__()
|
||||||
self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
|
self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
|
||||||
self.layer_1 = nn.Linear(config.d_model, config.d_inner)
|
self.layer_1 = nn.Linear(config.d_model, config.d_inner)
|
||||||
self.layer_2 = nn.Linear(config.d_inner, config.d_model)
|
self.layer_2 = nn.Linear(config.d_inner, config.d_model)
|
||||||
@@ -437,7 +437,7 @@ class XLNetFeedForward(nn.Module):
|
|||||||
|
|
||||||
class XLNetLayer(nn.Module):
|
class XLNetLayer(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetLayer, self).__init__()
|
super().__init__()
|
||||||
self.rel_attn = XLNetRelativeAttention(config)
|
self.rel_attn = XLNetRelativeAttention(config)
|
||||||
self.ff = XLNetFeedForward(config)
|
self.ff = XLNetFeedForward(config)
|
||||||
self.dropout = nn.Dropout(config.dropout)
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
@@ -631,7 +631,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_past = config.output_past
|
self.output_past = config.output_past
|
||||||
@@ -996,7 +996,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetLMHeadModel, self).__init__(config)
|
super().__init__(config)
|
||||||
self.attn_type = config.attn_type
|
self.attn_type = config.attn_type
|
||||||
self.same_length = config.same_length
|
self.same_length = config.same_length
|
||||||
|
|
||||||
@@ -1119,7 +1119,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetForSequenceClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
@@ -1234,7 +1234,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetForTokenClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
@@ -1355,7 +1355,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetForMultipleChoice, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
self.sequence_summary = SequenceSummary(config)
|
self.sequence_summary = SequenceSummary(config)
|
||||||
@@ -1463,7 +1463,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetForQuestionAnsweringSimple, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
@@ -1595,7 +1595,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLNetForQuestionAnswering, self).__init__(config)
|
super().__init__(config)
|
||||||
self.start_n_top = config.start_n_top
|
self.start_n_top = config.start_n_top
|
||||||
self.end_n_top = config.end_n_top
|
self.end_n_top = config.end_n_top
|
||||||
|
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class AdamW(Optimizer):
|
|||||||
if not 0.0 <= eps:
|
if not 0.0 <= eps:
|
||||||
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
|
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
|
||||||
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
|
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
|
||||||
super(AdamW, self).__init__(params, defaults)
|
super().__init__(params, defaults)
|
||||||
|
|
||||||
def step(self, closure=None):
|
def step(self, closure=None):
|
||||||
"""Performs a single optimization step.
|
"""Performs a single optimization step.
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
|
|||||||
"""Applys a warmup schedule on a given learning rate decay schedule."""
|
"""Applys a warmup schedule on a given learning rate decay schedule."""
|
||||||
|
|
||||||
def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
|
def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
|
||||||
super(WarmUp, self).__init__()
|
super().__init__()
|
||||||
self.initial_learning_rate = initial_learning_rate
|
self.initial_learning_rate = initial_learning_rate
|
||||||
self.warmup_steps = warmup_steps
|
self.warmup_steps = warmup_steps
|
||||||
self.power = power
|
self.power = power
|
||||||
@@ -102,7 +102,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
|
|||||||
name="AdamWeightDecay",
|
name="AdamWeightDecay",
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
|
super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
|
||||||
self.weight_decay_rate = weight_decay_rate
|
self.weight_decay_rate = weight_decay_rate
|
||||||
self._include_in_weight_decay = include_in_weight_decay
|
self._include_in_weight_decay = include_in_weight_decay
|
||||||
self._exclude_from_weight_decay = exclude_from_weight_decay
|
self._exclude_from_weight_decay = exclude_from_weight_decay
|
||||||
@@ -111,10 +111,10 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
|
|||||||
def from_config(cls, config):
|
def from_config(cls, config):
|
||||||
"""Creates an optimizer from its config with WarmUp custom object."""
|
"""Creates an optimizer from its config with WarmUp custom object."""
|
||||||
custom_objects = {"WarmUp": WarmUp}
|
custom_objects = {"WarmUp": WarmUp}
|
||||||
return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
|
return super().from_config(config, custom_objects=custom_objects)
|
||||||
|
|
||||||
def _prepare_local(self, var_device, var_dtype, apply_state):
|
def _prepare_local(self, var_device, var_dtype, apply_state):
|
||||||
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
|
super()._prepare_local(var_device, var_dtype, apply_state)
|
||||||
apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
|
apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
|
||||||
|
|
||||||
def _decay_weights_op(self, var, learning_rate, apply_state):
|
def _decay_weights_op(self, var, learning_rate, apply_state):
|
||||||
@@ -128,7 +128,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
|
|||||||
def apply_gradients(self, grads_and_vars, clip_norm, name=None):
|
def apply_gradients(self, grads_and_vars, clip_norm, name=None):
|
||||||
grads, tvars = list(zip(*grads_and_vars))
|
grads, tvars = list(zip(*grads_and_vars))
|
||||||
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
|
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
|
||||||
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
|
return super().apply_gradients(zip(grads, tvars))
|
||||||
|
|
||||||
def _get_lr(self, var_device, var_dtype, apply_state):
|
def _get_lr(self, var_device, var_dtype, apply_state):
|
||||||
"""Retrieves the learning rate with the given state."""
|
"""Retrieves the learning rate with the given state."""
|
||||||
@@ -147,16 +147,16 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
|
|||||||
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||||
decay = self._decay_weights_op(var, lr_t, apply_state)
|
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||||
with tf.control_dependencies([decay]):
|
with tf.control_dependencies([decay]):
|
||||||
return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
|
return super()._resource_apply_dense(grad, var, **kwargs)
|
||||||
|
|
||||||
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
|
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
|
||||||
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||||
decay = self._decay_weights_op(var, lr_t, apply_state)
|
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||||
with tf.control_dependencies([decay]):
|
with tf.control_dependencies([decay]):
|
||||||
return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
|
return super()._resource_apply_sparse(grad, var, indices, **kwargs)
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
config = super(AdamWeightDecay, self).get_config()
|
config = super().get_config()
|
||||||
config.update({"weight_decay_rate": self.weight_decay_rate})
|
config.update({"weight_decay_rate": self.weight_decay_rate})
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(AlbertTokenizer, self).__init__(
|
super().__init__(
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
|
|||||||
@@ -163,7 +163,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
This should likely be deactivated for Japanese:
|
This should likely be deactivated for Japanese:
|
||||||
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
|
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
|
||||||
"""
|
"""
|
||||||
super(BertTokenizer, self).__init__(
|
super().__init__(
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
sep_token=sep_token,
|
sep_token=sep_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
@@ -554,7 +554,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
add_special_tokens=True,
|
add_special_tokens=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(BertTokenizerFast, self).__init__(
|
super().__init__(
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
sep_token=sep_token,
|
sep_token=sep_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ class BertJapaneseTokenizer(BertTokenizer):
|
|||||||
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
|
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
|
||||||
Type of subword tokenizer.
|
Type of subword tokenizer.
|
||||||
"""
|
"""
|
||||||
super(BertTokenizer, self).__init__(
|
super().__init__(
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
sep_token=sep_token,
|
sep_token=sep_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
|
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(CamembertTokenizer, self).__init__(
|
super().__init__(
|
||||||
max_len=512,
|
max_len=512,
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
|||||||
control_codes = CONTROL_CODES
|
control_codes = CONTROL_CODES
|
||||||
|
|
||||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||||
super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs)
|
super().__init__(unk_token=unk_token, **kwargs)
|
||||||
self.max_len_single_sentence = (
|
self.max_len_single_sentence = (
|
||||||
self.max_len
|
self.max_len
|
||||||
) # no default special tokens - you can update this value if you add special tokens
|
) # no default special tokens - you can update this value if you add special tokens
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
eos_token="<|endoftext|>",
|
eos_token="<|endoftext|>",
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
|
super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
|
||||||
self.max_len_single_sentence = (
|
self.max_len_single_sentence = (
|
||||||
self.max_len
|
self.max_len
|
||||||
) # no default special tokens - you can update this value if you add special tokens
|
) # no default special tokens - you can update this value if you add special tokens
|
||||||
@@ -268,7 +268,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
truncation_strategy="longest_first",
|
truncation_strategy="longest_first",
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(GPT2TokenizerFast, self).__init__(
|
super().__init__(
|
||||||
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
|
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
|||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
|
||||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||||
super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
|
super().__init__(unk_token=unk_token, **kwargs)
|
||||||
|
|
||||||
self.max_len_single_sentence = (
|
self.max_len_single_sentence = (
|
||||||
self.max_len
|
self.max_len
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
mask_token="<mask>",
|
mask_token="<mask>",
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(RobertaTokenizer, self).__init__(
|
super().__init__(
|
||||||
vocab_file=vocab_file,
|
vocab_file=vocab_file,
|
||||||
merges_file=merges_file,
|
merges_file=merges_file,
|
||||||
errors=errors,
|
errors=errors,
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ class T5Tokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens = []
|
additional_special_tokens = []
|
||||||
additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])
|
additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])
|
||||||
|
|
||||||
super(T5Tokenizer, self).__init__(
|
super().__init__(
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens=["<formula>"],
|
additional_special_tokens=["<formula>"],
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(TransfoXLTokenizer, self).__init__(
|
super().__init__(
|
||||||
unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
|
unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1425,7 +1425,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
|
|||||||
_decoder = None
|
_decoder = None
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
super(PreTrainedTokenizerFast, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tokenizer(self):
|
def tokenizer(self):
|
||||||
|
|||||||
@@ -578,7 +578,7 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
do_lowercase_and_remove_accent=True,
|
do_lowercase_and_remove_accent=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(XLMTokenizer, self).__init__(
|
super().__init__(
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
sep_token=sep_token,
|
sep_token=sep_token,
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="<mask>",
|
mask_token="<mask>",
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(XLMRobertaTokenizer, self).__init__(
|
super().__init__(
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens=["<eop>", "<eod>"],
|
additional_special_tokens=["<eop>", "<eod>"],
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(XLNetTokenizer, self).__init__(
|
super().__init__(
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ class XxxConfig(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(XxxConfig, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ TFXxxOutput = tf.keras.layers.Layer
|
|||||||
|
|
||||||
class TFXxxLayer(tf.keras.layers.Layer):
|
class TFXxxLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFXxxLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.attention = TFXxxAttention(config, name="attention")
|
self.attention = TFXxxAttention(config, name="attention")
|
||||||
self.intermediate = TFXxxIntermediate(config, name="intermediate")
|
self.intermediate = TFXxxIntermediate(config, name="intermediate")
|
||||||
self.transformer_output = TFXxxOutput(config, name="output")
|
self.transformer_output = TFXxxOutput(config, name="output")
|
||||||
@@ -91,7 +91,7 @@ class TFXxxLayer(tf.keras.layers.Layer):
|
|||||||
####################################################
|
####################################################
|
||||||
class TFXxxMainLayer(tf.keras.layers.Layer):
|
class TFXxxMainLayer(tf.keras.layers.Layer):
|
||||||
def __init__(self, config, **kwargs):
|
def __init__(self, config, **kwargs):
|
||||||
super(TFXxxMainLayer, self).__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
||||||
@@ -307,7 +307,7 @@ class TFXxxModel(TFXxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXxxModel, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFXxxMainLayer(config, name="transformer")
|
self.transformer = TFXxxMainLayer(config, name="transformer")
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
@@ -348,7 +348,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
self.transformer = TFXxxMainLayer(config, name="transformer")
|
self.transformer = TFXxxMainLayer(config, name="transformer")
|
||||||
self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
|
self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
|
||||||
@@ -397,7 +397,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = TFXxxMainLayer(config, name="transformer")
|
self.transformer = TFXxxMainLayer(config, name="transformer")
|
||||||
@@ -452,7 +452,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = TFXxxMainLayer(config, name="transformer")
|
self.transformer = TFXxxMainLayer(config, name="transformer")
|
||||||
@@ -509,7 +509,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = TFXxxMainLayer(config, name="transformer")
|
self.transformer = TFXxxMainLayer(config, name="transformer")
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ XxxOutput = nn.Module
|
|||||||
|
|
||||||
class XxxLayer(nn.Module):
|
class XxxLayer(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XxxLayer, self).__init__()
|
super().__init__()
|
||||||
self.attention = XxxAttention(config)
|
self.attention = XxxAttention(config)
|
||||||
self.intermediate = XxxIntermediate(config)
|
self.intermediate = XxxIntermediate(config)
|
||||||
self.output = XxxOutput(config)
|
self.output = XxxOutput(config)
|
||||||
@@ -298,7 +298,7 @@ class XxxModel(XxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XxxModel, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.embeddings = XxxEmbeddings(config)
|
self.embeddings = XxxEmbeddings(config)
|
||||||
self.encoder = XxxEncoder(config)
|
self.encoder = XxxEncoder(config)
|
||||||
@@ -426,7 +426,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XxxForMaskedLM, self).__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
self.transformer = XxxModel(config)
|
self.transformer = XxxModel(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
|
||||||
@@ -507,7 +507,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XxxForSequenceClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = XxxModel(config)
|
self.transformer = XxxModel(config)
|
||||||
@@ -593,7 +593,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XxxForTokenClassification, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = XxxModel(config)
|
self.transformer = XxxModel(config)
|
||||||
@@ -692,7 +692,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XxxForQuestionAnswering, self).__init__(config)
|
super().__init__(config)
|
||||||
self.num_labels = config.num_labels
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
self.transformer = XxxModel(config)
|
self.transformer = XxxModel(config)
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = XxxTokenizer
|
tokenizer_class = XxxTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(XxxTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class XxxTokenizer(PreTrainedTokenizer):
|
|||||||
Whether to lower case the input
|
Whether to lower case the input
|
||||||
Only has an effect when do_basic_tokenize=True
|
Only has an effect when do_basic_tokenize=True
|
||||||
"""
|
"""
|
||||||
super(XxxTokenizer, self).__init__(
|
super().__init__(
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
sep_token=sep_token,
|
sep_token=sep_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = AlbertTokenizer
|
tokenizer_class = AlbertTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(AlbertTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(BertTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = BertJapaneseTokenizer
|
tokenizer_class = BertJapaneseTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(BertJapaneseTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@@ -135,7 +135,7 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
|
|||||||
tokenizer_class = BertJapaneseTokenizer
|
tokenizer_class = BertJapaneseTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(BertJapaneseCharacterTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
|
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
|
||||||
|
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = CTRLTokenizer
|
tokenizer_class = CTRLTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(CTRLTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
|
vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(GPT2TokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = OpenAIGPTTokenizer
|
tokenizer_class = OpenAIGPTTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(OpenAIGPTTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = RobertaTokenizer
|
tokenizer_class = RobertaTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(RobertaTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = T5Tokenizer
|
tokenizer_class = T5Tokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(T5TokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = T5Tokenizer(SAMPLE_VOCAB)
|
tokenizer = T5Tokenizer(SAMPLE_VOCAB)
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
|
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(TransfoXLTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"<unk>",
|
"<unk>",
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = XLMTokenizer
|
tokenizer_class = XLMTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(XLMTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = XLNetTokenizer
|
tokenizer_class = XLNetTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(XLNetTokenizationTest, self).setUp()
|
super().setUp()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user