cleaning up configuration classes
This commit is contained in:
@@ -65,7 +65,7 @@ class BertAbsConfig(PretrainedConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=30522,
|
||||
vocab_size=30522,
|
||||
max_pos=512,
|
||||
enc_layers=6,
|
||||
enc_hidden_size=512,
|
||||
@@ -81,14 +81,14 @@ class BertAbsConfig(PretrainedConfig):
|
||||
):
|
||||
super(BertAbsConfig, self).__init__(**kwargs)
|
||||
|
||||
if self._input_is_path_to_json(vocab_size_or_config_json_file):
|
||||
path_to_json = vocab_size_or_config_json_file
|
||||
if self._input_is_path_to_json(vocab_size):
|
||||
path_to_json = vocab_size
|
||||
with open(path_to_json, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
elif isinstance(vocab_size, int):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_pos = max_pos
|
||||
|
||||
self.enc_layers = enc_layers
|
||||
|
||||
@@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig):
|
||||
|
||||
|
||||
Arguments:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
|
||||
hidden_size: Size of the encoder layers and the pooler layer.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads: Number of attention heads for each attention layer in
|
||||
@@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=50257,
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_ctx=1024,
|
||||
n_embd=768,
|
||||
@@ -84,7 +84,7 @@ class XxxConfig(PretrainedConfig):
|
||||
summary_first_dropout=0.1,
|
||||
**kwargs):
|
||||
super(XxxConfig, self).__init__(**kwargs)
|
||||
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
|
||||
self.vocab_size = vocab_size if isinstance(vocab_size, six.string_types) else -1
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
@@ -102,12 +102,12 @@ class XxxConfig(PretrainedConfig):
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
if isinstance(vocab_size_or_config_json_file, six.string_types):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||
if isinstance(vocab_size, six.string_types):
|
||||
with open(vocab_size, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
||||
elif not isinstance(vocab_size, int):
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
|
||||
@@ -111,7 +111,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = XxxConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -109,7 +109,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = XxxConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=30000,
|
||||
vocab_size=30000,
|
||||
embedding_size=128,
|
||||
hidden_size=4096,
|
||||
num_hidden_layers=12,
|
||||
@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig):
|
||||
"""
|
||||
super(AlbertConfig, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.vocab_size = vocab_size
|
||||
self.embedding_size = embedding_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
|
||||
@@ -56,7 +56,7 @@ class BertConfig(PretrainedConfig):
|
||||
|
||||
|
||||
Arguments:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
|
||||
hidden_size: Size of the encoder layers and the pooler layer.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads: Number of attention heads for each attention layer in
|
||||
@@ -81,7 +81,7 @@ class BertConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=30522,
|
||||
vocab_size=30522,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
@@ -95,25 +95,15 @@ class BertConfig(PretrainedConfig):
|
||||
layer_norm_eps=1e-12,
|
||||
**kwargs):
|
||||
super(BertConfig, self).__init__(**kwargs)
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
" or the path to a pretrained model config file (str)")
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
|
||||
@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `CTRLModel`.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
dff: Size of the inner dimension of the FFN.
|
||||
@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=246534,
|
||||
vocab_size=246534,
|
||||
n_positions=256,
|
||||
n_ctx=256,
|
||||
n_embd=1280,
|
||||
@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-6,
|
||||
initializer_range=0.02,
|
||||
|
||||
num_labels=1,
|
||||
summary_type='cls_index',
|
||||
summary_use_proj=True,
|
||||
summary_activation=None,
|
||||
@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
|
||||
"""Constructs CTRLConfig.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
dff: Size of the inner dimension of the FFN.
|
||||
@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
super(CTRLConfig, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
self.num_labels = num_labels
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
|
||||
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=30522,
|
||||
vocab_size=30522,
|
||||
max_position_embeddings=512,
|
||||
sinusoidal_pos_embds=False,
|
||||
n_layers=6,
|
||||
@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
|
||||
seq_classif_dropout=0.2,
|
||||
**kwargs):
|
||||
super(DistilBertConfig, self).__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||
self.n_layers = n_layers
|
||||
self.n_heads = n_heads
|
||||
self.dim = dim
|
||||
self.hidden_dim = hidden_dim
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation = activation
|
||||
self.initializer_range = initializer_range
|
||||
self.tie_weights_ = tie_weights_
|
||||
self.qa_dropout = qa_dropout
|
||||
self.seq_classif_dropout = seq_classif_dropout
|
||||
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||
self.n_layers = n_layers
|
||||
self.n_heads = n_heads
|
||||
self.dim = dim
|
||||
self.hidden_dim = hidden_dim
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation = activation
|
||||
self.initializer_range = initializer_range
|
||||
self.tie_weights_ = tie_weights_
|
||||
self.qa_dropout = qa_dropout
|
||||
self.seq_classif_dropout = seq_classif_dropout
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
" or the path to a pretrained model config file (str)")
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.dim
|
||||
|
||||
@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=50257,
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_ctx=1024,
|
||||
n_embd=768,
|
||||
@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
|
||||
num_labels=1,
|
||||
summary_type='cls_index',
|
||||
summary_use_proj=True,
|
||||
summary_activation=None,
|
||||
@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
|
||||
"""Constructs GPT2Config.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
super(GPT2Config, self).__init__(**kwargs)
|
||||
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
self.num_labels = num_labels
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
else:
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
|
||||
@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=40478,
|
||||
vocab_size=40478,
|
||||
n_positions=512,
|
||||
n_ctx=512,
|
||||
n_embd=768,
|
||||
@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
predict_special_tokens=True,
|
||||
|
||||
num_labels=1,
|
||||
summary_type='cls_index',
|
||||
summary_use_proj=True,
|
||||
summary_activation=None,
|
||||
@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
"""Constructs OpenAIGPTConfig.
|
||||
"""
|
||||
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
||||
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.afn = afn
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.predict_special_tokens = predict_special_tokens
|
||||
|
||||
self.num_labels = num_labels
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
else:
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.afn = afn
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.predict_special_tokens = predict_special_tokens
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
|
||||
@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
||||
cutoffs: cutoffs for the adaptive softmax
|
||||
d_model: Dimensionality of the model's hidden states.
|
||||
d_embed: Dimensionality of the embeddings
|
||||
@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=267735,
|
||||
vocab_size=267735,
|
||||
cutoffs=[20000, 40000, 200000],
|
||||
d_model=1024,
|
||||
d_embed=1024,
|
||||
@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
"""Constructs TransfoXLConfig.
|
||||
"""
|
||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||
self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
||||
self.vocab_size = vocab_size
|
||||
self.cutoffs = []
|
||||
self.cutoffs.extend(cutoffs)
|
||||
self.tie_weight = tie_weight
|
||||
@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
self.init_std = init_std
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
" or the path to a pretrained model config file (str)")
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
return self.tgt_len + self.ext_len + self.mem_len
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.n_token
|
||||
def n_token(self): # Backward compatibility
|
||||
return self.vocab_size
|
||||
|
||||
@vocab_size.setter
|
||||
def vocab_size(self, value):
|
||||
self.n_token = value
|
||||
@n_token.setter
|
||||
def n_token(self, value): # Backward compatibility
|
||||
self.vocab_size = value
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
|
||||
@@ -49,8 +49,7 @@ class PretrainedConfig(object):
|
||||
pretrained_config_archive_map = {}
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
||||
self.num_labels = kwargs.pop('num_labels', 2)
|
||||
# Attributes with defaults
|
||||
self.output_attentions = kwargs.pop('output_attentions', False)
|
||||
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
||||
self.output_past = kwargs.pop('output_past', True) # Not used by all models
|
||||
@@ -59,6 +58,22 @@ class PretrainedConfig(object):
|
||||
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
||||
self.is_decoder = kwargs.pop('is_decoder', False)
|
||||
|
||||
# Fine-tuning task arguments
|
||||
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
||||
self.num_labels = kwargs.pop('num_labels', 2)
|
||||
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
|
||||
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
|
||||
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
|
||||
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
|
||||
|
||||
# Additional attributes without default values
|
||||
for key, value in kwargs.items():
|
||||
try:
|
||||
setattr(self, key, value)
|
||||
except AttributeError as err:
|
||||
logger.error("Can't set {} with value {} for {}".format(key, value, self))
|
||||
raise err
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save a configuration object to the directory `save_directory`, so that it
|
||||
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
|
||||
@@ -183,17 +198,15 @@ class PretrainedConfig(object):
|
||||
@classmethod
|
||||
def from_dict(cls, json_object):
|
||||
"""Constructs a `Config` from a Python dictionary of parameters."""
|
||||
config = cls(vocab_size_or_config_json_file=-1)
|
||||
for key, value in json_object.items():
|
||||
setattr(config, key, value)
|
||||
return config
|
||||
return cls(**json_object)
|
||||
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file):
|
||||
"""Constructs a `Config` from a json file of parameters."""
|
||||
with open(json_file, "r", encoding='utf-8') as reader:
|
||||
text = reader.read()
|
||||
return cls.from_dict(json.loads(text))
|
||||
dict_obj = json.loads(text)
|
||||
return cls(**dict_obj)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.__dict__ == other.__dict__
|
||||
|
||||
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `XLMModel`.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
|
||||
d_model: Size of the encoder layers and the pooler layer.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=30145,
|
||||
vocab_size=30145,
|
||||
emb_dim=2048,
|
||||
n_layers=12,
|
||||
n_heads=16,
|
||||
@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
|
||||
unk_index=3,
|
||||
mask_index=5,
|
||||
is_encoder=True,
|
||||
|
||||
finetuning_task=None,
|
||||
num_labels=2,
|
||||
summary_type='first',
|
||||
summary_use_proj=True,
|
||||
summary_activation=None,
|
||||
@@ -117,56 +114,43 @@ class XLMConfig(PretrainedConfig):
|
||||
"""Constructs XLMConfig.
|
||||
"""
|
||||
super(XLMConfig, self).__init__(**kwargs)
|
||||
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.n_words = vocab_size_or_config_json_file
|
||||
self.emb_dim = emb_dim
|
||||
self.n_layers = n_layers
|
||||
self.n_heads = n_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.gelu_activation = gelu_activation
|
||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
||||
self.causal = causal
|
||||
self.asm = asm
|
||||
self.n_langs = n_langs
|
||||
self.use_lang_emb = use_lang_emb
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.bos_index = bos_index
|
||||
self.eos_index = eos_index
|
||||
self.pad_index = pad_index
|
||||
self.unk_index = unk_index
|
||||
self.mask_index = mask_index
|
||||
self.is_encoder = is_encoder
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.embed_init_std = embed_init_std
|
||||
self.init_std = init_std
|
||||
self.finetuning_task = finetuning_task
|
||||
self.num_labels = num_labels
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.start_n_top = start_n_top
|
||||
self.end_n_top = end_n_top
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
" or the path to a pretrained model config file (str)")
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.n_layers = n_layers
|
||||
self.n_heads = n_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.gelu_activation = gelu_activation
|
||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
||||
self.causal = causal
|
||||
self.asm = asm
|
||||
self.n_langs = n_langs
|
||||
self.use_lang_emb = use_lang_emb
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.bos_index = bos_index
|
||||
self.eos_index = eos_index
|
||||
self.pad_index = pad_index
|
||||
self.unk_index = unk_index
|
||||
self.mask_index = mask_index
|
||||
self.is_encoder = is_encoder
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.embed_init_std = embed_init_std
|
||||
self.init_std = init_std
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.start_n_top = start_n_top
|
||||
self.end_n_top = end_n_top
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.n_words
|
||||
def n_words(self): # For backward compatibility
|
||||
return self.vocab_size
|
||||
|
||||
@vocab_size.setter
|
||||
def vocab_size(self, value):
|
||||
self.n_words = value
|
||||
@n_words.setter
|
||||
def n_words(self, value): # For backward compatibility
|
||||
self.vocab_size = value
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
|
||||
@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a ``XLNetModel``.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
||||
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
||||
d_model: Size of the encoder layers and the pooler layer.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=32000,
|
||||
vocab_size=32000,
|
||||
d_model=1024,
|
||||
n_layer=24,
|
||||
n_head=16,
|
||||
d_inner=4096,
|
||||
max_position_embeddings=512,
|
||||
ff_activation="gelu",
|
||||
untie_r=True,
|
||||
attn_type="bi",
|
||||
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-12,
|
||||
|
||||
dropout=0.1,
|
||||
mem_len=None,
|
||||
reuse_len=None,
|
||||
bi_data=False,
|
||||
clamp_len=-1,
|
||||
same_length=False,
|
||||
|
||||
finetuning_task=None,
|
||||
num_labels=2,
|
||||
summary_type='last',
|
||||
summary_use_proj=True,
|
||||
summary_activation='tanh',
|
||||
@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
|
||||
"""Constructs XLNetConfig.
|
||||
"""
|
||||
super(XLNetConfig, self).__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
assert d_model % n_head == 0
|
||||
self.d_head = d_model // n_head
|
||||
self.ff_activation = ff_activation
|
||||
self.d_inner = d_inner
|
||||
self.untie_r = untie_r
|
||||
self.attn_type = attn_type
|
||||
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
setattr(config, key, value)
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.n_token = vocab_size_or_config_json_file
|
||||
self.d_model = d_model
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
assert d_model % n_head == 0
|
||||
self.d_head = d_model // n_head
|
||||
self.ff_activation = ff_activation
|
||||
self.d_inner = d_inner
|
||||
self.untie_r = untie_r
|
||||
self.attn_type = attn_type
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.dropout = dropout
|
||||
self.mem_len = mem_len
|
||||
self.reuse_len = reuse_len
|
||||
self.bi_data = bi_data
|
||||
self.clamp_len = clamp_len
|
||||
self.same_length = same_length
|
||||
|
||||
self.dropout = dropout
|
||||
self.mem_len = mem_len
|
||||
self.reuse_len = reuse_len
|
||||
self.bi_data = bi_data
|
||||
self.clamp_len = clamp_len
|
||||
self.same_length = same_length
|
||||
|
||||
self.finetuning_task = finetuning_task
|
||||
self.num_labels = num_labels
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_last_dropout = summary_last_dropout
|
||||
self.start_n_top = start_n_top
|
||||
self.end_n_top = end_n_top
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
" or the path to a pretrained model config file (str)")
|
||||
self.summary_type = summary_type
|
||||
self.summary_use_proj = summary_use_proj
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_last_dropout = summary_last_dropout
|
||||
self.start_n_top = start_n_top
|
||||
self.end_n_top = end_n_top
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
return -1
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.n_token
|
||||
def n_token(self): # Backward compatibility
|
||||
return self.vocab_size
|
||||
|
||||
@vocab_size.setter
|
||||
def vocab_size(self, value):
|
||||
self.n_token = value
|
||||
@n_token.setter
|
||||
def n_token(self, value): # Backward compatibility
|
||||
self.vocab_size = value
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
|
||||
@@ -46,7 +46,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
||||
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
|
||||
roberta.eval() # disable dropout
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=50265,
|
||||
vocab_size=50265,
|
||||
hidden_size=roberta.args.encoder_embed_dim,
|
||||
num_hidden_layers=roberta.args.encoder_layers,
|
||||
num_attention_heads=roberta.args.encoder_attention_heads,
|
||||
|
||||
@@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(GPT2DoubleHeadsModel, self).__init__(config)
|
||||
config.num_labels = 1
|
||||
self.transformer = GPT2Model(config)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
self.multiple_choice_head = SequenceSummary(config)
|
||||
|
||||
@@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
||||
config.num_labels = 1
|
||||
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
||||
|
||||
|
||||
@@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
|
||||
self.n_token = config.n_token
|
||||
self.n_token = config.vocab_size
|
||||
|
||||
self.d_embed = config.d_embed
|
||||
self.d_model = config.d_model
|
||||
@@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
||||
self.d_head = config.d_head
|
||||
self.untie_r = config.untie_r
|
||||
|
||||
self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
||||
self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
|
||||
div_val=config.div_val, init_std=config.init_std, name='word_emb')
|
||||
|
||||
self.drop = tf.keras.layers.Dropout(config.dropout)
|
||||
@@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
|
||||
raise NotImplementedError
|
||||
# use adaptive softmax (including standard softmax)
|
||||
else:
|
||||
self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model,
|
||||
self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model,
|
||||
config.cutoffs, div_val=config.div_val, name='crit')
|
||||
|
||||
def reset_length(self, tgt_len, ext_len, mem_len):
|
||||
|
||||
@@ -25,15 +25,15 @@ import tensorflow as tf
|
||||
from .modeling_tf_utils import shape_list
|
||||
|
||||
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
|
||||
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
|
||||
keep_order=False, **kwargs):
|
||||
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
|
||||
|
||||
self.n_token = n_token
|
||||
self.vocab_size = vocab_size
|
||||
self.d_embed = d_embed
|
||||
self.d_proj = d_proj
|
||||
|
||||
self.cutoffs = cutoffs + [n_token]
|
||||
self.cutoffs = cutoffs + [vocab_size]
|
||||
self.cutoff_ends = [0] + self.cutoffs
|
||||
self.div_val = div_val
|
||||
|
||||
@@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||
self.out_projs.append(weight)
|
||||
else:
|
||||
self.out_projs.append(None)
|
||||
weight = self.add_weight(shape=(self.n_token, self.d_embed,),
|
||||
weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
|
||||
initializer='zeros',
|
||||
trainable=True,
|
||||
name='out_layers_._{}_._weight'.format(i))
|
||||
bias = self.add_weight(shape=(self.n_token,),
|
||||
bias = self.add_weight(shape=(self.vocab_size,),
|
||||
initializer='zeros',
|
||||
trainable=True,
|
||||
name='out_layers_._{}_._bias'.format(i))
|
||||
@@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||
hidden, target = inputs
|
||||
head_logprob = 0
|
||||
if self.n_clusters == 0:
|
||||
softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
|
||||
softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
|
||||
output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
|
||||
if target is not None:
|
||||
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
|
||||
|
||||
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
||||
self.use_bfloat16 = config.use_bfloat16
|
||||
self.initializer_range = config.initializer_range
|
||||
|
||||
self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
|
||||
self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
|
||||
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||
|
||||
|
||||
@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
|
||||
self.n_token = config.n_token
|
||||
self.n_token = config.vocab_size
|
||||
|
||||
self.d_embed = config.d_embed
|
||||
self.d_model = config.d_model
|
||||
self.n_head = config.n_head
|
||||
self.d_head = config.d_head
|
||||
|
||||
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
||||
self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
|
||||
div_val=config.div_val)
|
||||
|
||||
self.drop = nn.Dropout(config.dropout)
|
||||
@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||
self.sample_softmax = config.sample_softmax
|
||||
# use sampled softmax
|
||||
if config.sample_softmax > 0:
|
||||
self.out_layer = nn.Linear(config.d_model, config.n_token)
|
||||
self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
|
||||
self.out_layer = nn.Linear(config.d_model, config.vocab_size)
|
||||
self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
|
||||
# use adaptive softmax (including standard softmax)
|
||||
else:
|
||||
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
|
||||
self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
|
||||
config.cutoffs, div_val=config.div_val)
|
||||
self.init_weights()
|
||||
|
||||
|
||||
@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
self.clamp_len = config.clamp_len
|
||||
self.n_layer = config.n_layer
|
||||
|
||||
self.word_embedding = nn.Embedding(config.n_token, config.d_model)
|
||||
self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
|
||||
self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
|
||||
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
||||
self.dropout = nn.Dropout(config.dropout)
|
||||
@@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
self.same_length = config.same_length
|
||||
|
||||
self.transformer = XLNetModel(config)
|
||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
||||
self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
|
||||
@@ -110,7 +110,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = AlbertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -109,7 +109,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -633,7 +633,7 @@ class CommonTestCases:
|
||||
mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
|
||||
|
||||
config = self.config_class(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_positions=self.n_positions,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
|
||||
@@ -114,7 +114,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = CTRLConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
|
||||
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = DistilBertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
dim=self.hidden_size,
|
||||
n_layers=self.num_hidden_layers,
|
||||
n_heads=self.num_attention_heads,
|
||||
|
||||
@@ -110,7 +110,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = GPT2Config(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
|
||||
@@ -98,7 +98,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = OpenAIGPTConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
|
||||
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = RobertaConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -118,7 +118,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = AlbertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -114,7 +114,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -112,7 +112,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = CTRLConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
|
||||
@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = DistilBertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
dim=self.hidden_size,
|
||||
n_layers=self.num_hidden_layers,
|
||||
n_heads=self.num_attention_heads,
|
||||
|
||||
@@ -115,7 +115,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = GPT2Config(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
|
||||
@@ -114,7 +114,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = OpenAIGPTConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
|
||||
@@ -109,7 +109,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = RobertaConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
|
||||
@@ -92,7 +92,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
config = TransfoXLConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
mem_len=self.mem_len,
|
||||
clamp_len=self.clamp_len,
|
||||
cutoffs=self.cutoffs,
|
||||
|
||||
@@ -125,7 +125,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||
|
||||
config = XLMConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_special=self.n_special,
|
||||
emb_dim=self.hidden_size,
|
||||
n_layers=self.num_hidden_layers,
|
||||
|
||||
@@ -64,7 +64,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
num_attention_heads=4,
|
||||
d_inner=128,
|
||||
num_hidden_layers=5,
|
||||
max_position_embeddings=10,
|
||||
type_sequence_label_size=2,
|
||||
untie_r=True,
|
||||
bi_data=False,
|
||||
@@ -88,7 +87,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.d_inner = d_inner
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.bi_data = bi_data
|
||||
self.untie_r = untie_r
|
||||
self.same_length = same_length
|
||||
@@ -122,13 +120,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||
|
||||
config = XLNetConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
d_model=self.hidden_size,
|
||||
n_head=self.num_attention_heads,
|
||||
d_inner=self.d_inner,
|
||||
n_layer=self.num_hidden_layers,
|
||||
untie_r=self.untie_r,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
mem_len=self.mem_len,
|
||||
clamp_len=self.clamp_len,
|
||||
same_length=self.same_length,
|
||||
|
||||
@@ -91,7 +91,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
config = TransfoXLConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
mem_len=self.mem_len,
|
||||
clamp_len=self.clamp_len,
|
||||
cutoffs=self.cutoffs,
|
||||
|
||||
@@ -121,7 +121,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||
|
||||
config = XLMConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
n_special=self.n_special,
|
||||
emb_dim=self.hidden_size,
|
||||
n_layers=self.num_hidden_layers,
|
||||
|
||||
@@ -60,7 +60,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
num_attention_heads=4,
|
||||
d_inner=128,
|
||||
num_hidden_layers=5,
|
||||
max_position_embeddings=10,
|
||||
type_sequence_label_size=2,
|
||||
untie_r=True,
|
||||
bi_data=False,
|
||||
@@ -84,7 +83,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.d_inner = d_inner
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.bi_data = bi_data
|
||||
self.untie_r = untie_r
|
||||
self.same_length = same_length
|
||||
@@ -116,13 +114,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
config = XLNetConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
vocab_size=self.vocab_size,
|
||||
d_model=self.hidden_size,
|
||||
n_head=self.num_attention_heads,
|
||||
d_inner=self.d_inner,
|
||||
n_layer=self.num_hidden_layers,
|
||||
untie_r=self.untie_r,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
mem_len=self.mem_len,
|
||||
clamp_len=self.clamp_len,
|
||||
same_length=self.same_length,
|
||||
|
||||
Reference in New Issue
Block a user