cleaning up configuration classes
This commit is contained in:
@@ -65,7 +65,7 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size=30522,
|
||||||
max_pos=512,
|
max_pos=512,
|
||||||
enc_layers=6,
|
enc_layers=6,
|
||||||
enc_hidden_size=512,
|
enc_hidden_size=512,
|
||||||
@@ -81,14 +81,14 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
):
|
):
|
||||||
super(BertAbsConfig, self).__init__(**kwargs)
|
super(BertAbsConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
if self._input_is_path_to_json(vocab_size_or_config_json_file):
|
if self._input_is_path_to_json(vocab_size):
|
||||||
path_to_json = vocab_size_or_config_json_file
|
path_to_json = vocab_size
|
||||||
with open(path_to_json, "r", encoding="utf-8") as reader:
|
with open(path_to_json, "r", encoding="utf-8") as reader:
|
||||||
json_config = json.loads(reader.read())
|
json_config = json.loads(reader.read())
|
||||||
for key, value in json_config.items():
|
for key, value in json_config.items():
|
||||||
self.__dict__[key] = value
|
self.__dict__[key] = value
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif isinstance(vocab_size, int):
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.vocab_size = vocab_size
|
||||||
self.max_pos = max_pos
|
self.max_pos = max_pos
|
||||||
|
|
||||||
self.enc_layers = enc_layers
|
self.enc_layers = enc_layers
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig):
|
|||||||
|
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
|
vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
|
||||||
hidden_size: Size of the encoder layers and the pooler layer.
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads: Number of attention heads for each attention layer in
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
@@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=50257,
|
vocab_size=50257,
|
||||||
n_positions=1024,
|
n_positions=1024,
|
||||||
n_ctx=1024,
|
n_ctx=1024,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -84,7 +84,7 @@ class XxxConfig(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super(XxxConfig, self).__init__(**kwargs)
|
super(XxxConfig, self).__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
|
self.vocab_size = vocab_size if isinstance(vocab_size, six.string_types) else -1
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.n_embd = n_embd
|
self.n_embd = n_embd
|
||||||
@@ -102,12 +102,12 @@ class XxxConfig(PretrainedConfig):
|
|||||||
self.summary_activation = summary_activation
|
self.summary_activation = summary_activation
|
||||||
self.summary_first_dropout = summary_first_dropout
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
if isinstance(vocab_size_or_config_json_file, six.string_types):
|
if isinstance(vocab_size, six.string_types):
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
with open(vocab_size, "r", encoding="utf-8") as reader:
|
||||||
json_config = json.loads(reader.read())
|
json_config = json.loads(reader.read())
|
||||||
for key, value in json_config.items():
|
for key, value in json_config.items():
|
||||||
self.__dict__[key] = value
|
self.__dict__[key] = value
|
||||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
elif not isinstance(vocab_size, int):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"First argument must be either a vocabulary size (int)"
|
"First argument must be either a vocabulary size (int)"
|
||||||
"or the path to a pretrained model config file (str)"
|
"or the path to a pretrained model config file (str)"
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = XxxConfig(
|
config = XxxConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = XxxConfig(
|
config = XxxConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30000,
|
vocab_size=30000,
|
||||||
embedding_size=128,
|
embedding_size=128,
|
||||||
hidden_size=4096,
|
hidden_size=4096,
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=12,
|
||||||
@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
super(AlbertConfig, self).__init__(**kwargs)
|
super(AlbertConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.vocab_size = vocab_size
|
||||||
self.embedding_size = embedding_size
|
self.embedding_size = embedding_size
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
|
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
|
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
|
||||||
hidden_size: Size of the encoder layers and the pooler layer.
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads: Number of attention heads for each attention layer in
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
@@ -81,7 +81,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size=30522,
|
||||||
hidden_size=768,
|
hidden_size=768,
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=12,
|
||||||
num_attention_heads=12,
|
num_attention_heads=12,
|
||||||
@@ -95,25 +95,15 @@ class BertConfig(PretrainedConfig):
|
|||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super(BertConfig, self).__init__(**kwargs)
|
super(BertConfig, self).__init__(**kwargs)
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.vocab_size = vocab_size
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.hidden_size = hidden_size
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
self.num_hidden_layers = num_hidden_layers
|
||||||
json_config = json.loads(reader.read())
|
self.num_attention_heads = num_attention_heads
|
||||||
for key, value in json_config.items():
|
self.hidden_act = hidden_act
|
||||||
self.__dict__[key] = value
|
self.intermediate_size = intermediate_size
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
self.hidden_size = hidden_size
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.type_vocab_size = type_vocab_size
|
||||||
self.num_attention_heads = num_attention_heads
|
self.initializer_range = initializer_range
|
||||||
self.hidden_act = hidden_act
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.layer_norm_eps = layer_norm_eps
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `CTRLModel`.
|
"""Configuration class to store the configuration of a `CTRLModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
dff: Size of the inner dimension of the FFN.
|
dff: Size of the inner dimension of the FFN.
|
||||||
@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=246534,
|
vocab_size=246534,
|
||||||
n_positions=256,
|
n_positions=256,
|
||||||
n_ctx=256,
|
n_ctx=256,
|
||||||
n_embd=1280,
|
n_embd=1280,
|
||||||
@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
attn_pdrop=0.1,
|
attn_pdrop=0.1,
|
||||||
layer_norm_epsilon=1e-6,
|
layer_norm_epsilon=1e-6,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
summary_type='cls_index',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
"""Constructs CTRLConfig.
|
"""Constructs CTRLConfig.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
dff: Size of the inner dimension of the FFN.
|
dff: Size of the inner dimension of the FFN.
|
||||||
@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
"""
|
"""
|
||||||
super(CTRLConfig, self).__init__(**kwargs)
|
super(CTRLConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.n_embd = n_embd
|
self.n_embd = n_embd
|
||||||
@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
self.summary_type = summary_type
|
||||||
self.summary_use_proj = summary_use_proj
|
self.summary_use_proj = summary_use_proj
|
||||||
self.summary_activation = summary_activation
|
self.summary_activation = summary_activation
|
||||||
self.summary_first_dropout = summary_first_dropout
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size=30522,
|
||||||
max_position_embeddings=512,
|
max_position_embeddings=512,
|
||||||
sinusoidal_pos_embds=False,
|
sinusoidal_pos_embds=False,
|
||||||
n_layers=6,
|
n_layers=6,
|
||||||
@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
seq_classif_dropout=0.2,
|
seq_classif_dropout=0.2,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super(DistilBertConfig, self).__init__(**kwargs)
|
super(DistilBertConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.n_heads = n_heads
|
||||||
|
self.dim = dim
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.dropout = dropout
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.activation = activation
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.tie_weights_ = tie_weights_
|
||||||
|
self.qa_dropout = qa_dropout
|
||||||
|
self.seq_classif_dropout = seq_classif_dropout
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.dim = dim
|
|
||||||
self.hidden_dim = hidden_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
self.attention_dropout = attention_dropout
|
|
||||||
self.activation = activation
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.tie_weights_ = tie_weights_
|
|
||||||
self.qa_dropout = qa_dropout
|
|
||||||
self.seq_classif_dropout = seq_classif_dropout
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
return self.dim
|
return self.dim
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=50257,
|
vocab_size=50257,
|
||||||
n_positions=1024,
|
n_positions=1024,
|
||||||
n_ctx=1024,
|
n_ctx=1024,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
attn_pdrop=0.1,
|
attn_pdrop=0.1,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
summary_type='cls_index',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
"""Constructs GPT2Config.
|
"""Constructs GPT2Config.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
|
|||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
"""
|
"""
|
||||||
super(GPT2Config, self).__init__(**kwargs)
|
super(GPT2Config, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.n_ctx = n_ctx
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.n_positions = n_positions
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
self.n_embd = n_embd
|
||||||
json_config = json.loads(reader.read())
|
self.n_layer = n_layer
|
||||||
for key, value in json_config.items():
|
self.n_head = n_head
|
||||||
self.__dict__[key] = value
|
self.resid_pdrop = resid_pdrop
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
self.embd_pdrop = embd_pdrop
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.attn_pdrop = attn_pdrop
|
||||||
self.n_ctx = n_ctx
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.n_positions = n_positions
|
self.initializer_range = initializer_range
|
||||||
self.n_embd = n_embd
|
self.summary_type = summary_type
|
||||||
self.n_layer = n_layer
|
self.summary_use_proj = summary_use_proj
|
||||||
self.n_head = n_head
|
self.summary_activation = summary_activation
|
||||||
self.resid_pdrop = resid_pdrop
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.embd_pdrop = embd_pdrop
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
self.attn_pdrop = attn_pdrop
|
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=40478,
|
vocab_size=40478,
|
||||||
n_positions=512,
|
n_positions=512,
|
||||||
n_ctx=512,
|
n_ctx=512,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
predict_special_tokens=True,
|
predict_special_tokens=True,
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
summary_type='cls_index',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
"""Constructs OpenAIGPTConfig.
|
"""Constructs OpenAIGPTConfig.
|
||||||
"""
|
"""
|
||||||
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.n_ctx = n_ctx
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.n_positions = n_positions
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
self.n_embd = n_embd
|
||||||
json_config = json.loads(reader.read())
|
self.n_layer = n_layer
|
||||||
for key, value in json_config.items():
|
self.n_head = n_head
|
||||||
self.__dict__[key] = value
|
self.afn = afn
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
self.resid_pdrop = resid_pdrop
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.embd_pdrop = embd_pdrop
|
||||||
self.n_ctx = n_ctx
|
self.attn_pdrop = attn_pdrop
|
||||||
self.n_positions = n_positions
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.n_embd = n_embd
|
self.initializer_range = initializer_range
|
||||||
self.n_layer = n_layer
|
self.predict_special_tokens = predict_special_tokens
|
||||||
self.n_head = n_head
|
self.summary_type = summary_type
|
||||||
self.afn = afn
|
self.summary_use_proj = summary_use_proj
|
||||||
self.resid_pdrop = resid_pdrop
|
self.summary_activation = summary_activation
|
||||||
self.embd_pdrop = embd_pdrop
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.attn_pdrop = attn_pdrop
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.predict_special_tokens = predict_special_tokens
|
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
||||||
cutoffs: cutoffs for the adaptive softmax
|
cutoffs: cutoffs for the adaptive softmax
|
||||||
d_model: Dimensionality of the model's hidden states.
|
d_model: Dimensionality of the model's hidden states.
|
||||||
d_embed: Dimensionality of the embeddings
|
d_embed: Dimensionality of the embeddings
|
||||||
@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=267735,
|
vocab_size=267735,
|
||||||
cutoffs=[20000, 40000, 200000],
|
cutoffs=[20000, 40000, 200000],
|
||||||
d_model=1024,
|
d_model=1024,
|
||||||
d_embed=1024,
|
d_embed=1024,
|
||||||
@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
"""Constructs TransfoXLConfig.
|
"""Constructs TransfoXLConfig.
|
||||||
"""
|
"""
|
||||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||||
self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
self.vocab_size = vocab_size
|
||||||
self.cutoffs = []
|
self.cutoffs = []
|
||||||
self.cutoffs.extend(cutoffs)
|
self.cutoffs.extend(cutoffs)
|
||||||
self.tie_weight = tie_weight
|
self.tie_weight = tie_weight
|
||||||
@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
self.init_std = init_std
|
self.init_std = init_std
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
return self.tgt_len + self.ext_len + self.mem_len
|
return self.tgt_len + self.ext_len + self.mem_len
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def n_token(self): # Backward compatibility
|
||||||
return self.n_token
|
return self.vocab_size
|
||||||
|
|
||||||
@vocab_size.setter
|
@n_token.setter
|
||||||
def vocab_size(self, value):
|
def n_token(self, value): # Backward compatibility
|
||||||
self.n_token = value
|
self.vocab_size = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
|
|||||||
@@ -49,8 +49,7 @@ class PretrainedConfig(object):
|
|||||||
pretrained_config_archive_map = {}
|
pretrained_config_archive_map = {}
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
# Attributes with defaults
|
||||||
self.num_labels = kwargs.pop('num_labels', 2)
|
|
||||||
self.output_attentions = kwargs.pop('output_attentions', False)
|
self.output_attentions = kwargs.pop('output_attentions', False)
|
||||||
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
||||||
self.output_past = kwargs.pop('output_past', True) # Not used by all models
|
self.output_past = kwargs.pop('output_past', True) # Not used by all models
|
||||||
@@ -59,6 +58,22 @@ class PretrainedConfig(object):
|
|||||||
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
||||||
self.is_decoder = kwargs.pop('is_decoder', False)
|
self.is_decoder = kwargs.pop('is_decoder', False)
|
||||||
|
|
||||||
|
# Fine-tuning task arguments
|
||||||
|
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
||||||
|
self.num_labels = kwargs.pop('num_labels', 2)
|
||||||
|
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
|
||||||
|
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
|
||||||
|
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
|
||||||
|
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
|
||||||
|
|
||||||
|
# Additional attributes without default values
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
try:
|
||||||
|
setattr(self, key, value)
|
||||||
|
except AttributeError as err:
|
||||||
|
logger.error("Can't set {} with value {} for {}".format(key, value, self))
|
||||||
|
raise err
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
""" Save a configuration object to the directory `save_directory`, so that it
|
""" Save a configuration object to the directory `save_directory`, so that it
|
||||||
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
|
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
|
||||||
@@ -183,17 +198,15 @@ class PretrainedConfig(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, json_object):
|
def from_dict(cls, json_object):
|
||||||
"""Constructs a `Config` from a Python dictionary of parameters."""
|
"""Constructs a `Config` from a Python dictionary of parameters."""
|
||||||
config = cls(vocab_size_or_config_json_file=-1)
|
return cls(**json_object)
|
||||||
for key, value in json_object.items():
|
|
||||||
setattr(config, key, value)
|
|
||||||
return config
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_json_file(cls, json_file):
|
def from_json_file(cls, json_file):
|
||||||
"""Constructs a `Config` from a json file of parameters."""
|
"""Constructs a `Config` from a json file of parameters."""
|
||||||
with open(json_file, "r", encoding='utf-8') as reader:
|
with open(json_file, "r", encoding='utf-8') as reader:
|
||||||
text = reader.read()
|
text = reader.read()
|
||||||
return cls.from_dict(json.loads(text))
|
dict_obj = json.loads(text)
|
||||||
|
return cls(**dict_obj)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return self.__dict__ == other.__dict__
|
return self.__dict__ == other.__dict__
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `XLMModel`.
|
"""Configuration class to store the configuration of a `XLMModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
|
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
d_model: Size of the encoder layers and the pooler layer.
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
n_head: Number of attention heads for each attention layer in
|
n_head: Number of attention heads for each attention layer in
|
||||||
@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30145,
|
vocab_size=30145,
|
||||||
emb_dim=2048,
|
emb_dim=2048,
|
||||||
n_layers=12,
|
n_layers=12,
|
||||||
n_heads=16,
|
n_heads=16,
|
||||||
@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
unk_index=3,
|
unk_index=3,
|
||||||
mask_index=5,
|
mask_index=5,
|
||||||
is_encoder=True,
|
is_encoder=True,
|
||||||
|
|
||||||
finetuning_task=None,
|
|
||||||
num_labels=2,
|
|
||||||
summary_type='first',
|
summary_type='first',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -117,56 +114,43 @@ class XLMConfig(PretrainedConfig):
|
|||||||
"""Constructs XLMConfig.
|
"""Constructs XLMConfig.
|
||||||
"""
|
"""
|
||||||
super(XLMConfig, self).__init__(**kwargs)
|
super(XLMConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.emb_dim = emb_dim
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.n_layers = n_layers
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
self.n_heads = n_heads
|
||||||
json_config = json.loads(reader.read())
|
self.dropout = dropout
|
||||||
for key, value in json_config.items():
|
self.attention_dropout = attention_dropout
|
||||||
self.__dict__[key] = value
|
self.gelu_activation = gelu_activation
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
self.sinusoidal_embeddings = sinusoidal_embeddings
|
||||||
self.n_words = vocab_size_or_config_json_file
|
self.causal = causal
|
||||||
self.emb_dim = emb_dim
|
self.asm = asm
|
||||||
self.n_layers = n_layers
|
self.n_langs = n_langs
|
||||||
self.n_heads = n_heads
|
self.use_lang_emb = use_lang_emb
|
||||||
self.dropout = dropout
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.attention_dropout = attention_dropout
|
self.bos_index = bos_index
|
||||||
self.gelu_activation = gelu_activation
|
self.eos_index = eos_index
|
||||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
self.pad_index = pad_index
|
||||||
self.causal = causal
|
self.unk_index = unk_index
|
||||||
self.asm = asm
|
self.mask_index = mask_index
|
||||||
self.n_langs = n_langs
|
self.is_encoder = is_encoder
|
||||||
self.use_lang_emb = use_lang_emb
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.embed_init_std = embed_init_std
|
||||||
self.bos_index = bos_index
|
self.init_std = init_std
|
||||||
self.eos_index = eos_index
|
self.summary_type = summary_type
|
||||||
self.pad_index = pad_index
|
self.summary_use_proj = summary_use_proj
|
||||||
self.unk_index = unk_index
|
self.summary_activation = summary_activation
|
||||||
self.mask_index = mask_index
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
self.is_encoder = is_encoder
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.start_n_top = start_n_top
|
||||||
self.embed_init_std = embed_init_std
|
self.end_n_top = end_n_top
|
||||||
self.init_std = init_std
|
|
||||||
self.finetuning_task = finetuning_task
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.start_n_top = start_n_top
|
|
||||||
self.end_n_top = end_n_top
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def n_words(self): # For backward compatibility
|
||||||
return self.n_words
|
return self.vocab_size
|
||||||
|
|
||||||
@vocab_size.setter
|
@n_words.setter
|
||||||
def vocab_size(self, value):
|
def n_words(self, value): # For backward compatibility
|
||||||
self.n_words = value
|
self.vocab_size = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a ``XLNetModel``.
|
"""Configuration class to store the configuration of a ``XLNetModel``.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
d_model: Size of the encoder layers and the pooler layer.
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
n_head: Number of attention heads for each attention layer in
|
n_head: Number of attention heads for each attention layer in
|
||||||
@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=32000,
|
vocab_size=32000,
|
||||||
d_model=1024,
|
d_model=1024,
|
||||||
n_layer=24,
|
n_layer=24,
|
||||||
n_head=16,
|
n_head=16,
|
||||||
d_inner=4096,
|
d_inner=4096,
|
||||||
max_position_embeddings=512,
|
|
||||||
ff_activation="gelu",
|
ff_activation="gelu",
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
attn_type="bi",
|
attn_type="bi",
|
||||||
|
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
|
|
||||||
dropout=0.1,
|
dropout=0.1,
|
||||||
mem_len=None,
|
mem_len=None,
|
||||||
reuse_len=None,
|
reuse_len=None,
|
||||||
bi_data=False,
|
bi_data=False,
|
||||||
clamp_len=-1,
|
clamp_len=-1,
|
||||||
same_length=False,
|
same_length=False,
|
||||||
|
|
||||||
finetuning_task=None,
|
|
||||||
num_labels=2,
|
|
||||||
summary_type='last',
|
summary_type='last',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation='tanh',
|
summary_activation='tanh',
|
||||||
@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
"""Constructs XLNetConfig.
|
"""Constructs XLNetConfig.
|
||||||
"""
|
"""
|
||||||
super(XLNetConfig, self).__init__(**kwargs)
|
super(XLNetConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.d_model = d_model
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
assert d_model % n_head == 0
|
||||||
|
self.d_head = d_model // n_head
|
||||||
|
self.ff_activation = ff_activation
|
||||||
|
self.d_inner = d_inner
|
||||||
|
self.untie_r = untie_r
|
||||||
|
self.attn_type = attn_type
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.initializer_range = initializer_range
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.layer_norm_eps = layer_norm_eps
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
setattr(config, key, value)
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.n_token = vocab_size_or_config_json_file
|
|
||||||
self.d_model = d_model
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
assert d_model % n_head == 0
|
|
||||||
self.d_head = d_model // n_head
|
|
||||||
self.ff_activation = ff_activation
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.attn_type = attn_type
|
|
||||||
|
|
||||||
self.initializer_range = initializer_range
|
self.dropout = dropout
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.mem_len = mem_len
|
||||||
|
self.reuse_len = reuse_len
|
||||||
|
self.bi_data = bi_data
|
||||||
|
self.clamp_len = clamp_len
|
||||||
|
self.same_length = same_length
|
||||||
|
|
||||||
self.dropout = dropout
|
self.summary_type = summary_type
|
||||||
self.mem_len = mem_len
|
self.summary_use_proj = summary_use_proj
|
||||||
self.reuse_len = reuse_len
|
self.summary_activation = summary_activation
|
||||||
self.bi_data = bi_data
|
self.summary_last_dropout = summary_last_dropout
|
||||||
self.clamp_len = clamp_len
|
self.start_n_top = start_n_top
|
||||||
self.same_length = same_length
|
self.end_n_top = end_n_top
|
||||||
|
|
||||||
self.finetuning_task = finetuning_task
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_last_dropout = summary_last_dropout
|
|
||||||
self.start_n_top = start_n_top
|
|
||||||
self.end_n_top = end_n_top
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def n_token(self): # Backward compatibility
|
||||||
return self.n_token
|
return self.vocab_size
|
||||||
|
|
||||||
@vocab_size.setter
|
@n_token.setter
|
||||||
def vocab_size(self, value):
|
def n_token(self, value): # Backward compatibility
|
||||||
self.n_token = value
|
self.vocab_size = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
|
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
|
||||||
roberta.eval() # disable dropout
|
roberta.eval() # disable dropout
|
||||||
config = BertConfig(
|
config = BertConfig(
|
||||||
vocab_size_or_config_json_file=50265,
|
vocab_size=50265,
|
||||||
hidden_size=roberta.args.encoder_embed_dim,
|
hidden_size=roberta.args.encoder_embed_dim,
|
||||||
num_hidden_layers=roberta.args.encoder_layers,
|
num_hidden_layers=roberta.args.encoder_layers,
|
||||||
num_attention_heads=roberta.args.encoder_attention_heads,
|
num_attention_heads=roberta.args.encoder_attention_heads,
|
||||||
|
|||||||
@@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(GPT2DoubleHeadsModel, self).__init__(config)
|
super(GPT2DoubleHeadsModel, self).__init__(config)
|
||||||
|
config.num_labels = 1
|
||||||
self.transformer = GPT2Model(config)
|
self.transformer = GPT2Model(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.multiple_choice_head = SequenceSummary(config)
|
self.multiple_choice_head = SequenceSummary(config)
|
||||||
|
|||||||
@@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
config.num_labels = 1
|
||||||
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||||
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
||||||
|
|
||||||
|
|||||||
@@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
|||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
self.n_token = config.n_token
|
self.n_token = config.vocab_size
|
||||||
|
|
||||||
self.d_embed = config.d_embed
|
self.d_embed = config.d_embed
|
||||||
self.d_model = config.d_model
|
self.d_model = config.d_model
|
||||||
@@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
|||||||
self.d_head = config.d_head
|
self.d_head = config.d_head
|
||||||
self.untie_r = config.untie_r
|
self.untie_r = config.untie_r
|
||||||
|
|
||||||
self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
|
||||||
div_val=config.div_val, init_std=config.init_std, name='word_emb')
|
div_val=config.div_val, init_std=config.init_std, name='word_emb')
|
||||||
|
|
||||||
self.drop = tf.keras.layers.Dropout(config.dropout)
|
self.drop = tf.keras.layers.Dropout(config.dropout)
|
||||||
@@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
# use adaptive softmax (including standard softmax)
|
# use adaptive softmax (including standard softmax)
|
||||||
else:
|
else:
|
||||||
self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model,
|
self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model,
|
||||||
config.cutoffs, div_val=config.div_val, name='crit')
|
config.cutoffs, div_val=config.div_val, name='crit')
|
||||||
|
|
||||||
def reset_length(self, tgt_len, ext_len, mem_len):
|
def reset_length(self, tgt_len, ext_len, mem_len):
|
||||||
|
|||||||
@@ -25,15 +25,15 @@ import tensorflow as tf
|
|||||||
from .modeling_tf_utils import shape_list
|
from .modeling_tf_utils import shape_list
|
||||||
|
|
||||||
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
|
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
|
||||||
keep_order=False, **kwargs):
|
keep_order=False, **kwargs):
|
||||||
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
|
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.n_token = n_token
|
self.vocab_size = vocab_size
|
||||||
self.d_embed = d_embed
|
self.d_embed = d_embed
|
||||||
self.d_proj = d_proj
|
self.d_proj = d_proj
|
||||||
|
|
||||||
self.cutoffs = cutoffs + [n_token]
|
self.cutoffs = cutoffs + [vocab_size]
|
||||||
self.cutoff_ends = [0] + self.cutoffs
|
self.cutoff_ends = [0] + self.cutoffs
|
||||||
self.div_val = div_val
|
self.div_val = div_val
|
||||||
|
|
||||||
@@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
self.out_projs.append(weight)
|
self.out_projs.append(weight)
|
||||||
else:
|
else:
|
||||||
self.out_projs.append(None)
|
self.out_projs.append(None)
|
||||||
weight = self.add_weight(shape=(self.n_token, self.d_embed,),
|
weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
|
||||||
initializer='zeros',
|
initializer='zeros',
|
||||||
trainable=True,
|
trainable=True,
|
||||||
name='out_layers_._{}_._weight'.format(i))
|
name='out_layers_._{}_._weight'.format(i))
|
||||||
bias = self.add_weight(shape=(self.n_token,),
|
bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
initializer='zeros',
|
initializer='zeros',
|
||||||
trainable=True,
|
trainable=True,
|
||||||
name='out_layers_._{}_._bias'.format(i))
|
name='out_layers_._{}_._bias'.format(i))
|
||||||
@@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
hidden, target = inputs
|
hidden, target = inputs
|
||||||
head_logprob = 0
|
head_logprob = 0
|
||||||
if self.n_clusters == 0:
|
if self.n_clusters == 0:
|
||||||
softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
|
softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
|
||||||
output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
|
output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
|
||||||
if target is not None:
|
if target is not None:
|
||||||
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
|
||||||
|
|||||||
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
|||||||
self.use_bfloat16 = config.use_bfloat16
|
self.use_bfloat16 = config.use_bfloat16
|
||||||
self.initializer_range = config.initializer_range
|
self.initializer_range = config.initializer_range
|
||||||
|
|
||||||
self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
|
self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
|
||||||
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
|
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
|
||||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
|
||||||
|
|||||||
@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
self.n_token = config.n_token
|
self.n_token = config.vocab_size
|
||||||
|
|
||||||
self.d_embed = config.d_embed
|
self.d_embed = config.d_embed
|
||||||
self.d_model = config.d_model
|
self.d_model = config.d_model
|
||||||
self.n_head = config.n_head
|
self.n_head = config.n_head
|
||||||
self.d_head = config.d_head
|
self.d_head = config.d_head
|
||||||
|
|
||||||
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
|
||||||
div_val=config.div_val)
|
div_val=config.div_val)
|
||||||
|
|
||||||
self.drop = nn.Dropout(config.dropout)
|
self.drop = nn.Dropout(config.dropout)
|
||||||
@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
|||||||
self.sample_softmax = config.sample_softmax
|
self.sample_softmax = config.sample_softmax
|
||||||
# use sampled softmax
|
# use sampled softmax
|
||||||
if config.sample_softmax > 0:
|
if config.sample_softmax > 0:
|
||||||
self.out_layer = nn.Linear(config.d_model, config.n_token)
|
self.out_layer = nn.Linear(config.d_model, config.vocab_size)
|
||||||
self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
|
self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
|
||||||
# use adaptive softmax (including standard softmax)
|
# use adaptive softmax (including standard softmax)
|
||||||
else:
|
else:
|
||||||
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
|
self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
|
||||||
config.cutoffs, div_val=config.div_val)
|
config.cutoffs, div_val=config.div_val)
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
|
|||||||
@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
self.clamp_len = config.clamp_len
|
self.clamp_len = config.clamp_len
|
||||||
self.n_layer = config.n_layer
|
self.n_layer = config.n_layer
|
||||||
|
|
||||||
self.word_embedding = nn.Embedding(config.n_token, config.d_model)
|
self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
|
||||||
self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
|
self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
|
||||||
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
||||||
self.dropout = nn.Dropout(config.dropout)
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
@@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
self.same_length = config.same_length
|
self.same_length = config.same_length
|
||||||
|
|
||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
|
||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = AlbertConfig(
|
config = AlbertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = BertConfig(
|
config = BertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -633,7 +633,7 @@ class CommonTestCases:
|
|||||||
mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
|
mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
|
||||||
|
|
||||||
config = self.config_class(
|
config = self.config_class(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_positions=self.n_positions,
|
n_positions=self.n_positions,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = CTRLConfig(
|
config = CTRLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = DistilBertConfig(
|
config = DistilBertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
dim=self.hidden_size,
|
dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
n_heads=self.num_attention_heads,
|
n_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = GPT2Config(
|
config = GPT2Config(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = OpenAIGPTConfig(
|
config = OpenAIGPTConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = RobertaConfig(
|
config = RobertaConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = AlbertConfig(
|
config = AlbertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = BertConfig(
|
config = BertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = CTRLConfig(
|
config = CTRLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = DistilBertConfig(
|
config = DistilBertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
dim=self.hidden_size,
|
dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
n_heads=self.num_attention_heads,
|
n_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = GPT2Config(
|
config = GPT2Config(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = OpenAIGPTConfig(
|
config = OpenAIGPTConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = RobertaConfig(
|
config = RobertaConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
config = TransfoXLConfig(
|
config = TransfoXLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
cutoffs=self.cutoffs,
|
cutoffs=self.cutoffs,
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||||
|
|
||||||
config = XLMConfig(
|
config = XLMConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_special=self.n_special,
|
n_special=self.n_special,
|
||||||
emb_dim=self.hidden_size,
|
emb_dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
|
|||||||
@@ -64,7 +64,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
num_attention_heads=4,
|
num_attention_heads=4,
|
||||||
d_inner=128,
|
d_inner=128,
|
||||||
num_hidden_layers=5,
|
num_hidden_layers=5,
|
||||||
max_position_embeddings=10,
|
|
||||||
type_sequence_label_size=2,
|
type_sequence_label_size=2,
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
bi_data=False,
|
bi_data=False,
|
||||||
@@ -88,7 +87,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
self.d_inner = d_inner
|
self.d_inner = d_inner
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.bi_data = bi_data
|
self.bi_data = bi_data
|
||||||
self.untie_r = untie_r
|
self.untie_r = untie_r
|
||||||
self.same_length = same_length
|
self.same_length = same_length
|
||||||
@@ -122,13 +120,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||||
|
|
||||||
config = XLNetConfig(
|
config = XLNetConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
d_model=self.hidden_size,
|
d_model=self.hidden_size,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
d_inner=self.d_inner,
|
d_inner=self.d_inner,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
untie_r=self.untie_r,
|
untie_r=self.untie_r,
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
same_length=self.same_length,
|
same_length=self.same_length,
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
config = TransfoXLConfig(
|
config = TransfoXLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
cutoffs=self.cutoffs,
|
cutoffs=self.cutoffs,
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
|||||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||||
|
|
||||||
config = XLMConfig(
|
config = XLMConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_special=self.n_special,
|
n_special=self.n_special,
|
||||||
emb_dim=self.hidden_size,
|
emb_dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
|
|||||||
@@ -60,7 +60,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
|||||||
num_attention_heads=4,
|
num_attention_heads=4,
|
||||||
d_inner=128,
|
d_inner=128,
|
||||||
num_hidden_layers=5,
|
num_hidden_layers=5,
|
||||||
max_position_embeddings=10,
|
|
||||||
type_sequence_label_size=2,
|
type_sequence_label_size=2,
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
bi_data=False,
|
bi_data=False,
|
||||||
@@ -84,7 +83,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
|||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
self.d_inner = d_inner
|
self.d_inner = d_inner
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.bi_data = bi_data
|
self.bi_data = bi_data
|
||||||
self.untie_r = untie_r
|
self.untie_r = untie_r
|
||||||
self.same_length = same_length
|
self.same_length = same_length
|
||||||
@@ -116,13 +114,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
|||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
config = XLNetConfig(
|
config = XLNetConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
d_model=self.hidden_size,
|
d_model=self.hidden_size,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
d_inner=self.d_inner,
|
d_inner=self.d_inner,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
untie_r=self.untie_r,
|
untie_r=self.untie_r,
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
same_length=self.same_length,
|
same_length=self.same_length,
|
||||||
|
|||||||
Reference in New Issue
Block a user