Merge branch 'master' into from_scratch_training
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
ALBERT
|
ALBERT
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
``AlbrtConfig``
|
``AlbertConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.AlbertConfig
|
.. autoclass:: transformers.AlbertConfig
|
||||||
|
|||||||
@@ -34,6 +34,13 @@ XLM
|
|||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``XLMForQuestionAnsweringSimple``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.XLMForQuestionAnsweringSimple
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLMForQuestionAnswering``
|
``XLMForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,27 @@ XLNet
|
|||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``XLNetForTokenClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.XLNetForTokenClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``XLNetForMultipleChoice``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.XLNetForMultipleChoice
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``XLNetForQuestionAnsweringSimple``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetForQuestionAnswering``
|
``XLNetForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
|
|||||||
self.check()
|
self.check()
|
||||||
self.remove_long_sequences()
|
self.remove_long_sequences()
|
||||||
self.remove_empty_sequences()
|
self.remove_empty_sequences()
|
||||||
|
self.remove_unknown_sequences()
|
||||||
self.check()
|
self.check()
|
||||||
self.print_statistics()
|
self.print_statistics()
|
||||||
|
|
||||||
@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
|
|||||||
new_size = len(self)
|
new_size = len(self)
|
||||||
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
|
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
|
||||||
|
|
||||||
|
def remove_unknown_sequences(self):
|
||||||
|
"""
|
||||||
|
Remove sequences with a (too) high level of unknown tokens.
|
||||||
|
"""
|
||||||
|
if "unk_token" not in self.params.special_tok_ids:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
unk_token_id = self.params.special_tok_ids["unk_token"]
|
||||||
|
init_size = len(self)
|
||||||
|
unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
|
||||||
|
indices = (unk_occs / self.lengths) < 0.5
|
||||||
|
self.token_ids = self.token_ids[indices]
|
||||||
|
self.lengths = self.lengths[indices]
|
||||||
|
new_size = len(self)
|
||||||
|
logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
|
||||||
|
|
||||||
def print_statistics(self):
|
def print_statistics(self):
|
||||||
"""
|
"""
|
||||||
Print some statistics on the corpus. Only the master process.
|
Print some statistics on the corpus. Only the master process.
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"activation": "gelu",
|
||||||
|
"attention_dropout": 0.1,
|
||||||
|
"dim": 768,
|
||||||
|
"dropout": 0.1,
|
||||||
|
"hidden_dim": 3072,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"max_position_embeddings": 512,
|
||||||
|
"n_heads": 12,
|
||||||
|
"n_layers": 6,
|
||||||
|
"sinusoidal_pos_embds": true,
|
||||||
|
"tie_weights_": true,
|
||||||
|
"vocab_size": 119547
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"vocab_size": 50265,
|
||||||
|
"hidden_size": 768,
|
||||||
|
"num_hidden_layers": 6,
|
||||||
|
"num_attention_heads": 12,
|
||||||
|
"intermediate_size": 3072,
|
||||||
|
"hidden_act": "gelu",
|
||||||
|
"hidden_dropout_prob": 0.1,
|
||||||
|
"attention_probs_dropout_prob": 0.1,
|
||||||
|
"max_position_embeddings": 514,
|
||||||
|
"type_vocab_size": 1,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"layer_norm_eps": 0.00001
|
||||||
|
}
|
||||||
@@ -344,6 +344,7 @@ def full_text_generation(
|
|||||||
gamma=1.5,
|
gamma=1.5,
|
||||||
gm_scale=0.9,
|
gm_scale=0.9,
|
||||||
kl_scale=0.01,
|
kl_scale=0.01,
|
||||||
|
repetition_penalty=1.0,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
classifier, class_id = get_classifier(discrim, class_label, device)
|
classifier, class_id = get_classifier(discrim, class_label, device)
|
||||||
@@ -368,7 +369,14 @@ def full_text_generation(
|
|||||||
raise Exception("Specify either a bag of words or a discriminator")
|
raise Exception("Specify either a bag of words or a discriminator")
|
||||||
|
|
||||||
unpert_gen_tok_text, _, _ = generate_text_pplm(
|
unpert_gen_tok_text, _, _ = generate_text_pplm(
|
||||||
model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
context=context,
|
||||||
|
device=device,
|
||||||
|
length=length,
|
||||||
|
sample=sample,
|
||||||
|
perturb=False,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
)
|
)
|
||||||
if device == "cuda":
|
if device == "cuda":
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
@@ -401,6 +409,7 @@ def full_text_generation(
|
|||||||
gamma=gamma,
|
gamma=gamma,
|
||||||
gm_scale=gm_scale,
|
gm_scale=gm_scale,
|
||||||
kl_scale=kl_scale,
|
kl_scale=kl_scale,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
)
|
)
|
||||||
pert_gen_tok_texts.append(pert_gen_tok_text)
|
pert_gen_tok_texts.append(pert_gen_tok_text)
|
||||||
if classifier is not None:
|
if classifier is not None:
|
||||||
@@ -437,6 +446,7 @@ def generate_text_pplm(
|
|||||||
gamma=1.5,
|
gamma=1.5,
|
||||||
gm_scale=0.9,
|
gm_scale=0.9,
|
||||||
kl_scale=0.01,
|
kl_scale=0.01,
|
||||||
|
repetition_penalty=1.0,
|
||||||
):
|
):
|
||||||
output_so_far = None
|
output_so_far = None
|
||||||
if context:
|
if context:
|
||||||
@@ -508,6 +518,13 @@ def generate_text_pplm(
|
|||||||
|
|
||||||
pert_logits, past, pert_all_hidden = model(last, past=pert_past)
|
pert_logits, past, pert_all_hidden = model(last, past=pert_past)
|
||||||
pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST
|
pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST
|
||||||
|
|
||||||
|
for token_idx in set(output_so_far[0].tolist()):
|
||||||
|
if pert_logits[0, token_idx] < 0:
|
||||||
|
pert_logits[0, token_idx] *= repetition_penalty
|
||||||
|
else:
|
||||||
|
pert_logits[0, token_idx] /= repetition_penalty
|
||||||
|
|
||||||
pert_probs = F.softmax(pert_logits, dim=-1)
|
pert_probs = F.softmax(pert_logits, dim=-1)
|
||||||
|
|
||||||
if classifier is not None:
|
if classifier is not None:
|
||||||
@@ -588,6 +605,7 @@ def run_pplm_example(
|
|||||||
seed=0,
|
seed=0,
|
||||||
no_cuda=False,
|
no_cuda=False,
|
||||||
colorama=False,
|
colorama=False,
|
||||||
|
repetition_penalty=1.0,
|
||||||
):
|
):
|
||||||
# set Random seed
|
# set Random seed
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
@@ -655,6 +673,7 @@ def run_pplm_example(
|
|||||||
gamma=gamma,
|
gamma=gamma,
|
||||||
gm_scale=gm_scale,
|
gm_scale=gm_scale,
|
||||||
kl_scale=kl_scale,
|
kl_scale=kl_scale,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
)
|
)
|
||||||
|
|
||||||
# untokenize unperturbed text
|
# untokenize unperturbed text
|
||||||
@@ -767,6 +786,9 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument("--no_cuda", action="store_true", help="no cuda")
|
parser.add_argument("--no_cuda", action="store_true", help="no cuda")
|
||||||
parser.add_argument("--colorama", action="store_true", help="colors keywords")
|
parser.add_argument("--colorama", action="store_true", help="colors keywords")
|
||||||
|
parser.add_argument(
|
||||||
|
"--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
run_pplm_example(**vars(args))
|
run_pplm_example(**vars(args))
|
||||||
|
|||||||
@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class AlbertConfig(PretrainedConfig):
|
class AlbertConfig(PretrainedConfig):
|
||||||
"""Configuration for `AlbertModel`.
|
r"""
|
||||||
|
This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
|
||||||
|
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
|
||||||
|
|
||||||
The default settings match the configuration of model `albert_xxlarge`.
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
|
for more information.
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size (:obj:`int`, optional, defaults to 30000):
|
||||||
|
Vocabulary size of the ALBERT model. Defines the different tokens that
|
||||||
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
|
||||||
|
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||||
|
Dimensionality of vocabulary embeddings.
|
||||||
|
hidden_size (:obj:`int`, optional, defaults to 4096):
|
||||||
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
|
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
num_hidden_groups (:obj:`int`, optional, defaults to 1):
|
||||||
|
Number of groups for the hidden layers, parameters in the same group are shared.
|
||||||
|
num_attention_heads (:obj:`int`, optional, defaults to 64):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
intermediate_size (:obj:`int`, optional, defaults to 16384):
|
||||||
|
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
|
inner_group_num (:obj:`int`, optional, defaults to 1):
|
||||||
|
The number of inner repetition of attention and ffn.
|
||||||
|
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
|
||||||
|
The non-linear activation function (function or string) in the encoder and pooler.
|
||||||
|
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||||
|
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||||
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||||
|
The dropout ratio for the attention probabilities.
|
||||||
|
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||||
|
The maximum sequence length that this model might ever be used with. Typically set this to something
|
||||||
|
large (e.g., 512 or 1024 or 2048).
|
||||||
|
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||||
|
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
|
||||||
|
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||||
|
The epsilon used by the layer normalization layers.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
# Initializing an ALBERT-xxlarge style configuration
|
||||||
|
albert_xxlarge_configuration = AlbertConfig()
|
||||||
|
|
||||||
|
# Initializing an ALBERT-base style configuration
|
||||||
|
albert_base_configuration = AlbertConfig(
|
||||||
|
hidden_size=768,
|
||||||
|
num_attention_heads=12,
|
||||||
|
intermediate_size=3072,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initializing a model from the ALBERT-base style configuration
|
||||||
|
model = AlbertModel(bert_base_configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
@@ -58,35 +122,6 @@ class AlbertConfig(PretrainedConfig):
|
|||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Constructs AlbertConfig.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
|
|
||||||
embedding_size: size of voc embeddings.
|
|
||||||
hidden_size: Size of the encoder layers and the pooler layer.
|
|
||||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
|
||||||
num_hidden_groups: Number of group for the hidden layers, parameters in
|
|
||||||
the same group are shared.
|
|
||||||
num_attention_heads: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
|
||||||
layer in the Transformer encoder.
|
|
||||||
inner_group_num: int, number of inner repetition of attention and ffn.
|
|
||||||
down_scale_factor: float, the scale to apply
|
|
||||||
hidden_act: The non-linear activation function (function or string) in the
|
|
||||||
encoder and pooler.
|
|
||||||
hidden_dropout_prob: The dropout probability for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
max_position_embeddings: The maximum sequence length that this model might
|
|
||||||
ever be used with. Typically set this to something large just in case
|
|
||||||
(e.g., 512 or 1024 or 2048).
|
|
||||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
|
||||||
`AlbertModel`.
|
|
||||||
initializer_range: The stdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
"""
|
|
||||||
super(AlbertConfig, self).__init__(**kwargs)
|
super(AlbertConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
|
|||||||
@@ -77,32 +77,15 @@ CONFIG_MAPPING = OrderedDict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class AutoConfig:
|
class AutoConfig(object):
|
||||||
r""":class:`~transformers.AutoConfig` is a generic configuration class
|
r"""
|
||||||
|
:class:`~transformers.AutoConfig` is a generic configuration class
|
||||||
that will be instantiated as one of the configuration classes of the library
|
that will be instantiated as one of the configuration classes of the library
|
||||||
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
|
||||||
class method.
|
|
||||||
|
|
||||||
The `from_pretrained()` method take care of returning the correct model class instance
|
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
|
||||||
based on the `model_type` property of the config object, or when it's missing,
|
based on the `model_type` property of the config object, or when it's missing,
|
||||||
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
|
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
When using string matching, the configuration class is matched on
|
|
||||||
the `pretrained_model_name_or_path` string in the following order:
|
|
||||||
- contains `t5`: T5Config (T5 model)
|
|
||||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
|
||||||
- contains `albert`: AlbertConfig (ALBERT model)
|
|
||||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
|
||||||
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
|
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
|
||||||
- contains `bert`: BertConfig (Bert model)
|
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
|
||||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
|
||||||
This class cannot be instantiated using `__init__()` (throw an error).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -124,60 +107,61 @@ class AutoConfig:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
r""" Instantiate one of the configuration classes of the library
|
r""" Instantiates one of the configuration classes of the library
|
||||||
from a pre-trained model configuration.
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
The configuration class to instantiate is selected
|
The configuration class to instantiate is selected
|
||||||
based on the `model_type` property of the config object, or when it's missing,
|
based on the `model_type` property of the config object, or when it's missing,
|
||||||
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
|
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
- contains `t5`: T5Config (T5 model)
|
- contains `t5`: :class:`~transformers.T5Config` (T5 model)
|
||||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
- contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
|
||||||
- contains `albert`: AlbertConfig (ALBERT model)
|
- contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
|
||||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
- contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
|
||||||
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
|
- contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
- contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
|
||||||
- contains `bert`: BertConfig (Bert model)
|
- contains `bert`: :class:`~transformers.BertConfig` (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
- contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
- contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
- contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
- contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
- contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
|
||||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
- contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
|
||||||
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
Args:
|
||||||
|
pretrained_model_name_or_path (:obj:`string`):
|
||||||
|
Is either: \
|
||||||
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
|
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
|
cache_dir (:obj:`string`, optional, defaults to `None`):
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
configuration should be cached if the standard cache should not be used.
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
|
force_download (:obj:`boolean`, optional, defaults to `False`):
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
|
||||||
|
|
||||||
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
resume_download (:obj:`boolean`, optional, defaults to `False`):
|
||||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
Do not delete incompletely received file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
force_download: (`optional`) boolean, default False:
|
proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
|
||||||
|
The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
|
||||||
resume_download: (`optional`) boolean, default False:
|
|
||||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
|
||||||
The proxies are used on each request.
|
|
||||||
|
|
||||||
return_unused_kwargs: (`optional`) bool:
|
|
||||||
|
|
||||||
|
return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
|
||||||
- If False, then this function returns just the final configuration object.
|
- If False, then this function returns just the final configuration object.
|
||||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||||
|
|
||||||
|
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
|
||||||
|
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||||
|
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||||
|
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
|
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
|
||||||
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||||
|
|||||||
@@ -50,32 +50,61 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
class BertConfig(PretrainedConfig):
|
class BertConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a
|
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
|
||||||
`BertModel`.
|
It is used to instantiate an BERT model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
|
for more information.
|
||||||
|
|
||||||
|
|
||||||
Arguments:
|
Args:
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
|
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||||
hidden_size: Size of the encoder layers and the pooler layer.
|
Vocabulary size of the BERT model. Defines the different tokens that
|
||||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||||
num_attention_heads: Number of attention heads for each attention layer in
|
hidden_size (:obj:`int`, optional, defaults to 768):
|
||||||
the Transformer encoder.
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||||
layer in the Transformer encoder.
|
Number of hidden layers in the Transformer encoder.
|
||||||
hidden_act: The non-linear activation function (function or string) in the
|
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||||
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||||
layers in the embeddings, encoder, and pooler.
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||||
probabilities.
|
The non-linear activation function (function or string) in the encoder and pooler.
|
||||||
max_position_embeddings: The maximum sequence length that this model might
|
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||||
ever be used with. Typically set this to something large just in case
|
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||||
(e.g., 512 or 1024 or 2048).
|
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||||
`BertModel`.
|
The dropout ratio for the attention probabilities.
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||||
initializing all weight matrices.
|
The maximum sequence length that this model might ever be used with.
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||||
|
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||||
|
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
|
||||||
|
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||||
|
The epsilon used by the layer normalization layers.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import BertModel, BertConfig
|
||||||
|
|
||||||
|
# Initializing a BERT bert-base-uncased style configuration
|
||||||
|
configuration = BertConfig()
|
||||||
|
|
||||||
|
# Initializing a model from the bert-base-uncased style configuration
|
||||||
|
model = BertModel(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
model_type = "bert"
|
model_type = "bert"
|
||||||
@@ -97,6 +126,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super(BertConfig, self).__init__(**kwargs)
|
super(BertConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
|||||||
@@ -29,5 +29,35 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class CamembertConfig(RobertaConfig):
|
class CamembertConfig(RobertaConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
|
||||||
|
It is used to instantiate an Camembert model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
|
for more information.
|
||||||
|
|
||||||
|
The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
|
||||||
|
It reuses the same defaults. Please check the parent class for more information.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import CamembertModel, CamembertConfig
|
||||||
|
|
||||||
|
# Initializing a CamemBERT configuration
|
||||||
|
configuration = CamembertConfig()
|
||||||
|
|
||||||
|
# Initializing a model from the configuration
|
||||||
|
model = CamembertModel(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
|
"""
|
||||||
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
model_type = "camembert"
|
model_type = "camembert"
|
||||||
|
|||||||
@@ -26,25 +26,60 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
|
|||||||
|
|
||||||
|
|
||||||
class CTRLConfig(PretrainedConfig):
|
class CTRLConfig(PretrainedConfig):
|
||||||
"""Configuration class to store the configuration of a `CTRLModel`.
|
"""
|
||||||
|
This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
|
||||||
|
It is used to instantiate an CTRL model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
|
||||||
|
|
||||||
Args:
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
n_positions: Number of positional embeddings.
|
for more information.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
dff: Size of the inner dimension of the FFN.
|
Args:
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
vocab_size (:obj:`int`, optional, defaults to 246534):
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
Vocabulary size of the CTRL model. Defines the different tokens that
|
||||||
n_head: Number of attention heads for each attention layer in
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||||
the Transformer encoder.
|
n_positions (:obj:`int`, optional, defaults to 256):
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
The maximum sequence length that this model might ever be used with.
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||||
layers in the embeddings, encoder, and pooler.
|
n_ctx (:obj:`int`, optional, defaults to 256):
|
||||||
attn_pdrop: The dropout ratio for the attention
|
Dimensionality of the causal mask (usually same as n_positions).
|
||||||
probabilities.
|
n_embd (:obj:`int`, optional, defaults to 1280):
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
Dimensionality of the embeddings and hidden states.
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
dff (:obj:`int`, optional, defaults to 8192):
|
||||||
initializing all weight matrices.
|
Dimensionality of the inner dimension of the FFN.
|
||||||
|
n_layer (:obj:`int`, optional, defaults to 48):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head (:obj:`int`, optional, defaults to 16):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||||
|
The dropout ratio for the embeddings.
|
||||||
|
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout ratio for the attention.
|
||||||
|
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
|
||||||
|
The epsilon to use in the layer normalization layers
|
||||||
|
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import CTRLModel, CTRLConfig
|
||||||
|
|
||||||
|
# Initializing a CTRL configuration
|
||||||
|
configuration = CTRLConfig()
|
||||||
|
|
||||||
|
# Initializing a model from the configuration
|
||||||
|
model = CTRLModel(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
@@ -71,26 +106,6 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Constructs CTRLConfig.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
|
||||||
n_positions: Number of positional embeddings.
|
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
dff: Size of the inner dimension of the FFN.
|
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
attn_pdrop: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
"""
|
|
||||||
super(CTRLConfig, self).__init__(**kwargs)
|
super(CTRLConfig, self).__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
|
|||||||
@@ -31,6 +31,67 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class DistilBertConfig(PretrainedConfig):
|
class DistilBertConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
|
||||||
|
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
|
for more information.
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||||
|
Vocabulary size of the DistilBERT model. Defines the different tokens that
|
||||||
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||||
|
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||||
|
The maximum sequence length that this model might ever be used with.
|
||||||
|
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||||
|
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||||
|
Whether to use sinusoidal positional embeddings.
|
||||||
|
n_layers (:obj:`int`, optional, defaults to 6):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
n_heads (:obj:`int`, optional, defaults to 12):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
dim (:obj:`int`, optional, defaults to 768):
|
||||||
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
|
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||||
|
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
|
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout ratio for the attention probabilities.
|
||||||
|
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||||
|
The non-linear activation function (function or string) in the encoder and pooler.
|
||||||
|
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||||
|
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
qa_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probabilities used in the question answering model
|
||||||
|
:class:`~tranformers.DistilBertForQuestionAnswering`.
|
||||||
|
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
|
||||||
|
The dropout probabilities used in the sequence classification model
|
||||||
|
:class:`~tranformers.DistilBertForSequenceClassification`.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import DistilBertModel, DistilBertConfig
|
||||||
|
|
||||||
|
# Initializing a DistilBERT configuration
|
||||||
|
configuration = DistilBertConfig()
|
||||||
|
|
||||||
|
# Initializing a model from the configuration
|
||||||
|
model = DistilBertModel(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
|
"""
|
||||||
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
model_type = "distilbert"
|
model_type = "distilbert"
|
||||||
|
|
||||||
@@ -47,7 +108,6 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
attention_dropout=0.1,
|
attention_dropout=0.1,
|
||||||
activation="gelu",
|
activation="gelu",
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
tie_weights_=True,
|
|
||||||
qa_dropout=0.1,
|
qa_dropout=0.1,
|
||||||
seq_classif_dropout=0.2,
|
seq_classif_dropout=0.2,
|
||||||
**kwargs
|
**kwargs
|
||||||
@@ -64,7 +124,6 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
self.activation = activation
|
self.activation = activation
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.tie_weights_ = tie_weights_
|
|
||||||
self.qa_dropout = qa_dropout
|
self.qa_dropout = qa_dropout
|
||||||
self.seq_classif_dropout = seq_classif_dropout
|
self.seq_classif_dropout = seq_classif_dropout
|
||||||
|
|
||||||
|
|||||||
@@ -33,24 +33,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class GPT2Config(PretrainedConfig):
|
class GPT2Config(PretrainedConfig):
|
||||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
"""
|
||||||
|
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
|
||||||
|
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
|
||||||
|
|
||||||
Args:
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
n_positions: Number of positional embeddings.
|
for more information.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
Args:
|
||||||
n_head: Number of attention heads for each attention layer in
|
vocab_size (:obj:`int`, optional, defaults to 50257):
|
||||||
the Transformer encoder.
|
Vocabulary size of the GPT-2 model. Defines the different tokens that
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
n_positions (:obj:`int`, optional, defaults to 1024):
|
||||||
layers in the embeddings, encoder, and pooler.
|
The maximum sequence length that this model might ever be used with.
|
||||||
attn_pdrop: The dropout ratio for the attention
|
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||||
probabilities.
|
n_ctx (:obj:`int`, optional, defaults to 1024):
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
Dimensionality of the causal mask (usually same as n_positions).
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
n_embd (:obj:`int`, optional, defaults to 768):
|
||||||
initializing all weight matrices.
|
Dimensionality of the embeddings and hidden states.
|
||||||
|
n_layer (:obj:`int`, optional, defaults to 12):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head (:obj:`int`, optional, defaults to 12):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||||
|
The dropout ratio for the embeddings.
|
||||||
|
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout ratio for the attention.
|
||||||
|
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||||
|
The epsilon to use in the layer normalization layers
|
||||||
|
initializer_range (:obj:`float`, optional, defaults to 16):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
summary_type (:obj:`string`, optional, defaults to "cls_index"):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||||
|
Is one of the following options:
|
||||||
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||||
|
- 'attn' => Not implemented now, use multi-head attention
|
||||||
|
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||||
|
Add a projection after the vector extraction
|
||||||
|
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||||
|
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||||
|
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||||
|
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||||
|
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||||
|
Add a dropout before the projection and activation
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import GPT2Model, GPT2Config
|
||||||
|
|
||||||
|
# Initializing a GPT2 configuration
|
||||||
|
configuration = GPT2Config()
|
||||||
|
|
||||||
|
# Initializing a model from the configuration
|
||||||
|
model = GPT2Model(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
@@ -76,26 +136,8 @@ class GPT2Config(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Constructs GPT2Config.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
|
||||||
n_positions: Number of positional embeddings.
|
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
attn_pdrop: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
"""
|
|
||||||
super(GPT2Config, self).__init__(**kwargs)
|
super(GPT2Config, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
|
|||||||
@@ -26,9 +26,13 @@ class MMBTConfig(object):
|
|||||||
"""Configuration class to store the configuration of a `MMBT Model`.
|
"""Configuration class to store the configuration of a `MMBT Model`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: config of the underlying Transformer models. It's values are copied over to use a single config.
|
config (:obj:`~transformers.PreTrainedConfig`):
|
||||||
num_labels: Size of final Linear layer for classification.
|
Config of the underlying Transformer models. Its values are
|
||||||
modal_hidden_size: Embedding dimension of the non-text modality encoder.
|
copied over to use a single config.
|
||||||
|
num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
|
||||||
|
Size of final Linear layer for classification.
|
||||||
|
modal_hidden_size (:obj:`int`, optional, defautls to 2048):
|
||||||
|
Embedding dimension of the non-text modality encoder.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, num_labels=None, modal_hidden_size=2048):
|
def __init__(self, config, num_labels=None, modal_hidden_size=2048):
|
||||||
|
|||||||
@@ -30,27 +30,87 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
class OpenAIGPTConfig(PretrainedConfig):
|
class OpenAIGPTConfig(PretrainedConfig):
|
||||||
"""
|
"""
|
||||||
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
|
||||||
|
It is used to instantiate an GPT model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
|
||||||
|
|
||||||
Args:
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
n_positions: Number of positional embeddings.
|
for more information.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
Args:
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
vocab_size (:obj:`int`, optional, defaults to 40478):
|
||||||
n_head: Number of attention heads for each attention layer in
|
Vocabulary size of the GPT model. Defines the different tokens that
|
||||||
the Transformer encoder.
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||||
afn: The non-linear activation function (function or string) in the
|
n_positions (:obj:`int`, optional, defaults to 512):
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
The maximum sequence length that this model might ever be used with.
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||||
layers in the embeddings, encoder, and pooler.
|
n_ctx (:obj:`int`, optional, defaults to 512):
|
||||||
attn_pdrop: The dropout ratio for the attention
|
Dimensionality of the causal mask (usually same as n_positions).
|
||||||
probabilities.
|
n_embd (:obj:`int`, optional, defaults to 768):
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
Dimensionality of the embeddings and hidden states.
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
n_layer (:obj:`int`, optional, defaults to 12):
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
Number of hidden layers in the Transformer encoder.
|
||||||
initializing all weight matrices.
|
n_head (:obj:`int`, optional, defaults to 12):
|
||||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||||
|
The non-linear activation function (function or string) in the encoder and pooler.
|
||||||
|
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||||
|
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||||
|
The dropout ratio for the embeddings.
|
||||||
|
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout ratio for the attention.
|
||||||
|
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||||
|
The epsilon to use in the layer normalization layers
|
||||||
|
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Whether special tokens should be predicted when the model is has a language modeling head.
|
||||||
|
summary_type (:obj:`string`, optional, defaults to "cls_index"):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||||
|
Is one of the following options:
|
||||||
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||||
|
- 'attn' => Not implemented now, use multi-head attention
|
||||||
|
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||||
|
Add a projection after the vector extraction
|
||||||
|
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||||
|
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||||
|
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||||
|
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||||
|
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||||
|
Add a dropout before the projection and activation
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import OpenAIGPTConfig, OpenAIGPTModel
|
||||||
|
|
||||||
|
# Initializing a GPT configuration
|
||||||
|
configuration = OpenAIGPTConfig()
|
||||||
|
|
||||||
|
# Initializing a model from the configuration
|
||||||
|
model = OpenAIGPTModel(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
@@ -78,9 +138,8 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Constructs OpenAIGPTConfig.
|
|
||||||
"""
|
|
||||||
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
|
|||||||
@@ -34,5 +34,35 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class RobertaConfig(BertConfig):
|
class RobertaConfig(BertConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
|
||||||
|
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
|
for more information.
|
||||||
|
|
||||||
|
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
|
||||||
|
It reuses the same defaults. Please check the parent class for more information.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import RobertaConfig, RobertaModel
|
||||||
|
|
||||||
|
# Initializing a RoBERTa configuration
|
||||||
|
configuration = RobertaConfig()
|
||||||
|
|
||||||
|
# Initializing a model from the configuration
|
||||||
|
model = RobertaModel(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
|
"""
|
||||||
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
model_type = "roberta"
|
model_type = "roberta"
|
||||||
|
|||||||
@@ -29,39 +29,91 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class TransfoXLConfig(PretrainedConfig):
|
class TransfoXLConfig(PretrainedConfig):
|
||||||
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
"""
|
||||||
|
This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
|
||||||
|
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
|
for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
vocab_size (:obj:`int`, optional, defaults to 267735):
|
||||||
cutoffs: cutoffs for the adaptive softmax
|
Vocabulary size of the Transformer XL model. Defines the different tokens that
|
||||||
d_model: Dimensionality of the model's hidden states.
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
|
||||||
d_embed: Dimensionality of the embeddings
|
cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
|
||||||
d_head: Dimensionality of the model's heads.
|
Cutoffs for the adaptive softmax
|
||||||
div_val: divident value for adapative input and softmax
|
d_model (:obj:`int`, optional, defaults to 1024):
|
||||||
pre_lnorm: apply LayerNorm to the input instead of the output
|
Dimensionality of the model's hidden states.
|
||||||
d_inner: Inner dimension in FF
|
d_embed (:obj:`int`, optional, defaults to 1024):
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
Dimensionality of the embeddings
|
||||||
n_head: Number of attention heads for each attention layer in
|
n_head (:obj:`int`, optional, defaults to 16):
|
||||||
the Transformer encoder.
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
tgt_len: number of tokens to predict
|
d_head (:obj:`int`, optional, defaults to 64):
|
||||||
ext_len: length of the extended context
|
Dimensionality of the model's heads.
|
||||||
mem_len: length of the retained previous heads
|
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||||
same_length: use the same attn length for all tokens
|
Inner dimension in FF
|
||||||
proj_share_all_but_first: True to share all but first projs, False not to share.
|
div_val (:obj:`int`, optional, defaults to 4):
|
||||||
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
Divident value for adapative input and softmax
|
||||||
clamp_len: use the same pos embeddings after clamp_len
|
pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||||
sample_softmax: number of samples in sampled softmax
|
Apply LayerNorm to the input instead of the output
|
||||||
adaptive: use adaptive softmax
|
n_layer (:obj:`int`, optional, defaults to 18):
|
||||||
tie_weight: tie the word embedding and softmax weights
|
Number of hidden layers in the Transformer encoder.
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
tgt_len (:obj:`int`, optional, defaults to 128):
|
||||||
layers in the embeddings, encoder, and pooler.
|
Number of tokens to predict
|
||||||
dropatt: The dropout ratio for the attention probabilities.
|
ext_len (:obj:`int`, optional, defaults to 0):
|
||||||
untie_r: untie relative position biases
|
Length of the extended context
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
mem_len (:obj:`int`, optional, defaults to 1600):
|
||||||
init: parameter initializer to use
|
Length of the retained previous heads
|
||||||
init_range: parameters initialized by U(-init_range, init_range).
|
clamp_len (:obj:`int`, optional, defaults to 1000):
|
||||||
proj_init_std: parameters initialized by N(0, init_std)
|
use the same pos embeddings after clamp_len
|
||||||
init_std: parameters initialized by N(0, init_std)
|
same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Use the same attn length for all tokens
|
||||||
|
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
True to share all but first projs, False not to share.
|
||||||
|
attn_type (:obj:`int`, optional, defaults to 0):
|
||||||
|
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
||||||
|
sample_softmax (:obj:`int`, optional, defaults to -1):
|
||||||
|
number of samples in sampled softmax
|
||||||
|
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
use adaptive softmax
|
||||||
|
tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
tie the word embedding and softmax weights
|
||||||
|
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
dropatt (:obj:`float`, optional, defaults to 0):
|
||||||
|
The dropout ratio for the attention probabilities.
|
||||||
|
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Untie relative position biases
|
||||||
|
init (:obj:`string`, optional, defaults to `normal`):
|
||||||
|
Parameter initializer to use
|
||||||
|
init_range (:obj:`float`, optional, defaults to 0.01):
|
||||||
|
Parameters initialized by U(-init_range, init_range).
|
||||||
|
proj_init_std (:obj:`float`, optional, defaults to 0.01):
|
||||||
|
Parameters initialized by N(0, init_std)
|
||||||
|
init_std (:obj:`float`, optional, defaults to 0.02):
|
||||||
|
Parameters initialized by N(0, init_std)
|
||||||
|
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||||
|
The epsilon to use in the layer normalization layers
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from transformers import TransfoXLConfig, TransfoXLModel
|
||||||
|
|
||||||
|
# Initializing a Transformer XL configuration
|
||||||
|
configuration = TransfoXLConfig()
|
||||||
|
|
||||||
|
# Initializing a model from the configuration
|
||||||
|
model = TransfoXLModel(configuration)
|
||||||
|
|
||||||
|
# Accessing the model configuration
|
||||||
|
configuration = model.config
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
@@ -99,9 +151,8 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Constructs TransfoXLConfig.
|
|
||||||
"""
|
|
||||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.cutoffs = []
|
self.cutoffs = []
|
||||||
self.cutoffs.extend(cutoffs)
|
self.cutoffs.extend(cutoffs)
|
||||||
|
|||||||
@@ -37,44 +37,124 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class XLMConfig(PretrainedConfig):
|
class XLMConfig(PretrainedConfig):
|
||||||
"""Configuration class to store the configuration of a `XLMModel`.
|
"""
|
||||||
|
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
|
||||||
|
It is used to instantiate an XLM model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
|
||||||
|
|
||||||
Args:
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
for more information.
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
|
||||||
layer in the Transformer encoder.
|
|
||||||
ff_activation: The non-linear activation function (function or string) in the
|
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
|
||||||
untie_r: untie relative position biases
|
|
||||||
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
|
|
||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
Args:
|
||||||
layers in the embeddings, encoder, and pooler.
|
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||||
max_position_embeddings: The maximum sequence length that this model might
|
Vocabulary size of the XLM model. Defines the different tokens that
|
||||||
ever be used with. Typically set this to something large just in case
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
|
||||||
(e.g., 512 or 1024 or 2048).
|
emb_dim (:obj:`int`, optional, defaults to 2048):
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
initializing all weight matrices.
|
n_layer (:obj:`int`, optional, defaults to 12):
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head (:obj:`int`, optional, defaults to 16):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probability for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probability for the attention mechanism
|
||||||
|
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
|
||||||
|
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||||
|
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||||
|
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||||
|
Set this to `True` for the model to behave in a causal manner.
|
||||||
|
Causal models use a triangular attention mask in order to only attend to the left-side context instead
|
||||||
|
if a bidirectional context.
|
||||||
|
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||||
|
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
||||||
|
layer.
|
||||||
|
n_langs (:obj:`int`, optional, defaults to 1):
|
||||||
|
The number of languages the model handles. Set to 1 for monolingual models.
|
||||||
|
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
|
||||||
|
Whether to use language embeddings. Some models use additional language embeddings, see
|
||||||
|
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
|
||||||
|
for information on how to use them.
|
||||||
|
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||||
|
The maximum sequence length that this model might
|
||||||
|
ever be used with. Typically set this to something large just in case
|
||||||
|
(e.g., 512 or 1024 or 2048).
|
||||||
|
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
|
||||||
|
The standard deviation of the truncated_normal_initializer for
|
||||||
|
initializing the embedding matrices.
|
||||||
|
init_std (:obj:`int`, optional, defaults to 50257):
|
||||||
|
The standard deviation of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices except the embedding matrices.
|
||||||
|
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||||
|
The epsilon used by the layer normalization layers.
|
||||||
|
bos_index (:obj:`int`, optional, defaults to 0):
|
||||||
|
The index of the beginning of sentence token in the vocabulary.
|
||||||
|
eos_index (:obj:`int`, optional, defaults to 1):
|
||||||
|
The index of the end of sentence token in the vocabulary.
|
||||||
|
pad_index (:obj:`int`, optional, defaults to 2):
|
||||||
|
The index of the padding token in the vocabulary.
|
||||||
|
unk_index (:obj:`int`, optional, defaults to 3):
|
||||||
|
The index of the unknown token in the vocabulary.
|
||||||
|
mask_index (:obj:`int`, optional, defaults to 5):
|
||||||
|
The index of the masking token in the vocabulary.
|
||||||
|
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
|
||||||
|
summary_type (:obj:`string`, optional, defaults to "first"):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLMForSequenceClassification`.
|
||||||
|
Is one of the following options:
|
||||||
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||||
|
- 'attn' => Not implemented now, use multi-head attention
|
||||||
|
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLMForSequenceClassification`.
|
||||||
|
Add a projection after the vector extraction
|
||||||
|
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLMForSequenceClassification`.
|
||||||
|
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||||
|
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLMForSequenceClassification`.
|
||||||
|
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||||
|
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLMForSequenceClassification`.
|
||||||
|
Add a dropout before the projection and activation
|
||||||
|
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||||
|
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||||
|
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||||
|
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||||
|
mask_token_id (:obj:`int`, optional, defaults to 0):
|
||||||
|
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
|
||||||
|
lang_id (:obj:`int`, optional, defaults to 1):
|
||||||
|
The ID of the language used by the model. This parameter is used when generating
|
||||||
|
text in a given language.
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
Example::
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
from transformers import XLMConfig, XLMModel
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
|
||||||
init_std: float, initialize the parameters with a normal distribution
|
# Initializing a XLM configuration
|
||||||
with mean 0 and stddev init_std. Only effective when init="normal".
|
configuration = XLMConfig()
|
||||||
mem_len: int, the number of tokens to cache.
|
|
||||||
reuse_len: int, the number of tokens in the currect batch to be cached
|
# Initializing a model from the configuration
|
||||||
and reused in the future.
|
model = XLMModel(configuration)
|
||||||
bi_data: bool, whether to use bidirectional input pipeline.
|
|
||||||
Usually set to True during pretraining and False during finetuning.
|
# Accessing the model configuration
|
||||||
clamp_len: int, clamp all relative distances larger than clamp_len.
|
configuration = model.config
|
||||||
-1 means no clamping.
|
|
||||||
same_length: bool, whether to use the same attention length for each token.
|
Attributes:
|
||||||
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|||||||
@@ -30,42 +30,102 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
class XLNetConfig(PretrainedConfig):
|
class XLNetConfig(PretrainedConfig):
|
||||||
"""Configuration class to store the configuration of a ``XLNetModel``.
|
"""
|
||||||
|
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
|
||||||
|
It is used to instantiate an XLNet model according to the specified arguments, defining the model
|
||||||
|
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||||
|
the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
|
||||||
|
|
||||||
Args:
|
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||||
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
for more information.
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
|
||||||
layer in the Transformer encoder.
|
|
||||||
ff_activation: The non-linear activation function (function or string) in the
|
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
|
||||||
untie_r: untie relative position biases
|
|
||||||
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
|
|
||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
Args:
|
||||||
layers in the embeddings, encoder, and pooler.
|
vocab_size (:obj:`int`, optional, defaults to 32000):
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
Vocabulary size of the XLNet model. Defines the different tokens that
|
||||||
initializing all weight matrices.
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
d_model (:obj:`int`, optional, defaults to 1024):
|
||||||
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
|
n_layer (:obj:`int`, optional, defaults to 24):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head (:obj:`int`, optional, defaults to 16):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||||
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
|
ff_activation (:obj:`string`, optional, defaults to "gelu"):
|
||||||
|
The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||||
|
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Untie relative position biases
|
||||||
|
attn_type (:obj:`string`, optional, defaults to "bi"):
|
||||||
|
The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
|
||||||
|
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||||
|
The epsilon used by the layer normalization layers.
|
||||||
|
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
|
mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||||
|
The number of tokens to cache. The key/value pairs that have already been pre-computed
|
||||||
|
in a previous forward pass won't be re-computed. See the
|
||||||
|
`quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
|
||||||
|
for more information.
|
||||||
|
reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||||
|
The number of tokens in the current batch to be cached and reused in the future.
|
||||||
|
bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||||
|
Whether to use bidirectional input pipeline. Usually set to `True` during
|
||||||
|
pretraining and `False` during finetuning.
|
||||||
|
clamp_len (:obj:`int`, optional, defaults to -1):
|
||||||
|
Clamp all relative distances larger than clamp_len.
|
||||||
|
Setting this attribute to -1 means no clamping.
|
||||||
|
same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||||
|
Whether to use the same attention length for each token.
|
||||||
|
summary_type (:obj:`string`, optional, defaults to "last"):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||||
|
Is one of the following options:
|
||||||
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||||
|
- 'attn' => Not implemented now, use multi-head attention
|
||||||
|
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||||
|
Add a projection after the vector extraction
|
||||||
|
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||||
|
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||||
|
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||||
|
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||||
|
summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||||
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
|
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||||
|
Add a dropout after the projection and activation
|
||||||
|
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||||
|
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||||
|
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||||
|
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
Example::
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
from transformers import XLNetConfig, XLNetModel
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
|
||||||
init_std: float, initialize the parameters with a normal distribution
|
# Initializing a XLNet configuration
|
||||||
with mean 0 and stddev init_std. Only effective when init="normal".
|
configuration = XLNetConfig()
|
||||||
mem_len: int, the number of tokens to cache.
|
|
||||||
reuse_len: int, the number of tokens in the currect batch to be cached
|
# Initializing a model from the configuration
|
||||||
and reused in the future.
|
model = XLNetModel(configuration)
|
||||||
bi_data: bool, whether to use bidirectional input pipeline.
|
|
||||||
Usually set to True during pretraining and False during finetuning.
|
# Accessing the model configuration
|
||||||
clamp_len: int, clamp all relative distances larger than clamp_len.
|
configuration = model.config
|
||||||
-1 means no clamping.
|
|
||||||
same_length: bool, whether to use the same attention length for each token.
|
Attributes:
|
||||||
finetuning_task: name of the glue task on which the model was fine-tuned if any
|
pretrained_config_archive_map (Dict[str, str]):
|
||||||
|
A dictionary containing all the available pre-trained checkpoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|||||||
@@ -384,16 +384,13 @@ def get_from_cache(
|
|||||||
else:
|
else:
|
||||||
http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
|
http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
|
||||||
|
|
||||||
# we are copying the file before closing it, so flush to avoid truncation
|
logger.info("storing %s in cache at %s", url, cache_path)
|
||||||
temp_file.flush()
|
os.rename(temp_file.name, cache_path)
|
||||||
|
|
||||||
logger.info("storing %s in cache at %s", url, cache_path)
|
logger.info("creating metadata file for %s", cache_path)
|
||||||
os.rename(temp_file.name, cache_path)
|
meta = {"url": url, "etag": etag}
|
||||||
|
meta_path = cache_path + ".json"
|
||||||
logger.info("creating metadata file for %s", cache_path)
|
with open(meta_path, "w") as meta_file:
|
||||||
meta = {"url": url, "etag": etag}
|
json.dump(meta, meta_file)
|
||||||
meta_path = cache_path + ".json"
|
|
||||||
with open(meta_path, "w") as meta_file:
|
|
||||||
json.dump(meta, meta_file)
|
|
||||||
|
|
||||||
return cache_path
|
return cache_path
|
||||||
|
|||||||
@@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module):
|
|||||||
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
|
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
|
||||||
self.activation = ACT2FN[config.hidden_act]
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||||
|
self.decoder.bias = self.bias
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
hidden_states = self.dense(hidden_states)
|
hidden_states = self.dense(hidden_states)
|
||||||
hidden_states = self.activation(hidden_states)
|
hidden_states = self.activation(hidden_states)
|
||||||
|
|||||||
@@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module):
|
|||||||
|
|
||||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||||
|
|
||||||
|
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||||
|
self.decoder.bias = self.bias
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
hidden_states = self.transform(hidden_states)
|
hidden_states = self.transform(hidden_states)
|
||||||
hidden_states = self.decoder(hidden_states) + self.bias
|
hidden_states = self.decoder(hidden_states) + self.bias
|
||||||
|
|||||||
@@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module):
|
|||||||
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||||
|
|
||||||
|
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||||
|
self.decoder.bias = self.bias
|
||||||
|
|
||||||
def forward(self, features, **kwargs):
|
def forward(self, features, **kwargs):
|
||||||
x = self.dense(features)
|
x = self.dense(features)
|
||||||
x = gelu(x)
|
x = gelu(x)
|
||||||
|
|||||||
@@ -286,6 +286,7 @@ class T5Attention(nn.Module):
|
|||||||
bidirectional=not self.is_decoder,
|
bidirectional=not self.is_decoder,
|
||||||
num_buckets=self.relative_attention_num_buckets,
|
num_buckets=self.relative_attention_num_buckets,
|
||||||
)
|
)
|
||||||
|
rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
|
||||||
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
|
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
|
||||||
values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
|
values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
|
||||||
return values
|
return values
|
||||||
|
|||||||
@@ -705,55 +705,71 @@ class QuestionAnsweringPipeline(Pipeline):
|
|||||||
|
|
||||||
# Convert inputs to features
|
# Convert inputs to features
|
||||||
examples = self._args_parser(*texts, **kwargs)
|
examples = self._args_parser(*texts, **kwargs)
|
||||||
features = squad_convert_examples_to_features(
|
features_list = [
|
||||||
examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False
|
squad_convert_examples_to_features(
|
||||||
)
|
[example],
|
||||||
fw_args = self.inputs_for_model([f.__dict__ for f in features])
|
self.tokenizer,
|
||||||
|
kwargs["max_seq_len"],
|
||||||
|
kwargs["doc_stride"],
|
||||||
|
kwargs["max_question_len"],
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
for example in examples
|
||||||
|
]
|
||||||
|
all_answers = []
|
||||||
|
for features, example in zip(features_list, examples):
|
||||||
|
fw_args = self.inputs_for_model([f.__dict__ for f in features])
|
||||||
|
|
||||||
# Manage tensor allocation on correct device
|
# Manage tensor allocation on correct device
|
||||||
with self.device_placement():
|
with self.device_placement():
|
||||||
if self.framework == "tf":
|
if self.framework == "tf":
|
||||||
fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
|
fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
|
||||||
start, end = self.model(fw_args)
|
start, end = self.model(fw_args)
|
||||||
start, end = start.numpy(), end.numpy()
|
start, end = start.numpy(), end.numpy()
|
||||||
else:
|
else:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# Retrieve the score for the context tokens only (removing question tokens)
|
# Retrieve the score for the context tokens only (removing question tokens)
|
||||||
fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
|
fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
|
||||||
start, end = self.model(**fw_args)
|
start, end = self.model(**fw_args)
|
||||||
start, end = start.cpu().numpy(), end.cpu().numpy()
|
start, end = start.cpu().numpy(), end.cpu().numpy()
|
||||||
|
|
||||||
answers = []
|
answers = []
|
||||||
for (example, feature, start_, end_) in zip(examples, features, start, end):
|
for (feature, start_, end_) in zip(features, start, end):
|
||||||
# Normalize logits and spans to retrieve the answer
|
# Normalize logits and spans to retrieve the answer
|
||||||
start_ = np.exp(start_) / np.sum(np.exp(start_))
|
start_ = np.exp(start_) / np.sum(np.exp(start_))
|
||||||
end_ = np.exp(end_) / np.sum(np.exp(end_))
|
end_ = np.exp(end_) / np.sum(np.exp(end_))
|
||||||
|
|
||||||
# Mask padding and question
|
# Mask padding and question
|
||||||
start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
|
start_, end_ = (
|
||||||
|
start_ * np.abs(np.array(feature.p_mask) - 1),
|
||||||
|
end_ * np.abs(np.array(feature.p_mask) - 1),
|
||||||
|
)
|
||||||
|
|
||||||
# TODO : What happens if not possible
|
# TODO : What happens if not possible
|
||||||
# Mask CLS
|
# Mask CLS
|
||||||
start_[0] = end_[0] = 0
|
start_[0] = end_[0] = 0
|
||||||
|
|
||||||
starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
|
starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
|
||||||
char_to_word = np.array(example.char_to_word_offset)
|
char_to_word = np.array(example.char_to_word_offset)
|
||||||
|
|
||||||
# Convert the answer (tokens) back to the original text
|
# Convert the answer (tokens) back to the original text
|
||||||
answers += [
|
answers += [
|
||||||
{
|
{
|
||||||
"score": score.item(),
|
"score": score.item(),
|
||||||
"start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
|
"start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
|
||||||
"end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
|
"end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
|
||||||
"answer": " ".join(
|
"answer": " ".join(
|
||||||
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
|
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
for s, e, score in zip(starts, ends, scores)
|
for s, e, score in zip(starts, ends, scores)
|
||||||
]
|
]
|
||||||
if len(answers) == 1:
|
answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
|
||||||
return answers[0]
|
all_answers += answers
|
||||||
return answers
|
|
||||||
|
if len(all_answers) == 1:
|
||||||
|
return all_answers[0]
|
||||||
|
return all_answers
|
||||||
|
|
||||||
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
|
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -485,6 +485,8 @@ class ModelTesterMixin:
|
|||||||
self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
|
self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
|
||||||
# Check that it actually resizes the embeddings matrix
|
# Check that it actually resizes the embeddings matrix
|
||||||
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
|
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
|
||||||
|
# Check that the model can still do a forward pass successfully (every parameter should be resized)
|
||||||
|
model(**inputs_dict)
|
||||||
|
|
||||||
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
|
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
|
||||||
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
|
model_embed = model.resize_token_embeddings(model_vocab_size - 15)
|
||||||
@@ -492,6 +494,11 @@ class ModelTesterMixin:
|
|||||||
# Check that it actually resizes the embeddings matrix
|
# Check that it actually resizes the embeddings matrix
|
||||||
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
|
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
|
||||||
|
|
||||||
|
# Check that the model can still do a forward pass successfully (every parameter should be resized)
|
||||||
|
# Input ids should be clamped to the maximum size of the vocabulary
|
||||||
|
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
|
||||||
|
model(**inputs_dict)
|
||||||
|
|
||||||
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
|
# Check that adding and removing tokens has not modified the first part of the embedding matrix.
|
||||||
models_equal = True
|
models_equal = True
|
||||||
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
|
for p1, p2 in zip(cloned_embeddings, model_embed.weight):
|
||||||
|
|||||||
Reference in New Issue
Block a user