Optional layers (#8961)
* Apply on BERT and ALBERT * Update TF Bart * Add input processing to TF BART * Add input processing for TF CTRL * Add input processing to TF Distilbert * Add input processing to TF DPR * Add input processing to TF Electra * Add deprecated arguments * Add input processing to TF XLM * remove unused imports * Add input processing to TF Funnel * Add input processing to TF GPT2 * Add input processing to TF Longformer * Add input processing to TF Lxmert * Apply style * Add input processing to TF Mobilebert * Add input processing to TF GPT * Add input processing to TF Roberta * Add input processing to TF T5 * Add input processing to TF TransfoXL * Apply style * Rebase on master * Fix wrong model name * Fix BART * Apply style * Put the deprecated warnings in the input processing function * Remove the unused imports * Raise an error when len(kwargs)>0 * test ModelOutput instead of TFBaseModelOutput * Address Patrick's comments * Address Patrick's comments * Add boolean processing for the inputs * Take into account the optional layers * Add missing/unexpected weights in the other models * Apply style * rename parameters * Apply style * Remove useless * Remove useless * Remove useless * Update num parameters * Fix tests * Address Patrick's comment * Remove useless attribute
This commit is contained in:
@@ -547,7 +547,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
|
||||
class TFBertMainLayer(tf.keras.layers.Layer):
|
||||
config_class = BertConfig
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
def __init__(self, config, add_pooling_layer=True, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.config = config
|
||||
@@ -558,7 +558,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
||||
self.return_dict = config.use_return_dict
|
||||
self.embeddings = TFBertEmbeddings(config, name="embeddings")
|
||||
self.encoder = TFBertEncoder(config, name="encoder")
|
||||
self.pooler = TFBertPooler(config, name="pooler")
|
||||
self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings
|
||||
@@ -663,7 +663,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
||||
)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.pooler(sequence_output)
|
||||
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
|
||||
|
||||
if not inputs["return_dict"]:
|
||||
return (
|
||||
@@ -880,6 +880,9 @@ Bert Model with two heads on top as done during the pretraining:
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [r"cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
@@ -976,9 +979,13 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
|
||||
|
||||
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
||||
class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"pooler",
|
||||
r"cls.seq_relationship",
|
||||
r"cls.predictions.decoder.weight",
|
||||
r"nsp___cls",
|
||||
]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@@ -989,7 +996,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
"bi-directional self-attention."
|
||||
)
|
||||
|
||||
self.bert = TFBertMainLayer(config, name="bert")
|
||||
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
|
||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
@@ -1068,9 +1075,13 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
|
||||
|
||||
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"pooler",
|
||||
r"cls.seq_relationship",
|
||||
r"cls.predictions.decoder.weight",
|
||||
r"nsp___cls",
|
||||
]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@@ -1078,7 +1089,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
if not config.is_decoder:
|
||||
logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`")
|
||||
|
||||
self.bert = TFBertMainLayer(config, name="bert")
|
||||
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
|
||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
@@ -1165,6 +1176,9 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredictionLoss):
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"cls.predictions"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
@@ -1262,6 +1276,10 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassificationLoss):
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
|
||||
_keys_to_ignore_on_load_missing = [r"dropout"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
@@ -1353,6 +1371,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
|
||||
_keys_to_ignore_on_load_missing = [r"dropout"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
@@ -1477,15 +1499,21 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationLoss):
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"pooler",
|
||||
r"mlm___cls",
|
||||
r"nsp___cls",
|
||||
r"cls.predictions",
|
||||
r"cls.seq_relationship",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = [r"dropout"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.bert = TFBertMainLayer(config, name="bert")
|
||||
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = tf.keras.layers.Dense(
|
||||
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
|
||||
@@ -1571,15 +1599,20 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss):
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"pooler",
|
||||
r"mlm___cls",
|
||||
r"nsp___cls",
|
||||
r"cls.predictions",
|
||||
r"cls.seq_relationship",
|
||||
]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.bert = TFBertMainLayer(config, name="bert")
|
||||
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
|
||||
self.qa_outputs = tf.keras.layers.Dense(
|
||||
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user