Full rework of the TF input/output embeddings and bias resizing (#9193)
* Start rework resizing * Rework bias/decoder resizing * Full resizing rework * Full resizing rework * Start to update the models with the new approach * Finish to update the models * Update all the tests * Update the template * Fix tests * Fix tests * Test a new approach * Refactoring * Refactoring * Refactoring * New rework * Rework BART * Rework bert+blenderbot * Rework CTRL * Rework Distilbert * Rework DPR * Rework Electra * Rework Flaubert * Rework Funnel * Rework GPT2 * Rework Longformer * Rework Lxmert * Rework marian+mbart * Rework mobilebert * Rework mpnet * Rework openai * Rework pegasus * Rework Roberta * Rework T5 * Rework xlm+xlnet * Rework template * Fix TFT5EncoderOnly + DPRs * Restore previous methods * Fix Funnel * Fix CTRL and TransforXL * Apply style * Apply Sylvain's comments * Restore a test in DPR * Address the comments * Fix bug * Apply style * remove unused import * Fix test * Forgot a method * missing test * Trigger CI * naming update * Rebase * Trigger CI
This commit is contained in:
@@ -530,6 +530,46 @@ def load_tf_weights(model, resolved_archive_file):
|
||||
return missing_layers, unexpected_layers
|
||||
|
||||
|
||||
def init_copy_embeddings(old_embeddings, new_num_tokens):
|
||||
r"""
|
||||
This function aims to reduce the embeddings in case new_num_tokens < old_num_tokens or to pad with -1 in case
|
||||
new_num_tokens > old_num_tokens. A mask is also computed in order to know which weight in the embeddings should be
|
||||
kept or not. Example:
|
||||
|
||||
- if new_num_tokens=5 and old_num_tokens=4 and old_embeddings=[w1,w2,w3,w4]
|
||||
|
||||
- mask=[True,True,True,True,False] and current_weights=[w1,w2,w3,w4,-1]
|
||||
- if new_num_tokens=4 and old_num_tokens=5 and old_embeddings=[w1,w2,w3,w4,w5]
|
||||
|
||||
- mask=[True,True,True,True] and current_weights=[w1,w2,w3,w4]
|
||||
"""
|
||||
old_num_tokens, old_embedding_dim = shape_list(old_embeddings)
|
||||
size_diff = new_num_tokens - old_num_tokens
|
||||
|
||||
# initialize new embeddings
|
||||
# Copy token embeddings from the previous ones
|
||||
if tf.math.greater(size_diff, 0):
|
||||
# if the new size is greater than the old one, we extend the current embeddings with a padding until getting new size
|
||||
# and we create a mask to properly identify the padded values and be replaced by the values of the newly created
|
||||
# embeddings
|
||||
current_weights = tf.pad(
|
||||
old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1
|
||||
)
|
||||
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
||||
mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True)
|
||||
mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False)
|
||||
else:
|
||||
# if the new size if lower than the old one, we take the current embeddings until the new size
|
||||
current_weights = tf.slice(
|
||||
old_embeddings.value(),
|
||||
tf.convert_to_tensor([0, 0]),
|
||||
tf.convert_to_tensor([new_num_tokens, old_embedding_dim]),
|
||||
)
|
||||
mask = tf.fill(tf.convert_to_tensor([new_num_tokens, 1]), True)
|
||||
|
||||
return mask, current_weights
|
||||
|
||||
|
||||
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
||||
r"""
|
||||
Base class for all TF models.
|
||||
@@ -615,58 +655,132 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
||||
|
||||
def get_input_embeddings(self) -> tf.keras.layers.Layer:
|
||||
"""
|
||||
Returns the model's input embeddings.
|
||||
Returns the model's input embeddings layer.
|
||||
|
||||
Returns:
|
||||
:obj:`tf.keras.layers.Layer`: A torch module mapping vocabulary to hidden states.
|
||||
:obj:`tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
|
||||
"""
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
main_layer = getattr(self, self.base_model_prefix, self)
|
||||
|
||||
if base_model is not self:
|
||||
return base_model.get_input_embeddings()
|
||||
if main_layer is not self:
|
||||
return main_layer.get_input_embeddings()
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
"""
|
||||
Set model's input embeddings.
|
||||
Set model's input embeddings
|
||||
|
||||
Args:
|
||||
value (:obj:`tf.keras.layers.Layer`):
|
||||
A module mapping vocabulary to hidden states.
|
||||
value (:obj:`tf.Variable`):
|
||||
The new weights mapping hidden states to vocabulary.
|
||||
"""
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
if base_model is not self:
|
||||
base_model.set_input_embeddings(value)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
main_layer = getattr(self, self.base_model_prefix)
|
||||
|
||||
def get_output_embeddings(self) -> tf.keras.layers.Layer:
|
||||
if main_layer is None:
|
||||
raise NotImplementedError("The model does not implements the base_model_prefix attribute.")
|
||||
|
||||
try:
|
||||
main_layer.set_input_embeddings(value)
|
||||
except AttributeError:
|
||||
logger.info("Building the model")
|
||||
self(self.dummy_inputs)
|
||||
main_layer.set_input_embeddings(value)
|
||||
|
||||
def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
|
||||
"""
|
||||
Returns the model's output embeddings
|
||||
|
||||
Returns:
|
||||
:obj:`tf.keras.layers.Layer`: A torch module mapping hidden states to vocabulary.
|
||||
:obj:`tf.Variable`: The new weights mapping vocabulary to hidden states.
|
||||
"""
|
||||
if self.get_lm_head() is not None:
|
||||
lm_head = self.get_lm_head()
|
||||
|
||||
return lm_head.get_output_embeddings()
|
||||
|
||||
return None # Overwrite for models with output embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
"""
|
||||
Set model's output embeddings
|
||||
|
||||
Args:
|
||||
value (:obj:`tf.Variable`):
|
||||
The new weights mapping hidden states to vocabulary.
|
||||
"""
|
||||
if self.get_lm_head() is not None:
|
||||
lm_head = self.get_lm_head()
|
||||
try:
|
||||
lm_head.set_output_embeddings(value)
|
||||
except AttributeError:
|
||||
logger.info("Building the model")
|
||||
self(self.dummy_inputs)
|
||||
lm_head.set_output_embeddings(value)
|
||||
|
||||
def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
|
||||
"""
|
||||
Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the
|
||||
embeddings.
|
||||
embeddings
|
||||
|
||||
Return:
|
||||
:obj:`tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
|
||||
"""
|
||||
return None
|
||||
warnings.warn(
|
||||
"The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
|
||||
)
|
||||
return self.get_lm_head()
|
||||
|
||||
def get_prefix_bias_name(self) -> Union[None, str]:
|
||||
"""
|
||||
Get the concatenated prefix name of the bias from the model name to the parent layer.
|
||||
Get the concatenated prefix name of the bias from the model name to the parent layer
|
||||
|
||||
Return:
|
||||
:obj:`str`: The prefix name of the bias.
|
||||
"""
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return None
|
||||
|
||||
def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
|
||||
"""
|
||||
Dict of bias attached to an LM head. The key represents the name of the bias attribute.
|
||||
|
||||
Return:
|
||||
:obj:`tf.Variable`: The weights representing the bias, None if not an LM model.
|
||||
"""
|
||||
if self.get_lm_head() is not None:
|
||||
lm_head = self.get_lm_head()
|
||||
try:
|
||||
return lm_head.get_bias()
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
|
||||
return lm_head.get_bias()
|
||||
return None
|
||||
|
||||
def set_bias(self, value):
|
||||
"""
|
||||
Set all the bias in the LM head.
|
||||
|
||||
Args:
|
||||
value (:obj:`Dict[tf.Variable]`):
|
||||
All the new bias attached to an LM head.
|
||||
"""
|
||||
if self.get_lm_head() is not None:
|
||||
lm_head = self.get_lm_head()
|
||||
try:
|
||||
lm_head.set_bias(value)
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
lm_head.set_bias(value)
|
||||
|
||||
def get_lm_head(self) -> tf.keras.layers.Layer:
|
||||
"""
|
||||
The LM Head layer. This method must be overwritten by all the models that have a lm head.
|
||||
|
||||
Return:
|
||||
:obj:`tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
|
||||
"""
|
||||
return None
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens=None) -> tf.Variable:
|
||||
@@ -685,46 +799,151 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
||||
Return:
|
||||
:obj:`tf.Variable`: Pointer to the input tokens Embeddings Module of the model.
|
||||
"""
|
||||
if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
|
||||
return self._get_word_embedding_weight(self.get_input_embeddings())
|
||||
|
||||
model_embeds = self._resize_token_embeddings(new_num_tokens)
|
||||
if new_num_tokens is None:
|
||||
return model_embeds
|
||||
|
||||
return model_embeds
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
# get_input_embeddings and set_input_embeddings need to be implemented in base layer.
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
old_embeddings = base_model.get_input_embeddings()
|
||||
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||
base_model.set_input_embeddings(new_embeddings)
|
||||
# Update base model and current model config
|
||||
self.config.vocab_size = new_num_tokens
|
||||
base_model.vocab_size = new_num_tokens
|
||||
return base_model.get_input_embeddings()
|
||||
|
||||
def _get_word_embeddings(self, embeddings):
|
||||
if hasattr(embeddings, "word_embeddings"):
|
||||
# TFBertEmbeddings, TFAlbertEmbeddings, TFElectraEmbeddings
|
||||
return embeddings.word_embeddings
|
||||
elif hasattr(embeddings, "weight"):
|
||||
# TFSharedEmbeddings
|
||||
return embeddings.weight
|
||||
return model_embeds
|
||||
|
||||
def _get_word_embedding_weight(self, embedding_layer):
|
||||
if hasattr(embedding_layer, "word_embeddings"):
|
||||
return embedding_layer.word_embeddings
|
||||
elif hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
elif hasattr(embedding_layer, "decoder"):
|
||||
return embedding_layer.decoder
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
embeddings.build([])
|
||||
if hasattr(embeddings, "word_embeddings"):
|
||||
# TFBertEmbeddings, TFAlbertEmbeddings, TFElectraEmbeddings
|
||||
return embeddings.word_embeddings
|
||||
elif hasattr(embeddings, "weight"):
|
||||
# TFSharedEmbeddings
|
||||
return embeddings.weight
|
||||
self(self.dummy_inputs)
|
||||
if hasattr(embedding_layer, "word_embeddings"):
|
||||
return embedding_layer.word_embeddings
|
||||
elif hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
elif hasattr(embedding_layer, "decoder"):
|
||||
return embedding_layer.decoder
|
||||
else:
|
||||
raise ValueError("word embedding is not defined.")
|
||||
return None
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
old_embeddings = self._get_word_embedding_weight(self.get_input_embeddings())
|
||||
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||
|
||||
# if word embeddings are not tied, make sure that lm head bias is resized as well
|
||||
if self.get_bias() is not None:
|
||||
old_lm_head_bias = self.get_bias()
|
||||
new_lm_head_bias = self._get_resized_lm_head_bias(old_lm_head_bias, new_num_tokens)
|
||||
|
||||
self.set_bias(new_lm_head_bias)
|
||||
|
||||
# if word embeddings are not tied, make sure that lm head decoder is resized as well
|
||||
if self.get_output_embeddings() is not None:
|
||||
old_lm_head_decoder = self._get_word_embedding_weight(self.get_output_embeddings())
|
||||
new_lm_head_decoder = self._get_resized_lm_head_decoder(old_lm_head_decoder, new_num_tokens)
|
||||
|
||||
self.set_output_embeddings(new_lm_head_decoder)
|
||||
|
||||
self.set_input_embeddings(new_embeddings)
|
||||
|
||||
return self.get_input_embeddings()
|
||||
|
||||
def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
|
||||
"""
|
||||
Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
|
||||
Reducing the size will remove vectors from the end
|
||||
|
||||
Args:
|
||||
old_lm_head_bias (:obj:`tf.Variable`):
|
||||
Old lm head bias to be resized.
|
||||
new_num_tokens (:obj:`int`, `optional`):
|
||||
New number of tokens in the linear matrix.
|
||||
|
||||
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
|
||||
vectors from the end. If not provided or :obj:`None`, just returns None
|
||||
|
||||
Return:
|
||||
:obj:`tf.Variable`: Pointer to the resized bias.
|
||||
"""
|
||||
new_lm_head_bias = {}
|
||||
|
||||
for attr, weight in old_lm_head_bias.items():
|
||||
first_dim, old_num_tokens = (None, shape_list(weight)[0]) if tf.rank(weight) == 1 else shape_list(weight)
|
||||
size_diff = new_num_tokens - old_num_tokens
|
||||
final_shape = [new_num_tokens] if first_dim is None else [first_dim, new_num_tokens]
|
||||
|
||||
# initialize new bias
|
||||
if tf.math.greater(size_diff, 0):
|
||||
padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]]
|
||||
current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1)
|
||||
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
||||
mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy]
|
||||
bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True)
|
||||
bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False)
|
||||
else:
|
||||
slice_from = [0] if first_dim is None else [0, 0]
|
||||
current_bias = tf.slice(
|
||||
weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape)
|
||||
)
|
||||
bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True)
|
||||
|
||||
new_bias = self.add_weight(
|
||||
shape=final_shape,
|
||||
initializer="zeros",
|
||||
trainable=True,
|
||||
name=weight.name.split(":")[0],
|
||||
)
|
||||
init_bias = tf.where(bias_mask, current_bias, new_bias.value())
|
||||
|
||||
new_bias.assign(init_bias)
|
||||
new_lm_head_bias[attr] = new_bias
|
||||
|
||||
return new_lm_head_bias
|
||||
|
||||
def _get_resized_lm_head_decoder(self, old_lm_head_decoder, new_num_tokens):
|
||||
"""
|
||||
Build a resized decoder from the old ones. Increasing the size will add newly initialized vectors at the end.
|
||||
Reducing the size will remove vectors from the end
|
||||
|
||||
Args:
|
||||
old_lm_head_decoder (:obj:`tf.Variable`):
|
||||
Old lm head decoder to be resized.
|
||||
new_num_tokens (:obj:`int`, `optional`):
|
||||
New number of tokens in the linear matrix.
|
||||
|
||||
Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
|
||||
vectors from the end. If not provided or :obj:`None`, just returns None
|
||||
|
||||
Return:
|
||||
:obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are differents of the
|
||||
input ones.
|
||||
"""
|
||||
new_lm_head_decoder = old_lm_head_decoder
|
||||
is_input_output_equals = tf.reduce_any(
|
||||
self._get_word_embedding_weight(self.get_input_embeddings()) == old_lm_head_decoder
|
||||
)
|
||||
|
||||
if old_lm_head_decoder is not None and not is_input_output_equals:
|
||||
old_embedding_dim = shape_list(old_lm_head_decoder)[1]
|
||||
decoder_mask, current_decoder = init_copy_embeddings(old_lm_head_decoder, new_num_tokens)
|
||||
new_lm_head_decoder = self.add_weight(
|
||||
shape=(new_num_tokens, old_embedding_dim),
|
||||
initializer="zeros",
|
||||
trainable=True,
|
||||
name=old_lm_head_decoder.name.split(":")[0],
|
||||
)
|
||||
init_decoder = tf.where(decoder_mask, current_decoder, new_lm_head_decoder.value())
|
||||
|
||||
new_lm_head_decoder.assign(init_decoder)
|
||||
|
||||
return new_lm_head_decoder
|
||||
|
||||
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None) -> tf.Variable:
|
||||
"""
|
||||
Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
|
||||
Build a resized Embedding weights from a provided token Embedding weights. Increasing the size will add newly
|
||||
initialized vectors at the end. Reducing the size will remove vectors from the end
|
||||
|
||||
Args:
|
||||
@@ -741,87 +960,18 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
||||
:obj:`tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if
|
||||
:obj:`new_num_tokens` is :obj:`None`
|
||||
"""
|
||||
word_embeddings = self._get_word_embeddings(old_embeddings)
|
||||
bias_layer = self.get_output_layer_with_bias()
|
||||
|
||||
if new_num_tokens is None:
|
||||
return word_embeddings
|
||||
|
||||
old_num_tokens, old_embedding_dim = word_embeddings.shape
|
||||
|
||||
if old_num_tokens == new_num_tokens:
|
||||
return word_embeddings
|
||||
|
||||
# initialize new embeddings
|
||||
# todo: initializer range is not always passed in config.
|
||||
old_embedding_dim = shape_list(old_embeddings)[1]
|
||||
init_range = getattr(self.config, "initializer_range", 0.02)
|
||||
name = (
|
||||
self.name
|
||||
+ "/"
|
||||
+ self.base_model_prefix
|
||||
+ "/"
|
||||
+ old_embeddings.name
|
||||
+ "/"
|
||||
+ word_embeddings.name.split(":")[0]
|
||||
)
|
||||
embeddings_mask, current_embeddings = init_copy_embeddings(old_embeddings, new_num_tokens)
|
||||
new_embeddings = self.add_weight(
|
||||
name=name,
|
||||
name=old_embeddings.name.split(":")[0],
|
||||
shape=[new_num_tokens, old_embedding_dim],
|
||||
initializer=get_initializer(init_range),
|
||||
dtype=tf.float32,
|
||||
)
|
||||
init_weights = tf.make_ndarray(tf.make_tensor_proto(new_embeddings.value()))
|
||||
init_embeddings = tf.where(embeddings_mask, current_embeddings, new_embeddings.value())
|
||||
|
||||
# Copy token embeddings from the previous weights
|
||||
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
||||
init_weights[:num_tokens_to_copy] = word_embeddings.value()[:num_tokens_to_copy, :]
|
||||
new_embeddings.assign(init_weights)
|
||||
|
||||
if bias_layer is not None:
|
||||
if not hasattr(bias_layer, "bias"):
|
||||
bias_layer.build([])
|
||||
|
||||
# Second check in order to be sure the attribute has been properly created
|
||||
if not hasattr(bias_layer, "bias"):
|
||||
raise ValueError("bias is not defined.")
|
||||
|
||||
# initialize bias
|
||||
init_bias = np.zeros((new_num_tokens,))
|
||||
init_bias[:num_tokens_to_copy] = bias_layer.bias.value()[
|
||||
:num_tokens_to_copy
|
||||
] # tf.make_ndarray(tf.make_tensor_proto(bias_layer.bias.value()))[:num_tokens_to_copy]
|
||||
|
||||
bias_layer.bias = self.add_weight(
|
||||
shape=(new_num_tokens,),
|
||||
initializer="zeros",
|
||||
trainable=True,
|
||||
name=self.get_prefix_bias_name() + "/bias",
|
||||
)
|
||||
|
||||
bias_layer.bias.assign(init_bias)
|
||||
|
||||
output_embeddings = self.get_output_embeddings()
|
||||
|
||||
if output_embeddings is not None:
|
||||
if self.get_input_embeddings() != output_embeddings:
|
||||
if not hasattr(output_embeddings, "decoder"):
|
||||
output_embeddings.build([])
|
||||
|
||||
# Second check in order to be sure the attribute has been properly created
|
||||
if not hasattr(output_embeddings, "decoder"):
|
||||
raise ValueError("decoder is not defined.")
|
||||
|
||||
# initialize decoder
|
||||
init_weights = np.zeros((new_num_tokens, old_embedding_dim))
|
||||
init_weights[:num_tokens_to_copy] = output_embeddings.decoder.value()[:num_tokens_to_copy, :]
|
||||
|
||||
output_embeddings.decoder = self.add_weight(
|
||||
shape=(new_num_tokens, old_embedding_dim),
|
||||
initializer="zeros",
|
||||
trainable=True,
|
||||
name=self.get_prefix_bias_name() + "/decoder/weight",
|
||||
)
|
||||
output_embeddings.decoder.assign(init_weights)
|
||||
new_embeddings.assign(init_embeddings)
|
||||
|
||||
return new_embeddings
|
||||
|
||||
|
||||
@@ -470,6 +470,21 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.decoder
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.decoder.word_embeddings = value
|
||||
self.decoder.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias, "decoder_bias": self.decoder_bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.decoder_bias = value["decoder_bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.activation(hidden_states)
|
||||
@@ -505,10 +520,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -835,34 +847,8 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
|
||||
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
|
||||
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.albert.embeddings
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens):
|
||||
super().resize_token_embeddings(new_num_tokens=new_num_tokens)
|
||||
|
||||
# ALBERT is a special case where there are two bias to update
|
||||
# even though self.bias is not used anywhere and is here
|
||||
# just to make the loading weights from a PT model happy
|
||||
if new_num_tokens is not None:
|
||||
num_tokens_to_copy = min(self.predictions.bias.shape[0], new_num_tokens)
|
||||
self.predictions.vocab_size = num_tokens_to_copy
|
||||
init_bias = tf.zeros((new_num_tokens,))
|
||||
init_bias[:num_tokens_to_copy] = self.predictions.bias.value()[:num_tokens_to_copy]
|
||||
name = self.name + "/" + self.predictions.name + "/bias"
|
||||
self.predictions.bias = self.add_weight(
|
||||
shape=(new_num_tokens,), initializer="zeros", trainable=True, name=name
|
||||
)
|
||||
self.predictions.bias.assign(init_bias)
|
||||
|
||||
init_decoder_bias = tf.zeros((new_num_tokens,))
|
||||
init_decoder_bias[:num_tokens_to_copy] = self.predictions.decoder_bias.value()[:num_tokens_to_copy]
|
||||
name = self.name + "/" + self.predictions.name + "/decoder_bias"
|
||||
self.predictions.decoder_bias = self.add_weight(
|
||||
shape=(new_num_tokens,), initializer="zeros", trainable=True, name=name
|
||||
)
|
||||
|
||||
self.predictions.decoder_bias.assign(init_decoder_bias)
|
||||
def get_lm_head(self):
|
||||
return self.predictions
|
||||
|
||||
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
@@ -980,34 +966,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
|
||||
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
|
||||
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.albert.embeddings
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens):
|
||||
super().resize_token_embeddings(new_num_tokens=new_num_tokens)
|
||||
|
||||
# ALBERT is a special case where there are two bias to update
|
||||
# even though self.bias is not used anywhere and is here
|
||||
# just to make the loading weights from a PT model happy
|
||||
if new_num_tokens is not None:
|
||||
num_tokens_to_copy = min(self.predictions.bias.shape[0], new_num_tokens)
|
||||
self.predictions.vocab_size = num_tokens_to_copy
|
||||
init_bias = tf.zeros((new_num_tokens,))
|
||||
init_bias[:num_tokens_to_copy] = self.predictions.bias.value()[:num_tokens_to_copy]
|
||||
name = self.name + "/" + self.predictions.name + "/bias"
|
||||
self.predictions.bias = self.add_weight(
|
||||
shape=(new_num_tokens,), initializer="zeros", trainable=True, name=name
|
||||
)
|
||||
self.predictions.bias.assign(init_bias)
|
||||
|
||||
init_decoder_bias = tf.zeros((new_num_tokens,))
|
||||
init_decoder_bias[:num_tokens_to_copy] = self.predictions.decoder_bias.value()[:num_tokens_to_copy]
|
||||
name = self.name + "/" + self.predictions.name + "/decoder_bias"
|
||||
self.predictions.decoder_bias = self.add_weight(
|
||||
shape=(new_num_tokens,), initializer="zeros", trainable=True, name=name
|
||||
)
|
||||
|
||||
self.predictions.decoder_bias.assign(init_decoder_bias)
|
||||
def get_lm_head(self):
|
||||
return self.predictions
|
||||
|
||||
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
||||
@@ -481,6 +481,29 @@ class TFBartPretrainedModel(TFPreTrainedModel):
|
||||
}
|
||||
return dummy_inputs
|
||||
|
||||
def get_input_embeddings(self):
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
|
||||
return base_model.shared
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
|
||||
try:
|
||||
base_model.shared.weight = value
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
base_model.shared.weight = value
|
||||
|
||||
base_model.shared.vocab_size = shape_list(base_model.shared.weight)[0]
|
||||
|
||||
with tf.compat.v1.variable_scope("model.shared") as shared_abs_scope_name:
|
||||
pass
|
||||
|
||||
embed_tokens = TFWrappedEmbeddings(base_model.shared, abs_scope_name=shared_abs_scope_name)
|
||||
base_model.encoder.set_embed_tokens(embed_tokens)
|
||||
base_model.decoder.set_embed_tokens(embed_tokens)
|
||||
|
||||
@tf.function(
|
||||
input_signature=[
|
||||
{
|
||||
@@ -634,6 +657,9 @@ class TFBartEncoder(tf.keras.layers.Layer):
|
||||
else None
|
||||
)
|
||||
|
||||
def set_embed_tokens(self, embed_tokens):
|
||||
self.embed_tokens = embed_tokens
|
||||
|
||||
def call(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -791,6 +817,9 @@ class TFBartDecoder(tf.keras.layers.Layer):
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||
self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm
|
||||
|
||||
def set_embed_tokens(self, embed_tokens):
|
||||
self.embed_tokens = embed_tokens
|
||||
|
||||
def call(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -1009,6 +1038,9 @@ class TFBartModel(TFBartPretrainedModel):
|
||||
self.encoder = TFBartEncoder(config, embed_tokens, name="encoder")
|
||||
self.decoder = TFBartDecoder(config, embed_tokens, name="decoder")
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.decoder
|
||||
|
||||
@@ -1134,15 +1166,6 @@ class TFBartModel(TFBartPretrainedModel):
|
||||
encoder_attentions=enc_attns,
|
||||
)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.shared = value
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The BART Model with a language modeling head. Can be used for summarization.",
|
||||
@@ -1166,22 +1189,20 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel):
|
||||
def get_decoder(self):
|
||||
return self.model.decoder
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens):
|
||||
super().resize_token_embeddings(new_num_tokens=new_num_tokens)
|
||||
def get_encoder(self):
|
||||
return self.model.encoder
|
||||
|
||||
# BART is a special case where the bias has two dimensions
|
||||
# and not named just `bias`
|
||||
if new_num_tokens is not None:
|
||||
num_tokens_to_copy = min(self.final_logits_bias.shape[0], new_num_tokens)
|
||||
init_bias = tf.zeros((new_num_tokens,))
|
||||
init_bias[:num_tokens_to_copy] = self.final_logits_bias.value()[:num_tokens_to_copy]
|
||||
self.final_logits_bias = self.add_weight(
|
||||
shape=(1, new_num_tokens),
|
||||
initializer="zeros",
|
||||
trainable=False,
|
||||
name="final_logits_bias",
|
||||
)
|
||||
self.final_logits_bias.assign(init_bias)
|
||||
def get_output_embeddings(self):
|
||||
return self.get_input_embeddings()
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.set_input_embeddings(value)
|
||||
|
||||
def get_bias(self):
|
||||
return {"final_logits_bias": self.final_logits_bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.final_logits_bias = value["final_logits_bias"]
|
||||
|
||||
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
@@ -1356,12 +1377,6 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel):
|
||||
else:
|
||||
return logits
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.model.shared
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.encoder
|
||||
|
||||
def compute_loss(self, labels, logits):
|
||||
"""CrossEntropyLoss that ignores pad tokens"""
|
||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 BERT model. """
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -526,6 +527,20 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.word_embeddings = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
@@ -582,7 +597,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -918,13 +933,11 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
|
||||
self.nsp = TFBertNSPHead(config, name="nsp___cls")
|
||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.bert.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.mlm.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@@ -1044,13 +1057,11 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
|
||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.bert.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.mlm.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@@ -1149,13 +1160,11 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
|
||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.bert.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.mlm.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
|
||||
|
||||
@add_code_sample_docstrings(
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 CTRL model."""
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
@@ -242,10 +244,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.w.weight = value
|
||||
self.w.vocab_size = value.shape[0]
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
self.w.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -618,6 +617,20 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
|
||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.weight = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
@@ -638,13 +651,11 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
|
||||
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head.input_embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.lm_head
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.lm_head.name
|
||||
|
||||
def prepare_inputs_for_generation(self, inputs, past, **kwargs):
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
TF 2.0 DistilBERT model
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ...activations_tf import get_tf_activation
|
||||
@@ -39,7 +41,6 @@ from ...modeling_tf_utils import (
|
||||
TFPreTrainedModel,
|
||||
TFQuestionAnsweringLoss,
|
||||
TFSequenceClassificationLoss,
|
||||
TFSharedEmbeddings,
|
||||
TFTokenClassificationLoss,
|
||||
get_initializer,
|
||||
input_processing,
|
||||
@@ -72,9 +73,6 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
||||
self.vocab_size = config.vocab_size
|
||||
self.dim = config.dim
|
||||
self.initializer_range = config.initializer_range
|
||||
self.word_embeddings = TFSharedEmbeddings(
|
||||
config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings"
|
||||
) # padding_idx=0)
|
||||
self.position_embeddings = tf.keras.layers.Embedding(
|
||||
config.max_position_embeddings,
|
||||
config.dim,
|
||||
@@ -648,6 +646,20 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
|
||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.word_embeddings = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
@@ -671,13 +683,11 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModel
|
||||
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
|
||||
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.vocab_projector.input_embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.vocab_projector
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.vocab_projector.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
|
||||
@@ -577,6 +577,10 @@ class TFDPRContextEncoder(TFDPRPretrainedContextEncoder):
|
||||
self.ctx_encoder = TFDPREncoderLayer(config, name="ctx_encoder")
|
||||
|
||||
def get_input_embeddings(self):
|
||||
try:
|
||||
return self.ctx_encoder.bert_model.get_input_embeddings()
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
return self.ctx_encoder.bert_model.get_input_embeddings()
|
||||
|
||||
@add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
|
||||
@@ -671,6 +675,10 @@ class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder):
|
||||
self.question_encoder = TFDPREncoderLayer(config, name="question_encoder")
|
||||
|
||||
def get_input_embeddings(self):
|
||||
try:
|
||||
return self.question_encoder.bert_model.get_input_embeddings()
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
return self.question_encoder.bert_model.get_input_embeddings()
|
||||
|
||||
@add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING)
|
||||
@@ -764,6 +772,10 @@ class TFDPRReader(TFDPRPretrainedReader):
|
||||
self.span_predictor = TFDPRSpanPredictorLayer(config, name="span_predictor")
|
||||
|
||||
def get_input_embeddings(self):
|
||||
try:
|
||||
return self.span_predictor.encoder.bert_model.get_input_embeddings()
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
return self.span_predictor.encoder.bert_model.get_input_embeddings()
|
||||
|
||||
@add_start_docstrings_to_model_forward(TF_DPR_READER_INPUTS_DOCSTRING)
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
""" TF Electra model. """
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -511,10 +512,7 @@ class TFElectraMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -912,6 +910,20 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.word_embeddings = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states, training=False):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
@@ -943,13 +955,11 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLos
|
||||
|
||||
self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.electra.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.generator_lm_head
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.generator_lm_head.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -478,6 +479,10 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer):
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.weight = value
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def call(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -721,6 +726,20 @@ class TFFlaubertPredLayer(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.weight = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
@@ -767,13 +786,11 @@ class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel):
|
||||
self.transformer = TFFlaubertMainLayer(config, name="transformer")
|
||||
self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.pred_layer.input_embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.pred_layer
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.pred_layer.name
|
||||
|
||||
def prepare_inputs_for_generation(self, inputs, **kwargs):
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 Funnel model. """
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -787,7 +788,7 @@ class TFFunnelBaseLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
||||
@@ -873,7 +874,7 @@ class TFFunnelMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
||||
@@ -992,6 +993,20 @@ class TFFunnelMaskedLMHead(tf.keras.layers.Layer):
|
||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.word_embeddings = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states, training=False):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
@@ -1349,13 +1364,11 @@ class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss)
|
||||
self.funnel = TFFunnelMainLayer(config, name="funnel")
|
||||
self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.funnel.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.lm_head
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.lm_head.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
|
||||
@@ -243,7 +243,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.wte.weight = value
|
||||
self.wte.vocab_size = self.wte.weight.shape[0]
|
||||
self.wte.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -653,7 +653,10 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.transformer.wte
|
||||
return self.get_input_embeddings()
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.set_input_embeddings(value)
|
||||
|
||||
def prepare_inputs_for_generation(self, inputs, past, **kwargs):
|
||||
# only last token for inputs_ids if past is defined in kwargs
|
||||
@@ -771,9 +774,6 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
||||
config, initializer_range=config.initializer_range, name="multiple_choice_head"
|
||||
)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.transformer.wte
|
||||
|
||||
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def call(
|
||||
@@ -945,9 +945,6 @@ class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassific
|
||||
)
|
||||
self.transformer = TFGPT2MainLayer(config, name="transformer")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.transformer.wte
|
||||
|
||||
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
|
||||
@@ -1182,6 +1182,44 @@ class TFLEDPreTrainedModel(TFPreTrainedModel):
|
||||
}
|
||||
return dummy_inputs
|
||||
|
||||
def get_input_embeddings(self):
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
|
||||
return base_model.shared
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
|
||||
try:
|
||||
base_model.shared.weight = value
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
base_model.shared.weight = value
|
||||
|
||||
base_model.shared.vocab_size = shape_list(base_model.shared.weight)[0]
|
||||
|
||||
with tf.compat.v1.variable_scope("model.shared") as shared_abs_scope_name:
|
||||
pass
|
||||
|
||||
embed_tokens = TFWrappedEmbeddings(base_model.shared, abs_scope_name=shared_abs_scope_name)
|
||||
base_model.encoder.set_embed_tokens(embed_tokens)
|
||||
base_model.decoder.set_embed_tokens(embed_tokens)
|
||||
|
||||
@tf.function(
|
||||
input_signature=[
|
||||
{
|
||||
"input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
|
||||
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
|
||||
"decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
|
||||
"decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
|
||||
}
|
||||
]
|
||||
)
|
||||
def serving(self, inputs):
|
||||
output = self.call(inputs)
|
||||
|
||||
return self.serving_output(output)
|
||||
|
||||
|
||||
@dataclass
|
||||
# Copied from transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput with TFLongformer->TFLEDEncoder
|
||||
@@ -1483,6 +1521,9 @@ class TFLEDEncoder(tf.keras.layers.Layer):
|
||||
self.layers = [TFLEDEncoderLayer(config, i, name=f"layers.{i}") for i in range(config.encoder_layers)]
|
||||
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
|
||||
|
||||
def set_embed_tokens(self, embed_tokens):
|
||||
self.embed_tokens = embed_tokens
|
||||
|
||||
def call(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -1714,6 +1755,9 @@ class TFLEDDecoder(tf.keras.layers.Layer):
|
||||
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||
|
||||
def set_embed_tokens(self, embed_tokens):
|
||||
self.embed_tokens = embed_tokens
|
||||
|
||||
def call(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -1921,6 +1965,9 @@ class TFLEDModel(TFLEDPreTrainedModel):
|
||||
self.encoder = TFLEDEncoder(config, embed_tokens, name="encoder")
|
||||
self.decoder = TFLEDDecoder(config, embed_tokens, name="decoder")
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.decoder
|
||||
|
||||
@@ -2047,15 +2094,6 @@ class TFLEDModel(TFLEDPreTrainedModel):
|
||||
encoder_global_attentions=enc_g_attns,
|
||||
)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.shared = value
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The LED Model with a language modeling head. Can be used for summarization.",
|
||||
@@ -2079,22 +2117,20 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
|
||||
def get_decoder(self):
|
||||
return self.led.decoder
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens):
|
||||
super().resize_token_embeddings(new_num_tokens=new_num_tokens)
|
||||
def get_encoder(self):
|
||||
return self.led.encoder
|
||||
|
||||
# LED is a special case where the bias has two dimensions
|
||||
# and not named just `bias`
|
||||
if new_num_tokens is not None:
|
||||
num_tokens_to_copy = min(shape_list(self.final_logits_bias), new_num_tokens)
|
||||
init_bias = tf.zeros((new_num_tokens,))
|
||||
init_bias[:num_tokens_to_copy] = self.final_logits_bias.value()[:num_tokens_to_copy]
|
||||
self.final_logits_bias = self.add_weight(
|
||||
shape=(1, new_num_tokens),
|
||||
initializer="zeros",
|
||||
trainable=False,
|
||||
name="final_logits_bias",
|
||||
)
|
||||
self.final_logits_bias.assign(init_bias)
|
||||
def get_bias(self):
|
||||
return {"final_logits_bias": self.final_logits_bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.final_logits_bias = value["final_logits_bias"]
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.get_input_embeddings()
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.set_input_embeddings(value)
|
||||
|
||||
@add_start_docstrings_to_model_forward(LED_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TFLEDSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
@@ -2266,12 +2302,6 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
|
||||
)
|
||||
return (past[0], reordered_past)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.led.shared
|
||||
|
||||
def get_encoder(self):
|
||||
return self.led.encoder
|
||||
|
||||
def compute_loss(self, labels, logits):
|
||||
"""CrossEntropyLoss that ignores pad tokens"""
|
||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
"""Tensorflow Longformer model. """
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -437,6 +438,20 @@ class TFLongformerLMHead(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.decoder
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.decoder.word_embeddings = value
|
||||
self.decoder.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
@@ -1602,7 +1617,7 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -2040,13 +2055,11 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
|
||||
self.longformer = TFLongformerMainLayer(config, add_pooling_layer=False, name="longformer")
|
||||
self.lm_head = TFLongformerLMHead(config, self.longformer.embeddings, name="lm_head")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head.decoder
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.lm_head
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.lm_head.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 LXMERT model. """
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
@@ -706,10 +707,7 @@ class TFLxmertMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
raise NotImplementedError
|
||||
@@ -1103,6 +1101,20 @@ class TFLxmertLMPredictionHead(tf.keras.layers.Layer):
|
||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.word_embeddings = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
@@ -1292,13 +1304,11 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
|
||||
**({"obj_labels": obj_labels} if self.config.task_obj_predict else {}),
|
||||
}
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lxmert.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.cls.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.cls.name + "/" + self.cls.predictions.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING)
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 MobileBERT model. """
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -665,6 +666,20 @@ class TFMobileBertLMPredictionHead(tf.keras.layers.Layer):
|
||||
)
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.decoder = value
|
||||
self.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = tf.matmul(hidden_states, tf.concat([tf.transpose(self.decoder), self.dense], axis=0))
|
||||
@@ -704,10 +719,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -1039,13 +1051,11 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
|
||||
self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
|
||||
self.seq_relationship = TFMobileBertOnlyNSPHead(2, name="seq_relationship___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.predictions.predictions
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.predictions.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.predictions.name + "/" + self.predictions.predictions.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@@ -1149,13 +1159,11 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
|
||||
self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert")
|
||||
self.mlm = TFMobileBertMLMHead(config, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.mlm.predictions
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.mlm.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
@@ -541,7 +542,7 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
@@ -840,6 +841,20 @@ class TFMPNetLMHead(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.decoder
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.decoder.word_embeddings = value
|
||||
self.decoder.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, features):
|
||||
x = self.dense(features)
|
||||
x = self.act(x)
|
||||
@@ -862,13 +877,11 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
self.mpnet = TFMPNetMainLayer(config, name="mpnet")
|
||||
self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings, name="lm_head")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.mpnet.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.lm_head
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.lm_head.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
|
||||
@@ -219,7 +219,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.tokens_embed.weight = value
|
||||
self.tokens_embed.vocab_size = value.shape[0]
|
||||
self.tokens_embed.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -577,7 +577,10 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
|
||||
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.transformer.tokens_embed
|
||||
return self.get_input_embeddings()
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.set_input_embeddings(value)
|
||||
|
||||
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
@@ -682,9 +685,6 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
|
||||
config, initializer_range=config.initializer_range, name="multiple_choice_head"
|
||||
)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.transformer.tokens_embed
|
||||
|
||||
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def call(
|
||||
@@ -841,9 +841,6 @@ class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenc
|
||||
)
|
||||
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.transformer.tokens_embed
|
||||
|
||||
@add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 RoBERTa model. """
|
||||
|
||||
import warnings
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ...activations_tf import get_tf_activation
|
||||
@@ -502,7 +504,7 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
@@ -827,6 +829,20 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.decoder
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.decoder.word_embeddings = value
|
||||
self.decoder.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
@@ -849,13 +865,11 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
|
||||
self.roberta = TFRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
|
||||
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head.decoder
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.lm_head
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.lm_head.name
|
||||
|
||||
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
|
||||
@@ -573,15 +573,6 @@ class TFT5MainLayer(tf.keras.layers.Layer):
|
||||
self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embed_tokens
|
||||
|
||||
def set_embed_tokens(self, embed_tokens):
|
||||
self.embed_tokens = embed_tokens
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
||||
|
||||
@@ -839,6 +830,26 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
|
||||
|
||||
return self.serving_output(output)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
try:
|
||||
self.shared.weight = value
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
self.shared.weight = value
|
||||
|
||||
self.shared.vocab_size = shape_list(value)[0]
|
||||
# retrieve correct absolute scope for embed token wrapper
|
||||
with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name:
|
||||
pass
|
||||
# Wraps layer to avoid problems with weight restoring and ensuring we're in the correct TF scope.
|
||||
embed_tokens = TFWrappedEmbeddings(self.shared, abs_scope_name=shared_abs_scope_name)
|
||||
self.encoder.embed_tokens = embed_tokens
|
||||
if hasattr(self, "decoder"):
|
||||
self.decoder.embed_tokens = embed_tokens
|
||||
|
||||
def _shift_right(self, input_ids):
|
||||
decoder_start_token_id = self.config.decoder_start_token_id
|
||||
pad_token_id = self.config.pad_token_id
|
||||
@@ -1050,20 +1061,6 @@ class TFT5Model(TFT5PreTrainedModel):
|
||||
decoder_config.is_decoder = True
|
||||
self.decoder = TFT5MainLayer(decoder_config, embed_tokens, name="decoder")
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared.weight = new_embeddings
|
||||
self.shared.vocab_size = self.shared.weight.shape[0]
|
||||
# retrieve correct absolute scope for embed token wrapper
|
||||
with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name:
|
||||
pass
|
||||
# Wraps layer to avoid problems with weight restoring and ensuring we're in the correct TF scope.
|
||||
embed_tokens = TFWrappedEmbeddings(self.shared, abs_scope_name=shared_abs_scope_name)
|
||||
self.encoder.set_embed_tokens(embed_tokens)
|
||||
self.decoder.set_embed_tokens(embed_tokens)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
@@ -1222,24 +1219,23 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
|
||||
if not config.tie_word_embeddings:
|
||||
self.lm_head = tf.keras.layers.Dense(config.vocab_size, use_bias=False, name="lm_head")
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def get_output_embeddings(self):
|
||||
if self.config.tie_word_embeddings:
|
||||
return self.shared
|
||||
return self.get_input_embeddings()
|
||||
else:
|
||||
return self.lm_head
|
||||
# in a dense layer the kernel has a shape (last_dim, units), for us (dim, num_tokens)
|
||||
# value has a shape (num_tokens, dim) then needs to be transposed
|
||||
return tf.transpose(self.lm_head.kernel)
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared.weight = new_embeddings
|
||||
# retrieve correct absolute scope for embed token wrapper
|
||||
with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name:
|
||||
pass
|
||||
# Wraps layer to avoid problems with weight restoring and ensuring we're in the correct TF scope.
|
||||
embed_tokens = TFWrappedEmbeddings(self.shared, abs_scope_name=shared_abs_scope_name)
|
||||
self.encoder.set_embed_tokens(embed_tokens)
|
||||
self.decoder.set_embed_tokens(embed_tokens)
|
||||
def set_output_embeddings(self, value):
|
||||
if self.config.tie_word_embeddings:
|
||||
self.set_input_embeddings(value)
|
||||
else:
|
||||
self.lm_head = tf.keras.layers.Dense(shape_list(value)[0], use_bias=False, name="lm_head")
|
||||
# in a dense layer the kernel has a shape (last_dim, units), for us (dim, num_tokens)
|
||||
# value has a shape (num_tokens, dim) then needs to be transposed
|
||||
transposed_value = tf.transpose(value)
|
||||
self.lm_head.kernel = transposed_value
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
@@ -1358,9 +1354,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
|
||||
# T5v1.1 does not tie output word embeddings and thus does not require downscaling
|
||||
if self.config.tie_word_embeddings:
|
||||
sequence_output = sequence_output * (self.model_dim ** -0.5)
|
||||
logits = self.get_output_embeddings()(sequence_output, mode="linear")
|
||||
logits = self.shared(sequence_output, mode="linear")
|
||||
else:
|
||||
logits = self.get_output_embeddings()(sequence_output)
|
||||
logits = self.lm_head(sequence_output)
|
||||
|
||||
loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits)
|
||||
|
||||
@@ -1488,19 +1484,6 @@ class TFT5EncoderModel(TFT5PreTrainedModel):
|
||||
encoder_config.use_cache = False
|
||||
self.encoder = TFT5MainLayer(encoder_config, embed_tokens, name="encoder")
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared.weight = new_embeddings
|
||||
self.shared.vocab_size = self.shared.weight.shape[0]
|
||||
# retrieve correct absolute scope for embed token wrapper
|
||||
with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name:
|
||||
pass
|
||||
# Wraps layer to avoid problems with weight restoring and ensuring we're in the correct TF scope.
|
||||
embed_tokens = TFWrappedEmbeddings(self.shared, abs_scope_name=shared_abs_scope_name)
|
||||
self.encoder.set_embed_tokens(embed_tokens)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
|
||||
@@ -468,9 +468,6 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
||||
def set_input_embeddings(self, value):
|
||||
raise NotImplementedError
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
return self.word_emb
|
||||
|
||||
def backward_compatible(self):
|
||||
self.sample_softmax = -1
|
||||
|
||||
@@ -909,25 +906,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TFTransfoXLMHead(tf.keras.layers.Layer):
|
||||
def __init__(self, config, input_embeddings, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.input_embeddings = input_embeddings
|
||||
|
||||
def build(self, input_shape):
|
||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||
super().build(input_shape)
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
return hidden_states
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
|
||||
@@ -948,6 +926,9 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
|
||||
config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
|
||||
)
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
"""Double-check if you are using adaptive softmax."""
|
||||
if len(self.crit.out_layers) > 0:
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"""
|
||||
|
||||
import itertools
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@@ -330,10 +331,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.weight = value
|
||||
self.embeddings.vocab_size = value.shape[0]
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
self.embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
@@ -787,6 +785,20 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.weight = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
@@ -807,13 +819,11 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
|
||||
self.transformer = TFXLMMainLayer(config, name="transformer")
|
||||
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.pred_layer.input_embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.pred_layer
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.pred_layer.name
|
||||
|
||||
def prepare_inputs_for_generation(self, inputs, **kwargs):
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
TF 2.0 XLNet model.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
@@ -407,6 +408,20 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
|
||||
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.weight = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
hidden_states = hidden_states + self.bias
|
||||
@@ -450,7 +465,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.word_embedding.weight = value
|
||||
self.word_embedding.vocab_size = value.shape[0]
|
||||
self.word_embedding.vocab_size = shape_list(value)[0]
|
||||
|
||||
def build(self, input_shape):
|
||||
initializer = get_initializer(self.initializer_range)
|
||||
@@ -458,9 +473,6 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
||||
shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
|
||||
)
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1227,13 +1239,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
self.transformer = TFXLNetMainLayer(config, name="transformer")
|
||||
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_loss.input_embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.lm_loss
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
|
||||
return self.name + "/" + self.lm_loss.name
|
||||
|
||||
def prepare_inputs_for_generation(self, inputs, past, use_mems=None, **kwargs):
|
||||
|
||||
@@ -461,6 +461,20 @@ class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Lay
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.input_embeddings.word_embeddings
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.input_embeddings.word_embeddings = value
|
||||
self.input_embeddings.vocab_size = shape_list(value)[0]
|
||||
|
||||
def get_bias(self):
|
||||
return {"bias": self.bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.bias = value["bias"]
|
||||
self.vocab_size = shape_list(value["bias"])[0]
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||
@@ -800,15 +814,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelca
|
||||
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
|
||||
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.{{cookiecutter.lowercase_modelname}}.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.mlm.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
|
||||
|
||||
@add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
@@ -903,15 +911,9 @@ class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelca
|
||||
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
|
||||
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls")
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.{{cookiecutter.lowercase_modelname}}.embeddings
|
||||
|
||||
def get_output_layer_with_bias(self):
|
||||
def get_lm_head(self):
|
||||
return self.mlm.predictions
|
||||
|
||||
def get_prefix_bias_name(self):
|
||||
return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name
|
||||
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint="{{cookiecutter.checkpoint_identifier}}",
|
||||
@@ -1855,6 +1857,29 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
|
||||
}
|
||||
return dummy_inputs
|
||||
|
||||
def get_input_embeddings(self):
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
|
||||
return base_model.shared
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
base_model = getattr(self, self.base_model_prefix, self)
|
||||
|
||||
try:
|
||||
base_model.shared.weight = value
|
||||
except AttributeError:
|
||||
self(self.dummy_inputs)
|
||||
base_model.shared.weight = value
|
||||
|
||||
base_model.shared.vocab_size = shape_list(base_model.shared.weight)[0]
|
||||
|
||||
with tf.compat.v1.variable_scope("model.shared") as shared_abs_scope_name:
|
||||
pass
|
||||
|
||||
embed_tokens = TFWrappedEmbeddings(base_model.shared, abs_scope_name=shared_abs_scope_name)
|
||||
base_model.encoder.set_embed_tokens(embed_tokens)
|
||||
base_model.decoder.set_embed_tokens(embed_tokens)
|
||||
|
||||
@tf.function(
|
||||
input_signature=[
|
||||
{
|
||||
@@ -1984,6 +2009,9 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
|
||||
self.layers = [TF{{cookiecutter.camelcase_modelname}}EncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
|
||||
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
|
||||
|
||||
def set_embed_tokens(self, embed_tokens):
|
||||
self.embed_tokens = embed_tokens
|
||||
|
||||
def call(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -2124,6 +2152,9 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer):
|
||||
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||
|
||||
def set_embed_tokens(self, embed_tokens):
|
||||
self.embed_tokens = embed_tokens
|
||||
|
||||
def call(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -2331,6 +2362,9 @@ class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_mod
|
||||
self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, embed_tokens, name="encoder")
|
||||
self.decoder = TF{{cookiecutter.camelcase_modelname}}Decoder(config, embed_tokens, name="decoder")
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.decoder
|
||||
|
||||
@@ -2452,15 +2486,6 @@ class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_mod
|
||||
encoder_attentions=enc_attns,
|
||||
)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.shared = value
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The {{cookiecutter.uppercase_modelname}} Model with a language modeling head. Can be used for summarization.",
|
||||
@@ -2484,22 +2509,20 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
|
||||
def get_decoder(self):
|
||||
return self.model.decoder
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens):
|
||||
super().resize_token_embeddings(new_num_tokens=new_num_tokens)
|
||||
def get_encoder(self):
|
||||
return self.model.encoder
|
||||
|
||||
# {{cookiecutter.uppercase_modelname}} is a special case where the bias has two dimensions
|
||||
# and not named just `bias`
|
||||
if new_num_tokens is not None:
|
||||
num_tokens_to_copy = min(shape_list(self.final_logits_bias)[0], new_num_tokens)
|
||||
init_bias = tf.zeros((new_num_tokens,))
|
||||
init_bias[:num_tokens_to_copy] = self.final_logits_bias.value()[:num_tokens_to_copy]
|
||||
self.final_logits_bias = self.add_weight(
|
||||
shape=(1, new_num_tokens),
|
||||
initializer="zeros",
|
||||
trainable=False,
|
||||
name="final_logits_bias",
|
||||
)
|
||||
self.final_logits_bias.assign(init_bias)
|
||||
def get_bias(self):
|
||||
return {"final_logits_bias": self.final_logits_bias}
|
||||
|
||||
def set_bias(self, value):
|
||||
self.final_logits_bias = value["final_logits_bias"]
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.get_input_embeddings()
|
||||
|
||||
def set_output_embeddings(self, value):
|
||||
self.set_input_embeddings(value)
|
||||
|
||||
@add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
@@ -2664,12 +2687,6 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
|
||||
)
|
||||
return (past[0], reordered_past)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.model.shared
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.encoder
|
||||
|
||||
def compute_loss(self, labels, logits):
|
||||
"""CrossEntropyLoss that ignores pad tokens"""
|
||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
|
||||
|
||||
@@ -486,11 +486,83 @@ class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unitte
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_resize_token_embeddings(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
old_final_logits_bias = model.get_bias()
|
||||
|
||||
# reshape the embeddings
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
new_final_logits_bias = model.get_bias()
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_final_logits_bias is not None and new_final_logits_bias is not None:
|
||||
old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
|
||||
new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
|
||||
self.assertEqual(new_final_logits_bias.shape[0], 1)
|
||||
self.assertEqual(new_final_logits_bias.shape[1], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
|
||||
for p1, p2 in zip(old, new):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
|
||||
def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
|
||||
"""If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
|
||||
|
||||
@@ -274,13 +274,23 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
list_lm_models = [TFAlbertForPreTraining, TFAlbertForMaskedLM]
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in list_lm_models:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
@slow
|
||||
|
||||
@@ -159,11 +159,83 @@ class TFBartModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_resize_token_embeddings(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
old_final_logits_bias = model.get_bias()
|
||||
|
||||
# reshape the embeddings
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
new_final_logits_bias = model.get_bias()
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_final_logits_bias is not None and new_final_logits_bias is not None:
|
||||
old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
|
||||
new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
|
||||
self.assertEqual(new_final_logits_bias.shape[0], 1)
|
||||
self.assertEqual(new_final_logits_bias.shape[1], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
|
||||
for p1, p2 in zip(old, new):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
# This test is too long (>30sec) and makes fail the CI
|
||||
pass
|
||||
|
||||
@@ -340,15 +340,17 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
|
||||
if model_class in list_lm_models:
|
||||
x = model.get_output_layer_with_bias()
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_prefix_bias_name()
|
||||
assert isinstance(name, str)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_layer_with_bias()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_custom_load_tf_weights(self):
|
||||
model, output_loading_info = TFBertForTokenClassification.from_pretrained(
|
||||
|
||||
@@ -57,29 +57,93 @@ class TFBlenderbotModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
# inputs_embeds not supported
|
||||
pass
|
||||
|
||||
def test_saved_model_with_hidden_states_output(self):
|
||||
# Should be uncommented during patrick TF refactor
|
||||
pass
|
||||
|
||||
def test_saved_model_with_attentions_output(self):
|
||||
# Should be uncommented during patrick TF refactor
|
||||
pass
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
# This test is too long (>30sec) and makes fail the CI
|
||||
pass
|
||||
|
||||
def test_resize_token_embeddings(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
old_final_logits_bias = model.get_bias()
|
||||
|
||||
# reshape the embeddings
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
new_final_logits_bias = model.get_bias()
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_final_logits_bias is not None and new_final_logits_bias is not None:
|
||||
old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
|
||||
new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
|
||||
self.assertEqual(new_final_logits_bias.shape[0], 1)
|
||||
self.assertEqual(new_final_logits_bias.shape[1], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
|
||||
for p1, p2 in zip(old, new):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
|
||||
@is_pt_tf_cross_test
|
||||
@require_tokenizers
|
||||
|
||||
@@ -41,7 +41,6 @@ if is_tf_available():
|
||||
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
TFAdaptiveEmbedding,
|
||||
TFSharedEmbeddings,
|
||||
tf_top_k_top_p_filtering,
|
||||
)
|
||||
@@ -671,18 +670,20 @@ class TFModelTesterMixin:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), (tf.keras.layers.Layer, TFAdaptiveEmbedding))
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
|
||||
if model_class in list_lm_models:
|
||||
x = model.get_output_layer_with_bias()
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_prefix_bias_name()
|
||||
assert isinstance(name, str)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_layer_with_bias()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_determinism(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@@ -830,23 +831,68 @@ class TFModelTesterMixin:
|
||||
if not self.test_resize_embeddings:
|
||||
return
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
INPUT_SHAPE = [1, 10, config.hidden_size]
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "word_embeddings"):
|
||||
return embedding_layer.word_embeddings
|
||||
elif hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
elif hasattr(embedding_layer, "decoder"):
|
||||
return embedding_layer.decoder
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "word_embeddings"):
|
||||
return embedding_layer.word_embeddings
|
||||
elif hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
elif hasattr(embedding_layer, "decoder"):
|
||||
return embedding_layer.decoder
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
emb_old = model.get_input_embeddings()
|
||||
emb_old.build(INPUT_SHAPE)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_bias = model.get_bias()
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
# reshape the embeddings
|
||||
new_embeddings = model._get_resized_embeddings(emb_old, size)
|
||||
# # check that the resized embeddings size matches the desired size.
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_bias = model.get_bias()
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
self.assertEqual(new_embeddings.shape[0], assert_size)
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
emd_old_weights = model._get_word_embeddings(emb_old)
|
||||
models_equal = True
|
||||
for p1, p2 in zip(emd_old_weights.numpy(), new_embeddings.numpy()):
|
||||
if np.sum(abs(p1 - p2)) > 0:
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_bias is not None and new_bias is not None:
|
||||
for old_weight, new_weight in zip(old_bias.values(), new_bias.values()):
|
||||
self.assertEqual(new_weight.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_weight.value(), new_weight.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
|
||||
@@ -193,6 +193,33 @@ class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_ctrl_for_sequence_classification(*config_and_inputs)
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
list_lm_models = [TFCTRLLMHeadModel]
|
||||
list_other_models_with_output_ebd = [TFCTRLForSequenceClassification]
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
|
||||
if model_class in list_lm_models:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
elif model_class in list_other_models_with_output_ebd:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
|
||||
@@ -370,9 +370,16 @@ class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_gpt2_sequence_classification_model(self):
|
||||
|
||||
@@ -199,11 +199,83 @@ class TFLEDModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_resize_token_embeddings(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
old_final_logits_bias = model.get_bias()
|
||||
|
||||
# reshape the embeddings
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
new_final_logits_bias = model.get_bias()
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_final_logits_bias is not None and new_final_logits_bias is not None:
|
||||
old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
|
||||
new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
|
||||
self.assertEqual(new_final_logits_bias.shape[0], 1)
|
||||
self.assertEqual(new_final_logits_bias.shape[1], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
|
||||
for p1, p2 in zip(old, new):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
def test_attention_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
inputs_dict["global_attention_mask"] = tf.zeros_like(inputs_dict["attention_mask"])
|
||||
|
||||
@@ -687,15 +687,17 @@ class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
|
||||
if model_class in list_lm_models:
|
||||
x = model.get_output_layer_with_bias()
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_prefix_bias_name()
|
||||
assert isinstance(name, str)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_layer_with_bias()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
# This test is too long (>30sec) and makes fail the CI
|
||||
|
||||
@@ -38,7 +38,7 @@ class ModelTester(TFBartModelTester):
|
||||
|
||||
|
||||
@require_tf
|
||||
class TestTFMarianCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
class TFMarianMTModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
|
||||
all_generative_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
|
||||
model_tester_cls = ModelTester
|
||||
@@ -56,13 +56,6 @@ class TestTFMarianCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
# inputs_embeds not supported
|
||||
pass
|
||||
|
||||
def test_saved_model_with_hidden_states_output(self):
|
||||
# Should be uncommented during patrick TF refactor
|
||||
pass
|
||||
|
||||
def test_saved_model_with_attentions_output(self):
|
||||
pass
|
||||
|
||||
def test_compile_tf_model(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -100,15 +93,87 @@ class TestTFMarianCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
# This test is too long (>30sec) and makes fail the CI
|
||||
pass
|
||||
|
||||
def test_resize_token_embeddings(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
old_final_logits_bias = model.get_bias()
|
||||
|
||||
# reshape the embeddings
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
new_final_logits_bias = model.get_bias()
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_final_logits_bias is not None and new_final_logits_bias is not None:
|
||||
old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
|
||||
new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
|
||||
self.assertEqual(new_final_logits_bias.shape[0], 1)
|
||||
self.assertEqual(new_final_logits_bias.shape[1], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
|
||||
for p1, p2 in zip(old, new):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
|
||||
class AbstractMarianIntegrationTest(unittest.TestCase):
|
||||
maxDiff = 1000 # show more chars for failing integration tests
|
||||
|
||||
@@ -36,7 +36,7 @@ class ModelTester(TFBartModelTester):
|
||||
|
||||
|
||||
@require_tf
|
||||
class TestTFMBartCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
class TFMBartModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (TFMBartForConditionalGeneration,) if is_tf_available() else ()
|
||||
all_generative_model_classes = (TFMBartForConditionalGeneration,) if is_tf_available() else ()
|
||||
model_tester_cls = ModelTester
|
||||
@@ -54,14 +54,6 @@ class TestTFMBartCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
# inputs_embeds not supported
|
||||
pass
|
||||
|
||||
def test_saved_model_with_hidden_states_output(self):
|
||||
# Should be uncommented during patrick TF refactor
|
||||
pass
|
||||
|
||||
def test_saved_model_with_attentions_output(self):
|
||||
# Should be uncommented during patrick TF refactor
|
||||
pass
|
||||
|
||||
def test_compile_tf_model(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -99,15 +91,87 @@ class TestTFMBartCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
# This test is too long (>30sec) and makes fail the CI
|
||||
pass
|
||||
|
||||
def test_resize_token_embeddings(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
old_final_logits_bias = model.get_bias()
|
||||
|
||||
# reshape the embeddings
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
new_final_logits_bias = model.get_bias()
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_final_logits_bias is not None and new_final_logits_bias is not None:
|
||||
old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
|
||||
new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
|
||||
self.assertEqual(new_final_logits_bias.shape[0], 1)
|
||||
self.assertEqual(new_final_logits_bias.shape[1], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
|
||||
for p1, p2 in zip(old, new):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
|
||||
@is_pt_tf_cross_test
|
||||
@require_sentencepiece
|
||||
|
||||
@@ -292,15 +292,17 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
|
||||
if model_class in list_lm_models:
|
||||
x = model.get_output_layer_with_bias()
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_prefix_bias_name()
|
||||
assert isinstance(name, str)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_layer_with_bias()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
# This test is too long (>30sec) and makes fail the CI
|
||||
|
||||
@@ -228,9 +228,16 @@ class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_openai_gpt_sequence_classification_model(self):
|
||||
|
||||
@@ -41,7 +41,7 @@ class ModelTester(TFBartModelTester):
|
||||
|
||||
|
||||
@require_tf
|
||||
class TestTFPegasusCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
class TFPegasusModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (TFPegasusForConditionalGeneration,) if is_tf_available() else ()
|
||||
all_generative_model_classes = (TFPegasusForConditionalGeneration,) if is_tf_available() else ()
|
||||
model_tester_cls = ModelTester
|
||||
@@ -59,14 +59,6 @@ class TestTFPegasusCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
# inputs_embeds not supported
|
||||
pass
|
||||
|
||||
def test_saved_model_with_hidden_states_output(self):
|
||||
# Should be uncommented during patrick TF refactor
|
||||
pass
|
||||
|
||||
def test_saved_model_with_attentions_output(self):
|
||||
# Should be uncommented during patrick TF refactor
|
||||
pass
|
||||
|
||||
def test_compile_tf_model(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -104,15 +96,87 @@ class TestTFPegasusCommon(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
# This test is too long (>30sec) and makes fail the CI
|
||||
pass
|
||||
|
||||
def test_resize_token_embeddings(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
# Here we build the word embeddings weights if not exists.
|
||||
# And then we retry to get the attribute once built.
|
||||
model(model.dummy_inputs)
|
||||
if hasattr(embedding_layer, "weight"):
|
||||
return embedding_layer.weight
|
||||
else:
|
||||
return None
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
|
||||
# build the embeddings
|
||||
model = model_class(config=config)
|
||||
old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
old_final_logits_bias = model.get_bias()
|
||||
|
||||
# reshape the embeddings
|
||||
model.resize_token_embeddings(size)
|
||||
new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
|
||||
new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
|
||||
new_final_logits_bias = model.get_bias()
|
||||
|
||||
# check that the resized embeddings size matches the desired size.
|
||||
assert_size = size if size is not None else config.vocab_size
|
||||
|
||||
self.assertEqual(new_input_embeddings.shape[0], assert_size)
|
||||
|
||||
# check that weights remain the same after resizing
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_output_embeddings is not None and new_output_embeddings is not None:
|
||||
self.assertEqual(new_output_embeddings.shape[0], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
if old_final_logits_bias is not None and new_final_logits_bias is not None:
|
||||
old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
|
||||
new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
|
||||
self.assertEqual(new_final_logits_bias.shape[0], 1)
|
||||
self.assertEqual(new_final_logits_bias.shape[1], assert_size)
|
||||
|
||||
models_equal = True
|
||||
for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
|
||||
for p1, p2 in zip(old, new):
|
||||
if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
|
||||
models_equal = False
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
|
||||
@is_pt_tf_cross_test
|
||||
@require_sentencepiece
|
||||
|
||||
@@ -289,9 +289,16 @@ class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
|
||||
if model_class in self.all_generative_model_classes:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
def test_saved_model_creation(self):
|
||||
|
||||
@@ -187,13 +187,20 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
list_other_models_with_output_ebd = [TFTransfoXLForSequenceClassification]
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
x = model.get_output_layer_with_bias()
|
||||
if model_class in list_other_models_with_output_ebd:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
name = model.get_prefix_bias_name()
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
|
||||
@slow
|
||||
|
||||
Reference in New Issue
Block a user