skip finetuned checkpoints
This commit is contained in:
@@ -113,6 +113,11 @@ if _tf_available:
|
|||||||
load_gpt2_pt_weights_in_tf2,
|
load_gpt2_pt_weights_in_tf2,
|
||||||
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
|
||||||
|
TFTransfoXLModel, TFTransfoXLLMHeadModel,
|
||||||
|
load_transfo_xl_pt_weights_in_tf2,
|
||||||
|
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
||||||
TFXLNetModel, TFXLNetLMHeadModel,
|
TFXLNetModel, TFXLNetLMHeadModel,
|
||||||
TFXLNetForSequenceClassification,
|
TFXLNetForSequenceClassification,
|
||||||
|
|||||||
@@ -27,7 +27,8 @@ from pytorch_transformers import is_torch_available, cached_path
|
|||||||
from pytorch_transformers import (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2,
|
from pytorch_transformers import (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2,
|
||||||
GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2,
|
GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2,
|
||||||
XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2,
|
XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2,
|
||||||
XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2,)
|
XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2,
|
||||||
|
TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2,)
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
@@ -35,12 +36,15 @@ if is_torch_available():
|
|||||||
from pytorch_transformers import (BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
from pytorch_transformers import (BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,)
|
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,)
|
||||||
else:
|
else:
|
||||||
(BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
(BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,) = (
|
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,) = (
|
||||||
|
None, None, None,
|
||||||
None, None, None,
|
None, None, None,
|
||||||
None, None, None,
|
None, None, None,
|
||||||
None, None, None,
|
None, None, None,
|
||||||
@@ -55,6 +59,7 @@ MODEL_CLASSES = {
|
|||||||
'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
}
|
}
|
||||||
|
|
||||||
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False):
|
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False):
|
||||||
@@ -118,6 +123,9 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, compare_with
|
|||||||
print("-" * 100)
|
print("-" * 100)
|
||||||
print(" Converting checkpoint {}/{}: {}".format(i, len(aws_config_map), shortcut_name))
|
print(" Converting checkpoint {}/{}: {}".format(i, len(aws_config_map), shortcut_name))
|
||||||
print("-" * 100)
|
print("-" * 100)
|
||||||
|
if 'finetuned' in shortcut_name:
|
||||||
|
print(" Skipping fintenued checkpoint ")
|
||||||
|
continue
|
||||||
config_file = cached_path(aws_config_map[shortcut_name], force_download=True)
|
config_file = cached_path(aws_config_map[shortcut_name], force_download=True)
|
||||||
model_file = cached_path(aws_model_maps[shortcut_name], force_download=True)
|
model_file = cached_path(aws_model_maps[shortcut_name], force_download=True)
|
||||||
|
|
||||||
|
|||||||
@@ -13,8 +13,7 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" Utilities for PyTorch Transformer XL model.
|
""" A TF 2.0 Adaptive Softmax for Transformer XL model.
|
||||||
Directly adapted from https://github.com/kimiyoung/transformer-xl.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -174,106 +173,3 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
|
self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
|
|
||||||
params, tie_projs,
|
|
||||||
initializer=None, proj_initializer=None,
|
|
||||||
div_val=1, perms=None, proj_same_dim=True,
|
|
||||||
scope='adaptive_softmax',
|
|
||||||
**kwargs):
|
|
||||||
def _logit(x, W, b, proj):
|
|
||||||
y = x
|
|
||||||
if x.shape.ndims == 3:
|
|
||||||
if proj is not None:
|
|
||||||
y = tf.einsum('ibd,ed->ibe', y, proj)
|
|
||||||
return tf.einsum('ibd,nd->ibn', y, W) + b
|
|
||||||
else:
|
|
||||||
if proj is not None:
|
|
||||||
y = tf.einsum('id,ed->ie', y, proj)
|
|
||||||
return tf.einsum('id,nd->in', y, W) + b
|
|
||||||
|
|
||||||
params_W, params_projs = params[0], params[1]
|
|
||||||
|
|
||||||
with tf.variable_scope(scope):
|
|
||||||
if len(cutoffs) == 0:
|
|
||||||
softmax_b = tf.get_variable('bias', [n_token],
|
|
||||||
initializer=tf.zeros_initializer())
|
|
||||||
output = _logit(hidden, params_W, softmax_b, params_projs)
|
|
||||||
nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
|
|
||||||
logits=output)
|
|
||||||
nll = tf.reduce_mean(nll)
|
|
||||||
else:
|
|
||||||
total_loss, total_cnt = 0, 0
|
|
||||||
cutoff_ends = [0] + cutoffs + [n_token]
|
|
||||||
for i in range(len(cutoff_ends) - 1):
|
|
||||||
with tf.variable_scope('cutoff_{}'.format(i)):
|
|
||||||
l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
|
|
||||||
|
|
||||||
cur_d_embed = d_embed // (div_val ** i)
|
|
||||||
|
|
||||||
if div_val == 1:
|
|
||||||
cur_W = params_W[l_idx: r_idx]
|
|
||||||
else:
|
|
||||||
cur_W = params_W[i]
|
|
||||||
cur_b = tf.get_variable('b', [r_idx - l_idx],
|
|
||||||
initializer=tf.zeros_initializer())
|
|
||||||
if tie_projs[i]:
|
|
||||||
if div_val == 1:
|
|
||||||
cur_proj = params_projs
|
|
||||||
else:
|
|
||||||
cur_proj = params_projs[i]
|
|
||||||
else:
|
|
||||||
if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:
|
|
||||||
cur_proj = None
|
|
||||||
else:
|
|
||||||
cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],
|
|
||||||
initializer=proj_initializer)
|
|
||||||
|
|
||||||
if i == 0:
|
|
||||||
cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],
|
|
||||||
initializer=tf.zeros_initializer())
|
|
||||||
cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],
|
|
||||||
initializer=tf.zeros_initializer())
|
|
||||||
cur_W = tf.concat([cur_W, cluster_W], 0)
|
|
||||||
cur_b = tf.concat([cur_b, cluster_b], 0)
|
|
||||||
|
|
||||||
head_logit = _logit(hidden, cur_W, cur_b, cur_proj)
|
|
||||||
|
|
||||||
head_target = kwargs.get("head_target")
|
|
||||||
head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
|
|
||||||
labels=head_target,
|
|
||||||
logits=head_logit)
|
|
||||||
|
|
||||||
masked_loss = head_nll * perms[i]
|
|
||||||
total_loss += tf.reduce_sum(masked_loss)
|
|
||||||
total_cnt += tf.reduce_sum(perms[i])
|
|
||||||
|
|
||||||
# head_logprob = tf.nn.log_softmax(head_logit)
|
|
||||||
|
|
||||||
# final_logprob = head_logprob * perms[i][:, :, None]
|
|
||||||
# final_target = tf.one_hot(target, tf.shape(head_logprob)[2])
|
|
||||||
# total_loss -= tf.einsum('ibn,ibn->', final_logprob, final_target)
|
|
||||||
# total_cnt += tf.reduce_sum(perms[i])
|
|
||||||
else:
|
|
||||||
cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i])
|
|
||||||
|
|
||||||
cur_hidden = tf.einsum('ibd,ibk->kd', hidden, perms[i])
|
|
||||||
tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj)
|
|
||||||
|
|
||||||
tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx),
|
|
||||||
perms[i])
|
|
||||||
tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
|
|
||||||
labels=tf.to_int32(tail_target),
|
|
||||||
logits=tail_logit)
|
|
||||||
|
|
||||||
sum_nll = cur_head_nll + tail_nll
|
|
||||||
mask = tf.reduce_sum(perms[i], [0, 1])
|
|
||||||
|
|
||||||
masked_loss = sum_nll * mask
|
|
||||||
total_loss += tf.reduce_sum(masked_loss)
|
|
||||||
total_cnt += tf.reduce_sum(mask)
|
|
||||||
|
|
||||||
nll = total_loss / total_cnt
|
|
||||||
|
|
||||||
return nll
|
|
||||||
Reference in New Issue
Block a user