Merge branch 'xlnet' into doc-sphinx
This commit is contained in:
@@ -36,7 +36,7 @@ from .modeling_xlm import (XLMConfig, XLMModel,
|
||||
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
|
||||
PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
|
||||
|
||||
from .optimization import BertAdam
|
||||
from .optimization_openai import OpenAIAdam
|
||||
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
||||
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
||||
|
||||
from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
|
||||
|
||||
@@ -73,17 +73,17 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||
print("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
names = []
|
||||
arrays = []
|
||||
for name, shape in init_vars:
|
||||
print("Loading TF weight {} with shape {}".format(name, shape))
|
||||
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
names.append(name)
|
||||
arrays.append(array)
|
||||
@@ -93,7 +93,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
||||
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
||||
# which are not required for using pretrained model
|
||||
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
|
||||
print("Skipping {}".format("/".join(name)))
|
||||
logger.info("Skipping {}".format("/".join(name)))
|
||||
continue
|
||||
pointer = model
|
||||
for m_name in name:
|
||||
@@ -113,7 +113,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
||||
try:
|
||||
pointer = getattr(pointer, l[0])
|
||||
except AttributeError:
|
||||
print("Skipping {}".format("/".join(name)))
|
||||
logger.info("Skipping {}".format("/".join(name)))
|
||||
continue
|
||||
if len(l) >= 2:
|
||||
num = int(l[1])
|
||||
@@ -127,7 +127,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
logger.info("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
return model
|
||||
|
||||
|
||||
@@ -49,17 +49,17 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
tf_path = os.path.abspath(gpt2_checkpoint_path)
|
||||
print("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
names = []
|
||||
arrays = []
|
||||
for name, shape in init_vars:
|
||||
print("Loading TF weight {} with shape {}".format(name, shape))
|
||||
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
names.append(name)
|
||||
arrays.append(array.squeeze())
|
||||
@@ -90,7 +90,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
logger.info("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
return model
|
||||
|
||||
|
||||
@@ -110,7 +110,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
logger.info("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
return model
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
# Build TF to PyTorch weights loading map
|
||||
@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
tf_weights = {}
|
||||
for name, shape in init_vars:
|
||||
print("Loading TF weight {} with shape {}".format(name, shape))
|
||||
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
tf_weights[name] = array
|
||||
|
||||
@@ -157,7 +157,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
|
||||
except AssertionError as e:
|
||||
e.args += (p_i.shape, arr_i.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {} for layer {}".format(name, i))
|
||||
logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
|
||||
p_i.data = torch.from_numpy(arr_i)
|
||||
else:
|
||||
try:
|
||||
@@ -165,13 +165,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
logger.info("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
tf_weights.pop(name, None)
|
||||
tf_weights.pop(name + '/Adam', None)
|
||||
tf_weights.pop(name + '/Adam_1', None)
|
||||
|
||||
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
|
||||
logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
|
||||
return model
|
||||
|
||||
|
||||
|
||||
@@ -272,7 +272,6 @@ class LogUniformSampler(object):
|
||||
self.range_max = range_max
|
||||
log_indices = torch.arange(1., range_max+2., 1.).log_()
|
||||
self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
|
||||
# print('P', self.dist.numpy().tolist()[-30:])
|
||||
|
||||
self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
|
||||
|
||||
@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
|
||||
logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
# class LogUniformSampler(object):
|
||||
# def __init__(self, range_max, unique=False):
|
||||
# """
|
||||
# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
|
||||
# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
|
||||
# """
|
||||
# self.range_max = range_max
|
||||
# log_indices = torch.arange(1., range_max+2., 1.).log_()
|
||||
# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
|
||||
|
||||
# self.unique = unique
|
||||
|
||||
# if self.unique:
|
||||
# self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
|
||||
|
||||
# def sample(self, n_sample, labels):
|
||||
# pos_sample, new_labels = labels.unique(return_inverse=True)
|
||||
# n_pos_sample = pos_sample.size(0)
|
||||
# n_neg_sample = n_sample - n_pos_sample
|
||||
|
||||
# if self.unique:
|
||||
# self.exclude_mask.index_fill_(0, pos_sample, 1)
|
||||
# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
|
||||
# self.exclude_mask.index_fill_(0, pos_sample, 0)
|
||||
# else:
|
||||
# sample_dist = self.dist
|
||||
|
||||
# neg_sample = torch.multinomial(sample_dist, n_neg_sample)
|
||||
|
||||
# sample = torch.cat([pos_sample, neg_sample])
|
||||
# sample_prob = self.dist[sample]
|
||||
|
||||
# return new_labels, sample, sample_prob
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
S, B = 3, 4
|
||||
n_vocab = 10000
|
||||
n_sample = 5
|
||||
H = 32
|
||||
|
||||
labels = torch.LongTensor(S, B).random_(0, n_vocab)
|
||||
|
||||
# sampler = LogUniformSampler(n_vocab, unique=False)
|
||||
# new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
|
||||
|
||||
sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
|
||||
# true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
|
||||
|
||||
# print('true_probs', true_probs.numpy().tolist())
|
||||
# print('samp_probs', samp_probs.numpy().tolist())
|
||||
# print('neg_samples', neg_samples.numpy().tolist())
|
||||
|
||||
# print('sum', torch.sum(sampler.dist).item())
|
||||
|
||||
# assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
|
||||
|
||||
embedding = nn.Embedding(n_vocab, H)
|
||||
bias = torch.zeros(n_vocab)
|
||||
inputs = torch.Tensor(S, B, H).normal_()
|
||||
|
||||
logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
|
||||
print('logits', logits.detach().numpy().tolist())
|
||||
print('logits shape', logits.size())
|
||||
print('out_labels', out_labels.detach().numpy().tolist())
|
||||
print('out_labels shape', out_labels.size())
|
||||
|
||||
|
||||
@@ -57,16 +57,18 @@ class PretrainedConfig(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `xlnet-large-cased`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `config.json` a configuration file for the model
|
||||
- a path or url to a directory containing a configuration file `config.json` for the model,
|
||||
- a path or url to a configuration file for the model.
|
||||
cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
|
||||
"""
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
|
||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
|
||||
else:
|
||||
elif os.path.isdir(pretrained_model_name_or_path):
|
||||
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
||||
else:
|
||||
config_file = pretrained_model_name_or_path
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
|
||||
@@ -102,7 +104,7 @@ class PretrainedConfig(object):
|
||||
for key in to_remove:
|
||||
kwargs.pop(key, None)
|
||||
|
||||
logger.info("Model config {}".format(config))
|
||||
logger.info("Model config %s", config)
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
@@ -200,6 +202,7 @@ class PreTrainedModel(nn.Module):
|
||||
- a path or url to a tensorflow pretrained model checkpoint containing:
|
||||
. `config.json` a configuration file for the model
|
||||
. `model.chkpt` a TensorFlow checkpoint
|
||||
config: an optional configuration for the model
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionnary (collections.OrderedDict object) to use
|
||||
@@ -207,23 +210,31 @@ class PreTrainedModel(nn.Module):
|
||||
*inputs, **kwargs: additional input for the specific XLNet class
|
||||
(ex: num_labels for XLNetForSequenceClassification)
|
||||
"""
|
||||
config = kwargs.pop('config', None)
|
||||
state_dict = kwargs.pop('state_dict', None)
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
from_tf = kwargs.pop('from_tf', False)
|
||||
output_loading_info = kwargs.pop('output_loading_info', False)
|
||||
|
||||
# Load config
|
||||
config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
if config is None:
|
||||
config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
|
||||
# Load model
|
||||
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
||||
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
|
||||
else:
|
||||
elif os.path.isdir(pretrained_model_name_or_path):
|
||||
if from_tf:
|
||||
# Directly load from a TensorFlow checkpoint
|
||||
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
|
||||
else:
|
||||
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
|
||||
else:
|
||||
if from_tf:
|
||||
# Directly load from a TensorFlow checkpoint
|
||||
archive_file = pretrained_model_name_or_path + ".index"
|
||||
else:
|
||||
archive_file = pretrained_model_name_or_path
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
|
||||
|
||||
@@ -122,14 +122,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
tf_weights = {}
|
||||
for name, shape in init_vars:
|
||||
print("Loading TF weight {} with shape {}".format(name, shape))
|
||||
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
tf_weights[name] = array
|
||||
|
||||
@@ -137,15 +137,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
|
||||
tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
|
||||
|
||||
for name, pointer in tf_to_pt_map.items():
|
||||
print("Importing {}".format(name))
|
||||
logger.info("Importing {}".format(name))
|
||||
if name not in tf_weights:
|
||||
print("{} not in tf pre-trained weights, skipping".format(name))
|
||||
logger.info("{} not in tf pre-trained weights, skipping".format(name))
|
||||
continue
|
||||
array = tf_weights[name]
|
||||
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
||||
# which are not required for using pretrained model
|
||||
if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
|
||||
print("Transposing")
|
||||
logger.info("Transposing")
|
||||
array = np.transpose(array)
|
||||
if isinstance(pointer, list):
|
||||
# Here we will split the TF weigths
|
||||
@@ -157,7 +157,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
|
||||
except AssertionError as e:
|
||||
e.args += (p_i.shape, arr_i.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {} for layer {}".format(name, i))
|
||||
logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
|
||||
p_i.data = torch.from_numpy(arr_i)
|
||||
else:
|
||||
try:
|
||||
@@ -165,13 +165,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
logger.info("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
tf_weights.pop(name, None)
|
||||
tf_weights.pop(name + '/Adam', None)
|
||||
tf_weights.pop(name + '/Adam_1', None)
|
||||
|
||||
print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
|
||||
logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
|
||||
return model
|
||||
|
||||
|
||||
@@ -252,10 +252,6 @@ class XLNetConfig(PretrainedConfig):
|
||||
layer_norm_eps=1e-12,
|
||||
|
||||
dropout=0.1,
|
||||
dropatt=0.1,
|
||||
init="normal",
|
||||
init_range=0.1,
|
||||
init_std=0.02,
|
||||
mem_len=None,
|
||||
reuse_len=None,
|
||||
bi_data=False,
|
||||
@@ -297,11 +293,7 @@ class XLNetConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
|
||||
self.init = init
|
||||
self.init_range = init_range
|
||||
self.init_std = init_std
|
||||
self.dropout = dropout
|
||||
self.dropatt = dropatt
|
||||
self.mem_len = mem_len
|
||||
self.reuse_len = reuse_len
|
||||
self.bi_data = bi_data
|
||||
@@ -393,7 +385,7 @@ class XLNetRelativeAttention(nn.Module):
|
||||
x = x[1:, ...]
|
||||
x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
|
||||
# x = x[:, 0:klen, :, :]
|
||||
x = torch.index_select(x, 1, torch.arange(klen))
|
||||
x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@@ -14,174 +14,92 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch optimization for BERT model."""
|
||||
|
||||
import logging
|
||||
import math
|
||||
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.optimizer import required
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
import logging
|
||||
import abc
|
||||
import sys
|
||||
from torch.optim.lr_scheduler import LambdaLR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ConstantLRSchedule(LambdaLR):
|
||||
def __init__(self, optimizer, last_epoch=-1):
|
||||
super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
|
||||
|
||||
if sys.version_info >= (3, 4):
|
||||
ABC = abc.ABC
|
||||
else:
|
||||
ABC = abc.ABCMeta('ABC', (), {})
|
||||
|
||||
|
||||
class _LRSchedule(ABC):
|
||||
""" Parent of all LRSchedules here. """
|
||||
warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense
|
||||
def __init__(self, warmup=0.002, t_total=-1, **kw):
|
||||
"""
|
||||
:param warmup: what fraction of t_total steps will be used for linear warmup
|
||||
:param t_total: how many training steps (updates) are planned
|
||||
:param kw:
|
||||
"""
|
||||
super(_LRSchedule, self).__init__(**kw)
|
||||
if t_total < 0:
|
||||
logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
|
||||
if not 0.0 <= warmup < 1.0 and not warmup == -1:
|
||||
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
|
||||
warmup = max(warmup, 0.)
|
||||
self.warmup, self.t_total = float(warmup), float(t_total)
|
||||
self.warned_for_t_total_at_progress = -1
|
||||
|
||||
def get_lr(self, step, nowarn=False):
|
||||
"""
|
||||
:param step: which of t_total steps we're on
|
||||
:param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps
|
||||
:return: learning rate multiplier for current update
|
||||
"""
|
||||
if self.t_total < 0:
|
||||
return 1.
|
||||
progress = float(step) / self.t_total
|
||||
ret = self.get_lr_(progress)
|
||||
# warning for exceeding t_total (only active with warmup_linear
|
||||
if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
|
||||
logger.warning(
|
||||
"Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
|
||||
.format(ret, self.__class__.__name__))
|
||||
self.warned_for_t_total_at_progress = progress
|
||||
# end warning
|
||||
return ret
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_lr_(self, progress):
|
||||
"""
|
||||
:param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress
|
||||
:return: learning rate multiplier for current update
|
||||
"""
|
||||
return 1.
|
||||
|
||||
|
||||
class ConstantLR(_LRSchedule):
|
||||
def get_lr_(self, progress):
|
||||
return 1.
|
||||
|
||||
|
||||
class WarmupCosineSchedule(_LRSchedule):
|
||||
class WarmupCosineSchedule(LambdaLR):
|
||||
"""
|
||||
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||
Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
|
||||
Linearly increases learning rate from 0 to 1 over `warmup` training steps.
|
||||
Decreases learning rate from 1. to 0. over remaining `t_total - warmup` steps following a cosine curve.
|
||||
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
|
||||
:param warmup: see LRSchedule
|
||||
:param t_total: see LRSchedule
|
||||
:param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
|
||||
:param kw:
|
||||
"""
|
||||
warn_t_total = True
|
||||
def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
|
||||
"""
|
||||
:param warmup: see LRSchedule
|
||||
:param t_total: see LRSchedule
|
||||
:param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
|
||||
:param kw:
|
||||
"""
|
||||
super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
|
||||
self.cycles = cycles
|
||||
def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
|
||||
|
||||
def get_lr_(self, progress):
|
||||
if progress < self.warmup:
|
||||
return progress / self.warmup
|
||||
else:
|
||||
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
|
||||
return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
|
||||
def lr_lambda(step):
|
||||
if step < warmup_steps:
|
||||
return step / max(1, warmup_steps)
|
||||
else:
|
||||
progress = (step - warmup_steps) / max(1, t_total - warmup_steps) # progress after warmup
|
||||
return 0.5 * (1. + math.cos(math.pi * cycles * 2 * progress))
|
||||
|
||||
super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||
|
||||
class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
|
||||
class WarmupCosineWithHardRestartsSchedule(LambdaLR):
|
||||
"""
|
||||
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
|
||||
learning rate (with hard restarts).
|
||||
"""
|
||||
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
|
||||
super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
|
||||
assert(cycles >= 1.)
|
||||
def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
|
||||
|
||||
def get_lr_(self, progress):
|
||||
if progress < self.warmup:
|
||||
return progress / self.warmup
|
||||
else:
|
||||
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
|
||||
ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
|
||||
return ret
|
||||
def lr_lambda(step):
|
||||
if step < warmup_steps:
|
||||
return step / max(1, warmup_steps)
|
||||
else:
|
||||
progress = (step - warmup_steps) / max(1, t_total - warmup_steps) # progress after warmup
|
||||
ret = 0.5 * (1. + math.cos(math.pi * ((cycles * progress) % 1)))
|
||||
return ret
|
||||
|
||||
super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||
|
||||
|
||||
class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
|
||||
"""
|
||||
All training progress is divided in `cycles` (default=1.) parts of equal length.
|
||||
Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
|
||||
followed by a learning rate decreasing from 1. to 0. following a cosine curve.
|
||||
"""
|
||||
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
|
||||
assert(warmup * cycles < 1.)
|
||||
warmup = warmup * cycles if warmup >= 0 else warmup
|
||||
super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
|
||||
|
||||
def get_lr_(self, progress):
|
||||
progress = progress * self.cycles % 1.
|
||||
if progress < self.warmup:
|
||||
return progress / self.warmup
|
||||
else:
|
||||
progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup
|
||||
ret = 0.5 * (1. + math.cos(math.pi * progress))
|
||||
return ret
|
||||
|
||||
|
||||
class WarmupConstantSchedule(_LRSchedule):
|
||||
class WarmupConstantSchedule(LambdaLR):
|
||||
"""
|
||||
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||
Keeps learning rate equal to 1. after warmup.
|
||||
"""
|
||||
def get_lr_(self, progress):
|
||||
if progress < self.warmup:
|
||||
return progress / self.warmup
|
||||
return 1.
|
||||
def __init__(self, optimizer, warmup_steps, last_epoch=-1):
|
||||
|
||||
def lr_lambda(step):
|
||||
if step < warmup_steps:
|
||||
return step / warmup_steps
|
||||
return 1.
|
||||
|
||||
super(WarmupConstantSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||
|
||||
|
||||
class WarmupLinearSchedule(_LRSchedule):
|
||||
class WarmupLinearSchedule(LambdaLR):
|
||||
"""
|
||||
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||
Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
|
||||
"""
|
||||
warn_t_total = True
|
||||
def get_lr_(self, progress):
|
||||
if progress < self.warmup:
|
||||
return progress / self.warmup
|
||||
return max((progress - 1.) / (self.warmup - 1.), 0.)
|
||||
def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
|
||||
|
||||
def lr_lambda(step):
|
||||
if step < warmup_steps:
|
||||
return step / max(1, warmup_steps)
|
||||
return (t_total - step) / max(1, t_total - warmup_steps)
|
||||
|
||||
super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||
|
||||
|
||||
SCHEDULES = {
|
||||
None: ConstantLR,
|
||||
"none": ConstantLR,
|
||||
"warmup_cosine": WarmupCosineSchedule,
|
||||
"warmup_constant": WarmupConstantSchedule,
|
||||
"warmup_linear": WarmupLinearSchedule
|
||||
}
|
||||
|
||||
|
||||
class BertAdam(Optimizer):
|
||||
"""Implements BERT version of Adam algorithm with weight decay fix.
|
||||
class AdamW(Optimizer):
|
||||
""" Implements Adam algorithm with weight decay fix.
|
||||
|
||||
Parameters:
|
||||
lr: learning rate
|
||||
@@ -197,43 +115,20 @@ class BertAdam(Optimizer):
|
||||
e: Adams epsilon. Default: 1e-6
|
||||
weight_decay: Weight decay. Default: 0.01
|
||||
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
|
||||
correct_bias: can be set to False to avoid correcting bias in Adam (e.g. like in Bert repository)
|
||||
"""
|
||||
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
|
||||
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
|
||||
if lr is not required and lr < 0.0:
|
||||
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01, correct_bias=True):
|
||||
if lr < 0.0:
|
||||
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
||||
if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
|
||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||
if not 0.0 <= b1 < 1.0:
|
||||
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
|
||||
if not 0.0 <= b2 < 1.0:
|
||||
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
|
||||
if not e >= 0.0:
|
||||
if not 0.0 <= betas[0] < 1.0:
|
||||
raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
|
||||
if not 0.0 <= betas[1] < 1.0:
|
||||
raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1] ))
|
||||
if not 0.0 <= eps:
|
||||
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
|
||||
# initialize schedule object
|
||||
if not isinstance(schedule, _LRSchedule):
|
||||
schedule_type = SCHEDULES[schedule]
|
||||
schedule = schedule_type(warmup=warmup, t_total=t_total)
|
||||
else:
|
||||
if warmup != -1 or t_total != -1:
|
||||
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
|
||||
"Please specify custom warmup and t_total in _LRSchedule object.")
|
||||
defaults = dict(lr=lr, schedule=schedule,
|
||||
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
|
||||
max_grad_norm=max_grad_norm)
|
||||
super(BertAdam, self).__init__(params, defaults)
|
||||
|
||||
def get_lr(self):
|
||||
lr = []
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
state = self.state[p]
|
||||
if len(state) == 0:
|
||||
return [0]
|
||||
lr_scheduled = group['lr']
|
||||
lr_scheduled *= group['schedule'].get_lr(state['step'])
|
||||
lr.append(lr_scheduled)
|
||||
return lr
|
||||
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
|
||||
correct_bias=correct_bias)
|
||||
super(AdamW, self).__init__(params, defaults)
|
||||
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
@@ -260,22 +155,28 @@ class BertAdam(Optimizer):
|
||||
if len(state) == 0:
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['next_m'] = torch.zeros_like(p.data)
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['next_v'] = torch.zeros_like(p.data)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
|
||||
next_m, next_v = state['next_m'], state['next_v']
|
||||
beta1, beta2 = group['b1'], group['b2']
|
||||
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
||||
beta1, beta2 = group['betas']
|
||||
|
||||
# Add grad clipping
|
||||
if group['max_grad_norm'] > 0:
|
||||
clip_grad_norm_(p, group['max_grad_norm'])
|
||||
state['step'] += 1
|
||||
|
||||
# Decay the first and second moment running average coefficient
|
||||
# In-place operations to update the averages at the same time
|
||||
next_m.mul_(beta1).add_(1 - beta1, grad)
|
||||
next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
|
||||
update = next_m / (next_v.sqrt() + group['e'])
|
||||
exp_avg.mul_(beta1).add_(1 - beta1, grad)
|
||||
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
|
||||
denom = exp_avg_sq.sqrt().add_(group['eps'])
|
||||
|
||||
step_size = group['lr']
|
||||
if group['correct_bias']: # No bias correction for Bert
|
||||
bias_correction1 = 1 - beta1 ** state['step']
|
||||
bias_correction2 = 1 - beta2 ** state['step']
|
||||
step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
|
||||
|
||||
p.data.addcdiv_(-step_size, exp_avg, denom)
|
||||
|
||||
# Just adding the square of the weights to the loss function is *not*
|
||||
# the correct way of using L2 regularization/weight decay with Adam,
|
||||
@@ -284,20 +185,8 @@ class BertAdam(Optimizer):
|
||||
# Instead we want to decay the weights in a manner that doesn't interact
|
||||
# with the m/v parameters. This is equivalent to adding the square
|
||||
# of the weights to the loss with plain (non-momentum) SGD.
|
||||
if group['weight_decay'] > 0.0:
|
||||
update += group['weight_decay'] * p.data
|
||||
|
||||
lr_scheduled = group['lr']
|
||||
lr_scheduled *= group['schedule'].get_lr(state['step'])
|
||||
|
||||
update_with_lr = lr_scheduled * update
|
||||
p.data.add_(-update_with_lr)
|
||||
|
||||
state['step'] += 1
|
||||
|
||||
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
|
||||
# No bias correction
|
||||
# bias_correction1 = 1 - beta1 ** state['step']
|
||||
# bias_correction2 = 1 - beta2 ** state['step']
|
||||
# Add weight decay at the end (fixed version)
|
||||
if group['weight_decay'] > 0:
|
||||
p.data.add_(-group['lr'] * group['weight_decay'], p.data)
|
||||
|
||||
return loss
|
||||
|
||||
@@ -1,127 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch optimization for OpenAI GPT model."""
|
||||
|
||||
import math
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.optimizer import required
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
import logging
|
||||
from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
|
||||
WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OpenAIAdam(Optimizer):
|
||||
"""Implements Open AI version of Adam algorithm with weight decay fix.
|
||||
"""
|
||||
def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
|
||||
b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
|
||||
vector_l2=False, max_grad_norm=-1, **kwargs):
|
||||
if lr is not required and lr < 0.0:
|
||||
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
||||
if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
|
||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||
if not 0.0 <= b1 < 1.0:
|
||||
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
|
||||
if not 0.0 <= b2 < 1.0:
|
||||
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
|
||||
if not e >= 0.0:
|
||||
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
|
||||
# initialize schedule object
|
||||
if not isinstance(schedule, _LRSchedule):
|
||||
schedule_type = SCHEDULES[schedule]
|
||||
schedule = schedule_type(warmup=warmup, t_total=t_total)
|
||||
else:
|
||||
if warmup != -1 or t_total != -1:
|
||||
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
|
||||
"Please specify custom warmup and t_total in _LRSchedule object.")
|
||||
defaults = dict(lr=lr, schedule=schedule,
|
||||
b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
|
||||
max_grad_norm=max_grad_norm)
|
||||
super(OpenAIAdam, self).__init__(params, defaults)
|
||||
|
||||
def get_lr(self):
|
||||
lr = []
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
state = self.state[p]
|
||||
if len(state) == 0:
|
||||
return [0]
|
||||
lr_scheduled = group['lr']
|
||||
lr_scheduled *= group['schedule'].get_lr(state['step'])
|
||||
lr.append(lr_scheduled)
|
||||
return lr
|
||||
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
loss = closure()
|
||||
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
|
||||
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
||||
beta1, beta2 = group['b1'], group['b2']
|
||||
|
||||
state['step'] += 1
|
||||
|
||||
# Add grad clipping
|
||||
if group['max_grad_norm'] > 0:
|
||||
clip_grad_norm_(p, group['max_grad_norm'])
|
||||
|
||||
# Decay the first and second moment running average coefficient
|
||||
exp_avg.mul_(beta1).add_(1 - beta1, grad)
|
||||
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
|
||||
denom = exp_avg_sq.sqrt().add_(group['e'])
|
||||
|
||||
bias_correction1 = 1 - beta1 ** state['step']
|
||||
bias_correction2 = 1 - beta2 ** state['step']
|
||||
|
||||
lr_scheduled = group['lr']
|
||||
lr_scheduled *= group['schedule'].get_lr(state['step'])
|
||||
|
||||
step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
|
||||
|
||||
p.data.addcdiv_(-step_size, exp_avg, denom)
|
||||
|
||||
# Add weight decay at the end (fixed version)
|
||||
if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
|
||||
p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
|
||||
|
||||
return loss
|
||||
@@ -20,10 +20,9 @@ import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_transformers import BertAdam
|
||||
from pytorch_transformers import OpenAIAdam
|
||||
from pytorch_transformers.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
|
||||
WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
|
||||
from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
|
||||
WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@@ -34,12 +33,12 @@ class OptimizationTest(unittest.TestCase):
|
||||
for a, b in zip(list1, list2):
|
||||
self.assertAlmostEqual(a, b, delta=tol)
|
||||
|
||||
def test_adam(self):
|
||||
def test_adam_w(self):
|
||||
w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
|
||||
target = torch.tensor([0.4, 0.2, -0.5])
|
||||
criterion = torch.nn.MSELoss()
|
||||
# No warmup, constant schedule, no gradient clipping
|
||||
optimizer = BertAdam(params=[w], lr=2e-1,
|
||||
optimizer = AdamW(params=[w], lr=2e-1,
|
||||
weight_decay=0.0,
|
||||
max_grad_norm=-1)
|
||||
for _ in range(100):
|
||||
@@ -52,23 +51,13 @@ class OptimizationTest(unittest.TestCase):
|
||||
|
||||
|
||||
class ScheduleInitTest(unittest.TestCase):
|
||||
def test_bert_sched_init(self):
|
||||
def test_sched_init(self):
|
||||
m = torch.nn.Linear(50, 50)
|
||||
optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
|
||||
optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
|
||||
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
|
||||
optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
|
||||
optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
|
||||
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
|
||||
optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
|
||||
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
|
||||
# shouldn't fail
|
||||
|
||||
def test_openai_sched_init(self):
|
||||
m = torch.nn.Linear(50, 50)
|
||||
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
|
||||
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
|
||||
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
|
||||
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
|
||||
optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
|
||||
optim = AdamW(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
|
||||
self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
|
||||
# shouldn't fail
|
||||
|
||||
|
||||
@@ -98,14 +98,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
self.build_vocab()
|
||||
|
||||
def count_file(self, path, verbose=False, add_eos=False):
|
||||
if verbose: print('counting file {} ...'.format(path))
|
||||
if verbose: logger.info('counting file {} ...'.format(path))
|
||||
assert os.path.exists(path)
|
||||
|
||||
sents = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for idx, line in enumerate(f):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
logger.info(' line {}'.format(idx))
|
||||
symbols = self.tokenize(line, add_eos=add_eos)
|
||||
self.counter.update(symbols)
|
||||
sents.append(symbols)
|
||||
@@ -116,10 +116,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
sents : a list of sentences, each a list of tokenized symbols
|
||||
"""
|
||||
if verbose: print('counting {} sents ...'.format(len(sents)))
|
||||
if verbose: logger.info('counting {} sents ...'.format(len(sents)))
|
||||
for idx, symbols in enumerate(sents):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
logger.info(' line {}'.format(idx))
|
||||
self.counter.update(symbols)
|
||||
|
||||
def _build_from_file(self, vocab_file):
|
||||
@@ -147,11 +147,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def build_vocab(self):
|
||||
if self.vocab_file:
|
||||
print('building vocab from {}'.format(self.vocab_file))
|
||||
logger.info('building vocab from {}'.format(self.vocab_file))
|
||||
self._build_from_file(self.vocab_file)
|
||||
print('final vocab size {}'.format(len(self)))
|
||||
logger.info('final vocab size {}'.format(len(self)))
|
||||
else:
|
||||
print('building vocab with min_freq={}, max_size={}'.format(
|
||||
logger.info('building vocab with min_freq={}, max_size={}'.format(
|
||||
self.min_freq, self.max_size))
|
||||
self.idx2sym = []
|
||||
self.sym2idx = OrderedDict()
|
||||
@@ -163,18 +163,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
if cnt < self.min_freq: break
|
||||
self.add_symbol(sym)
|
||||
|
||||
print('final vocab size {} from {} unique tokens'.format(
|
||||
logger.info('final vocab size {} from {} unique tokens'.format(
|
||||
len(self), len(self.counter)))
|
||||
|
||||
def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
|
||||
add_double_eos=False):
|
||||
if verbose: print('encoding file {} ...'.format(path))
|
||||
if verbose: logger.info('encoding file {} ...'.format(path))
|
||||
assert os.path.exists(path)
|
||||
encoded = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for idx, line in enumerate(f):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
logger.info(' line {}'.format(idx))
|
||||
symbols = self.tokenize(line, add_eos=add_eos,
|
||||
add_double_eos=add_double_eos)
|
||||
encoded.append(self.convert_to_tensor(symbols))
|
||||
@@ -185,11 +185,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
return encoded
|
||||
|
||||
def encode_sents(self, sents, ordered=False, verbose=False):
|
||||
if verbose: print('encoding {} sents ...'.format(len(sents)))
|
||||
if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
|
||||
encoded = []
|
||||
for idx, symbols in enumerate(sents):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
logger.info(' line {}'.format(idx))
|
||||
encoded.append(self.convert_to_tensor(symbols))
|
||||
|
||||
if ordered:
|
||||
@@ -218,7 +218,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
if sym in self.sym2idx:
|
||||
return self.sym2idx[sym]
|
||||
else:
|
||||
# print('encounter unk {}'.format(sym))
|
||||
# logger.info('encounter unk {}'.format(sym))
|
||||
# assert '<eos>' not in sym
|
||||
if hasattr(self, 'unk_idx'):
|
||||
return self.sym2idx.get(sym, self.unk_idx)
|
||||
@@ -544,14 +544,14 @@ def get_lm_corpus(datadir, dataset):
|
||||
fn = os.path.join(datadir, 'cache.pt')
|
||||
fn_pickle = os.path.join(datadir, 'cache.pkl')
|
||||
if os.path.exists(fn):
|
||||
print('Loading cached dataset...')
|
||||
logger.info('Loading cached dataset...')
|
||||
corpus = torch.load(fn_pickle)
|
||||
elif os.path.exists(fn):
|
||||
print('Loading cached dataset from pickle...')
|
||||
logger.info('Loading cached dataset from pickle...')
|
||||
with open(fn, "rb") as fp:
|
||||
corpus = pickle.load(fp)
|
||||
else:
|
||||
print('Producing dataset {}...'.format(dataset))
|
||||
logger.info('Producing dataset {}...'.format(dataset))
|
||||
kwargs = {}
|
||||
if dataset in ['wt103', 'wt2']:
|
||||
kwargs['special'] = ['<eos>']
|
||||
|
||||
Reference in New Issue
Block a user