From ba10065c4b44d733d135ad6dc1b8a77f88c6dbb9 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 7 Nov 2019 15:55:36 +0100 Subject: [PATCH] update model, conversion script, tests and template --- ...t_xxx_original_tf_checkpoint_to_pytorch.py | 10 +- transformers/__init__.py | 1 + transformers/configuration_t5.py | 13 +- ...rt_t5_original_tf_checkpoint_to_pytorch.py | 12 +- transformers/modeling_t5.py | 129 ++++++++++++------ transformers/tests/modeling_common_test.py | 41 +++--- transformers/tests/modeling_t5_test.py | 12 +- transformers/tokenization_t5.py | 1 + 8 files changed, 135 insertions(+), 84 deletions(-) diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py index d50d129cba..9d389deaad 100755 --- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py +++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py @@ -26,9 +26,9 @@ from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx import logging logging.basicConfig(level=logging.INFO) -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path): +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model - config = XxxConfig.from_json_file(xxx_config_file) + config = XxxConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = XxxForPreTraining(config) @@ -48,11 +48,11 @@ if __name__ == "__main__": type = str, required = True, help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--xxx_config_file", + parser.add_argument("--config_file", default = None, type = str, required = True, - help = "The config json file corresponding to the pre-trained XXX model. \n" + help = "The config json file corresponding to the pre-trained model. \n" "This specifies the model architecture.") parser.add_argument("--pytorch_dump_path", default = None, @@ -61,5 +61,5 @@ if __name__ == "__main__": help = "Path to the output PyTorch model.") args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.xxx_config_file, + args.config_file, args.pytorch_dump_path) diff --git a/transformers/__init__.py b/transformers/__init__.py index bf896276d6..601a068592 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -97,6 +97,7 @@ if is_torch_available(): DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel, + load_tf_weights_in_t5, T5_PRETRAINED_MODEL_ARCHIVE_MAP) # Optimization diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 9db918e59f..96e67758ac 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -57,8 +57,7 @@ class T5Config(PretrainedConfig): (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `T5Model`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. + initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). layer_norm_eps: The epsilon used by LayerNorm. """ pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -67,25 +66,27 @@ class T5Config(PretrainedConfig): vocab_size_or_config_json_file=32128, n_positions=512, d_model=512, + d_kv=64, d_ff=2048, - num_layers=12, - num_heads=12, + num_layers=6, + num_heads=8, relative_attention_num_buckets=32, dropout_rate=0.1, layer_norm_epsilon=1e-6, - initializer_range=0.02, + initializer_factor=1.0, **kwargs): super(T5Config, self).__init__(**kwargs) self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 self.n_positions = n_positions self.d_model = d_model + self.d_kv = d_kv self.d_ff = d_ff self.num_layers = num_layers self.num_heads = num_heads self.relative_attention_num_buckets = relative_attention_num_buckets self.dropout_rate = dropout_rate self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range + self.initializer_factor = initializer_factor if isinstance(vocab_size_or_config_json_file, six.string_types): with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py index 608027ebac..2b74d2dd93 100755 --- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py @@ -21,16 +21,16 @@ from __future__ import print_function import argparse import torch -from transformers import T5Config, T5ForPreTraining, load_tf_weights_in_t5 +from transformers import T5Config, T5Model, load_tf_weights_in_t5 import logging logging.basicConfig(level=logging.INFO) -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, t5_config_file, pytorch_dump_path): +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model - config = T5Config.from_json_file(t5_config_file) + config = T5Config.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) - model = T5ForPreTraining(config) + model = T5Model(config) # Load weights from tf checkpoint load_tf_weights_in_t5(model, config, tf_checkpoint_path) @@ -48,7 +48,7 @@ if __name__ == "__main__": type = str, required = True, help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--t5_config_file", + parser.add_argument("--config_file", default = None, type = str, required = True, @@ -61,5 +61,5 @@ if __name__ == "__main__": help = "Path to the output PyTorch model.") args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.t5_config_file, + args.config_file, args.pytorch_dump_path) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index ce443cf882..6ed241761a 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -65,34 +65,40 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] - arrays = [] + tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) - arrays.append(array) + tf_weights[name] = array - for name, array in zip(names, arrays): - name = name.split('/') + for txt_name in names: + name = txt_name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m", "global_step"] for n in name): logger.info("Skipping {}".format("/".join(name))) + tf_weights.pop(txt_name, None) + continue + if '_slot_' in name[-1]: + logger.info("Skipping {}".format("/".join(name))) + tf_weights.pop(txt_name, None) continue pointer = model + array = tf_weights[txt_name] for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] - if l[0] == 'kernel' or l[0] == 'gamma': + if l[0] in ['kernel', 'scale', 'embedding']: pointer = getattr(pointer, 'weight') - elif l[0] == 'output_bias' or l[0] == 'beta': - pointer = getattr(pointer, 'bias') - elif l[0] == 'output_weights': - pointer = getattr(pointer, 'weight') - elif l[0] == 'squad': - pointer = getattr(pointer, 'classifier') + # elif l[0] == 'scale': + # pointer = getattr(pointer, 'weight') + # elif l[0] == 'output_bias' or l[0] == 'beta': + # pointer = getattr(pointer, 'bias') + # elif l[0] == 'squad': + # pointer = getattr(pointer, 'classifier') else: try: pointer = getattr(pointer, l[0]) @@ -102,9 +108,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): if len(l) >= 2: num = int(l[1]) pointer = pointer[num] - if m_name[-11:] == '_embeddings': + if l[0] not in ['kernel', 'scale', 'embedding']: pointer = getattr(pointer, 'weight') - elif m_name == 'kernel': + if l[0] != 'embedding': + logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name)) array = np.transpose(array) try: assert pointer.shape == array.shape @@ -112,7 +119,11 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) - pointer.data = torch.from_numpy(array) + pointer.data = torch.from_numpy(array.astype(np.float32)) + tf_weights.pop(txt_name, None) + + logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) return model @@ -163,10 +174,13 @@ class T5Attention(nn.Module): self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets self.dim = config.d_model + self.d_kv = config.d_kv self.n_heads = config.num_heads self.dropout = config.dropout_rate assert self.dim % self.n_heads == 0 + assert self.dim // self.n_heads == self.d_kv + # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.dim, self.dim, bias=False) self.k = nn.Linear(self.dim, self.dim, bias=False) self.v = nn.Linear(self.dim, self.dim, bias=False) @@ -312,8 +326,9 @@ class T5Attention(nn.Module): scores += position_bias if mask is not None: - mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) - scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + scores += mask + # mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) @@ -378,34 +393,35 @@ class T5Block(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super(T5Block, self).__init__() self.is_decoder = config.is_decoder - self.layer_000 = T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer = nn.ModuleList() + self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) if self.is_decoder: - self.layer_001 = T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_002 = T5LayerFF(config) + self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + self.layer.append(T5LayerFF(config)) else: - self.layer_001 = T5LayerFF(config) + self.layer.append(T5LayerFF(config)) def forward(self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, head_mask=None): - self_attention_outputs = self.layer_000(hidden_states, + self_attention_outputs = self.layer[0](hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask) hidden_states = self_attention_outputs[0] outputs = self_attention_outputs[1:] - if self.is_decoder: - cross_attention_outputs = self.layer_001(hidden_states, - kv=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - head_mask=head_mask) + if not self.is_decoder: + hidden_states = self.layer[1](hidden_states) + else: + cross_attention_outputs = self.layer[1](hidden_states, + kv=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + head_mask=head_mask) hidden_states = cross_attention_outputs[0] outputs = cross_attention_outputs[1:] + outputs - hidden_states = self.layer_002(hidden_states) - else: - hidden_states = self.layer_001(hidden_states) + hidden_states = self.layer[2](hidden_states) outputs = (hidden_states,) + outputs # add attentions if we output them return outputs @@ -422,15 +438,36 @@ class T5PreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: + factor = self.config.initializer_factor # Used for testing weights initialization + if isinstance(module, nn.LayerNorm): module.bias.data.zero_() + module.weight.data.fill_(factor*1.0) + elif isinstance(module, T5Model): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.shared.weight.data.normal_(mean=0.0, std=factor*1.0) + elif isinstance(module, T5DenseReluDense): + # Mesh TensorFlow FF initialization + # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5)) + if hasattr(module.wi, 'bias') and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, 'bias') and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5Attention): + # Mesh TensorFlow attention initialization to avoid scaling before softmax + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + d_kv = self.config.d_kv + n_heads = self.config.num_heads + module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5)) + module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5)) + module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5)) + if module.has_relative_attention_bias: + module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5)) class T5Stack(T5PreTrainedModel): @@ -440,8 +477,8 @@ class T5Stack(T5PreTrainedModel): self.output_hidden_states = config.output_hidden_states self.is_decoder = config.is_decoder - self.blocks = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) - for i in range(config.num_layers)]) + self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) + for i in range(config.num_layers)]) self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -518,7 +555,7 @@ class T5Stack(T5PreTrainedModel): all_attentions = () position_bias = None encoder_decoder_position_bias = None - for i, layer_module in enumerate(self.blocks): + for i, layer_module in enumerate(self.block): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -724,9 +761,10 @@ class T5WithLMHeadModel(T5PreTrainedModel): """ def __init__(self, config): super(T5WithLMHeadModel, self).__init__(config) + self.model_dim = config.d_model self.transformer = T5Model(config) - self.lm_head = nn.Linear(config.d_model, config.vocab_size) + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) self.init_weights() @@ -738,15 +776,18 @@ class T5WithLMHeadModel(T5PreTrainedModel): outputs = self.transformer(**kwargs) sequence_output = outputs[0] + # Rescale output before projecting on vocab + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim ** -0.5) lm_logits = self.lm_head(sequence_output) - outputs = (lm_logits,) + outputs[2:] # Add hidden states and attention if they are here + outputs = (lm_logits,) + outputs[1:] # Add hidden states and attention if they are here if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - outputs = (loss,) + outputs + outputs = (loss,) + outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 return outputs # (lm_loss), lm_logits, (hidden_states), (attentions) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 42bf9ac3f5..ee75da605c 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -59,7 +59,7 @@ else: def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if '_range' in key or '_std' in key: + if '_range' in key or '_std' in key or 'initializer_factor' in key: setattr(configs_no_init, key, 0.0) return configs_no_init @@ -83,20 +83,24 @@ class CommonTestCases: model.eval() with torch.no_grad(): outputs = model(**inputs_dict) + out_2 = outputs[0].numpy() + out_2[np.isnan(out_2)] = 0 with TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model = model_class.from_pretrained(tmpdirname) - with torch.no_grad(): - after_outputs = model(**inputs_dict) - # Make sure we don't have nans - out_1 = after_outputs[0].numpy() - out_2 = outputs[0].numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) + with torch.no_grad(): + after_outputs = model(**inputs_dict) + + # # Make sure we don't have nans + out_1 = after_outputs[0].numpy() + out_1[np.isnan(out_1)] = 0 + + out_1 = out_1 - out_2 + amax = np.amax(out_1) + amin = np.amin(out_1) + self.assertLessEqual(max(amax, -amin), 1e-5) def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -127,27 +131,28 @@ class CommonTestCases: model = model_class(config) model.eval() outputs = model(**inputs_dict) - self_attentions = outputs[-1] + attentions = outputs[-1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( - list(self_attentions[0].shape[-3:]), + list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, self.model_tester.seq_length, self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) out_len = len(outputs) if self.is_encoder_decoder: - cross_attentions = outputs[-2] + self.assertEqual(out_len % 2, 0) + decoder_attentions = outputs[(out_len // 2)-1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( - list(cross_attentions[0].shape[-3:]), + list(decoder_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + self.model_tester.seq_length, + self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) # Check attention is always last and order is fine config.output_attentions = True diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py index b8bb828ebd..2c67b83c25 100644 --- a/transformers/tests/modeling_t5_test.py +++ b/transformers/tests/modeling_t5_test.py @@ -57,7 +57,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester): d_ff=37, relative_attention_num_buckets=8, dropout_rate=0.1, - initializer_range=0.02, + initializer_factor=0.002, scope=None, ): self.parent = parent @@ -74,7 +74,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester): self.d_ff = d_ff self.relative_attention_num_buckets = relative_attention_num_buckets self.dropout_rate = dropout_rate - self.initializer_range = initializer_range + self.initializer_factor = initializer_factor self.scope = scope def prepare_config_and_inputs(self): @@ -93,11 +93,12 @@ class T5ModelTest(CommonTestCases.CommonModelTester): n_positions=self.n_positions, d_model=self.hidden_size, d_ff=self.d_ff, + d_kv=self.hidden_size // self.num_attention_heads, num_layers=self.num_hidden_layers, num_heads=self.num_attention_heads, relative_attention_num_buckets=self.relative_attention_num_buckets, dropout_rate=self.dropout_rate, - initializer_range=self.initializer_range) + initializer_factor=self.initializer_factor) return (config, input_ids, input_mask, token_labels) @@ -130,8 +131,9 @@ class T5ModelTest(CommonTestCases.CommonModelTester): def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): model = T5WithLMHeadModel(config=config) model.eval() - loss, prediction_scores = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids, - decoder_attention_mask=input_mask, decoder_lm_labels=token_labels) + outputs = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids, + decoder_attention_mask=input_mask, decoder_lm_labels=token_labels) + loss, prediction_scores = outputs[0], outputs[1] result = { "loss": loss, "prediction_scores": prediction_scores, diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index cff6a41baf..ae898ba0d3 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -18,6 +18,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging import os +from shutil import copyfile from .tokenization_utils import PreTrainedTokenizer