Merge remote-tracking branch 'upstream/master'
This commit is contained in:
@@ -1234,9 +1234,9 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT
|
|||||||
|
|
||||||
### BERT
|
### BERT
|
||||||
|
|
||||||
You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py`](convert_tf_checkpoint_to_pytorch.py) script.
|
You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py ) script.
|
||||||
|
|
||||||
This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`]((./examples/run_squad.py))).
|
This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py)).
|
||||||
|
|
||||||
You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
|
You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ class InputFeatures(object):
|
|||||||
|
|
||||||
|
|
||||||
def convert_examples_to_features(examples, seq_length, tokenizer):
|
def convert_examples_to_features(examples, seq_length, tokenizer):
|
||||||
"""Loads a data file into a list of `InputBatch`s."""
|
"""Loads a data file into a list of `InputFeature`s."""
|
||||||
|
|
||||||
features = []
|
features = []
|
||||||
for (ex_index, example) in enumerate(examples):
|
for (ex_index, example) in enumerate(examples):
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class DocumentDatabase:
|
|||||||
self._precalculate_doc_weights()
|
self._precalculate_doc_weights()
|
||||||
rand_start = self.doc_cumsum[current_idx]
|
rand_start = self.doc_cumsum[current_idx]
|
||||||
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
|
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
|
||||||
sentence_index = randint(rand_start, rand_end) % self.cumsum_max
|
sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
|
||||||
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
|
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
|
||||||
else:
|
else:
|
||||||
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
|
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
|
||||||
|
|||||||
@@ -442,7 +442,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
|||||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
# `type=1` were learned during pre-training and are added to the wordpiece
|
||||||
# embedding vector (and position vector). This is not *strictly* necessary
|
# embedding vector (and position vector). This is not *strictly* necessary
|
||||||
# since the [SEP] token unambigiously separates the sequences, but it makes
|
# since the [SEP] token unambiguously separates the sequences, but it makes
|
||||||
# it easier for the model to learn the concept of sequences.
|
# it easier for the model to learn the concept of sequences.
|
||||||
#
|
#
|
||||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||||
|
|||||||
@@ -85,9 +85,9 @@ class SquadExample(object):
|
|||||||
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
||||||
if self.start_position:
|
if self.start_position:
|
||||||
s += ", start_position: %d" % (self.start_position)
|
s += ", start_position: %d" % (self.start_position)
|
||||||
if self.start_position:
|
if self.end_position:
|
||||||
s += ", end_position: %d" % (self.end_position)
|
s += ", end_position: %d" % (self.end_position)
|
||||||
if self.start_position:
|
if self.is_impossible:
|
||||||
s += ", is_impossible: %r" % (self.is_impossible)
|
s += ", is_impossible: %r" % (self.is_impossible)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
|
|||||||
name = name.split('/')
|
name = name.split('/')
|
||||||
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
||||||
# which are not required for using pretrained model
|
# which are not required for using pretrained model
|
||||||
if any(n in ["adam_v", "adam_m"] for n in name):
|
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
|
||||||
print("Skipping {}".format("/".join(name)))
|
print("Skipping {}".format("/".join(name)))
|
||||||
continue
|
continue
|
||||||
pointer = model
|
pointer = model
|
||||||
@@ -91,8 +91,14 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
|
|||||||
pointer = getattr(pointer, 'bias')
|
pointer = getattr(pointer, 'bias')
|
||||||
elif l[0] == 'output_weights':
|
elif l[0] == 'output_weights':
|
||||||
pointer = getattr(pointer, 'weight')
|
pointer = getattr(pointer, 'weight')
|
||||||
|
elif l[0] == 'squad':
|
||||||
|
pointer = getattr(pointer, 'classifier')
|
||||||
else:
|
else:
|
||||||
|
try:
|
||||||
pointer = getattr(pointer, l[0])
|
pointer = getattr(pointer, l[0])
|
||||||
|
except AttributeError:
|
||||||
|
print("Skipping {}".format("/".join(name)))
|
||||||
|
continue
|
||||||
if len(l) >= 2:
|
if len(l) >= 2:
|
||||||
num = int(l[1])
|
num = int(l[1])
|
||||||
pointer = pointer[num]
|
pointer = pointer[num]
|
||||||
|
|||||||
@@ -617,8 +617,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
|
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
# Shift so that tokens < n predict n
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
|
|
||||||
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1))
|
||||||
return loss
|
return loss
|
||||||
return lm_logits, presents
|
return lm_logits, presents
|
||||||
|
|
||||||
@@ -690,8 +696,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
||||||
losses = []
|
losses = []
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
|
losses.append(loss_fct(shift_logits.view(-1,
|
||||||
|
shift_logits.size(-1)), shift_labels.view(-1)))
|
||||||
if mc_labels is not None:
|
if mc_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss()
|
loss_fct = CrossEntropyLoss()
|
||||||
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
||||||
|
|||||||
@@ -716,8 +716,14 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
|
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
# Shift so that tokens < n predict n
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
|
|
||||||
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1))
|
||||||
return loss
|
return loss
|
||||||
return lm_logits
|
return lm_logits
|
||||||
|
|
||||||
@@ -803,8 +809,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
||||||
losses = []
|
losses = []
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
|
losses.append(loss_fct(shift_logits.view(-1,
|
||||||
|
shift_logits.size(-1)), shift_labels.view(-1)))
|
||||||
if mc_labels is not None:
|
if mc_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss()
|
loss_fct = CrossEntropyLoss()
|
||||||
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
||||||
|
|||||||
@@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
|
|||||||
def warmup_cosine(x, warmup=0.002):
|
def warmup_cosine(x, warmup=0.002):
|
||||||
if x < warmup:
|
if x < warmup:
|
||||||
return x/warmup
|
return x/warmup
|
||||||
return 0.5 * (1.0 + torch.cos(math.pi * x))
|
x_ = (x - warmup) / (1 - warmup) # progress after warmup -
|
||||||
|
return 0.5 * (1. + math.cos(math.pi * x_))
|
||||||
|
|
||||||
def warmup_constant(x, warmup=0.002):
|
def warmup_constant(x, warmup=0.002):
|
||||||
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
|
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
|
||||||
|
|||||||
@@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
|
|||||||
def warmup_cosine(x, warmup=0.002):
|
def warmup_cosine(x, warmup=0.002):
|
||||||
if x < warmup:
|
if x < warmup:
|
||||||
return x/warmup
|
return x/warmup
|
||||||
return 0.5 * (1.0 + torch.cos(math.pi * x))
|
x_ = (x - warmup) / (1 - warmup) # progress after warmup
|
||||||
|
return 0.5 * (1. + math.cos(math.pi * x_))
|
||||||
|
|
||||||
def warmup_constant(x, warmup=0.002):
|
def warmup_constant(x, warmup=0.002):
|
||||||
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.
|
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.
|
||||||
|
|||||||
@@ -105,8 +105,8 @@ class BertTokenizer(object):
|
|||||||
self.max_len = max_len if max_len is not None else int(1e12)
|
self.max_len = max_len if max_len is not None else int(1e12)
|
||||||
|
|
||||||
def tokenize(self, text):
|
def tokenize(self, text):
|
||||||
if self.do_basic_tokenize:
|
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
|
if self.do_basic_tokenize:
|
||||||
for token in self.basic_tokenizer.tokenize(text):
|
for token in self.basic_tokenizer.tokenize(text):
|
||||||
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
||||||
split_tokens.append(sub_token)
|
split_tokens.append(sub_token)
|
||||||
@@ -142,6 +142,16 @@ class BertTokenizer(object):
|
|||||||
"""
|
"""
|
||||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||||
|
if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
|
||||||
|
logger.warning("The pre-trained model you are loading is a cased model but you have not set "
|
||||||
|
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
|
||||||
|
"you may want to check this behavior.")
|
||||||
|
kwargs['do_lower_case'] = False
|
||||||
|
elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
|
||||||
|
logger.warning("The pre-trained model you are loading is an uncased model but you have set "
|
||||||
|
"`do_lower_case` to False. We are setting `do_lower_case=True` for you "
|
||||||
|
"but you may want to check this behavior.")
|
||||||
|
kwargs['do_lower_case'] = True
|
||||||
else:
|
else:
|
||||||
vocab_file = pretrained_model_name_or_path
|
vocab_file = pretrained_model_name_or_path
|
||||||
if os.path.isdir(vocab_file):
|
if os.path.isdir(vocab_file):
|
||||||
|
|||||||
Reference in New Issue
Block a user