Merge branch 'master' into pr/1383

This commit is contained in:
thomwolf
2019-10-09 17:25:08 +02:00
45 changed files with 1707 additions and 448 deletions

View File

@@ -28,7 +28,8 @@ logger = logging.getLogger(__name__)
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`.

View File

@@ -178,10 +178,12 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
else:
model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
convert_pt_checkpoint_to_tf(model_type,
model_file,
config_file,
os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
if os.path.isfile(model_shortcut_name):
model_shortcut_name = 'converted_model'
convert_pt_checkpoint_to_tf(model_type=model_type,
pytorch_checkpoint_path=model_file,
config_file=config_file,
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
compare_with_pt_model=compare_with_pt_model)
os.remove(config_file)
os.remove(model_file)

View File

@@ -118,7 +118,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
def gelu(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initialy created.
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415

View File

@@ -159,8 +159,6 @@ class MultiHeadSelfAttention(nn.Module):
dim_per_head = self.dim // self.n_heads
assert 2 <= mask.dim() <= 3
causal = (mask.dim() == 3)
mask_reshp = (bs, 1, 1, k_length)
def shape(x):
@@ -649,7 +647,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:2]
loss, start_scores, end_scores = outputs[:3]
"""
def __init__(self, config):

View File

@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
""" Load tf checkpoints in a pytorch model

View File

@@ -62,7 +62,7 @@ def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
def gelu(x):
""" Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415

View File

@@ -45,7 +45,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
def gelu(x):
""" Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
@@ -226,8 +226,6 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
dim_per_head = self.dim // self.n_heads
assert 2 <= len(tf.shape(mask)) <= 3
causal = (len(tf.shape(mask)) == 3)
mask_reshape = [bs, 1, 1, k_length]
def shape(x):
@@ -603,7 +601,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
@@ -715,9 +713,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
start_positions = tf.constant([1])
end_positions = tf.constant([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
outputs = model(input_ids)
start_scores, end_scores = outputs[:2]
"""

View File

@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):

View File

@@ -69,7 +69,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
def gelu(x):
""" Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415

View File

@@ -501,7 +501,10 @@ class PoolerEndLogits(nn.Module):
x = self.dense_1(x).squeeze(-1)
if p_mask is not None:
x = x * (1 - p_mask) - 1e30 * p_mask
if next(self.parameters()).dtype == torch.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x

View File

@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
},
'merges_file':
{
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
},
}
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2': 1024,
'gpt2-medium': 1024,
'gpt2-large': 1024,
'distilgpt2': 1024,
}
@lru_cache()

View File

@@ -512,7 +512,8 @@ class PreTrainedTokenizer(object):
for token in new_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
if token != self.unk_token and \
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
token not in to_add_tokens:
to_add_tokens.append(token)
logger.info("Adding %s to the vocabulary", token)
@@ -911,6 +912,11 @@ class PreTrainedTokenizer(object):
Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
Args:
token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
"""
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
@@ -933,20 +939,11 @@ class PreTrainedTokenizer(object):
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = ''.join(sub_texts)
if self._sep_token is not None and self._sep_token in text:
text = text.replace(self._cls_token, self._sep_token)
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
if clean_up_tokenization_spaces:
clean_text = [self.clean_up_tokenization(text) for text in split_text]
return clean_text
else:
return split_text
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
return text
@property
def special_tokens_map(self):