Merge branch 'master' into pr/1383
This commit is contained in:
@@ -28,7 +28,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
|
||||
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
|
||||
|
||||
class GPT2Config(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||
|
||||
@@ -178,10 +178,12 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
|
||||
else:
|
||||
model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
|
||||
|
||||
convert_pt_checkpoint_to_tf(model_type,
|
||||
model_file,
|
||||
config_file,
|
||||
os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
|
||||
if os.path.isfile(model_shortcut_name):
|
||||
model_shortcut_name = 'converted_model'
|
||||
convert_pt_checkpoint_to_tf(model_type=model_type,
|
||||
pytorch_checkpoint_path=model_file,
|
||||
config_file=config_file,
|
||||
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
|
||||
compare_with_pt_model=compare_with_pt_model)
|
||||
os.remove(config_file)
|
||||
os.remove(model_file)
|
||||
|
||||
@@ -118,7 +118,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
||||
|
||||
|
||||
def gelu(x):
|
||||
""" Original Implementation of the gelu activation function in Google Bert repo when initialy created.
|
||||
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
|
||||
@@ -159,8 +159,6 @@ class MultiHeadSelfAttention(nn.Module):
|
||||
|
||||
dim_per_head = self.dim // self.n_heads
|
||||
|
||||
assert 2 <= mask.dim() <= 3
|
||||
causal = (mask.dim() == 3)
|
||||
mask_reshp = (bs, 1, 1, k_length)
|
||||
|
||||
def shape(x):
|
||||
@@ -649,7 +647,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
||||
start_positions = torch.tensor([1])
|
||||
end_positions = torch.tensor([3])
|
||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
loss, start_scores, end_scores = outputs[:2]
|
||||
loss, start_scores, end_scores = outputs[:3]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
|
||||
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
|
||||
|
||||
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model
|
||||
|
||||
@@ -62,7 +62,7 @@ def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||
|
||||
def gelu(x):
|
||||
""" Gaussian Error Linear Unit.
|
||||
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
|
||||
Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
|
||||
@@ -45,7 +45,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
|
||||
def gelu(x):
|
||||
""" Gaussian Error Linear Unit.
|
||||
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
|
||||
Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
@@ -226,8 +226,6 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
|
||||
|
||||
dim_per_head = self.dim // self.n_heads
|
||||
|
||||
assert 2 <= len(tf.shape(mask)) <= 3
|
||||
causal = (len(tf.shape(mask)) == 3)
|
||||
mask_reshape = [bs, 1, 1, k_length]
|
||||
|
||||
def shape(x):
|
||||
@@ -603,7 +601,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
outputs = model(input_ids)
|
||||
prediction_scores = outputs[0]
|
||||
|
||||
"""
|
||||
@@ -715,9 +713,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
start_positions = tf.constant([1])
|
||||
end_positions = tf.constant([3])
|
||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
outputs = model(input_ids)
|
||||
start_scores, end_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
|
||||
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
|
||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
|
||||
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
|
||||
|
||||
|
||||
def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||
|
||||
@@ -69,7 +69,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
|
||||
|
||||
def gelu(x):
|
||||
""" Gaussian Error Linear Unit.
|
||||
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
|
||||
Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
|
||||
@@ -501,7 +501,10 @@ class PoolerEndLogits(nn.Module):
|
||||
x = self.dense_1(x).squeeze(-1)
|
||||
|
||||
if p_mask is not None:
|
||||
x = x * (1 - p_mask) - 1e30 * p_mask
|
||||
if next(self.parameters()).dtype == torch.float16:
|
||||
x = x * (1 - p_mask) - 65500 * p_mask
|
||||
else:
|
||||
x = x * (1 - p_mask) - 1e30 * p_mask
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
|
||||
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
|
||||
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
|
||||
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
|
||||
},
|
||||
'merges_file':
|
||||
{
|
||||
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
|
||||
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
|
||||
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
|
||||
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'gpt2': 1024,
|
||||
'gpt2-medium': 1024,
|
||||
'gpt2-large': 1024,
|
||||
'distilgpt2': 1024,
|
||||
}
|
||||
|
||||
@lru_cache()
|
||||
|
||||
@@ -512,7 +512,8 @@ class PreTrainedTokenizer(object):
|
||||
for token in new_tokens:
|
||||
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
|
||||
if token != self.unk_token and \
|
||||
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
|
||||
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
|
||||
token not in to_add_tokens:
|
||||
to_add_tokens.append(token)
|
||||
logger.info("Adding %s to the vocabulary", token)
|
||||
|
||||
@@ -911,6 +912,11 @@ class PreTrainedTokenizer(object):
|
||||
Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
|
||||
with options to remove special tokens and clean up tokenization spaces.
|
||||
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
||||
|
||||
Args:
|
||||
token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
|
||||
skip_special_tokens: if set to True, will replace special tokens.
|
||||
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
|
||||
"""
|
||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
@@ -933,20 +939,11 @@ class PreTrainedTokenizer(object):
|
||||
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||
text = ''.join(sub_texts)
|
||||
|
||||
if self._sep_token is not None and self._sep_token in text:
|
||||
text = text.replace(self._cls_token, self._sep_token)
|
||||
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
|
||||
if clean_up_tokenization_spaces:
|
||||
clean_text = [self.clean_up_tokenization(text) for text in split_text]
|
||||
return clean_text
|
||||
else:
|
||||
return split_text
|
||||
if clean_up_tokenization_spaces:
|
||||
clean_text = self.clean_up_tokenization(text)
|
||||
return clean_text
|
||||
else:
|
||||
if clean_up_tokenization_spaces:
|
||||
clean_text = self.clean_up_tokenization(text)
|
||||
return clean_text
|
||||
else:
|
||||
return text
|
||||
return text
|
||||
|
||||
@property
|
||||
def special_tokens_map(self):
|
||||
|
||||
Reference in New Issue
Block a user