Merge branch 'master' into pr/1383

2019-10-09 17:25:08 +02:00
parent 07d055f849 1c5079952f
commit d9e60f4f0d
45 changed files with 1707 additions and 448 deletions
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -28,7 +28,8 @@ logger = logging.getLogger(__name__)

 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
-                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
+                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}

 class GPT2Config(PretrainedConfig):
    """Configuration class to store the configuration of a `GPT2Model`.
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -178,10 +178,12 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
            else:
                model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)

-            convert_pt_checkpoint_to_tf(model_type,
-                                        model_file,
-                                        config_file,
-                                        os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
+            if os.path.isfile(model_shortcut_name):
+                model_shortcut_name = 'converted_model'
+            convert_pt_checkpoint_to_tf(model_type=model_type,
+                                        pytorch_checkpoint_path=model_file,
+                                        config_file=config_file,
+                                        tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                        compare_with_pt_model=compare_with_pt_model)
            os.remove(config_file)
            os.remove(model_file)
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -118,7 +118,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):


 def gelu(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -159,8 +159,6 @@ class MultiHeadSelfAttention(nn.Module):

        dim_per_head = self.dim // self.n_heads

-        assert 2 <= mask.dim() <= 3
-        causal = (mask.dim() == 3)
        mask_reshp = (bs, 1, 1, k_length)

        def shape(x):
@@ -649,7 +647,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
+        loss, start_scores, end_scores = outputs[:3]

    """
    def __init__(self, config):
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)

 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
+                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}

 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    """ Load tf checkpoints in a pytorch model
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -62,7 +62,7 @@ def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):

 def gelu(x):
    """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -45,7 +45,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
 def gelu(x):
    """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
@@ -226,8 +226,6 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):

        dim_per_head = self.dim // self.n_heads

-        assert 2 <= len(tf.shape(mask)) <= 3
-        causal = (len(tf.shape(mask)) == 3)
        mask_reshape = [bs, 1, 1, k_length]

        def shape(x):
@@ -603,7 +601,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
+        outputs = model(input_ids)
        prediction_scores = outputs[0]

    """
@@ -715,9 +713,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        start_positions = tf.constant([1])
-        end_positions = tf.constant([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        outputs = model(input_ids)
        start_scores, end_scores = outputs[:2]

    """
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)

 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
+                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}


 def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -69,7 +69,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):

 def gelu(x):
    """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -501,7 +501,10 @@ class PoolerEndLogits(nn.Module):
        x = self.dense_1(x).squeeze(-1)

        if p_mask is not None:
-            x = x * (1 - p_mask) - 1e30 * p_mask
+            if next(self.parameters()).dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask

        return x

--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
+        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
    },
    'merges_file':
    {
        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
+        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
    },
 }

@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'gpt2': 1024,
    'gpt2-medium': 1024,
    'gpt2-large': 1024,
+    'distilgpt2': 1024,
 }

@lru_cache()
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -512,7 +512,8 @@ class PreTrainedTokenizer(object):
        for token in new_tokens:
            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
            if token != self.unk_token and \
-                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
+                    token not in to_add_tokens:
                to_add_tokens.append(token)
                logger.info("Adding %s to the vocabulary", token)

@@ -911,6 +912,11 @@ class PreTrainedTokenizer(object):
        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
        with options to remove special tokens and clean up tokenization spaces.
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+
+        Args:
+            token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
+            skip_special_tokens: if set to True, will replace special tokens.
+            clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
        """
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)

@@ -933,20 +939,11 @@ class PreTrainedTokenizer(object):
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
        text = ''.join(sub_texts)

-        if self._sep_token is not None and self._sep_token in text:
-            text = text.replace(self._cls_token, self._sep_token)
-            split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
-            if clean_up_tokenization_spaces:
-                clean_text = [self.clean_up_tokenization(text) for text in split_text]
-                return clean_text
-            else:
-                return split_text
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
        else:
-            if clean_up_tokenization_spaces:
-                clean_text = self.clean_up_tokenization(text)
-                return clean_text
-            else:
-                return text
+            return text

    @property
    def special_tokens_map(self):