Merge branch 'master' into t5

2019-12-10 12:58:48 +01:00
parent 608a8f5b56 e57d00ee10
commit 0558c9cb9b
169 changed files with 13409 additions and 3288 deletions
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLNetTokenizer,
                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup

 from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
@@ -98,7 +98,7 @@ def train(args, train_dataset, model, tokenizer):
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
--- a/templates/adding_a_new_model/README.md
+++ b/templates/adding_a_new_model/README.md
@@ -7,7 +7,7 @@ The library is designed to incorporate a variety of models and code bases. As su
 One important point though is that the library has the following goals impacting the way models are incorporated:

 - one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter.
- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificites includes `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.
+- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.

 For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html).

@@ -20,7 +20,7 @@ Here an overview of the general workflow:
 - [ ] add tests
 - [ ] finalize

-Let's details what should be done at each step
+Let's detail what should be done at each step

 ## Adding model/configuration/tokenization classes

@@ -28,16 +28,16 @@ Here is the workflow for adding model/configuration/tokenization classes:

 - [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name,
 - [ ] edit the files to replace `XXX` (with various casing) with your model name
- [ ] copy-past or create a simple configuration class for your model in the `configuration_...` file
- [ ] copy-past or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
- [ ] copy-past or create a tokenizer class for your model in the `tokenization_...` file
+- [ ] copy-paste or create a simple configuration class for your model in the `configuration_...` file
+- [ ] copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
+- [ ] copy-paste or create a tokenizer class for your model in the `tokenization_...` file

 # Adding conversion scripts

 Here is the workflow for the conversion scripts:

 - [ ] copy the conversion script (`convert_...`) from the present folder to the main folder.
- [ ] edit this scipt to convert your original checkpoint weights to the current pytorch ones.
+- [ ] edit this script to convert your original checkpoint weights to the current pytorch ones.

 # Adding tests:

@@ -58,5 +58,5 @@ You can then finish the addition step by adding imports for your classes in the
 - [ ] add your models and tokenizer to `pipeline.py`
 - [ ] add a link to your conversion script in the main conversion utility (currently in `__main__` but will be moved to the `commands` subfolder in the near future)
 - [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
- [ ] add a mention of your model in the doc: `README.md` and the documentation it-self at `docs/source/pretrained_models.rst`.
+- [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`.
 - [ ] upload the pretrained weigths, configurations and vocabulary files.
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -34,7 +34,7 @@ import numpy as np
 import tensorflow as tf

 from .configuration_xxx import XxxConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings

 logger = logging.getLogger(__name__)
@@ -51,7 +51,7 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
 ####################################################
 # TF 2.0 Models are constructed using Keras imperative API by sub-classing
 # - tf.keras.layers.Layer for the layers and
-# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
+# - TFPreTrainedModel for the models (itself a sub-class of tf.keras.Model)
 ####################################################

 ####################################################
@@ -123,9 +123,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
            input_ids = inputs

        if attention_mask is None:
-            attention_mask = tf.fill(tf.shape(input_ids), 1)
+            attention_mask = tf.fill(shape_list(input_ids), 1)
        if token_type_ids is None:
-            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+            token_type_ids = tf.fill(shape_list(input_ids), 0)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
@@ -257,6 +257,10 @@ XXX_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """

@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -122,7 +122,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
 ####################################################
 # PyTorch Models are constructed by sub-classing
 # - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
+# - PreTrainedModel for the models (itself a sub-class of torch.nn.Module)
 ####################################################

 ####################################################
@@ -240,6 +240,10 @@ XXX_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """

@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
@@ -296,11 +300,22 @@ class XxxModel(XxxPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
+            attention_mask = torch.ones(input_shape, device=device)
        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
@@ -334,7 +349,7 @@ class XxxModel(XxxPreTrainedModel):

        ##################################
        # Replace this with your model code
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
        sequence_output = encoder_outputs[0]
        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
@@ -385,14 +400,15 @@ class XxxForMaskedLM(XxxPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head

-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
                masked_lm_labels=None):

        outputs = self.transformer(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids, 
-                            head_mask=head_mask)
+                            head_mask=head_mask,
+                            inputs_embeds=inputs_embeds)

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
@@ -450,14 +466,15 @@ class XxxForSequenceClassification(XxxPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        outputs = self.transformer(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids, 
-                            head_mask=head_mask)
+                            head_mask=head_mask,
+                            inputs_embeds=inputs_embeds)

        pooled_output = outputs[1]

@@ -521,14 +538,15 @@ class XxxForTokenClassification(XxxPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
-                position_ids=None, head_mask=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        outputs = self.transformer(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids, 
-                            head_mask=head_mask)
+                            head_mask=head_mask,
+                            inputs_embeds=inputs_embeds)

        sequence_output = outputs[0]

@@ -604,14 +622,15 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
                start_positions=None, end_positions=None):

        outputs = self.transformer(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids, 
-                            head_mask=head_mask)
+                            head_mask=head_mask,
+                            inputs_embeds=inputs_embeds)

        sequence_output = outputs[0]

--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function

 import unittest
 import shutil
-import pytest
 import sys

 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow

 from transformers import XxxConfig, is_tf_available

@@ -33,10 +33,9 @@ if is_tf_available():
                                               TFXxxForTokenClassification,
                                               TFXxxForQuestionAnswering,
                                               TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")


+@require_tf
 class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):

    all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
@@ -244,7 +243,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)

-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in ['xxx-base-uncased']:
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -18,12 +18,12 @@ from __future__ import print_function

 import unittest
 import shutil
-import pytest

 from transformers import is_torch_available

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device

 if is_torch_available():
    from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
@@ -31,10 +31,9 @@ if is_torch_available():
                                        XxxForQuestionAnswering, XxxForSequenceClassification,
                                        XxxForTokenClassification, XxxForMultipleChoice)
    from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")


+@require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):

    all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxModel(config=config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = XxxForQuestionAnswering(config=config)
+            model.to(torch_device)
            model.eval()
            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                   start_positions=sequence_labels, end_positions=sequence_labels)
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = XxxForSequenceClassification(config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            result = {
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = XxxForTokenClassification(config=config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            result = {
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)

-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -172,7 +172,7 @@ class XxxTokenizer(PreTrainedTokenizer):
                special tokens for the model

        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens: