Add magic method to our TF models to convert datasets with column inference (#17160)

* Add method to call to_tf_dataset() with column inference * Add test for dataset creation * Add a default arg for data collator * Fix test * Fix call with non-dev version of datasets * Test correct column removal too * make fixup * More tests to make sure we remove unwanted columns * Fix test to avoid predicting on unbuilt models * Fix test to avoid predicting on unbuilt models * Fix test to remove unwanted head mask columns from inputs * Stop pushing your debug breakpoints to the main repo of the $2bn company you work for * Skip the test in convnext because no grouped conv support * Drop bools from the dataset dict * Make style * Skip the training test for models whose input dicts don't give us labels * Skip transformerXL in the test because it doesn't return a simple loss * Skip TFTapas because of some odd NaN losses * make style * make fixup * Add docstring * fixup * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Remove breakpoint from tests * Fix assert, add requires_backends * Protect tokenizer import with if TYPE_CHECKING * make fixup * Add noqa, more fixup * More rearranging for ~* aesthetics *~ * Adding defaults for shuffle and batch_size to match to_tf_dataset() * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2022-06-06 15:53:49 +01:00
parent d28b7aa8cb
commit 19a8a3036d
5 changed files with 162 additions and 1 deletions
--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ b/tests/models/convnext/test_modeling_tf_convnext.py
@@ -174,6 +174,13 @@ class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase):
    def test_attention_outputs(self):
        pass

+    @unittest.skipIf(
+        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
+        reason="TF (<=2.8) does not support backprop for grouped convolutions on CPU.",
+    )
+    def test_dataset_conversion(self):
+        super().test_dataset_conversion()
+
    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ b/tests/models/tapas/test_modeling_tf_tapas.py
@@ -498,6 +498,10 @@ class TFTapasModelTest(TFModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)

+    @unittest.skip(reason="The default test gets NaN losses with the test-generated inputs")
+    def test_dataset_conversion(self):
+        pass
+

 def prepare_tapas_single_inputs_for_inference():
    # Here we prepare a single table-question pair to test TAPAS inference on:
--- a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
@@ -216,6 +216,10 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
            model = TFTransfoXLModel.from_pretrained(model_name)
            self.assertIsNotNone(model)

+    @unittest.skip(reason="This model doesn't play well with fit() due to not returning a single loss.")
+    def test_dataset_conversion(self):
+        pass
+

@require_tf
 class TFTransfoXLModelLanguageGenerationTest(unittest.TestCase):
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -25,6 +25,8 @@ import unittest.mock as mock
 from importlib import import_module
 from typing import List, Tuple

+from datasets import Dataset
+
 from huggingface_hub import delete_repo, login
 from requests.exceptions import HTTPError
 from transformers import is_tf_available, is_torch_available
@@ -1509,6 +1511,56 @@ class TFModelTesterMixin:
            observed_main_input_name = list(model_signature.parameters.keys())[1]
            self.assertEqual(model_class.main_input_name, observed_main_input_name)

+    def test_dataset_conversion(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=False)
+            tf_inputs_dict = {
+                key: val
+                for key, val in tf_inputs_dict.items()
+                if "head_mask" not in key and isinstance(val, tf.Tensor)
+            }
+            tf_inputs_dict["extra_unwanted_column"] = list(tf_inputs_dict.values())[0]  # Use a random other tensor
+            input_dataset = Dataset.from_dict(tf_inputs_dict)
+            tf_dataset = model.prepare_tf_dataset(
+                input_dataset, batch_size=len(input_dataset), drop_remainder=False, shuffle=False
+            )
+            test_batch = next(iter(tf_dataset))
+            if isinstance(test_batch, tf.Tensor):
+                self.assertEqual(len(test_batch), len(input_dataset))  # Assert we didn't lose any data
+            else:
+                # Assert we discarded the unwanted extra column but kept everything else
+                self.assertEqual(len(test_batch), len(input_dataset.features) - 1)
+                self.assertNotIn("extra_unwanted_column", test_batch)
+                for tensor in test_batch.values():
+                    self.assertTrue(isinstance(tensor, tf.Tensor))
+                    self.assertEqual(len(tensor), len(input_dataset))  # Assert we didn't lose any data
+                    model(test_batch, training=False)
+
+            if "labels" in inspect.signature(model_class.call).parameters.keys():
+                tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+                if "labels" not in tf_inputs_dict:
+                    return  # This model isn't giving us labels after all, don't try training with it
+                tf_inputs_dict = {key: val for key, val in tf_inputs_dict.items() if "head_mask" not in key}
+                tf_inputs_dict["extra_unwanted_column"] = list(tf_inputs_dict.values())[0]  # Use a random other tensor
+                input_dataset = Dataset.from_dict(tf_inputs_dict)
+                tf_dataset = model.prepare_tf_dataset(
+                    input_dataset, batch_size=len(input_dataset), drop_remainder=False, shuffle=False
+                )
+                test_batch, test_batch_labels = next(iter(tf_dataset))
+                self.assertGreater(len(test_batch_labels), 0)  # Assert the labels are present
+                feature_columns = 1 if isinstance(test_batch, tf.Tensor) else len(test_batch)
+                label_columns = 1 if isinstance(test_batch_labels, tf.Tensor) else len(test_batch_labels)
+                # Assert we discarded the unwanted extra column but kept everything else
+                self.assertEqual(feature_columns + label_columns, len(input_dataset.features) - 1)
+                if isinstance(test_batch, dict):
+                    self.assertNotIn("extra_unwanted_column", test_batch)
+                if isinstance(test_batch_labels, dict):
+                    self.assertNotIn("extra_unwanted_column", test_batch_labels)
+                model.compile(optimizer="sgd", run_eagerly=True)
+                model.train_on_batch(test_batch, test_batch_labels)
+
    def _generate_random_bad_tokens(self, num_bad_tokens, model):
        # special tokens cannot be bad tokens
        special_tokens = []