Add magic method to our TF models to convert datasets with column inference (#17160)
* Add method to call to_tf_dataset() with column inference * Add test for dataset creation * Add a default arg for data collator * Fix test * Fix call with non-dev version of datasets * Test correct column removal too * make fixup * More tests to make sure we remove unwanted columns * Fix test to avoid predicting on unbuilt models * Fix test to avoid predicting on unbuilt models * Fix test to remove unwanted head mask columns from inputs * Stop pushing your debug breakpoints to the main repo of the $2bn company you work for * Skip the test in convnext because no grouped conv support * Drop bools from the dataset dict * Make style * Skip the training test for models whose input dicts don't give us labels * Skip transformerXL in the test because it doesn't return a simple loss * Skip TFTapas because of some odd NaN losses * make style * make fixup * Add docstring * fixup * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Remove breakpoint from tests * Fix assert, add requires_backends * Protect tokenizer import with if TYPE_CHECKING * make fixup * Add noqa, more fixup * More rearranging for ~* aesthetics *~ * Adding defaults for shuffle and batch_size to match to_tf_dataset() * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -25,6 +25,8 @@ import unittest.mock as mock
|
||||
from importlib import import_module
|
||||
from typing import List, Tuple
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from huggingface_hub import delete_repo, login
|
||||
from requests.exceptions import HTTPError
|
||||
from transformers import is_tf_available, is_torch_available
|
||||
@@ -1509,6 +1511,56 @@ class TFModelTesterMixin:
|
||||
observed_main_input_name = list(model_signature.parameters.keys())[1]
|
||||
self.assertEqual(model_class.main_input_name, observed_main_input_name)
|
||||
|
||||
def test_dataset_conversion(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=False)
|
||||
tf_inputs_dict = {
|
||||
key: val
|
||||
for key, val in tf_inputs_dict.items()
|
||||
if "head_mask" not in key and isinstance(val, tf.Tensor)
|
||||
}
|
||||
tf_inputs_dict["extra_unwanted_column"] = list(tf_inputs_dict.values())[0] # Use a random other tensor
|
||||
input_dataset = Dataset.from_dict(tf_inputs_dict)
|
||||
tf_dataset = model.prepare_tf_dataset(
|
||||
input_dataset, batch_size=len(input_dataset), drop_remainder=False, shuffle=False
|
||||
)
|
||||
test_batch = next(iter(tf_dataset))
|
||||
if isinstance(test_batch, tf.Tensor):
|
||||
self.assertEqual(len(test_batch), len(input_dataset)) # Assert we didn't lose any data
|
||||
else:
|
||||
# Assert we discarded the unwanted extra column but kept everything else
|
||||
self.assertEqual(len(test_batch), len(input_dataset.features) - 1)
|
||||
self.assertNotIn("extra_unwanted_column", test_batch)
|
||||
for tensor in test_batch.values():
|
||||
self.assertTrue(isinstance(tensor, tf.Tensor))
|
||||
self.assertEqual(len(tensor), len(input_dataset)) # Assert we didn't lose any data
|
||||
model(test_batch, training=False)
|
||||
|
||||
if "labels" in inspect.signature(model_class.call).parameters.keys():
|
||||
tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
|
||||
if "labels" not in tf_inputs_dict:
|
||||
return # This model isn't giving us labels after all, don't try training with it
|
||||
tf_inputs_dict = {key: val for key, val in tf_inputs_dict.items() if "head_mask" not in key}
|
||||
tf_inputs_dict["extra_unwanted_column"] = list(tf_inputs_dict.values())[0] # Use a random other tensor
|
||||
input_dataset = Dataset.from_dict(tf_inputs_dict)
|
||||
tf_dataset = model.prepare_tf_dataset(
|
||||
input_dataset, batch_size=len(input_dataset), drop_remainder=False, shuffle=False
|
||||
)
|
||||
test_batch, test_batch_labels = next(iter(tf_dataset))
|
||||
self.assertGreater(len(test_batch_labels), 0) # Assert the labels are present
|
||||
feature_columns = 1 if isinstance(test_batch, tf.Tensor) else len(test_batch)
|
||||
label_columns = 1 if isinstance(test_batch_labels, tf.Tensor) else len(test_batch_labels)
|
||||
# Assert we discarded the unwanted extra column but kept everything else
|
||||
self.assertEqual(feature_columns + label_columns, len(input_dataset.features) - 1)
|
||||
if isinstance(test_batch, dict):
|
||||
self.assertNotIn("extra_unwanted_column", test_batch)
|
||||
if isinstance(test_batch_labels, dict):
|
||||
self.assertNotIn("extra_unwanted_column", test_batch_labels)
|
||||
model.compile(optimizer="sgd", run_eagerly=True)
|
||||
model.train_on_batch(test_batch, test_batch_labels)
|
||||
|
||||
def _generate_random_bad_tokens(self, num_bad_tokens, model):
|
||||
# special tokens cannot be bad tokens
|
||||
special_tokens = []
|
||||
|
||||
Reference in New Issue
Block a user