From 8f20e61c85c11794ede9f2c45aabf16771b7353b Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Mon, 24 Apr 2023 17:34:30 +0100 Subject: [PATCH] Update feature selection in to_tf_dataset (#21935) * Update feature selection * Check compatibility with datasets version * Checkout from datasets main --- docs/source/en/tasks/image_classification.mdx | 4 ++-- docs/source/es/training.mdx | 4 ++-- docs/source/pt/training.mdx | 4 ++-- src/transformers/modeling_tf_utils.py | 6 ++++++ 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.mdx index 47c6394708..4e9f436a3c 100644 --- a/docs/source/en/tasks/image_classification.mdx +++ b/docs/source/en/tasks/image_classification.mdx @@ -385,12 +385,12 @@ Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Data ```py >>> # converting our train dataset to tf.data.Dataset >>> tf_train_dataset = food["train"].to_tf_dataset( -... columns=["pixel_values"], label_cols=["label"], shuffle=True, batch_size=batch_size, collate_fn=data_collator +... columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator ... ) >>> # converting our test dataset to tf.data.Dataset >>> tf_eval_dataset = food["test"].to_tf_dataset( -... columns=["pixel_values"], label_cols=["label"], shuffle=True, batch_size=batch_size, collate_fn=data_collator +... columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator ... ) ``` diff --git a/docs/source/es/training.mdx b/docs/source/es/training.mdx index 467df17d13..6a7f408920 100644 --- a/docs/source/es/training.mdx +++ b/docs/source/es/training.mdx @@ -173,7 +173,7 @@ A continuación, convierte los datasets tokenizados en datasets de TensorFlow co ```py >>> tf_train_dataset = small_train_dataset.to_tf_dataset( ... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], +... label_cols="labels", ... shuffle=True, ... collate_fn=data_collator, ... batch_size=8, @@ -181,7 +181,7 @@ A continuación, convierte los datasets tokenizados en datasets de TensorFlow co >>> tf_validation_dataset = small_eval_dataset.to_tf_dataset( ... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], +... label_cols="labels", ... shuffle=False, ... collate_fn=data_collator, ... batch_size=8, diff --git a/docs/source/pt/training.mdx b/docs/source/pt/training.mdx index bf59c14528..d84f227aec 100644 --- a/docs/source/pt/training.mdx +++ b/docs/source/pt/training.mdx @@ -205,7 +205,7 @@ Especifique suas entradas em `columns` e seu rótulo em `label_cols`: ```py >>> tf_train_dataset = small_train_dataset.to_tf_dataset( ... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], +... label_cols="labels", ... shuffle=True, ... collate_fn=data_collator, ... batch_size=8, @@ -213,7 +213,7 @@ Especifique suas entradas em `columns` e seu rótulo em `label_cols`: >>> tf_validation_dataset = small_eval_dataset.to_tf_dataset( ... columns=["attention_mask", "input_ids", "token_type_ids"], -... label_cols=["labels"], +... label_cols="labels", ... shuffle=False, ... collate_fn=data_collator, ... batch_size=8, diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index a6d566dc55..f48651a6e9 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -1413,6 +1413,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu feature_cols = [col for col in output_columns if col in model_inputs and col not in model_labels] label_cols = [col for col in output_columns if col in model_labels] + # Backwards compatibility for older versions of datasets. Previously, if `columns` or `label_cols` + # were a single element list, the returned element spec would be a single element. Now, passing [feature] + # will return a dict structure {"feature": feature}, and passing a single string will return a single element. + feature_cols = feature_cols[0] if len(feature_cols) == 1 else feature_cols + label_cols = label_cols[0] if len(label_cols) == 1 else label_cols + if drop_remainder is None: drop_remainder = shuffle tf_dataset = dataset.to_tf_dataset(