From fcf0652460753f8a81f7576e8abdaa6b3742f00e Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Wed, 1 Jul 2020 01:49:11 +0200 Subject: [PATCH] Fix TensorFlow dataset generator (#4881) * fix TensorFlow generator * Better features handling * Apply style * Apply style * Fix squad as well * Apply style * Better factorization of TF Tensors creation --- src/transformers/data/processors/glue.py | 25 ++-- src/transformers/data/processors/squad.py | 141 ++++++++++++++-------- 2 files changed, 101 insertions(+), 65 deletions(-) diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py index 8a96240486..bc28cdc3df 100644 --- a/src/transformers/data/processors/glue.py +++ b/src/transformers/data/processors/glue.py @@ -17,6 +17,7 @@ import logging import os +from dataclasses import asdict from enum import Enum from typing import List, Optional, Union @@ -81,26 +82,16 @@ if is_tf_available(): def gen(): for ex in features: - yield ( - { - "input_ids": ex.input_ids, - "attention_mask": ex.attention_mask, - "token_type_ids": ex.token_type_ids, - }, - ex.label, - ) + d = {k: v for k, v in asdict(ex).items() if v is not None} + label = d.pop("label") + yield (d, label) + + input_names = ["input_ids"] + tokenizer.model_input_names return tf.data.Dataset.from_generator( gen, - ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), - ( - { - "input_ids": tf.TensorShape([None]), - "attention_mask": tf.TensorShape([None]), - "token_type_ids": tf.TensorShape([None]), - }, - tf.TensorShape([]), - ), + ({k: tf.int32 for k in input_names}, tf.int64), + ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), ) diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 0c68df6820..1c840639ca 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -389,57 +389,102 @@ def squad_convert_examples_to_features( def gen(): for i, ex in enumerate(features): - yield ( - { - "input_ids": ex.input_ids, - "attention_mask": ex.attention_mask, - "token_type_ids": ex.token_type_ids, - "feature_index": i, - "qas_id": ex.qas_id, - }, - { - "start_positions": ex.start_position, - "end_positions": ex.end_position, - "cls_index": ex.cls_index, - "p_mask": ex.p_mask, - "is_impossible": ex.is_impossible, - }, - ) + if ex.token_type_ids is None: + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "feature_index": i, + "qas_id": ex.qas_id, + }, + { + "start_positions": ex.start_position, + "end_positions": ex.end_position, + "cls_index": ex.cls_index, + "p_mask": ex.p_mask, + "is_impossible": ex.is_impossible, + }, + ) + else: + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + "feature_index": i, + "qas_id": ex.qas_id, + }, + { + "start_positions": ex.start_position, + "end_positions": ex.end_position, + "cls_index": ex.cls_index, + "p_mask": ex.p_mask, + "is_impossible": ex.is_impossible, + }, + ) # Why have we split the batch into a tuple? PyTorch just has a list of tensors. - train_types = ( - { - "input_ids": tf.int32, - "attention_mask": tf.int32, - "token_type_ids": tf.int32, - "feature_index": tf.int64, - "qas_id": tf.string, - }, - { - "start_positions": tf.int64, - "end_positions": tf.int64, - "cls_index": tf.int64, - "p_mask": tf.int32, - "is_impossible": tf.int32, - }, - ) + if "token_type_ids" in tokenizer.model_input_names: + train_types = ( + { + "input_ids": tf.int32, + "attention_mask": tf.int32, + "token_type_ids": tf.int32, + "feature_index": tf.int64, + "qas_id": tf.string, + }, + { + "start_positions": tf.int64, + "end_positions": tf.int64, + "cls_index": tf.int64, + "p_mask": tf.int32, + "is_impossible": tf.int32, + }, + ) - train_shapes = ( - { - "input_ids": tf.TensorShape([None]), - "attention_mask": tf.TensorShape([None]), - "token_type_ids": tf.TensorShape([None]), - "feature_index": tf.TensorShape([]), - "qas_id": tf.TensorShape([]), - }, - { - "start_positions": tf.TensorShape([]), - "end_positions": tf.TensorShape([]), - "cls_index": tf.TensorShape([]), - "p_mask": tf.TensorShape([None]), - "is_impossible": tf.TensorShape([]), - }, - ) + train_shapes = ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "token_type_ids": tf.TensorShape([None]), + "feature_index": tf.TensorShape([]), + "qas_id": tf.TensorShape([]), + }, + { + "start_positions": tf.TensorShape([]), + "end_positions": tf.TensorShape([]), + "cls_index": tf.TensorShape([]), + "p_mask": tf.TensorShape([None]), + "is_impossible": tf.TensorShape([]), + }, + ) + else: + train_types = ( + {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string}, + { + "start_positions": tf.int64, + "end_positions": tf.int64, + "cls_index": tf.int64, + "p_mask": tf.int32, + "is_impossible": tf.int32, + }, + ) + + train_shapes = ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "feature_index": tf.TensorShape([]), + "qas_id": tf.TensorShape([]), + }, + { + "start_positions": tf.TensorShape([]), + "end_positions": tf.TensorShape([]), + "cls_index": tf.TensorShape([]), + "p_mask": tf.TensorShape([None]), + "is_impossible": tf.TensorShape([]), + }, + ) return tf.data.Dataset.from_generator(gen, train_types, train_shapes) else: