From fcf0652460753f8a81f7576e8abdaa6b3742f00e Mon Sep 17 00:00:00 2001
From: Julien Plu <plu.julien@gmail.com>
Date: Wed, 1 Jul 2020 01:49:11 +0200
Subject: [PATCH] Fix TensorFlow dataset generator (#4881)

* fix TensorFlow generator

* Better features handling

* Apply style

* Apply style

* Fix squad as well

* Apply style

* Better factorization of TF Tensors creation
---
 src/transformers/data/processors/glue.py  |  25 ++--
 src/transformers/data/processors/squad.py | 141 ++++++++++++++--------
 2 files changed, 101 insertions(+), 65 deletions(-)

diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py
index 8a96240486..bc28cdc3df 100644
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -17,6 +17,7 @@
 
 import logging
 import os
+from dataclasses import asdict
 from enum import Enum
 from typing import List, Optional, Union
 
@@ -81,26 +82,16 @@ if is_tf_available():
 
         def gen():
             for ex in features:
-                yield (
-                    {
-                        "input_ids": ex.input_ids,
-                        "attention_mask": ex.attention_mask,
-                        "token_type_ids": ex.token_type_ids,
-                    },
-                    ex.label,
-                )
+                d = {k: v for k, v in asdict(ex).items() if v is not None}
+                label = d.pop("label")
+                yield (d, label)
+
+        input_names = ["input_ids"] + tokenizer.model_input_names
 
         return tf.data.Dataset.from_generator(
             gen,
-            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
-            (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                },
-                tf.TensorShape([]),
-            ),
+            ({k: tf.int32 for k in input_names}, tf.int64),
+            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
         )
 
 
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 0c68df6820..1c840639ca 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -389,57 +389,102 @@ def squad_convert_examples_to_features(
 
         def gen():
             for i, ex in enumerate(features):
-                yield (
-                    {
-                        "input_ids": ex.input_ids,
-                        "attention_mask": ex.attention_mask,
-                        "token_type_ids": ex.token_type_ids,
-                        "feature_index": i,
-                        "qas_id": ex.qas_id,
-                    },
-                    {
-                        "start_positions": ex.start_position,
-                        "end_positions": ex.end_position,
-                        "cls_index": ex.cls_index,
-                        "p_mask": ex.p_mask,
-                        "is_impossible": ex.is_impossible,
-                    },
-                )
+                if ex.token_type_ids is None:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+                else:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
 
         # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
-        train_types = (
-            {
-                "input_ids": tf.int32,
-                "attention_mask": tf.int32,
-                "token_type_ids": tf.int32,
-                "feature_index": tf.int64,
-                "qas_id": tf.string,
-            },
-            {
-                "start_positions": tf.int64,
-                "end_positions": tf.int64,
-                "cls_index": tf.int64,
-                "p_mask": tf.int32,
-                "is_impossible": tf.int32,
-            },
-        )
+        if "token_type_ids" in tokenizer.model_input_names:
+            train_types = (
+                {
+                    "input_ids": tf.int32,
+                    "attention_mask": tf.int32,
+                    "token_type_ids": tf.int32,
+                    "feature_index": tf.int64,
+                    "qas_id": tf.string,
+                },
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
 
-        train_shapes = (
-            {
-                "input_ids": tf.TensorShape([None]),
-                "attention_mask": tf.TensorShape([None]),
-                "token_type_ids": tf.TensorShape([None]),
-                "feature_index": tf.TensorShape([]),
-                "qas_id": tf.TensorShape([]),
-            },
-            {
-                "start_positions": tf.TensorShape([]),
-                "end_positions": tf.TensorShape([]),
-                "cls_index": tf.TensorShape([]),
-                "p_mask": tf.TensorShape([None]),
-                "is_impossible": tf.TensorShape([]),
-            },
-        )
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+        else:
+            train_types = (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
 
         return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
     else: