From 5e2f2d7dd2b72a35fe9e2fe5b55e13674e9a74a2 Mon Sep 17 00:00:00 2001
From: Sebastian Sosa <37946988+CakeCrusher@users.noreply.github.com>
Date: Thu, 21 Jul 2022 02:35:41 -0600
Subject: [PATCH] Better messaging and fix for incorrect shape when collating
 data. (#18119)

* More informative error message

* raise dynamic error

* remove_excess_nesting application

* incorrect shape assertion for collator & function to remove excess nesting from DatasetDict

* formatting

* eliminating datasets import

* removed and relocated remove_excess_nesting to the datasets library and updated docs accordingly

* independent assert instructions

* inform user of excess nesting
---
 src/transformers/tokenization_utils_base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 96ee9c6eee..776c9a69db 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -733,8 +733,10 @@ class BatchEncoding(UserDict):
                         "Please see if a fast version of this tokenizer is available to have this feature available."
                     )
                 raise ValueError(
-                    "Unable to create tensor, you should probably activate truncation and/or padding "
-                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
+                    "Unable to create tensor, you should probably activate truncation and/or padding with"
+                    " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
+                    f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
+                    " expected)."
                 )
 
         return self