From 5e2f2d7dd2b72a35fe9e2fe5b55e13674e9a74a2 Mon Sep 17 00:00:00 2001 From: Sebastian Sosa <37946988+CakeCrusher@users.noreply.github.com> Date: Thu, 21 Jul 2022 02:35:41 -0600 Subject: [PATCH] Better messaging and fix for incorrect shape when collating data. (#18119) * More informative error message * raise dynamic error * remove_excess_nesting application * incorrect shape assertion for collator & function to remove excess nesting from DatasetDict * formatting * eliminating datasets import * removed and relocated remove_excess_nesting to the datasets library and updated docs accordingly * independent assert instructions * inform user of excess nesting --- src/transformers/tokenization_utils_base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 96ee9c6eee..776c9a69db 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -733,8 +733,10 @@ class BatchEncoding(UserDict): "Please see if a fast version of this tokenizer is available to have this feature available." ) raise ValueError( - "Unable to create tensor, you should probably activate truncation and/or padding " - "with 'padding=True' 'truncation=True' to have batched tensors with the same length." + "Unable to create tensor, you should probably activate truncation and/or padding with" + " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your" + f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is" + " expected)." ) return self