From 1e4a7737ed1fdb361a0473403201374382b447e3 Mon Sep 17 00:00:00 2001 From: YangLiu Date: Wed, 7 Jun 2023 19:29:19 +0800 Subject: [PATCH] Add support for non-rust implemented tokenization for `__getitem__` method. (#24039) * Add support for non-rust implemented tokenization for `__getitem__` method. * Update for error message on adding new sub-branch for `__item__` method. --------- Co-authored-by: liuyang17 --- src/transformers/tokenization_utils_base.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 4b65b56f7a..ba2f8ef3f1 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -233,15 +233,20 @@ class BatchEncoding(UserDict): etc.). If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`. + + If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.) + with the constraint of slice. """ if isinstance(item, str): return self.data[item] elif self._encodings is not None: return self._encodings[item] + elif isinstance(item, slice): + return {key: self.data[key][slice] for key in self.data.keys()} else: raise KeyError( - "Indexing with integers (to access backend Encoding for a given batch index) " - "is not available when using Python based tokenizers" + "Invalid key. Only three types of key are available: " + "(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting." ) def __getattr__(self, item: str):