Add support for non-rust implemented tokenization for __getitem__ method. (#24039)
* Add support for non-rust implemented tokenization for `__getitem__` method. * Update for error message on adding new sub-branch for `__item__` method. --------- Co-authored-by: liuyang17 <liuyang17@zhihu.com>
This commit is contained in:
@@ -233,15 +233,20 @@ class BatchEncoding(UserDict):
|
|||||||
etc.).
|
etc.).
|
||||||
|
|
||||||
If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
|
If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
|
||||||
|
|
||||||
|
If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
|
||||||
|
with the constraint of slice.
|
||||||
"""
|
"""
|
||||||
if isinstance(item, str):
|
if isinstance(item, str):
|
||||||
return self.data[item]
|
return self.data[item]
|
||||||
elif self._encodings is not None:
|
elif self._encodings is not None:
|
||||||
return self._encodings[item]
|
return self._encodings[item]
|
||||||
|
elif isinstance(item, slice):
|
||||||
|
return {key: self.data[key][slice] for key in self.data.keys()}
|
||||||
else:
|
else:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
"Indexing with integers (to access backend Encoding for a given batch index) "
|
"Invalid key. Only three types of key are available: "
|
||||||
"is not available when using Python based tokenizers"
|
"(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
|
||||||
)
|
)
|
||||||
|
|
||||||
def __getattr__(self, item: str):
|
def __getattr__(self, item: str):
|
||||||
|
|||||||
Reference in New Issue
Block a user