Add support for non-rust implemented tokenization for __getitem__ method. (#24039)
* Add support for non-rust implemented tokenization for `__getitem__` method. * Update for error message on adding new sub-branch for `__item__` method. --------- Co-authored-by: liuyang17 <liuyang17@zhihu.com>
This commit is contained in:
@@ -233,15 +233,20 @@ class BatchEncoding(UserDict):
|
||||
etc.).
|
||||
|
||||
If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
|
||||
|
||||
If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
|
||||
with the constraint of slice.
|
||||
"""
|
||||
if isinstance(item, str):
|
||||
return self.data[item]
|
||||
elif self._encodings is not None:
|
||||
return self._encodings[item]
|
||||
elif isinstance(item, slice):
|
||||
return {key: self.data[key][slice] for key in self.data.keys()}
|
||||
else:
|
||||
raise KeyError(
|
||||
"Indexing with integers (to access backend Encoding for a given batch index) "
|
||||
"is not available when using Python based tokenizers"
|
||||
"Invalid key. Only three types of key are available: "
|
||||
"(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
|
||||
)
|
||||
|
||||
def __getattr__(self, item: str):
|
||||
|
||||
Reference in New Issue
Block a user