is_pretokenized -> is_split_into_words (#7236)
* is_pretokenized -> is_split_into_words * Fix tests
This commit is contained in:
@@ -1088,7 +1088,7 @@ ENCODE_KWARGS_DOCSTRING = r"""
|
||||
:obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
|
||||
returned to provide some overlap between truncated and overflowing sequences. The value of this
|
||||
argument defines the number of overlapping tokens.
|
||||
is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the input is already pre-tokenized (e.g., split into words), in which case the tokenizer
|
||||
will skip the pre-tokenization step. This is useful for NER or token classification.
|
||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||
@@ -1863,7 +1863,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
is_pretokenized: bool = False,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
@@ -1884,12 +1884,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
The sequence or batch of sequences to be encoded.
|
||||
Each sequence can be a string or a list of strings (pretokenized string).
|
||||
If the sequences are provided as list of strings (pretokenized), you must set
|
||||
:obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
|
||||
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
|
||||
The sequence or batch of sequences to be encoded.
|
||||
Each sequence can be a string or a list of strings (pretokenized string).
|
||||
If the sequences are provided as list of strings (pretokenized), you must set
|
||||
:obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
|
||||
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
"""
|
||||
# Input type checking for clearer error
|
||||
assert isinstance(text, str) or (
|
||||
@@ -1928,8 +1928,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
)
|
||||
|
||||
is_batched = bool(
|
||||
(not is_pretokenized and isinstance(text, (list, tuple)))
|
||||
or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))
|
||||
(not is_split_into_words and isinstance(text, (list, tuple)))
|
||||
or (
|
||||
is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
|
||||
)
|
||||
)
|
||||
|
||||
if is_batched:
|
||||
@@ -1941,7 +1943,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
is_pretokenized=is_pretokenized,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
@@ -1962,7 +1964,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
is_pretokenized=is_pretokenized,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
@@ -1985,7 +1987,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
is_pretokenized: bool = False,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
@@ -2032,7 +2034,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation_strategy=truncation_strategy,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
is_pretokenized=is_pretokenized,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
@@ -2054,7 +2056,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
is_pretokenized: bool = False,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
@@ -2084,7 +2086,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
is_pretokenized: bool = False,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
@@ -2126,7 +2128,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation_strategy=truncation_strategy,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
is_pretokenized=is_pretokenized,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
@@ -2154,7 +2156,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
is_pretokenized: bool = False,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
|
||||
Reference in New Issue
Block a user