is_pretokenized -> is_split_into_words (#7236)

* is_pretokenized -> is_split_into_words

* Fix tests
This commit is contained in:
Sylvain Gugger
2020-09-22 09:34:35 -04:00
committed by GitHub
parent 324f361e91
commit 21ca148090
9 changed files with 142 additions and 72 deletions

View File

@@ -1088,7 +1088,7 @@ ENCODE_KWARGS_DOCSTRING = r"""
:obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
returned to provide some overlap between truncated and overflowing sequences. The value of this
argument defines the number of overlapping tokens.
is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the input is already pre-tokenized (e.g., split into words), in which case the tokenizer
will skip the pre-tokenization step. This is useful for NER or token classification.
pad_to_multiple_of (:obj:`int`, `optional`):
@@ -1863,7 +1863,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
@@ -1884,12 +1884,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
The sequence or batch of sequences to be encoded.
Each sequence can be a string or a list of strings (pretokenized string).
If the sequences are provided as list of strings (pretokenized), you must set
:obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
The sequence or batch of sequences to be encoded.
Each sequence can be a string or a list of strings (pretokenized string).
If the sequences are provided as list of strings (pretokenized), you must set
:obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences).
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
"""
# Input type checking for clearer error
assert isinstance(text, str) or (
@@ -1928,8 +1928,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
)
is_batched = bool(
(not is_pretokenized and isinstance(text, (list, tuple)))
or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))
(not is_split_into_words and isinstance(text, (list, tuple)))
or (
is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
)
)
if is_batched:
@@ -1941,7 +1943,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation=truncation,
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
@@ -1962,7 +1964,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation=truncation,
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
@@ -1985,7 +1987,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
@@ -2032,7 +2034,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
@@ -2054,7 +2056,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
@@ -2084,7 +2086,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
@@ -2126,7 +2128,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_pretokenized=is_pretokenized,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
@@ -2154,7 +2156,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_pretokenized: bool = False,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,