Replace as_target context managers by direct calls (#18325)
* Preliminary work on tokenizers * Quality + fix tests * Treat processors * Fix pad * Remove all uses of in tests, docs and examples * Replace all as_target_tokenizer * Fix tests * Fix quality * Update examples/flax/image-captioning/run_image_captioning_flax.py Co-authored-by: amyeroberts <amy@huggingface.co> * Style Co-authored-by: amyeroberts <amy@huggingface.co>
This commit is contained in:
@@ -1501,7 +1501,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
self.deprecation_warnings = (
|
||||
{}
|
||||
) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
|
||||
|
||||
self._in_target_context_manager = False
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
@@ -2431,8 +2431,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
|
||||
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
||||
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
|
||||
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
||||
text_pair_target: Optional[
|
||||
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
|
||||
] = None,
|
||||
add_special_tokens: bool = True,
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||
@@ -2455,15 +2459,85 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
sequences.
|
||||
|
||||
Args:
|
||||
text (`str`, `List[str]`, `List[List[str]]`):
|
||||
text (`str`, `List[str]`, `List[List[str]]`, *optional*):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
text_pair (`str`, `List[str]`, `List[List[str]]`):
|
||||
text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
text_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
|
||||
The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
|
||||
list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
|
||||
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
|
||||
The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
|
||||
list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
|
||||
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
"""
|
||||
# To avoid duplicating
|
||||
all_kwargs = dict(
|
||||
add_special_tokens=add_special_tokens,
|
||||
padding=padding,
|
||||
truncation=truncation,
|
||||
max_length=max_length,
|
||||
stride=stride,
|
||||
is_split_into_words=is_split_into_words,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_tensors=return_tensors,
|
||||
return_token_type_ids=return_token_type_ids,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_overflowing_tokens=return_overflowing_tokens,
|
||||
return_special_tokens_mask=return_special_tokens_mask,
|
||||
return_offsets_mapping=return_offsets_mapping,
|
||||
return_length=return_length,
|
||||
verbose=verbose,
|
||||
)
|
||||
all_kwargs.update(kwargs)
|
||||
if text is None and text_target is None:
|
||||
raise ValueError("You need to specify either `text` or `text_target`.")
|
||||
if text is not None:
|
||||
# The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
|
||||
# input mode in this case.
|
||||
if not self._in_target_context_manager:
|
||||
self._switch_to_input_mode()
|
||||
encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
|
||||
if text_target is not None:
|
||||
self._switch_to_target_mode()
|
||||
target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
|
||||
# Leave back tokenizer in input mode
|
||||
self._switch_to_input_mode()
|
||||
|
||||
if text_target is None:
|
||||
return encodings
|
||||
elif text is None:
|
||||
return target_encodings
|
||||
else:
|
||||
encodings["labels"] = target_encodings["input_ids"]
|
||||
return encodings
|
||||
|
||||
def _call_one(
|
||||
self,
|
||||
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
|
||||
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
|
||||
add_special_tokens: bool = True,
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
stride: int = 0,
|
||||
is_split_into_words: bool = False,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
return_token_type_ids: Optional[bool] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
return_overflowing_tokens: bool = False,
|
||||
return_special_tokens_mask: bool = False,
|
||||
return_offsets_mapping: bool = False,
|
||||
return_length: bool = False,
|
||||
verbose: bool = True,
|
||||
**kwargs
|
||||
) -> BatchEncoding:
|
||||
# Input type checking for clearer error
|
||||
def _is_valid_text_input(t):
|
||||
if isinstance(t, str):
|
||||
@@ -3456,13 +3530,34 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
)
|
||||
self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
|
||||
|
||||
def _switch_to_input_mode(self):
|
||||
"""
|
||||
Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
|
||||
"""
|
||||
pass
|
||||
|
||||
def _switch_to_target_mode(self):
|
||||
"""
|
||||
Private method to put the tokenizer in target mode (when it has different modes for input/outputs)
|
||||
"""
|
||||
pass
|
||||
|
||||
@contextmanager
|
||||
def as_target_tokenizer(self):
|
||||
"""
|
||||
Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
|
||||
sequence-to-sequence models that need a slightly different processing for the labels.
|
||||
"""
|
||||
warnings.warn(
|
||||
"`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
|
||||
"labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
|
||||
"your input texts if you use the same keyword arguments, or in a separate call."
|
||||
)
|
||||
self._switch_to_target_mode()
|
||||
self._in_target_context_manager = True
|
||||
yield
|
||||
self._in_target_context_manager = False
|
||||
self._switch_to_input_mode()
|
||||
|
||||
@classmethod
|
||||
def register_for_auto_class(cls, auto_class="AutoTokenizer"):
|
||||
@@ -3563,14 +3658,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
# docstyle-ignore
|
||||
formatted_warning = """
|
||||
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
|
||||
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
|
||||
your targets.
|
||||
`__call__` method to prepare your inputs and targets.
|
||||
|
||||
Here is a short example:
|
||||
|
||||
model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)
|
||||
|
||||
If you either need to use different keyword arguments for the source and target texts, you should do two calls like
|
||||
this:
|
||||
|
||||
model_inputs = tokenizer(src_texts, ...)
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(tgt_texts, ...)
|
||||
labels = tokenizer(text_target=tgt_texts, ...)
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
|
||||
See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
|
||||
|
||||
Reference in New Issue
Block a user