Uniformize OwlViT and Owlv2 processors (#35700)

* uniformize owlvit processor

* uniformize owlv2

* nit

* add positional arg test owlvit

* run-slow: owlvit, owlv2

* run-slow: owlvit, owlv2

* remove one letter variable
This commit is contained in:
Yoni Gozlan
2025-02-13 17:30:26 -05:00
committed by GitHub
parent e6a7981711
commit 336dc69d63
4 changed files with 203 additions and 66 deletions

View File

@@ -21,8 +21,16 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np import numpy as np
from ...processing_utils import ProcessorMixin from ...image_processing_utils import BatchFeature
from ...tokenization_utils_base import BatchEncoding from ...image_utils import ImageInput
from ...processing_utils import (
ImagesKwargs,
ProcessingKwargs,
ProcessorMixin,
Unpack,
_validate_images_text_input_order,
)
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
@@ -30,6 +38,23 @@ if TYPE_CHECKING:
from .modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, Owlv2ObjectDetectionOutput from .modeling_owlv2 import Owlv2ImageGuidedObjectDetectionOutput, Owlv2ObjectDetectionOutput
class Owlv2ImagesKwargs(ImagesKwargs, total=False):
query_images: Optional[ImageInput]
class Owlv2ProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Owlv2ImagesKwargs
_defaults = {
"text_kwargs": {
"padding": "max_length",
},
"images_kwargs": {},
"common_kwargs": {
"return_tensors": "np",
},
}
class Owlv2Processor(ProcessorMixin): class Owlv2Processor(ProcessorMixin):
r""" r"""
Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
@@ -46,12 +71,27 @@ class Owlv2Processor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"] attributes = ["image_processor", "tokenizer"]
image_processor_class = "Owlv2ImageProcessor" image_processor_class = "Owlv2ImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
# For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
optional_call_args = ["query_images"]
def __init__(self, image_processor, tokenizer, **kwargs): def __init__(self, image_processor, tokenizer, **kwargs):
super().__init__(image_processor, tokenizer) super().__init__(image_processor, tokenizer)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2 # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs): def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
# The following is to capture `query_images` argument that may be passed as a positional argument.
# See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details,
# or this conversation for more context: https://github.com/huggingface/transformers/pull/32544#discussion_r1720208116
# This behavior is only needed for backward compatibility and will be removed in future versions.
#
*args,
audio=None,
videos=None,
**kwargs: Unpack[Owlv2ProcessorKwargs],
) -> BatchFeature:
""" """
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
@@ -60,14 +100,14 @@ class Owlv2Processor(ProcessorMixin):
of the above two methods for more information. of the above two methods for more information.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
`List[torch.Tensor]`): `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported. tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The query image to be prepared, one query image is expected per target image to be queried. Each image The query image to be prepared, one query image is expected per target image to be queried. Each image
can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
@@ -78,36 +118,49 @@ class Owlv2Processor(ProcessorMixin):
- `'pt'`: Return PyTorch `torch.Tensor` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects. - `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects. - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns: Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields: [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`). `None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
""" """
output_kwargs = self._merge_kwargs(
Owlv2ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
**self.prepare_and_validate_optional_call_args(*args),
)
query_images = output_kwargs["images_kwargs"].pop("query_images", None)
return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
if text is None and query_images is None and images is None: if text is None and query_images is None and images is None:
raise ValueError( raise ValueError(
"You have to specify at least one text or query image or image. All three cannot be none." "You have to specify at least one text or query image or image. All three cannot be none."
) )
# check if images and text inputs are reversed for BC
images, text = _validate_images_text_input_order(images, text)
data = {}
if text is not None: if text is not None:
if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)): if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)] encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])]
elif isinstance(text, List) and isinstance(text[0], List): elif isinstance(text, List) and isinstance(text[0], List):
encodings = [] encodings = []
# Maximum number of queries across batch # Maximum number of queries across batch
max_num_queries = max([len(t) for t in text]) max_num_queries = max([len(text_single) for text_single in text])
# Pad all batch samples to max number of text queries # Pad all batch samples to max number of text queries
for t in text: for text_single in text:
if len(t) != max_num_queries: if len(text_single) != max_num_queries:
t = t + [" "] * (max_num_queries - len(t)) text_single = text_single + [" "] * (max_num_queries - len(text_single))
encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs) encoding = self.tokenizer(text_single, **output_kwargs["text_kwargs"])
encodings.append(encoding) encodings.append(encoding)
else: else:
raise TypeError("Input text should be a string, a list of strings or a nested list of strings") raise TypeError("Input text should be a string, a list of strings or a nested list of strings")
@@ -137,30 +190,19 @@ class Owlv2Processor(ProcessorMixin):
else: else:
raise ValueError("Target return tensor type could not be returned") raise ValueError("Target return tensor type could not be returned")
encoding = BatchEncoding() data["input_ids"] = input_ids
encoding["input_ids"] = input_ids data["attention_mask"] = attention_mask
encoding["attention_mask"] = attention_mask
if query_images is not None: if query_images is not None:
encoding = BatchEncoding() query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
query_pixel_values = self.image_processor( # Query images always override the text prompt
query_images, return_tensors=return_tensors, **kwargs data = {"query_pixel_values": query_pixel_values}
).pixel_values
encoding["query_pixel_values"] = query_pixel_values
if images is not None: if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
data["pixel_values"] = image_features.pixel_values
if text is not None and images is not None: return BatchFeature(data=data, tensor_type=return_tensors)
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif query_images is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None or query_images is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2 # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OwlViT->Owlv2
def post_process_object_detection(self, *args, **kwargs): def post_process_object_detection(self, *args, **kwargs):

View File

@@ -21,8 +21,16 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np import numpy as np
from ...processing_utils import ProcessorMixin from ...image_processing_utils import BatchFeature
from ...tokenization_utils_base import BatchEncoding from ...image_utils import ImageInput
from ...processing_utils import (
ImagesKwargs,
ProcessingKwargs,
ProcessorMixin,
Unpack,
_validate_images_text_input_order,
)
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
@@ -30,6 +38,23 @@ if TYPE_CHECKING:
from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput
class OwlViTImagesKwargs(ImagesKwargs, total=False):
query_images: Optional[ImageInput]
class OwlViTProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: OwlViTImagesKwargs
_defaults = {
"text_kwargs": {
"padding": "max_length",
},
"images_kwargs": {},
"common_kwargs": {
"return_tensors": "np",
},
}
class OwlViTProcessor(ProcessorMixin): class OwlViTProcessor(ProcessorMixin):
r""" r"""
Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
@@ -46,6 +71,8 @@ class OwlViTProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"] attributes = ["image_processor", "tokenizer"]
image_processor_class = "OwlViTImageProcessor" image_processor_class = "OwlViTImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
# For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
optional_call_args = ["query_images"]
def __init__(self, image_processor=None, tokenizer=None, **kwargs): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None feature_extractor = None
@@ -65,7 +92,20 @@ class OwlViTProcessor(ProcessorMixin):
super().__init__(image_processor, tokenizer) super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs): def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
# The following is to capture `query_images` argument that may be passed as a positional argument.
# See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details,
# or this conversation for more context: https://github.com/huggingface/transformers/pull/32544#discussion_r1720208116
# This behavior is only needed for backward compatibility and will be removed in future versions.
#
*args,
audio=None,
videos=None,
**kwargs: Unpack[OwlViTProcessorKwargs],
) -> BatchFeature:
""" """
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
@@ -74,14 +114,14 @@ class OwlViTProcessor(ProcessorMixin):
of the above two methods for more information. of the above two methods for more information.
Args: Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
`List[torch.Tensor]`): `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported. tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The query image to be prepared, one query image is expected per target image to be queried. Each image The query image to be prepared, one query image is expected per target image to be queried. Each image
can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
@@ -92,36 +132,49 @@ class OwlViTProcessor(ProcessorMixin):
- `'pt'`: Return PyTorch `torch.Tensor` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects. - `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects. - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns: Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields: [`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`). `None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
""" """
output_kwargs = self._merge_kwargs(
OwlViTProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
**self.prepare_and_validate_optional_call_args(*args),
)
query_images = output_kwargs["images_kwargs"].pop("query_images", None)
return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
if text is None and query_images is None and images is None: if text is None and query_images is None and images is None:
raise ValueError( raise ValueError(
"You have to specify at least one text or query image or image. All three cannot be none." "You have to specify at least one text or query image or image. All three cannot be none."
) )
# check if images and text inputs are reversed for BC
images, text = _validate_images_text_input_order(images, text)
data = {}
if text is not None: if text is not None:
if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)): if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)] encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])]
elif isinstance(text, List) and isinstance(text[0], List): elif isinstance(text, List) and isinstance(text[0], List):
encodings = [] encodings = []
# Maximum number of queries across batch # Maximum number of queries across batch
max_num_queries = max([len(t) for t in text]) max_num_queries = max([len(text_single) for text_single in text])
# Pad all batch samples to max number of text queries # Pad all batch samples to max number of text queries
for t in text: for text_single in text:
if len(t) != max_num_queries: if len(text_single) != max_num_queries:
t = t + [" "] * (max_num_queries - len(t)) text_single = text_single + [" "] * (max_num_queries - len(text_single))
encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs) encoding = self.tokenizer(text_single, **output_kwargs["text_kwargs"])
encodings.append(encoding) encodings.append(encoding)
else: else:
raise TypeError("Input text should be a string, a list of strings or a nested list of strings") raise TypeError("Input text should be a string, a list of strings or a nested list of strings")
@@ -151,30 +204,19 @@ class OwlViTProcessor(ProcessorMixin):
else: else:
raise ValueError("Target return tensor type could not be returned") raise ValueError("Target return tensor type could not be returned")
encoding = BatchEncoding() data["input_ids"] = input_ids
encoding["input_ids"] = input_ids data["attention_mask"] = attention_mask
encoding["attention_mask"] = attention_mask
if query_images is not None: if query_images is not None:
encoding = BatchEncoding() query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
query_pixel_values = self.image_processor( # Query images always override the text prompt
query_images, return_tensors=return_tensors, **kwargs data = {"query_pixel_values": query_pixel_values}
).pixel_values
encoding["query_pixel_values"] = query_pixel_values
if images is not None: if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
data["pixel_values"] = image_features.pixel_values
if text is not None and images is not None: return BatchFeature(data=data, tensor_type=return_tensors)
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif query_images is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None or query_images is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
def post_process(self, *args, **kwargs): def post_process(self, *args, **kwargs):
""" """

View File

@@ -0,0 +1,38 @@
import shutil
import tempfile
import unittest
import pytest
from transformers import Owlv2Processor
from transformers.testing_utils import require_scipy
from ...test_processing_common import ProcessorTesterMixin
@require_scipy
class Owlv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Owlv2Processor
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
processor = self.processor_class.from_pretrained("google/owlv2-base-patch16-ensemble")
processor.save_pretrained(self.tmpdirname)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_processor_query_images_positional(self):
processor_components = self.prepare_components()
processor = Owlv2Processor(**processor_components)
image_input = self.prepare_image_inputs()
query_images = self.prepare_image_inputs()
inputs = processor(None, image_input, query_images)
self.assertListEqual(list(inputs.keys()), ["query_pixel_values", "pixel_values"])
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()

View File

@@ -232,6 +232,21 @@ class OwlViTProcessorTest(ProcessorTesterMixin, unittest.TestCase):
with pytest.raises(ValueError): with pytest.raises(ValueError):
processor() processor()
def test_processor_query_images_positional(self):
processor_components = self.prepare_components()
processor = OwlViTProcessor(**processor_components)
image_input = self.prepare_image_inputs()
query_images = self.prepare_image_inputs()
inputs = processor(None, image_input, query_images)
self.assertListEqual(list(inputs.keys()), ["query_pixel_values", "pixel_values"])
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
def test_tokenizer_decode(self): def test_tokenizer_decode(self):
image_processor = self.get_image_processor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()