From 6b104c5bb00b7199cb3c8d819af5528f9b20a523 Mon Sep 17 00:00:00 2001 From: ParkSangJun <30718444+cosmoquester@users.noreply.github.com> Date: Fri, 4 Mar 2022 19:57:09 +0900 Subject: [PATCH] Support CLIPTokenizerFast for CLIPProcessor (#15913) * Fix to support fast tokenizer with `CLIPProcessor` * Update CLIPProcessor test for fast tokenizer * Fix Docstring Style * Rename into meaningful Variable name in test code --- .../models/clip/processing_clip.py | 18 +++++----- tests/clip/test_processor_clip.py | 34 +++++++++++++------ 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index 2323dbc7e8..d750d4f2d2 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -23,17 +23,17 @@ class CLIPProcessor(ProcessorMixin): r""" Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor. - [`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizer`]. See the + [`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizerFast`]. See the [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information. Args: feature_extractor ([`CLIPFeatureExtractor`]): The feature extractor is a required input. - tokenizer ([`CLIPTokenizer`]): + tokenizer ([`CLIPTokenizerFast`]): The tokenizer is a required input. """ feature_extractor_class = "CLIPFeatureExtractor" - tokenizer_class = "CLIPTokenizer" + tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) @@ -42,8 +42,8 @@ class CLIPProcessor(ProcessorMixin): def __call__(self, text=None, images=None, return_tensors=None, **kwargs): """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `text` is not `None` to encode the - text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the doctsring of the above two methods for more information. @@ -94,14 +94,14 @@ class CLIPProcessor(ProcessorMixin): def batch_decode(self, *args, **kwargs): """ - This method forwards all its arguments to CLIPTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer - to the docstring of this method for more information. + This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. """ return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): """ - This method forwards all its arguments to CLIPTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the - docstring of this method for more information. + This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. """ return self.tokenizer.decode(*args, **kwargs) diff --git a/tests/clip/test_processor_clip.py b/tests/clip/test_processor_clip.py index e8d7a73e53..17520a4ad9 100644 --- a/tests/clip/test_processor_clip.py +++ b/tests/clip/test_processor_clip.py @@ -21,7 +21,7 @@ import unittest import numpy as np import pytest -from transformers import CLIPTokenizer +from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_vision_available from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision @@ -39,7 +39,7 @@ class CLIPProcessorTest(unittest.TestCase): self.tmpdirname = tempfile.mkdtemp() # fmt: off - vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low", "er", "lowest", "newer", "wider", "", "<|endoftext|>"] + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"] # fmt: on vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] @@ -68,6 +68,9 @@ class CLIPProcessorTest(unittest.TestCase): def get_tokenizer(self, **kwargs): return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs) + def get_rust_tokenizer(self, **kwargs): + return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + def get_feature_extractor(self, **kwargs): return CLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) @@ -86,19 +89,28 @@ class CLIPProcessorTest(unittest.TestCase): return image_inputs def test_save_load_pretrained_default(self): - tokenizer = self.get_tokenizer() + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() feature_extractor = self.get_feature_extractor() - processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, feature_extractor=feature_extractor) + processor_slow.save_pretrained(self.tmpdirname) + processor_slow = CLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False) - processor.save_pretrained(self.tmpdirname) - processor = CLIPProcessor.from_pretrained(self.tmpdirname) + processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, feature_extractor=feature_extractor) + processor_fast.save_pretrained(self.tmpdirname) + processor_fast = CLIPProcessor.from_pretrained(self.tmpdirname) - self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) - self.assertIsInstance(processor.tokenizer, CLIPTokenizer) + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) + self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) - self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) - self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor) + self.assertEqual(processor_slow.feature_extractor.to_json_string(), feature_extractor.to_json_string()) + self.assertEqual(processor_fast.feature_extractor.to_json_string(), feature_extractor.to_json_string()) + self.assertIsInstance(processor_slow.feature_extractor, CLIPFeatureExtractor) + self.assertIsInstance(processor_fast.feature_extractor, CLIPFeatureExtractor) def test_save_load_pretrained_additional_features(self): processor = CLIPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) @@ -112,7 +124,7 @@ class CLIPProcessorTest(unittest.TestCase): ) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) - self.assertIsInstance(processor.tokenizer, CLIPTokenizer) + self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)