Save Processor (#27761)
* save processor * Update tests/models/auto/test_processor_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/test_processing_common.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
@@ -25,8 +25,9 @@ from ...configuration_utils import PretrainedConfig
|
||||
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||
from ...feature_extraction_utils import FeatureExtractionMixin
|
||||
from ...image_processing_utils import ImageProcessingMixin
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils import TOKENIZER_CONFIG_FILE
|
||||
from ...utils import FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging
|
||||
from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, get_file_from_repo, logging
|
||||
from .auto_factory import _LazyAutoMapping
|
||||
from .configuration_auto import (
|
||||
CONFIG_MAPPING_NAMES,
|
||||
@@ -227,27 +228,41 @@ class AutoProcessor:
|
||||
processor_class = None
|
||||
processor_auto_map = None
|
||||
|
||||
# First, let's see if we have a preprocessor config.
|
||||
# First, let's see if we have a processor or preprocessor config.
|
||||
# Filter the kwargs for `get_file_from_repo`.
|
||||
get_file_from_repo_kwargs = {
|
||||
key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs
|
||||
}
|
||||
# Let's start by checking whether the processor class is saved in an image processor
|
||||
preprocessor_config_file = get_file_from_repo(
|
||||
pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs
|
||||
|
||||
# Let's start by checking whether the processor class is saved in a processor config
|
||||
processor_config_file = get_file_from_repo(
|
||||
pretrained_model_name_or_path, PROCESSOR_NAME, **get_file_from_repo_kwargs
|
||||
)
|
||||
if preprocessor_config_file is not None:
|
||||
config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
if processor_config_file is not None:
|
||||
config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
processor_class = config_dict.get("processor_class", None)
|
||||
if "AutoProcessor" in config_dict.get("auto_map", {}):
|
||||
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
|
||||
|
||||
# If not found, let's check whether the processor class is saved in a feature extractor config
|
||||
if preprocessor_config_file is not None and processor_class is None:
|
||||
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
processor_class = config_dict.get("processor_class", None)
|
||||
if "AutoProcessor" in config_dict.get("auto_map", {}):
|
||||
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
|
||||
if processor_class is None:
|
||||
# If not found, let's check whether the processor class is saved in an image processor config
|
||||
preprocessor_config_file = get_file_from_repo(
|
||||
pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs
|
||||
)
|
||||
if preprocessor_config_file is not None:
|
||||
config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
processor_class = config_dict.get("processor_class", None)
|
||||
if "AutoProcessor" in config_dict.get("auto_map", {}):
|
||||
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
|
||||
|
||||
# If not found, let's check whether the processor class is saved in a feature extractor config
|
||||
if preprocessor_config_file is not None and processor_class is None:
|
||||
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
processor_class = config_dict.get("processor_class", None)
|
||||
if "AutoProcessor" in config_dict.get("auto_map", {}):
|
||||
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
|
||||
|
||||
if processor_class is None:
|
||||
# Next, let's check whether the processor class is saved in a tokenizer
|
||||
|
||||
@@ -16,14 +16,28 @@
|
||||
Processing saving/loading class for common processors.
|
||||
"""
|
||||
|
||||
import copy
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
from typing import Any, Dict, Optional, Tuple, Union
|
||||
|
||||
from .dynamic_module_utils import custom_object_save
|
||||
from .tokenization_utils_base import PreTrainedTokenizerBase
|
||||
from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging
|
||||
from .utils import (
|
||||
PROCESSOR_NAME,
|
||||
PushToHubMixin,
|
||||
add_model_info_to_auto_map,
|
||||
cached_file,
|
||||
copy_func,
|
||||
direct_transformers_import,
|
||||
download_url,
|
||||
is_offline_mode,
|
||||
is_remote_url,
|
||||
logging,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -85,10 +99,70 @@ class ProcessorMixin(PushToHubMixin):
|
||||
|
||||
setattr(self, attribute_name, arg)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
|
||||
# Get the kwargs in `__init__`.
|
||||
sig = inspect.signature(self.__init__)
|
||||
# Only save the attributes that are presented in the kwargs of `__init__`.
|
||||
attrs_to_save = sig.parameters
|
||||
# Don't save attributes like `tokenizer`, `image processor` etc.
|
||||
attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]
|
||||
# extra attributes to be kept
|
||||
attrs_to_save += ["auto_map"]
|
||||
|
||||
output = {k: v for k, v in output.items() if k in attrs_to_save}
|
||||
|
||||
output["processor_class"] = self.__class__.__name__
|
||||
|
||||
if "tokenizer" in output:
|
||||
del output["tokenizer"]
|
||||
if "image_processor" in output:
|
||||
del output["image_processor"]
|
||||
if "feature_extractor" in output:
|
||||
del output["feature_extractor"]
|
||||
|
||||
# Some attributes have different names but containing objects that are not simple strings
|
||||
output = {
|
||||
k: v
|
||||
for k, v in output.items()
|
||||
if not (isinstance(v, PushToHubMixin) or v.__class__.__name__ == "BeamSearchDecoderCTC")
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
def to_json_string(self) -> str:
|
||||
"""
|
||||
Serializes this instance to a JSON string.
|
||||
|
||||
Returns:
|
||||
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
|
||||
"""
|
||||
dictionary = self.to_dict()
|
||||
|
||||
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
|
||||
|
||||
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
|
||||
"""
|
||||
Save this instance to a JSON file.
|
||||
|
||||
Args:
|
||||
json_file_path (`str` or `os.PathLike`):
|
||||
Path to the JSON file in which this processor instance's parameters will be saved.
|
||||
"""
|
||||
with open(json_file_path, "w", encoding="utf-8") as writer:
|
||||
writer.write(self.to_json_string())
|
||||
|
||||
def __repr__(self):
|
||||
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
|
||||
attributes_repr = "\n".join(attributes_repr)
|
||||
return f"{self.__class__.__name__}:\n{attributes_repr}"
|
||||
return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
|
||||
|
||||
def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
|
||||
"""
|
||||
@@ -139,6 +213,7 @@ class ProcessorMixin(PushToHubMixin):
|
||||
if self._auto_class is not None:
|
||||
attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
|
||||
configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
|
||||
configs.append(self)
|
||||
custom_object_save(self, save_directory, config=configs)
|
||||
|
||||
for attribute_name in self.attributes:
|
||||
@@ -156,6 +231,12 @@ class ProcessorMixin(PushToHubMixin):
|
||||
if isinstance(attribute, PreTrainedTokenizerBase):
|
||||
del attribute.init_kwargs["auto_map"]
|
||||
|
||||
# If we save using the predefined names, we can load using `from_pretrained`
|
||||
output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
|
||||
|
||||
self.to_json_file(output_processor_file)
|
||||
logger.info(f"processor saved in {output_processor_file}")
|
||||
|
||||
if push_to_hub:
|
||||
self._upload_modified_files(
|
||||
save_directory,
|
||||
@@ -165,6 +246,150 @@ class ProcessorMixin(PushToHubMixin):
|
||||
token=kwargs.get("token"),
|
||||
)
|
||||
|
||||
return [output_processor_file]
|
||||
|
||||
@classmethod
|
||||
def get_processor_dict(
|
||||
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
"""
|
||||
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
|
||||
processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
|
||||
Returns:
|
||||
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
|
||||
"""
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
resume_download = kwargs.pop("resume_download", False)
|
||||
proxies = kwargs.pop("proxies", None)
|
||||
token = kwargs.pop("token", None)
|
||||
local_files_only = kwargs.pop("local_files_only", False)
|
||||
revision = kwargs.pop("revision", None)
|
||||
subfolder = kwargs.pop("subfolder", "")
|
||||
|
||||
from_pipeline = kwargs.pop("_from_pipeline", None)
|
||||
from_auto_class = kwargs.pop("_from_auto", False)
|
||||
|
||||
user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
|
||||
if from_pipeline is not None:
|
||||
user_agent["using_pipeline"] = from_pipeline
|
||||
|
||||
if is_offline_mode() and not local_files_only:
|
||||
logger.info("Offline mode: forcing local_files_only=True")
|
||||
local_files_only = True
|
||||
|
||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||
is_local = os.path.isdir(pretrained_model_name_or_path)
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME)
|
||||
if os.path.isfile(pretrained_model_name_or_path):
|
||||
resolved_processor_file = pretrained_model_name_or_path
|
||||
is_local = True
|
||||
elif is_remote_url(pretrained_model_name_or_path):
|
||||
processor_file = pretrained_model_name_or_path
|
||||
resolved_processor_file = download_url(pretrained_model_name_or_path)
|
||||
else:
|
||||
processor_file = PROCESSOR_NAME
|
||||
try:
|
||||
# Load from local folder or from cache or download from model Hub and cache
|
||||
resolved_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
processor_file,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
resume_download=resume_download,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
)
|
||||
except EnvironmentError:
|
||||
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
||||
# the original exception.
|
||||
raise
|
||||
except Exception:
|
||||
# For any other exception, we throw a generic error.
|
||||
raise EnvironmentError(
|
||||
f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
||||
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
||||
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
||||
f" directory containing a {PROCESSOR_NAME} file"
|
||||
)
|
||||
|
||||
try:
|
||||
# Load processor dict
|
||||
with open(resolved_processor_file, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
processor_dict = json.loads(text)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
raise EnvironmentError(
|
||||
f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
|
||||
)
|
||||
|
||||
if is_local:
|
||||
logger.info(f"loading configuration file {resolved_processor_file}")
|
||||
else:
|
||||
logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
|
||||
|
||||
if "auto_map" in processor_dict and not is_local:
|
||||
processor_dict["auto_map"] = add_model_info_to_auto_map(
|
||||
processor_dict["auto_map"], pretrained_model_name_or_path
|
||||
)
|
||||
|
||||
return processor_dict, kwargs
|
||||
|
||||
@classmethod
|
||||
def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
|
||||
"""
|
||||
Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
|
||||
|
||||
Args:
|
||||
processor_dict (`Dict[str, Any]`):
|
||||
Dictionary that will be used to instantiate the processor object. Such a dictionary can be
|
||||
retrieved from a pretrained checkpoint by leveraging the
|
||||
[`~processing_utils.ProcessingMixin.to_dict`] method.
|
||||
kwargs (`Dict[str, Any]`):
|
||||
Additional parameters from which to initialize the processor object.
|
||||
|
||||
Returns:
|
||||
[`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
|
||||
parameters.
|
||||
"""
|
||||
processor_dict = processor_dict.copy()
|
||||
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
||||
|
||||
# Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
|
||||
# We have to pop up some unused (but specific) arguments to make it work.
|
||||
if "processor_class" in processor_dict:
|
||||
del processor_dict["processor_class"]
|
||||
|
||||
if "auto_map" in processor_dict:
|
||||
del processor_dict["auto_map"]
|
||||
|
||||
processor = cls(*args, **processor_dict)
|
||||
|
||||
# Update processor with kwargs if needed
|
||||
for key in set(kwargs.keys()):
|
||||
if hasattr(processor, key):
|
||||
setattr(processor, key, kwargs.pop(key))
|
||||
|
||||
logger.info(f"Processor {processor}")
|
||||
if return_unused_kwargs:
|
||||
return processor, kwargs
|
||||
else:
|
||||
return processor
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
@@ -226,7 +451,19 @@ class ProcessorMixin(PushToHubMixin):
|
||||
kwargs["token"] = token
|
||||
|
||||
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
return cls(*args)
|
||||
|
||||
# Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
|
||||
# updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
|
||||
# However, for models added in the future, we won't get the expected error if this file is missing.
|
||||
try:
|
||||
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||
except EnvironmentError as e:
|
||||
if "does not appear to have a file named processor_config.json." in str(e):
|
||||
processor_dict, kwargs = {}, kwargs
|
||||
else:
|
||||
raise
|
||||
|
||||
return cls.from_args_and_dict(args, processor_dict, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def register_for_auto_class(cls, auto_class="AutoProcessor"):
|
||||
|
||||
@@ -217,6 +217,7 @@ SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
|
||||
CONFIG_NAME = "config.json"
|
||||
FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
|
||||
IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
|
||||
PROCESSOR_NAME = "processor_config.json"
|
||||
GENERATION_CONFIG_NAME = "generation_config.json"
|
||||
MODEL_CARD_NAME = "modelcard.json"
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ from transformers import (
|
||||
)
|
||||
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
|
||||
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
|
||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available
|
||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
|
||||
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
||||
@@ -91,6 +91,28 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
|
||||
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||
|
||||
def test_processor_from_processor_class(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
feature_extractor = Wav2Vec2FeatureExtractor()
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
||||
|
||||
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
|
||||
|
||||
# save in new folder
|
||||
processor.save_pretrained(tmpdirname)
|
||||
|
||||
# drop `processor_class` in tokenizer config
|
||||
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
config_dict.pop("processor_class")
|
||||
|
||||
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
|
||||
f.write(json.dumps(config_dict))
|
||||
|
||||
processor = AutoProcessor.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||
|
||||
def test_processor_from_feat_extr_processor_class(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
feature_extractor = Wav2Vec2FeatureExtractor()
|
||||
@@ -101,6 +123,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
# save in new folder
|
||||
processor.save_pretrained(tmpdirname)
|
||||
|
||||
# drop `processor_class` in processor
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
config_dict.pop("processor_class")
|
||||
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
|
||||
f.write(json.dumps(config_dict))
|
||||
|
||||
# drop `processor_class` in tokenizer
|
||||
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
@@ -123,6 +153,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
# save in new folder
|
||||
processor.save_pretrained(tmpdirname)
|
||||
|
||||
# drop `processor_class` in processor
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
config_dict.pop("processor_class")
|
||||
|
||||
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
|
||||
f.write(json.dumps(config_dict))
|
||||
|
||||
# drop `processor_class` in feature extractor
|
||||
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f:
|
||||
config_dict = json.load(f)
|
||||
@@ -270,6 +308,45 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
||||
if CustomConfig in PROCESSOR_MAPPING._extra_content:
|
||||
del PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||
|
||||
def test_from_pretrained_dynamic_processor_with_extra_attributes(self):
|
||||
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
|
||||
pass
|
||||
|
||||
class NewTokenizer(BertTokenizer):
|
||||
pass
|
||||
|
||||
class NewProcessor(ProcessorMixin):
|
||||
feature_extractor_class = "AutoFeatureExtractor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
|
||||
super().__init__(feature_extractor, tokenizer)
|
||||
|
||||
self.processor_attr_1 = processor_attr_1
|
||||
self.processor_attr_2 = processor_attr_2
|
||||
|
||||
try:
|
||||
AutoConfig.register("custom", CustomConfig)
|
||||
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
|
||||
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
|
||||
AutoProcessor.register(CustomConfig, NewProcessor)
|
||||
# If remote code is not set, the default is to use local classes.
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
"hf-internal-testing/test_dynamic_processor", processor_attr_2=False
|
||||
)
|
||||
self.assertEqual(processor.__class__.__name__, "NewProcessor")
|
||||
self.assertEqual(processor.processor_attr_1, 1)
|
||||
self.assertEqual(processor.processor_attr_2, False)
|
||||
finally:
|
||||
if "custom" in CONFIG_MAPPING._extra_content:
|
||||
del CONFIG_MAPPING._extra_content["custom"]
|
||||
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
|
||||
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
|
||||
if CustomConfig in TOKENIZER_MAPPING._extra_content:
|
||||
del TOKENIZER_MAPPING._extra_content[CustomConfig]
|
||||
if CustomConfig in PROCESSOR_MAPPING._extra_content:
|
||||
del PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||
|
||||
def test_auto_processor_creates_tokenizer(self):
|
||||
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
|
||||
self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
|
||||
|
||||
@@ -26,6 +26,8 @@ from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
@@ -34,7 +36,9 @@ if is_vision_available():
|
||||
|
||||
|
||||
@require_vision
|
||||
class CLIPProcessorTest(unittest.TestCase):
|
||||
class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = CLIPProcessor
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
|
||||
127
tests/test_processing_common.py
Normal file
127
tests/test_processing_common.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import CLIPTokenizerFast, ProcessorMixin
|
||||
from transformers.models.auto.processing_auto import processor_class_from_name
|
||||
from transformers.testing_utils import (
|
||||
check_json_file_has_correct_format,
|
||||
require_tokenizers,
|
||||
require_torch,
|
||||
require_vision,
|
||||
)
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import CLIPImageProcessor
|
||||
|
||||
|
||||
@require_torch
|
||||
class ProcessorTesterMixin:
|
||||
processor_class = None
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {}
|
||||
|
||||
def get_component(self, attribute, **kwargs):
|
||||
assert attribute in self.processor_class.attributes
|
||||
component_class_name = getattr(self.processor_class, f"{attribute}_class")
|
||||
if isinstance(component_class_name, tuple):
|
||||
component_class_name = component_class_name[0]
|
||||
|
||||
component_class = processor_class_from_name(component_class_name)
|
||||
component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa
|
||||
|
||||
return component
|
||||
|
||||
def prepare_components(self):
|
||||
components = {}
|
||||
for attribute in self.processor_class.attributes:
|
||||
component = self.get_component(attribute)
|
||||
components[attribute] = component
|
||||
|
||||
return components
|
||||
|
||||
def get_processor(self):
|
||||
components = self.prepare_components()
|
||||
processor = self.processor_class(**components, **self.prepare_processor_dict())
|
||||
return processor
|
||||
|
||||
def test_processor_to_json_string(self):
|
||||
processor = self.get_processor()
|
||||
obj = json.loads(processor.to_json_string())
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
def test_processor_from_and_save_pretrained(self):
|
||||
processor_first = self.get_processor()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
processor_second = self.processor_class.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
|
||||
|
||||
|
||||
class MyProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
image_processor_class = "CLIPImageProcessor"
|
||||
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||
|
||||
def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
|
||||
super().__init__(image_processor, tokenizer)
|
||||
|
||||
self.processor_attr_1 = processor_attr_1
|
||||
self.processor_attr_2 = processor_attr_2
|
||||
|
||||
|
||||
@require_tokenizers
|
||||
@require_vision
|
||||
class ProcessorTest(unittest.TestCase):
|
||||
processor_class = MyProcessor
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {"processor_attr_1": 1, "processor_attr_2": False}
|
||||
|
||||
def get_processor(self):
|
||||
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
||||
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
|
||||
processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
|
||||
|
||||
return processor
|
||||
|
||||
def test_processor_to_json_string(self):
|
||||
processor = self.get_processor()
|
||||
obj = json.loads(processor.to_json_string())
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
def test_processor_from_and_save_pretrained(self):
|
||||
processor_first = self.get_processor()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = processor_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
processor_second = self.processor_class.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
|
||||
Reference in New Issue
Block a user