Save Processor (#27761)
* save processor * Update tests/models/auto/test_processor_auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/test_processing_common.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
@@ -25,8 +25,9 @@ from ...configuration_utils import PretrainedConfig
|
|||||||
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||||
from ...feature_extraction_utils import FeatureExtractionMixin
|
from ...feature_extraction_utils import FeatureExtractionMixin
|
||||||
from ...image_processing_utils import ImageProcessingMixin
|
from ...image_processing_utils import ImageProcessingMixin
|
||||||
|
from ...processing_utils import ProcessorMixin
|
||||||
from ...tokenization_utils import TOKENIZER_CONFIG_FILE
|
from ...tokenization_utils import TOKENIZER_CONFIG_FILE
|
||||||
from ...utils import FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging
|
from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, get_file_from_repo, logging
|
||||||
from .auto_factory import _LazyAutoMapping
|
from .auto_factory import _LazyAutoMapping
|
||||||
from .configuration_auto import (
|
from .configuration_auto import (
|
||||||
CONFIG_MAPPING_NAMES,
|
CONFIG_MAPPING_NAMES,
|
||||||
@@ -227,12 +228,24 @@ class AutoProcessor:
|
|||||||
processor_class = None
|
processor_class = None
|
||||||
processor_auto_map = None
|
processor_auto_map = None
|
||||||
|
|
||||||
# First, let's see if we have a preprocessor config.
|
# First, let's see if we have a processor or preprocessor config.
|
||||||
# Filter the kwargs for `get_file_from_repo`.
|
# Filter the kwargs for `get_file_from_repo`.
|
||||||
get_file_from_repo_kwargs = {
|
get_file_from_repo_kwargs = {
|
||||||
key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs
|
key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs
|
||||||
}
|
}
|
||||||
# Let's start by checking whether the processor class is saved in an image processor
|
|
||||||
|
# Let's start by checking whether the processor class is saved in a processor config
|
||||||
|
processor_config_file = get_file_from_repo(
|
||||||
|
pretrained_model_name_or_path, PROCESSOR_NAME, **get_file_from_repo_kwargs
|
||||||
|
)
|
||||||
|
if processor_config_file is not None:
|
||||||
|
config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||||
|
processor_class = config_dict.get("processor_class", None)
|
||||||
|
if "AutoProcessor" in config_dict.get("auto_map", {}):
|
||||||
|
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
|
||||||
|
|
||||||
|
if processor_class is None:
|
||||||
|
# If not found, let's check whether the processor class is saved in an image processor config
|
||||||
preprocessor_config_file = get_file_from_repo(
|
preprocessor_config_file = get_file_from_repo(
|
||||||
pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs
|
pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs
|
||||||
)
|
)
|
||||||
@@ -244,7 +257,9 @@ class AutoProcessor:
|
|||||||
|
|
||||||
# If not found, let's check whether the processor class is saved in a feature extractor config
|
# If not found, let's check whether the processor class is saved in a feature extractor config
|
||||||
if preprocessor_config_file is not None and processor_class is None:
|
if preprocessor_config_file is not None and processor_class is None:
|
||||||
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
|
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
|
||||||
|
pretrained_model_name_or_path, **kwargs
|
||||||
|
)
|
||||||
processor_class = config_dict.get("processor_class", None)
|
processor_class = config_dict.get("processor_class", None)
|
||||||
if "AutoProcessor" in config_dict.get("auto_map", {}):
|
if "AutoProcessor" in config_dict.get("auto_map", {}):
|
||||||
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
|
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
|
||||||
|
|||||||
@@ -16,14 +16,28 @@
|
|||||||
Processing saving/loading class for common processors.
|
Processing saving/loading class for common processors.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import inspect
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union
|
from typing import Any, Dict, Optional, Tuple, Union
|
||||||
|
|
||||||
from .dynamic_module_utils import custom_object_save
|
from .dynamic_module_utils import custom_object_save
|
||||||
from .tokenization_utils_base import PreTrainedTokenizerBase
|
from .tokenization_utils_base import PreTrainedTokenizerBase
|
||||||
from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging
|
from .utils import (
|
||||||
|
PROCESSOR_NAME,
|
||||||
|
PushToHubMixin,
|
||||||
|
add_model_info_to_auto_map,
|
||||||
|
cached_file,
|
||||||
|
copy_func,
|
||||||
|
direct_transformers_import,
|
||||||
|
download_url,
|
||||||
|
is_offline_mode,
|
||||||
|
is_remote_url,
|
||||||
|
logging,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -85,10 +99,70 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
|
|
||||||
setattr(self, attribute_name, arg)
|
setattr(self, attribute_name, arg)
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Serializes this instance to a Python dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
|
||||||
|
"""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
|
||||||
|
# Get the kwargs in `__init__`.
|
||||||
|
sig = inspect.signature(self.__init__)
|
||||||
|
# Only save the attributes that are presented in the kwargs of `__init__`.
|
||||||
|
attrs_to_save = sig.parameters
|
||||||
|
# Don't save attributes like `tokenizer`, `image processor` etc.
|
||||||
|
attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]
|
||||||
|
# extra attributes to be kept
|
||||||
|
attrs_to_save += ["auto_map"]
|
||||||
|
|
||||||
|
output = {k: v for k, v in output.items() if k in attrs_to_save}
|
||||||
|
|
||||||
|
output["processor_class"] = self.__class__.__name__
|
||||||
|
|
||||||
|
if "tokenizer" in output:
|
||||||
|
del output["tokenizer"]
|
||||||
|
if "image_processor" in output:
|
||||||
|
del output["image_processor"]
|
||||||
|
if "feature_extractor" in output:
|
||||||
|
del output["feature_extractor"]
|
||||||
|
|
||||||
|
# Some attributes have different names but containing objects that are not simple strings
|
||||||
|
output = {
|
||||||
|
k: v
|
||||||
|
for k, v in output.items()
|
||||||
|
if not (isinstance(v, PushToHubMixin) or v.__class__.__name__ == "BeamSearchDecoderCTC")
|
||||||
|
}
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_json_string(self) -> str:
|
||||||
|
"""
|
||||||
|
Serializes this instance to a JSON string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
|
||||||
|
"""
|
||||||
|
dictionary = self.to_dict()
|
||||||
|
|
||||||
|
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
|
||||||
|
"""
|
||||||
|
Save this instance to a JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_file_path (`str` or `os.PathLike`):
|
||||||
|
Path to the JSON file in which this processor instance's parameters will be saved.
|
||||||
|
"""
|
||||||
|
with open(json_file_path, "w", encoding="utf-8") as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
|
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
|
||||||
attributes_repr = "\n".join(attributes_repr)
|
attributes_repr = "\n".join(attributes_repr)
|
||||||
return f"{self.__class__.__name__}:\n{attributes_repr}"
|
return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
|
||||||
|
|
||||||
def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
|
def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -139,6 +213,7 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
if self._auto_class is not None:
|
if self._auto_class is not None:
|
||||||
attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
|
attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
|
||||||
configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
|
configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
|
||||||
|
configs.append(self)
|
||||||
custom_object_save(self, save_directory, config=configs)
|
custom_object_save(self, save_directory, config=configs)
|
||||||
|
|
||||||
for attribute_name in self.attributes:
|
for attribute_name in self.attributes:
|
||||||
@@ -156,6 +231,12 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
if isinstance(attribute, PreTrainedTokenizerBase):
|
if isinstance(attribute, PreTrainedTokenizerBase):
|
||||||
del attribute.init_kwargs["auto_map"]
|
del attribute.init_kwargs["auto_map"]
|
||||||
|
|
||||||
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
|
output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
|
||||||
|
|
||||||
|
self.to_json_file(output_processor_file)
|
||||||
|
logger.info(f"processor saved in {output_processor_file}")
|
||||||
|
|
||||||
if push_to_hub:
|
if push_to_hub:
|
||||||
self._upload_modified_files(
|
self._upload_modified_files(
|
||||||
save_directory,
|
save_directory,
|
||||||
@@ -165,6 +246,150 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
token=kwargs.get("token"),
|
token=kwargs.get("token"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return [output_processor_file]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_processor_dict(
|
||||||
|
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
||||||
|
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
|
||||||
|
processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
pretrained_model_name_or_path (`str` or `os.PathLike`):
|
||||||
|
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||||
|
subfolder (`str`, *optional*, defaults to `""`):
|
||||||
|
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||||
|
specify the folder name here.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
|
||||||
|
"""
|
||||||
|
cache_dir = kwargs.pop("cache_dir", None)
|
||||||
|
force_download = kwargs.pop("force_download", False)
|
||||||
|
resume_download = kwargs.pop("resume_download", False)
|
||||||
|
proxies = kwargs.pop("proxies", None)
|
||||||
|
token = kwargs.pop("token", None)
|
||||||
|
local_files_only = kwargs.pop("local_files_only", False)
|
||||||
|
revision = kwargs.pop("revision", None)
|
||||||
|
subfolder = kwargs.pop("subfolder", "")
|
||||||
|
|
||||||
|
from_pipeline = kwargs.pop("_from_pipeline", None)
|
||||||
|
from_auto_class = kwargs.pop("_from_auto", False)
|
||||||
|
|
||||||
|
user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
|
||||||
|
if from_pipeline is not None:
|
||||||
|
user_agent["using_pipeline"] = from_pipeline
|
||||||
|
|
||||||
|
if is_offline_mode() and not local_files_only:
|
||||||
|
logger.info("Offline mode: forcing local_files_only=True")
|
||||||
|
local_files_only = True
|
||||||
|
|
||||||
|
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||||
|
is_local = os.path.isdir(pretrained_model_name_or_path)
|
||||||
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
|
processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME)
|
||||||
|
if os.path.isfile(pretrained_model_name_or_path):
|
||||||
|
resolved_processor_file = pretrained_model_name_or_path
|
||||||
|
is_local = True
|
||||||
|
elif is_remote_url(pretrained_model_name_or_path):
|
||||||
|
processor_file = pretrained_model_name_or_path
|
||||||
|
resolved_processor_file = download_url(pretrained_model_name_or_path)
|
||||||
|
else:
|
||||||
|
processor_file = PROCESSOR_NAME
|
||||||
|
try:
|
||||||
|
# Load from local folder or from cache or download from model Hub and cache
|
||||||
|
resolved_processor_file = cached_file(
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
processor_file,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
force_download=force_download,
|
||||||
|
proxies=proxies,
|
||||||
|
resume_download=resume_download,
|
||||||
|
local_files_only=local_files_only,
|
||||||
|
token=token,
|
||||||
|
user_agent=user_agent,
|
||||||
|
revision=revision,
|
||||||
|
subfolder=subfolder,
|
||||||
|
)
|
||||||
|
except EnvironmentError:
|
||||||
|
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
||||||
|
# the original exception.
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
# For any other exception, we throw a generic error.
|
||||||
|
raise EnvironmentError(
|
||||||
|
f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
||||||
|
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
||||||
|
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
||||||
|
f" directory containing a {PROCESSOR_NAME} file"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load processor dict
|
||||||
|
with open(resolved_processor_file, "r", encoding="utf-8") as reader:
|
||||||
|
text = reader.read()
|
||||||
|
processor_dict = json.loads(text)
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
raise EnvironmentError(
|
||||||
|
f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_local:
|
||||||
|
logger.info(f"loading configuration file {resolved_processor_file}")
|
||||||
|
else:
|
||||||
|
logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
|
||||||
|
|
||||||
|
if "auto_map" in processor_dict and not is_local:
|
||||||
|
processor_dict["auto_map"] = add_model_info_to_auto_map(
|
||||||
|
processor_dict["auto_map"], pretrained_model_name_or_path
|
||||||
|
)
|
||||||
|
|
||||||
|
return processor_dict, kwargs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
|
||||||
|
"""
|
||||||
|
Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
processor_dict (`Dict[str, Any]`):
|
||||||
|
Dictionary that will be used to instantiate the processor object. Such a dictionary can be
|
||||||
|
retrieved from a pretrained checkpoint by leveraging the
|
||||||
|
[`~processing_utils.ProcessingMixin.to_dict`] method.
|
||||||
|
kwargs (`Dict[str, Any]`):
|
||||||
|
Additional parameters from which to initialize the processor object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
|
||||||
|
parameters.
|
||||||
|
"""
|
||||||
|
processor_dict = processor_dict.copy()
|
||||||
|
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
||||||
|
|
||||||
|
# Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
|
||||||
|
# We have to pop up some unused (but specific) arguments to make it work.
|
||||||
|
if "processor_class" in processor_dict:
|
||||||
|
del processor_dict["processor_class"]
|
||||||
|
|
||||||
|
if "auto_map" in processor_dict:
|
||||||
|
del processor_dict["auto_map"]
|
||||||
|
|
||||||
|
processor = cls(*args, **processor_dict)
|
||||||
|
|
||||||
|
# Update processor with kwargs if needed
|
||||||
|
for key in set(kwargs.keys()):
|
||||||
|
if hasattr(processor, key):
|
||||||
|
setattr(processor, key, kwargs.pop(key))
|
||||||
|
|
||||||
|
logger.info(f"Processor {processor}")
|
||||||
|
if return_unused_kwargs:
|
||||||
|
return processor, kwargs
|
||||||
|
else:
|
||||||
|
return processor
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(
|
def from_pretrained(
|
||||||
cls,
|
cls,
|
||||||
@@ -226,7 +451,19 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
kwargs["token"] = token
|
kwargs["token"] = token
|
||||||
|
|
||||||
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
|
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
return cls(*args)
|
|
||||||
|
# Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
|
||||||
|
# updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
|
||||||
|
# However, for models added in the future, we won't get the expected error if this file is missing.
|
||||||
|
try:
|
||||||
|
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
|
||||||
|
except EnvironmentError as e:
|
||||||
|
if "does not appear to have a file named processor_config.json." in str(e):
|
||||||
|
processor_dict, kwargs = {}, kwargs
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
return cls.from_args_and_dict(args, processor_dict, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def register_for_auto_class(cls, auto_class="AutoProcessor"):
|
def register_for_auto_class(cls, auto_class="AutoProcessor"):
|
||||||
|
|||||||
@@ -217,6 +217,7 @@ SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
|
|||||||
CONFIG_NAME = "config.json"
|
CONFIG_NAME = "config.json"
|
||||||
FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
|
FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
|
||||||
IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
|
IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
|
||||||
|
PROCESSOR_NAME = "processor_config.json"
|
||||||
GENERATION_CONFIG_NAME = "generation_config.json"
|
GENERATION_CONFIG_NAME = "generation_config.json"
|
||||||
MODEL_CARD_NAME = "modelcard.json"
|
MODEL_CARD_NAME = "modelcard.json"
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
|
from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
|
||||||
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
|
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
|
||||||
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available
|
from transformers.utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, is_tokenizers_available
|
||||||
|
|
||||||
|
|
||||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
|
||||||
@@ -91,6 +91,28 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertIsInstance(processor, Wav2Vec2Processor)
|
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||||
|
|
||||||
|
def test_processor_from_processor_class(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
|
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
|
||||||
|
|
||||||
|
# save in new folder
|
||||||
|
processor.save_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
# drop `processor_class` in tokenizer config
|
||||||
|
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
|
||||||
|
config_dict = json.load(f)
|
||||||
|
config_dict.pop("processor_class")
|
||||||
|
|
||||||
|
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
|
||||||
|
f.write(json.dumps(config_dict))
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
self.assertIsInstance(processor, Wav2Vec2Processor)
|
||||||
|
|
||||||
def test_processor_from_feat_extr_processor_class(self):
|
def test_processor_from_feat_extr_processor_class(self):
|
||||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
feature_extractor = Wav2Vec2FeatureExtractor()
|
feature_extractor = Wav2Vec2FeatureExtractor()
|
||||||
@@ -101,6 +123,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
|||||||
# save in new folder
|
# save in new folder
|
||||||
processor.save_pretrained(tmpdirname)
|
processor.save_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
# drop `processor_class` in processor
|
||||||
|
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
|
||||||
|
config_dict = json.load(f)
|
||||||
|
config_dict.pop("processor_class")
|
||||||
|
|
||||||
|
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
|
||||||
|
f.write(json.dumps(config_dict))
|
||||||
|
|
||||||
# drop `processor_class` in tokenizer
|
# drop `processor_class` in tokenizer
|
||||||
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
|
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "r") as f:
|
||||||
config_dict = json.load(f)
|
config_dict = json.load(f)
|
||||||
@@ -123,6 +153,14 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
|||||||
# save in new folder
|
# save in new folder
|
||||||
processor.save_pretrained(tmpdirname)
|
processor.save_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
# drop `processor_class` in processor
|
||||||
|
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "r") as f:
|
||||||
|
config_dict = json.load(f)
|
||||||
|
config_dict.pop("processor_class")
|
||||||
|
|
||||||
|
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
|
||||||
|
f.write(json.dumps(config_dict))
|
||||||
|
|
||||||
# drop `processor_class` in feature extractor
|
# drop `processor_class` in feature extractor
|
||||||
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f:
|
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "r") as f:
|
||||||
config_dict = json.load(f)
|
config_dict = json.load(f)
|
||||||
@@ -270,6 +308,45 @@ class AutoFeatureExtractorTest(unittest.TestCase):
|
|||||||
if CustomConfig in PROCESSOR_MAPPING._extra_content:
|
if CustomConfig in PROCESSOR_MAPPING._extra_content:
|
||||||
del PROCESSOR_MAPPING._extra_content[CustomConfig]
|
del PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||||
|
|
||||||
|
def test_from_pretrained_dynamic_processor_with_extra_attributes(self):
|
||||||
|
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NewTokenizer(BertTokenizer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NewProcessor(ProcessorMixin):
|
||||||
|
feature_extractor_class = "AutoFeatureExtractor"
|
||||||
|
tokenizer_class = "AutoTokenizer"
|
||||||
|
|
||||||
|
def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
|
||||||
|
super().__init__(feature_extractor, tokenizer)
|
||||||
|
|
||||||
|
self.processor_attr_1 = processor_attr_1
|
||||||
|
self.processor_attr_2 = processor_attr_2
|
||||||
|
|
||||||
|
try:
|
||||||
|
AutoConfig.register("custom", CustomConfig)
|
||||||
|
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
|
||||||
|
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
|
||||||
|
AutoProcessor.register(CustomConfig, NewProcessor)
|
||||||
|
# If remote code is not set, the default is to use local classes.
|
||||||
|
processor = AutoProcessor.from_pretrained(
|
||||||
|
"hf-internal-testing/test_dynamic_processor", processor_attr_2=False
|
||||||
|
)
|
||||||
|
self.assertEqual(processor.__class__.__name__, "NewProcessor")
|
||||||
|
self.assertEqual(processor.processor_attr_1, 1)
|
||||||
|
self.assertEqual(processor.processor_attr_2, False)
|
||||||
|
finally:
|
||||||
|
if "custom" in CONFIG_MAPPING._extra_content:
|
||||||
|
del CONFIG_MAPPING._extra_content["custom"]
|
||||||
|
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
|
||||||
|
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
|
||||||
|
if CustomConfig in TOKENIZER_MAPPING._extra_content:
|
||||||
|
del TOKENIZER_MAPPING._extra_content[CustomConfig]
|
||||||
|
if CustomConfig in PROCESSOR_MAPPING._extra_content:
|
||||||
|
del PROCESSOR_MAPPING._extra_content[CustomConfig]
|
||||||
|
|
||||||
def test_auto_processor_creates_tokenizer(self):
|
def test_auto_processor_creates_tokenizer(self):
|
||||||
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
|
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
|
||||||
self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
|
self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
|
||||||
|
|||||||
@@ -26,6 +26,8 @@ from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
|||||||
from transformers.testing_utils import require_vision
|
from transformers.testing_utils import require_vision
|
||||||
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
|
||||||
|
|
||||||
|
from ...test_processing_common import ProcessorTesterMixin
|
||||||
|
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@@ -34,7 +36,9 @@ if is_vision_available():
|
|||||||
|
|
||||||
|
|
||||||
@require_vision
|
@require_vision
|
||||||
class CLIPProcessorTest(unittest.TestCase):
|
class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||||
|
processor_class = CLIPProcessor
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
|||||||
127
tests/test_processing_common.py
Normal file
127
tests/test_processing_common.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from transformers import CLIPTokenizerFast, ProcessorMixin
|
||||||
|
from transformers.models.auto.processing_auto import processor_class_from_name
|
||||||
|
from transformers.testing_utils import (
|
||||||
|
check_json_file_has_correct_format,
|
||||||
|
require_tokenizers,
|
||||||
|
require_torch,
|
||||||
|
require_vision,
|
||||||
|
)
|
||||||
|
from transformers.utils import is_vision_available
|
||||||
|
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from transformers import CLIPImageProcessor
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
class ProcessorTesterMixin:
|
||||||
|
processor_class = None
|
||||||
|
|
||||||
|
def prepare_processor_dict(self):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def get_component(self, attribute, **kwargs):
|
||||||
|
assert attribute in self.processor_class.attributes
|
||||||
|
component_class_name = getattr(self.processor_class, f"{attribute}_class")
|
||||||
|
if isinstance(component_class_name, tuple):
|
||||||
|
component_class_name = component_class_name[0]
|
||||||
|
|
||||||
|
component_class = processor_class_from_name(component_class_name)
|
||||||
|
component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa
|
||||||
|
|
||||||
|
return component
|
||||||
|
|
||||||
|
def prepare_components(self):
|
||||||
|
components = {}
|
||||||
|
for attribute in self.processor_class.attributes:
|
||||||
|
component = self.get_component(attribute)
|
||||||
|
components[attribute] = component
|
||||||
|
|
||||||
|
return components
|
||||||
|
|
||||||
|
def get_processor(self):
|
||||||
|
components = self.prepare_components()
|
||||||
|
processor = self.processor_class(**components, **self.prepare_processor_dict())
|
||||||
|
return processor
|
||||||
|
|
||||||
|
def test_processor_to_json_string(self):
|
||||||
|
processor = self.get_processor()
|
||||||
|
obj = json.loads(processor.to_json_string())
|
||||||
|
for key, value in self.prepare_processor_dict().items():
|
||||||
|
self.assertEqual(obj[key], value)
|
||||||
|
self.assertEqual(getattr(processor, key, None), value)
|
||||||
|
|
||||||
|
def test_processor_from_and_save_pretrained(self):
|
||||||
|
processor_first = self.get_processor()
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
saved_file = processor_first.save_pretrained(tmpdirname)[0]
|
||||||
|
check_json_file_has_correct_format(saved_file)
|
||||||
|
processor_second = self.processor_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
|
||||||
|
|
||||||
|
|
||||||
|
class MyProcessor(ProcessorMixin):
|
||||||
|
attributes = ["image_processor", "tokenizer"]
|
||||||
|
image_processor_class = "CLIPImageProcessor"
|
||||||
|
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
|
||||||
|
|
||||||
|
def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
|
||||||
|
super().__init__(image_processor, tokenizer)
|
||||||
|
|
||||||
|
self.processor_attr_1 = processor_attr_1
|
||||||
|
self.processor_attr_2 = processor_attr_2
|
||||||
|
|
||||||
|
|
||||||
|
@require_tokenizers
|
||||||
|
@require_vision
|
||||||
|
class ProcessorTest(unittest.TestCase):
|
||||||
|
processor_class = MyProcessor
|
||||||
|
|
||||||
|
def prepare_processor_dict(self):
|
||||||
|
return {"processor_attr_1": 1, "processor_attr_2": False}
|
||||||
|
|
||||||
|
def get_processor(self):
|
||||||
|
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
||||||
|
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
|
||||||
|
processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
|
||||||
|
|
||||||
|
return processor
|
||||||
|
|
||||||
|
def test_processor_to_json_string(self):
|
||||||
|
processor = self.get_processor()
|
||||||
|
obj = json.loads(processor.to_json_string())
|
||||||
|
for key, value in self.prepare_processor_dict().items():
|
||||||
|
self.assertEqual(obj[key], value)
|
||||||
|
self.assertEqual(getattr(processor, key, None), value)
|
||||||
|
|
||||||
|
def test_processor_from_and_save_pretrained(self):
|
||||||
|
processor_first = self.get_processor()
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
saved_file = processor_first.save_pretrained(tmpdirname)[0]
|
||||||
|
check_json_file_has_correct_format(saved_file)
|
||||||
|
processor_second = self.processor_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
|
||||||
Reference in New Issue
Block a user