Pipeline testing - using tiny models on Hub (#20426)
* rework pipeline tests * run pipeline tests * fix * fix * fix * revert the changes in get_test_pipeline() parameter list * fix expected error message * skip a test * clean up --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -15,6 +15,7 @@
|
||||
|
||||
import argparse
|
||||
import collections.abc
|
||||
import copy
|
||||
import importlib
|
||||
import inspect
|
||||
import json
|
||||
@@ -31,6 +32,7 @@ from huggingface_hub import Repository, create_repo, upload_folder
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
FEATURE_EXTRACTOR_MAPPING,
|
||||
IMAGE_PROCESSOR_MAPPING,
|
||||
PROCESSOR_MAPPING,
|
||||
TOKENIZER_MAPPING,
|
||||
AutoTokenizer,
|
||||
@@ -74,29 +76,36 @@ def get_processor_types_from_config_class(config_class, allowed_mappings=None):
|
||||
|
||||
We use `tuple` here to include (potentially) both slow & fast tokenizers.
|
||||
"""
|
||||
|
||||
# To make a uniform return type
|
||||
def _to_tuple(x):
|
||||
if not isinstance(x, collections.abc.Sequence):
|
||||
x = (x,)
|
||||
else:
|
||||
x = tuple(x)
|
||||
return x
|
||||
|
||||
if allowed_mappings is None:
|
||||
allowed_mappings = ["processor", "tokenizer", "feature_extractor"]
|
||||
allowed_mappings = ["processor", "tokenizer", "image_processor", "feature_extractor"]
|
||||
|
||||
processor_types = ()
|
||||
|
||||
# Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers or a feature extractor.
|
||||
# Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers, and/or an image processor or
|
||||
# a feature extractor
|
||||
if config_class in PROCESSOR_MAPPING and "processor" in allowed_mappings:
|
||||
processor_types = PROCESSOR_MAPPING[config_class]
|
||||
elif config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
|
||||
processor_types = TOKENIZER_MAPPING[config_class]
|
||||
elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
|
||||
processor_types = FEATURE_EXTRACTOR_MAPPING[config_class]
|
||||
processor_types = _to_tuple(PROCESSOR_MAPPING[config_class])
|
||||
else:
|
||||
# Some configurations have no processor at all. For example, generic composite models like
|
||||
# `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
|
||||
# require any processor.
|
||||
pass
|
||||
if config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
|
||||
processor_types = TOKENIZER_MAPPING[config_class]
|
||||
|
||||
# make a uniform return type
|
||||
if not isinstance(processor_types, collections.abc.Sequence):
|
||||
processor_types = (processor_types,)
|
||||
else:
|
||||
processor_types = tuple(processor_types)
|
||||
if config_class in IMAGE_PROCESSOR_MAPPING and "image_processor" in allowed_mappings:
|
||||
processor_types += _to_tuple(IMAGE_PROCESSOR_MAPPING[config_class])
|
||||
elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
|
||||
processor_types += _to_tuple(FEATURE_EXTRACTOR_MAPPING[config_class])
|
||||
|
||||
# Remark: some configurations have no processor at all. For example, generic composite models like
|
||||
# `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
|
||||
# require any processor.
|
||||
|
||||
# We might get `None` for some tokenizers - remove them here.
|
||||
processor_types = tuple(p for p in processor_types if p is not None)
|
||||
@@ -154,7 +163,7 @@ def get_config_class_from_processor_class(processor_class):
|
||||
return new_config_class
|
||||
|
||||
|
||||
def build_processor(config_class, processor_class):
|
||||
def build_processor(config_class, processor_class, allow_no_checkpoint=False):
|
||||
"""Create a processor for `processor_class`.
|
||||
|
||||
If a processor is not able to be built with the original arguments, this method tries to change the arguments and
|
||||
@@ -264,6 +273,18 @@ def build_processor(config_class, processor_class):
|
||||
if config_class_from_processor_class != config_class:
|
||||
processor = build_processor(config_class_from_processor_class, processor_class)
|
||||
|
||||
# Try to create an image processor or a feature extractor without any checkpoint
|
||||
if (
|
||||
processor is None
|
||||
and allow_no_checkpoint
|
||||
and (issubclass(processor_class, BaseImageProcessor) or issubclass(processor_class, FeatureExtractionMixin))
|
||||
):
|
||||
try:
|
||||
processor = processor_class()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
pass
|
||||
|
||||
# validation
|
||||
if processor is not None:
|
||||
if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")):
|
||||
@@ -458,6 +479,18 @@ def convert_processors(processors, tiny_config, output_folder, result):
|
||||
result["warnings"].append(f"Failed to convert feature extractors: {e}")
|
||||
feature_extractors = []
|
||||
|
||||
if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
|
||||
if fast_tokenizer is not None:
|
||||
if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]:
|
||||
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
|
||||
else:
|
||||
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
|
||||
if slow_tokenizer is not None:
|
||||
if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]:
|
||||
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
|
||||
else:
|
||||
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
|
||||
|
||||
processors = [fast_tokenizer, slow_tokenizer] + feature_extractors
|
||||
processors = [p for p in processors if p is not None]
|
||||
for p in processors:
|
||||
@@ -491,6 +524,12 @@ def build_model(model_arch, tiny_config, output_dir):
|
||||
if os.path.isdir(processor_output_dir):
|
||||
shutil.copytree(processor_output_dir, checkpoint_dir, dirs_exist_ok=True)
|
||||
|
||||
tiny_config = copy.deepcopy(tiny_config)
|
||||
|
||||
if any([model_arch.__name__.endswith(x) for x in ["ForCausalLM", "LMHeadModel"]]):
|
||||
tiny_config.is_encoder_decoder = False
|
||||
tiny_config.is_decoder = True
|
||||
|
||||
model = model_arch(config=tiny_config)
|
||||
model.save_pretrained(checkpoint_dir)
|
||||
model.from_pretrained(checkpoint_dir)
|
||||
@@ -819,7 +858,7 @@ def build(config_class, models_to_create, output_dir):
|
||||
|
||||
for processor_class in processor_classes:
|
||||
try:
|
||||
processor = build_processor(config_class, processor_class)
|
||||
processor = build_processor(config_class, processor_class, allow_no_checkpoint=True)
|
||||
if processor is not None:
|
||||
result["processor"][processor_class] = processor
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user