From 3d66146afcc400b01ce59fcfed9cdb1c59016e33 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 13 Dec 2021 14:13:39 +0100 Subject: [PATCH] Fixing tests for Perceiver (#14745) - Do not run image-classification pipeline (_CHECKPOINT_FOR_DOC uses the checkpoint for langage, which cannot load a FeatureExtractor so current logic fails). - Add a safeguard to not run tests when `tokenizer_class` or `feature_extractor_class` **are** defined, but cannot be loaded This happens for Perceiver for the "FastTokenizer" (which doesn't exist so None) and FeatureExtractor (which does exist but cannot be loaded because the checkpoint doesn't define one which is reasonable for the said checkpoint) - Added `get_vocab` function to `PerceiverTokenizer` since it is used by `fill-mask` pipeline when the argument `targets` is used to narrow a subset of possible values. Co-authored-by: Nicolas Patry --- .../models/auto/feature_extraction_auto.py | 1 + .../models/perceiver/tokenization_perceiver.py | 12 ++++++++++-- tests/test_pipelines_common.py | 6 ++++++ tests/test_pipelines_image_classification.py | 11 ++++++++++- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index c7903c941e..45f12953f9 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -43,6 +43,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( ("detr", "DetrFeatureExtractor"), ("layoutlmv2", "LayoutLMv2FeatureExtractor"), ("clip", "CLIPFeatureExtractor"), + ("perceiver", "PerceiverFeatureExtractor"), ] ) diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py index 9349c5aac4..95c4ee6683 100644 --- a/src/transformers/models/perceiver/tokenization_perceiver.py +++ b/src/transformers/models/perceiver/tokenization_perceiver.py @@ -87,7 +87,7 @@ class PerceiverTokenizer(PreTrainedTokenizer): self._utf_vocab_size = 2 ** 8 # utf is 8 bits # define special tokens dict - self.special_tokens_encoder: Dict[int, str] = { + self.special_tokens_encoder: Dict[str, int] = { self.pad_token: 0, self.bos_token: 1, self.eos_token: 2, @@ -96,7 +96,15 @@ class PerceiverTokenizer(PreTrainedTokenizer): self.sep_token: 5, } self._num_special_tokens = len(self.special_tokens_encoder) - self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()} + self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()} + + def get_vocab(self) -> Dict[str, int]: + vocab = self.special_tokens_encoder.copy() + vocab.update(self.added_tokens_encoder) + for i in range(self._utf_vocab_size): + token = chr(i) + vocab[token] = i + len(self.special_tokens_encoder) + return vocab @property def vocab_size(self): diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index 62a1831dc4..eaa67579c7 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -169,6 +169,11 @@ class PipelineTestCaseMeta(type): else: tokenizer = None feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config) + + if tokenizer is None and feature_extractor is None: + self.skipTest( + f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor (PerceiverConfig with no FastTokenizer ?)" + ) pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor) if pipeline is None: # The test can disable itself, but it should be very marginal @@ -213,6 +218,7 @@ class PipelineTestCaseMeta(type): if not tokenizer_classes: # We need to test even if there are no tokenizers. tokenizer_classes = [None] + for tokenizer_class in tokenizer_classes: if tokenizer_class is not None: tokenizer_name = tokenizer_class.__name__ diff --git a/tests/test_pipelines_image_classification.py b/tests/test_pipelines_image_classification.py index a8f5cb4da3..f61ffea2df 100644 --- a/tests/test_pipelines_image_classification.py +++ b/tests/test_pipelines_image_classification.py @@ -14,7 +14,12 @@ import unittest -from transformers import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, PreTrainedTokenizer, is_vision_available +from transformers import ( + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + PerceiverConfig, + PreTrainedTokenizer, + is_vision_available, +) from transformers.pipelines import ImageClassificationPipeline, pipeline from transformers.testing_utils import ( is_pipeline_test, @@ -45,6 +50,10 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING def get_test_pipeline(self, model, tokenizer, feature_extractor): + if isinstance(model.config, PerceiverConfig): + self.skipTest( + "Perceiver model tester is defined with a language one, which has no feature_extractor, so the automated test cannot work here" + ) image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor) examples = [