Add Vision Transformer and ViTFeatureExtractor (#10950)

* Squash all commits into one * Update ViTFeatureExtractor to use image_utils instead of torchvision * Remove torchvision and add Pillow * Small docs improvement * Address most comments by @sgugger * Fix tests * Clean up conversion script * Pooler first draft * Fix quality * Improve conversion script * Make style and quality * Make fix-copies * Minor docs improvements * Should use fix-copies instead of manual handling * Revert "Should use fix-copies instead of manual handling" This reverts commit fd4e591bce4496d41406425c82606a8fdaf8a50b. * Place ViT in alphabetical order Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-04-01 17:16:05 +02:00
parent af6732225c
commit 30677dc743
25 changed files with 3072 additions and 22 deletions
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -34,6 +34,7 @@ if is_torch_available():
    from transformers import (
        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        MODEL_FOR_CAUSAL_LM_MAPPING,
+        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
        MODEL_FOR_MASKED_LM_MAPPING,
        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
        MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
@@ -99,6 +100,7 @@ class ModelTesterMixin:
            elif model_class in [
                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
                *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
+                *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.values(),
            ]:
                inputs_dict["labels"] = torch.zeros(
                    self.model_tester.batch_size, dtype=torch.long, device=torch_device