Add Vision Transformer and ViTFeatureExtractor (#10950)

* Squash all commits into one * Update ViTFeatureExtractor to use image_utils instead of torchvision * Remove torchvision and add Pillow * Small docs improvement * Address most comments by @sgugger * Fix tests * Clean up conversion script * Pooler first draft * Fix quality * Improve conversion script * Make style and quality * Make fix-copies * Minor docs improvements * Should use fix-copies instead of manual handling * Revert "Should use fix-copies instead of manual handling" This reverts commit fd4e591bce4496d41406425c82606a8fdaf8a50b. * Place ViT in alphabetical order Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-04-01 17:16:05 +02:00
parent af6732225c
commit 30677dc743
25 changed files with 3072 additions and 22 deletions
--- a/tests/test_image_utils.py
+++ b/tests/test_image_utils.py
@@ -264,7 +264,9 @@ class ImageFeatureExtractionTester(unittest.TestCase):

        # During the conversion rescale and channel first will be applied.
        expected = array.transpose(2, 0, 1).astype(np.float32) / 255.0
-        expected = (expected - np.array(mean)[:, None, None]) / np.array(std)[:, None, None]
+        np_mean = np.array(mean).astype(np.float32)[:, None, None]
+        np_std = np.array(std).astype(np.float32)[:, None, None]
+        expected = (expected - np_mean) / np_std
        self.assertTrue(np.array_equal(normalized_image, expected))

    def test_normalize_array(self):