From 9c6f7485a6ab2364f04df5893ab0e09f3c889b5d Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Tue, 3 Jan 2023 14:17:18 +0100 Subject: [PATCH] Add GIT (GenerativeImage2Text) (#20295) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * First draft * Make model instantiation work * Fix copied from statement * More fixes * Add correct output head * Improve configuration * Add conversion script * Improve conversion script * Remove token_type_ids * Fix conversion of projection layers * Convert all weights * Use cats image * Make logits match * Generate caption on cats image * Add GITProcessor * Update conversion script * Add support for more checkpoints * Fix conversion script * Add initial tests * Remove cross-attention * More improvements * Remove is_decoder * Improve model tests * Improve tests * Improve model outputs * Fix model outputs equivalence * Fix more tests * Remove unused code * Use generate to generate text, no use of cache for now * Use generate more appropriately * Fix config tests * Fix style * Add support for use_cache Co-authored-by: Joao Gante * Fix style * Fix GIT vision encoder * Update README * Fix integration test * Set bos and eos token ids * Improve docs * Improve code * Add support for provided attention_mask * Add copied from statement * Fix gradient checkpointing test * Set model_input_names * Investigate model_input_names * Remove script * Fix model inputs * Fix docstring * Rename GIT to Git * Support more models * Add support for textvqa model * Add video support * Extend conversion script for video * Add support for large variant * Add support for more models * Fix config archive map * Update integration test * Fix README * Fix CLIP mean and std * Update processor * Fix use_cache for video, thanks @gante * Remove print statements * Remove assertion * Add processor tests * Fix model_input_names * Use Auto API for processor * Fix processor tests * Fix integration test * Fix pipeline test * Make tests faster * Update conversion script * Update conversion script * Convert more checkpoints * Update conversion script * Fix typo * Update docstrings * Improve code snippets * Fix doc tests * Add more code examplesé * Fix doc tests * Add integration tests * Fix unused variable * revert * Add GIT to Japanese README Co-authored-by: Niels Rogge Co-authored-by: Joao Gante Co-authored-by: ydshieh --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.mdx | 2 + docs/source/en/model_doc/git.mdx | 66 + src/transformers/__init__.py | 18 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 3 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/clip/configuration_clip.py | 24 +- .../models/clip/image_processing_clip.py | 4 +- src/transformers/models/git/__init__.py | 64 + .../models/git/configuration_git.py | 265 +++ .../models/git/convert_git_to_pytorch.py | 404 +++++ src/transformers/models/git/modeling_git.py | 1532 +++++++++++++++++ src/transformers/models/git/processing_git.py | 113 ++ src/transformers/utils/dummy_pt_objects.py | 31 + src/transformers/utils/fx.py | 1 + tests/models/git/__init__.py | 0 tests/models/git/test_modeling_git.py | 462 +++++ tests/models/git/test_processor_git.py | 153 ++ .../test_pipelines_text_generation.py | 6 +- utils/check_repo.py | 1 + utils/documentation_tests.txt | 1 + 32 files changed, 3150 insertions(+), 16 deletions(-) create mode 100644 docs/source/en/model_doc/git.mdx create mode 100644 src/transformers/models/git/__init__.py create mode 100644 src/transformers/models/git/configuration_git.py create mode 100644 src/transformers/models/git/convert_git_to_pytorch.py create mode 100644 src/transformers/models/git/modeling_git.py create mode 100644 src/transformers/models/git/processing_git.py create mode 100644 tests/models/git/__init__.py create mode 100644 tests/models/git/test_modeling_git.py create mode 100644 tests/models/git/test_processor_git.py diff --git a/README.md b/README.md index 07c305c6bf..e04144d8c7 100644 --- a/README.md +++ b/README.md @@ -316,6 +316,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. diff --git a/README_es.md b/README_es.md index 78d9a162fe..28a480f0ec 100644 --- a/README_es.md +++ b/README_es.md @@ -316,6 +316,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. diff --git a/README_hd.md b/README_hd.md index 149e38034e..ebf16afa77 100644 --- a/README_hd.md +++ b/README_hd.md @@ -289,6 +289,7 @@ conda install -c huggingface transformers 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (FLAVA: A फाउंडेशनल लैंग्वेज एंड विजन अलाइनमेंट मॉडल) (https://arxiv) साथ वाला पेपर .org/abs/2112.04482) अमनप्रीत सिंह, रोंगहांग हू, वेदानुज गोस्वामी, गुइल्यूम कुएरॉन, वोज्शिएक गालुबा, मार्कस रोहरबैक, और डौवे कीला द्वारा। 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा। 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले ​​द्वारा रिहाई। +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा। 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI से) साथ में दिया गया पेपर [जेनरेटिव प्री-ट्रेनिंग द्वारा भाषा की समझ में सुधार](https://blog .openai.com/language-unsupervised/) एलेक रैडफोर्ड, कार्तिक नरसिम्हन, टिम सालिमन्स और इल्या सुत्स्केवर द्वारा। 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI से) रिपॉजिटरी के साथ [EleutherAI/gpt-neo](https://github.com/ EleutherAI /gpt-neo) रिलीज। सिड ब्लैक, स्टेला बिडरमैन, लियो गाओ, फिल वांग और कॉनर लेही द्वारा पोस्ट किया गया। diff --git a/README_ja.md b/README_ja.md index c7241723a6..ba1c93995f 100644 --- a/README_ja.md +++ b/README_ja.md @@ -351,6 +351,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (EleutherAI から) Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy から公開されたレポジトリー : [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) diff --git a/README_ko.md b/README_ko.md index 4617dde6ad..01aa6aee97 100644 --- a/README_ko.md +++ b/README_ko.md @@ -266,6 +266,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. diff --git a/README_zh-hans.md b/README_zh-hans.md index 01ed683813..aafd81de6f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -290,6 +290,7 @@ conda install -c huggingface transformers 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。 +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 6da2a383ee..4902790af2 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -302,6 +302,7 @@ conda install -c huggingface transformers 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GIT](https://huggingface.co/docs/transformers/main/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 05bb4ebef7..ead00cea0a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -512,6 +512,8 @@ title: Donut - local: model_doc/flava title: FLAVA + - local: model_doc/git + title: GIT - local: model_doc/groupvit title: GroupViT - local: model_doc/layoutlmv2 diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index c182189373..f6c9be4236 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -103,6 +103,7 @@ The documentation is organized into five sections: 1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. 1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. +1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. 1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. 1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. @@ -276,6 +277,7 @@ Flax), PyTorch, and/or TensorFlow. | FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | | FNet | ✅ | ✅ | ✅ | ❌ | ❌ | | Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ | +| GIT | ❌ | ❌ | ✅ | ❌ | ❌ | | GLPN | ❌ | ❌ | ✅ | ❌ | ❌ | | GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ | | GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/git.mdx b/docs/source/en/model_doc/git.mdx new file mode 100644 index 0000000000..bc918383af --- /dev/null +++ b/docs/source/en/model_doc/git.mdx @@ -0,0 +1,66 @@ + + +# GIT + +## Overview + +The GIT model was proposed in [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by +Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. GIT is a decoder-only Transformer +that leverages [CLIP](clip)'s vision encoder to condition the model on vision inputs besides text. The model obtains state-of-the-art results on +image captioning and visual question answering benchmarks. + +The abstract from the paper is the following: + +*In this paper, we design and train a Generative Image-to-text Transformer, GIT, to unify vision-language tasks such as image/video captioning and question answering. While generative models provide a consistent network architecture between pre-training and fine-tuning, existing work typically contains complex structures (uni/multi-modal encoder/decoder) and depends on external modules such as object detectors/taggers and optical character recognition (OCR). In GIT, we simplify the architecture as one image encoder and one text decoder under a single language modeling task. We also scale up the pre-training data and the model size to boost the model performance. Without bells and whistles, our GIT establishes new state of the arts on 12 challenging benchmarks with a large margin. For instance, our model surpasses the human performance for the first time on TextCaps (138.2 vs. 125.5 in CIDEr). Furthermore, we present a new scheme of generation-based image classification and scene text recognition, achieving decent performance on standard benchmarks.* + +Tips: + +- GIT is implemented in a very similar way to GPT-2, the only difference being that the model is also conditioned on `pixel_values`. +- One can use [`GitProcessor`] to prepare images for the model, and the `generate` method for autoregressive generation. + + + + GIT architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/microsoft/GenerativeImage2Text). + +## GitVisionConfig + +[[autodoc]] GitVisionConfig + +## GitVisionModel + +[[autodoc]] GitVisionModel + - forward + +## GitConfig + +[[autodoc]] GitConfig + - all + +## GitProcessor + +[[autodoc]] GitProcessor + - __call__ + +## GitModel + +[[autodoc]] GitModel + - forward + +## GitForCausalLM + +[[autodoc]] GitForCausalLM + - forward \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 88c23302ae..4f6b41d072 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -256,6 +256,7 @@ _import_structure = { "models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig"], "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"], "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"], + "models.git": ["GIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GitConfig", "GitProcessor", "GitVisionConfig"], "models.glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"], "models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"], "models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"], @@ -1447,6 +1448,15 @@ else: "load_tf_weights_in_funnel", ] ) + _import_structure["models.git"].extend( + [ + "GIT_PRETRAINED_MODEL_ARCHIVE_LIST", + "GitForCausalLM", + "GitModel", + "GitPreTrainedModel", + "GitVisionModel", + ] + ) _import_structure["models.glpn"].extend( [ "GLPN_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3595,6 +3605,7 @@ if TYPE_CHECKING: from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer + from .models.git import GIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GitConfig, GitProcessor, GitVisionConfig from .models.glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig @@ -4610,6 +4621,13 @@ if TYPE_CHECKING: FunnelPreTrainedModel, load_tf_weights_in_funnel, ) + from .models.git import ( + GIT_PRETRAINED_MODEL_ARCHIVE_LIST, + GitForCausalLM, + GitModel, + GitPreTrainedModel, + GitVisionModel, + ) from .models.glpn import ( GLPN_PRETRAINED_MODEL_ARCHIVE_LIST, GLPNForDepthEstimation, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 201b6707ff..cb3a093c2b 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -73,6 +73,7 @@ from . import ( fnet, fsmt, funnel, + git, glpn, gpt2, gpt_neo, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 4cf28be433..18bc2a6a69 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -77,6 +77,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("fnet", "FNetConfig"), ("fsmt", "FSMTConfig"), ("funnel", "FunnelConfig"), + ("git", "GitConfig"), ("glpn", "GLPNConfig"), ("gpt-sw3", "GPT2Config"), ("gpt2", "GPT2Config"), @@ -234,6 +235,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("fsmt", "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("funnel", "FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("git", "GIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gpt2", "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("gpt_neo", "GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -390,6 +392,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("fnet", "FNet"), ("fsmt", "FairSeq Machine-Translation"), ("funnel", "Funnel Transformer"), + ("git", "GIT"), ("glpn", "GLPN"), ("gpt-sw3", "GPT-Sw3"), ("gpt2", "OpenAI GPT-2"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 6b45997bb2..0fee88153a 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -54,6 +54,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("donut-swin", "DonutImageProcessor"), ("dpt", "DPTImageProcessor"), ("flava", "FlavaImageProcessor"), + ("git", ("CLIPImageProcessor", "VideoMAEImageProcessor")), ("glpn", "GLPNImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 4d61c4c972..b639b8f93d 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -76,6 +76,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("fnet", "FNetModel"), ("fsmt", "FSMTModel"), ("funnel", ("FunnelModel", "FunnelBaseModel")), + ("git", "GitModel"), ("glpn", "GLPNModel"), ("gpt-sw3", "GPT2Model"), ("gpt2", "GPT2Model"), @@ -264,6 +265,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict( ("fnet", "FNetForMaskedLM"), ("fsmt", "FSMTForConditionalGeneration"), ("funnel", "FunnelForMaskedLM"), + ("git", "GitForCausalLM"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), ("gpt_neo", "GPTNeoForCausalLM"), @@ -329,6 +331,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ("data2vec-text", "Data2VecTextForCausalLM"), ("electra", "ElectraForCausalLM"), ("ernie", "ErnieForCausalLM"), + ("git", "GitForCausalLM"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), ("gpt_neo", "GPTNeoForCausalLM"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 16d66b44c4..31d1740ffb 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -46,6 +46,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("clip", "CLIPProcessor"), ("clipseg", "CLIPSegProcessor"), ("flava", "FlavaProcessor"), + ("git", "GITProcessor"), ("groupvit", "CLIPProcessor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index b1c2ae6ecf..c0c700cdf1 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -137,6 +137,7 @@ else: ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), ("fsmt", ("FSMTTokenizer", None)), ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), + ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 1aeda330e4..607a86277c 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -39,15 +39,14 @@ CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = { class CLIPTextConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate an CLIP - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the CLIP + This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP + text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the text encoder of the CLIP [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. - Args: vocab_size (`int`, *optional*, defaults to 49408): Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by @@ -65,7 +64,7 @@ class CLIPTextConfig(PretrainedConfig): just in case (e.g., 512 or 1024 or 2048). hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. @@ -73,7 +72,7 @@ class CLIPTextConfig(PretrainedConfig): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float``, *optional*, defaults to 1): + initializer_factor (`float`, *optional*, defaults to 1): A factor for initializing all weight matrices (should be kept to 1, used internally for initialization testing). @@ -149,15 +148,14 @@ class CLIPTextConfig(PretrainedConfig): class CLIPVisionConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate an CLIP - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the CLIP + This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a + CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. - Args: hidden_size (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. @@ -181,7 +179,7 @@ class CLIPVisionConfig(PretrainedConfig): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float``, *optional*, defaults to 1): + initializer_factor (`float`, *optional*, defaults to 1): A factor for initializing all weight matrices (should be kept to 1, used internally for initialization testing). @@ -258,8 +256,8 @@ class CLIPVisionConfig(PretrainedConfig): class CLIPConfig(PretrainedConfig): r""" [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate - CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating a - configuration with the defaults will yield a similar configuration to that of the CLIP + a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating + a configuration with the defaults will yield a similar configuration to that of the CLIP [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py index 2f660a622f..380411b47a 100644 --- a/src/transformers/models/clip/image_processing_clip.py +++ b/src/transformers/models/clip/image_processing_clip.py @@ -70,10 +70,10 @@ class CLIPImageProcessor(BaseImageProcessor): method. do_normalize: Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. - image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): Mean to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. - image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): Image standard deviation. do_convert_rgb (`bool`, *optional*, defaults to `True`): Standard deviation to use if normalizing the image. This is a float or list of floats the length of the diff --git a/src/transformers/models/git/__init__.py b/src/transformers/models/git/__init__.py new file mode 100644 index 0000000000..539cd3b37a --- /dev/null +++ b/src/transformers/models/git/__init__.py @@ -0,0 +1,64 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available + + +_import_structure = { + "configuration_git": ["GIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GitConfig", "GitVisionConfig"], + "processing_git": ["GitProcessor"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_git"] = [ + "GIT_PRETRAINED_MODEL_ARCHIVE_LIST", + "GitForCausalLM", + "GitModel", + "GitPreTrainedModel", + "GitVisionModel", + ] + +if TYPE_CHECKING: + from .configuration_git import GIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GitConfig, GitVisionConfig + from .processing_git import GitProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_git import ( + GIT_PRETRAINED_MODEL_ARCHIVE_LIST, + GitForCausalLM, + GitModel, + GitPreTrainedModel, + GitVisionModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py new file mode 100644 index 0000000000..43b9c50169 --- /dev/null +++ b/src/transformers/models/git/configuration_git.py @@ -0,0 +1,265 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +from typing import Union + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +GIT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "microsoft/git-base": "https://huggingface.co/microsoft/git-base/resolve/main/config.json", +} + + +# Copied from transformers.models.clip.configuration_clip.CLIPVisionConfig with CLIPVision->GitVision, CLIP->GIT, clip->git, openai/git-vit-base-patch32->microsoft/git-base, 32->16 +class GitVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GitVisionModel`]. It is used to instantiate a GIT + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the vision encoder of the GIT + [microsoft/git-base](https://huggingface.co/microsoft/git-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, + defaults to 1e-5): The epsilon used by the layer normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + + Example: + + ```python + >>> from transformers import GitVisionConfig, GitVisionModel + + >>> # Initializing a GitVisionConfig with microsoft/git-base style configuration + >>> configuration = GitVisionConfig() + + >>> # Initializing a GitVisionModel (with random weights) from the microsoft/git-base style configuration + >>> model = GitVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "git_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=16, + hidden_act="quick_gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.dropout = dropout + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from GITConfig + if config_dict.get("model_type") == "git": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class GitConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GitModel`]. It is used to instantiate a GIT model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the GIT + [microsoft/git-base](https://huggingface.co/microsoft/git-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`GitVisionConfig`]. + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the GIT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GitModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + num_image_with_embedding (`int`, *optional*): + The number of temporal embeddings to add, in case the model is used for video captioning/VQA. + + Examples: + + ```python + >>> from transformers import GitConfig, GitModel + + >>> # Initializing a GIT microsoft/git-base style configuration + >>> configuration = GitConfig() + + >>> # Initializing a model (with random weights) from the microsoft/git-base style configuration + >>> model = GitModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "git" + + def __init__( + self, + vision_config=None, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=6, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=1024, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + tie_word_embeddings=False, + bos_token_id=101, + eos_token_id=102, + num_image_with_embedding=None, + **kwargs + ): + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the GitVisionConfig with default values.") + + self.vision_config = GitVisionConfig(**vision_config) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + self.tie_word_embeddings = tie_word_embeddings + self.num_image_with_embedding = num_image_with_embedding + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py new file mode 100644 index 0000000000..f072db7cd9 --- /dev/null +++ b/src/transformers/models/git/convert_git_to_pytorch.py @@ -0,0 +1,404 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert GIT checkpoints from the original repository. + +URL: https://github.com/microsoft/GenerativeImage2Text/tree/main""" + + +import argparse +from pathlib import Path + +import numpy as np +import torch +from PIL import Image +from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor + +import requests +from huggingface_hub import hf_hub_download +from transformers import ( + AutoTokenizer, + CLIPImageProcessor, + GitConfig, + GitForCausalLM, + GitProcessor, + GitVisionConfig, + VideoMAEImageProcessor, +) +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_git_config(model_name): + if "base" in model_name and "vqa" in model_name: + image_size = 480 + elif "large" in model_name and "vqa" in model_name: + image_size = 420 + else: + image_size = 224 + + vision_config = GitVisionConfig(image_size=image_size) + + if "large" in model_name: + vision_config.patch_size = 14 + vision_config.hidden_size = 1024 + vision_config.intermediate_size = 4096 + vision_config.num_hidden_layers = 24 + vision_config.num_attention_heads = 16 + + is_video = "vatex" in model_name or "msrvtt" in model_name + num_image_with_embedding = 6 if is_video else None + config = GitConfig(vision_config=vision_config.to_dict(), num_image_with_embedding=num_image_with_embedding) + + return config, image_size, is_video + + +# here we list all keys to be renamed (original name on the left, our name on the right) +def create_rename_keys(config, prefix=""): + rename_keys = [] + + # image encoder + # ftm: off + rename_keys.append( + (f"{prefix}image_encoder.class_embedding", "git.image_encoder.vision_model.embeddings.class_embedding") + ) + rename_keys.append( + ( + f"{prefix}image_encoder.positional_embedding", + "git.image_encoder.vision_model.embeddings.position_embedding.weight", + ) + ) + rename_keys.append( + (f"{prefix}image_encoder.conv1.weight", "git.image_encoder.vision_model.embeddings.patch_embedding.weight") + ) + rename_keys.append((f"{prefix}image_encoder.ln_pre.weight", "git.image_encoder.vision_model.pre_layrnorm.weight")) + rename_keys.append((f"{prefix}image_encoder.ln_pre.bias", "git.image_encoder.vision_model.pre_layrnorm.bias")) + rename_keys.append( + (f"{prefix}image_encoder.ln_post.weight", "git.image_encoder.vision_model.post_layernorm.weight") + ) + rename_keys.append((f"{prefix}image_encoder.ln_post.bias", "git.image_encoder.vision_model.post_layernorm.bias")) + # fmt: on + rename_keys.append((f"{prefix}image_encoder.proj", "git.image_encoder.visual_projection.weight")) + + # fmt: off + for i in range(config.vision_config.num_hidden_layers): + # image encoder layers: output projection, 2 feedforward neural networks and 2 layernorms + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.weight")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.bias")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.weight")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.bias")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.weight")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.bias")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.weight")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.bias")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.weight")) + rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.bias")) + # fmt: on + + # text decoder + # fmt: off + rename_keys.append((f"{prefix}textual.embedding.words.weight", "git.embeddings.word_embeddings.weight")) + rename_keys.append((f"{prefix}textual.embedding.positions.weight", "git.embeddings.position_embeddings.weight")) + rename_keys.append((f"{prefix}textual.visual_projection.0.weight", "git.visual_projection.visual_projection.0.weight")) + rename_keys.append((f"{prefix}textual.visual_projection.0.bias", "git.visual_projection.visual_projection.0.bias")) + rename_keys.append((f"{prefix}textual.visual_projection.1.weight", "git.visual_projection.visual_projection.1.weight")) + rename_keys.append((f"{prefix}textual.visual_projection.1.bias", "git.visual_projection.visual_projection.1.bias")) + + rename_keys.append((f"{prefix}textual.embedding.layer_norm.weight", "git.embeddings.LayerNorm.weight")) + rename_keys.append((f"{prefix}textual.embedding.layer_norm.bias", "git.embeddings.LayerNorm.bias")) + rename_keys.append((f"{prefix}textual.output.weight", "output.weight")) + rename_keys.append((f"{prefix}textual.output.bias", "output.bias")) + for i in range(config.num_hidden_layers): + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.weight", f"git.encoder.layer.{i}.attention.self.query.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.bias", f"git.encoder.layer.{i}.attention.self.query.bias")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.weight", f"git.encoder.layer.{i}.attention.self.key.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.bias", f"git.encoder.layer.{i}.attention.self.key.bias")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.weight", f"git.encoder.layer.{i}.attention.self.value.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.bias", f"git.encoder.layer.{i}.attention.self.value.bias")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.weight", f"git.encoder.layer.{i}.attention.output.dense.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.bias", f"git.encoder.layer.{i}.attention.output.dense.bias")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.weight", f"git.encoder.layer.{i}.attention.output.LayerNorm.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.bias", f"git.encoder.layer.{i}.attention.output.LayerNorm.bias")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.weight", f"git.encoder.layer.{i}.intermediate.dense.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.bias", f"git.encoder.layer.{i}.intermediate.dense.bias")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.weight", f"git.encoder.layer.{i}.output.dense.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.bias", f"git.encoder.layer.{i}.output.dense.bias")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.weight", f"git.encoder.layer.{i}.output.LayerNorm.weight")) + rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.bias", f"git.encoder.layer.{i}.output.LayerNorm.bias")) + # fmt: on + + if config.num_image_with_embedding is not None: + rename_keys.append(("img_temperal_embedding.0", "git.img_temperal_embedding.0")) + rename_keys.append(("img_temperal_embedding.1", "git.img_temperal_embedding.1")) + rename_keys.append(("img_temperal_embedding.2", "git.img_temperal_embedding.2")) + rename_keys.append(("img_temperal_embedding.3", "git.img_temperal_embedding.3")) + rename_keys.append(("img_temperal_embedding.4", "git.img_temperal_embedding.4")) + rename_keys.append(("img_temperal_embedding.5", "git.img_temperal_embedding.5")) + + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val.T if "image_encoder.visual_projection" in new else val + + +# we split up the matrix of each CLIP encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config, prefix=""): + dim = config.vision_config.hidden_size + for i in range(config.vision_config.num_hidden_layers): + # read in weights + bias of input projection layer (in the original implementation, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[ + :dim, : + ] + state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:dim] + state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ + dim : dim * 2, : + ] + state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[ + dim : dim * 2 + ] + state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[ + -dim:, : + ] + state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-dim:] + + +# We will verify our results on an image +def prepare_img(model_name): + if "textvqa" in model_name: + filepath = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") + image = Image.open(filepath).convert("RGB") + else: + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + return image + + +def prepare_video(): + from decord import VideoReader, cpu + + # set seed for reproducability + np.random.seed(0) + + def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + converted_len = int(clip_len * frame_sample_rate) + end_idx = np.random.randint(converted_len, seg_len) + start_idx = end_idx - converted_len + indices = np.linspace(start_idx, end_idx, num=clip_len) + indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + return indices + + # video clip consists of 300 frames (10 seconds at 30 FPS) + file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset") + videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + + # sample 6 frames + videoreader.seek(0) + indices = sample_frame_indices(clip_len=6, frame_sample_rate=4, seg_len=len(videoreader)) + video = videoreader.get_batch(indices).asnumpy() + + return video + + +@torch.no_grad() +def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our GIT structure. + """ + + model_name_to_url = { + "git-base": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE/snapshot/model.pt", + "git-base-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_COCO/snapshot/model.pt", + "git-base-textcaps": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTCAPS/snapshot/model.pt", + "git-base-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VQAv2/snapshot/model.pt", + "git-base-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTVQA/snapshot/model.pt", # todo + "git-base-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VATEX/snapshot/model.pt", + "git-base-msrvtt-qa": ( + "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_MSRVTT_QA/snapshot/model.pt" + ), + "git-large": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE/snapshot/model.pt", + "git-large-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_COCO/snapshot/model.pt", + "git-large-textcaps": ( + "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTCAPS/snapshot/model.pt" + ), + "git-large-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VQAv2/snapshot/model.pt", + "git-large-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTVQA/snapshot/model.pt", + "git-large-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VATEX/snapshot/model.pt", + "git-large-msrvtt-qa": ( + "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_MSRVTT_QA/snapshot/model.pt" + ), + } + + model_name_to_path = { + "git-large": "/Users/nielsrogge/Documents/GIT/git_large_model.pt", + "git-large-coco": "/Users/nielsrogge/Documents/GIT/git_large_coco_model.pt", + "git-large-textcaps": "/Users/nielsrogge/Documents/GIT/git_large_textcaps_model.pt", + "git-large-vqav2": "/Users/nielsrogge/Documents/GIT/git_large_vqav2_model.pt", + "git-large-textvqa": "/Users/nielsrogge/Documents/GIT/git_large_textvqa_model.pt", + } + + # define GIT configuration based on model name + config, image_size, is_video = get_git_config(model_name) + if "large" in model_name and not is_video: + # large checkpoints take way too long to download + checkpoint_path = model_name_to_path[model_name] + state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + else: + checkpoint_url = model_name_to_url[model_name] + state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[ + "model" + ] + # rename keys + prefix = "module." if model_name == "git-base" else "" + rename_keys = create_rename_keys(config, prefix=prefix) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + read_in_q_k_v(state_dict, config, prefix=prefix) + + # load HuggingFace model + model = GitForCausalLM(config) + missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) + model.eval() + + print("Missing keys:", missing_keys) + print("Unexpected keys:", unexpected_keys) + + assert missing_keys == ["git.embeddings.position_ids", "git.image_encoder.vision_model.embeddings.position_ids"] + assert unexpected_keys == ["git.image_encoder.visual_projection.weight"] + + # verify results + image_processor = ( + VideoMAEImageProcessor( + size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} + ) + if is_video + else CLIPImageProcessor( + size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} + ) + ) + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_input_names=["input_ids", "attention_mask"]) + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + if is_video: + video = prepare_video() + pixel_values = processor(images=list(video), return_tensors="pt").pixel_values + else: + image = prepare_img(model_name) + image_transforms = Compose( + [ + Resize(image_size, interpolation=Image.BICUBIC), + CenterCrop(image_size), + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ] + ) + original_pixel_values = image_transforms(image).unsqueeze(0) + pixel_values = processor(images=image, return_tensors="pt").pixel_values + + assert torch.allclose(pixel_values, original_pixel_values) + + input_ids = torch.tensor([[101]]) + outputs = model(input_ids, pixel_values=pixel_values) + logits = outputs.logits + print("Logits:", logits[0, -1, :3]) + + if model_name == "git-base": + expected_slice_logits = torch.tensor([-1.2832, -1.2835, -1.2840]) + elif model_name == "git-base-coco": + expected_slice_logits = torch.tensor([-0.9925, -0.9930, -0.9935]) + elif model_name == "git-base-textcaps": + expected_slice_logits = torch.tensor([-1.2980, -1.2983, -1.2985]) + elif model_name == "git-base-vqav2": + expected_slice_logits = torch.tensor([-0.8570, -0.8568, -0.8561]) + elif model_name == "git-base-textvqa": + expected_slice_logits = torch.tensor([-1.4085, -1.4083, -1.4082]) + elif model_name == "git-base-vatex": + expected_slice_logits = torch.tensor([-1.3451, -1.3447, -1.3447]) + elif model_name == "git-base-msrvtt-qa": + expected_slice_logits = torch.tensor([-0.8554, -0.8550, -0.8540]) + elif model_name == "git-large": + expected_slice_logits = torch.tensor([-1.1708, -1.1707, -1.1705]) + elif model_name == "git-large-coco": + expected_slice_logits = torch.tensor([-1.0425, -1.0423, -1.0422]) + elif model_name == "git-large-textcaps": + expected_slice_logits = torch.tensor([-1.2705, -1.2708, -1.2706]) + elif model_name == "git-large-vqav2": + expected_slice_logits = torch.tensor([-0.7042, -0.7043, -0.7043]) + elif model_name == "git-large-textvqa": + expected_slice_logits = torch.tensor([-0.8590, -0.8592, -0.8590]) + elif model_name == "git-large-vatex": + expected_slice_logits = torch.tensor([-1.0113, -1.0114, -1.0113]) + elif model_name == "git-large-msrvtt-qa": + expected_slice_logits = torch.tensor([0.0130, 0.0134, 0.0131]) + + assert torch.allclose(logits[0, -1, :3], expected_slice_logits, atol=1e-4) + print("Looks ok!") + + prompt = "" + if "textvqa" in model_name: + prompt = "what does the front of the bus say at the top?" + elif "msrvtt-qa" in model_name: + prompt = "what does the woman eat?" + elif "vqa" in model_name: + prompt = "what are the cats doing?" + input_ids = tokenizer(prompt, add_special_tokens=False).input_ids + input_ids = [processor.tokenizer.cls_token_id] + input_ids + input_ids = torch.tensor(input_ids).unsqueeze(0) + print("Generating caption...") + generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) + print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True)) + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print(f"Pushing model and processor of {model_name} to the hub...") + model.push_to_hub(f"microsoft/{model_name}") + processor.push_to_hub(f"microsoft/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="git-base", + type=str, + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model to the hub.", + ) + + args = parser.parse_args() + convert_git_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py new file mode 100644 index 0000000000..861b27e3bf --- /dev/null +++ b/src/transformers/models/git/modeling_git.py @@ -0,0 +1,1532 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch GIT model.""" + + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...file_utils import ModelOutput +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPast, + BaseModelOutputWithPooling, + CausalLMOutputWithPast, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_git import GitConfig, GitVisionConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "microsoft/git-base" +_CONFIG_FOR_DOC = "GitConfig" +_TOKENIZER_FOR_DOC = "BertTokenizer" + +GIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/git-base", + # See all GIT models at https://huggingface.co/models?filter=git +] + + +@dataclass +# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Git +class GitVisionModelOutput(ModelOutput): + """ + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + + Args: + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class GitEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + embeddings = self.word_embeddings(input_ids) + else: + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class GitSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.image_patch_tokens = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1) + if config.num_image_with_embedding is not None: + self.image_patch_tokens *= config.num_image_with_embedding + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + cutoff = self.image_patch_tokens if pixel_values_present else 0 + if past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([key_layer[:, :, :cutoff, :], past_key_value[0], key_layer[:, :, -1:, :]], dim=2) + value_layer = torch.cat( + [value_layer[:, :, :cutoff, :], past_key_value[1], value_layer[:, :, -1:, :]], dim=2 + ) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + # NOTE: like in other caches, we store the text component. In GIT it means we discard the image component. + past_key_value = ( + key_layer[:, :, cutoff:, :], + value_layer[:, :, cutoff:, :], + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GitModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +class GitSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class GitAttention(nn.Module): + # Copied from transformers.models.bert.modeling_bert.BertAttention.__init__ with Bert->Git + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = GitSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = GitSelfOutput(config) + self.pruned_heads = set() + + # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + past_key_value, + output_attentions, + pixel_values_present, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate +class GitIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput +class GitOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class GitLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = GitAttention(config) + self.intermediate = GitIntermediate(config) + self.output = GitOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + pixel_values_present=pixel_values_present, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class GitEncoder(nn.Module): + # Copied from transformers.models.bert.modeling_bert.BertEncoder.__init__ with Bert->Git + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([GitLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + past_key_value, + output_attentions, + pixel_values_present, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class GitPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = GitConfig + base_model_prefix = "git" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, GitVisionEmbeddings): + nn.init.normal_(module.class_embedding, mean=0.0, std=self.config.initializer_range) + nn.init.normal_(module.patch_embedding.weight, std=self.config.initializer_range) + nn.init.normal_(module.position_embedding.weight, std=self.config.initializer_range) + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (GitEncoder, GitVisionEncoder)): + module.gradient_checkpointing = value + + +GIT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`GitConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GIT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`AutoImageProcessor.__call__`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Git +class GitVisionEmbeddings(nn.Module): + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP +class GitVisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.clip.modeling_clip.CLIPAttention +class GitVisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scale + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {causal_attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit akward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->GitVision +class GitVisionEncoderLayer(nn.Module): + def __init__(self, config: GitVisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = GitVisionAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim) + self.mlp = GitVisionMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->GitVision, CLIPConfig +class GitVisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`GitVisionEncoderLayer`]. + + Args: + config: GitVisionConfig + """ + + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + causal_attention_mask, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +GIT_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class GitVisionTransformer(nn.Module): + # Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.__init__ with CLIPEncoder->GitVisionEncoder, CLIP->Git + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = GitVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim) + self.encoder = GitVisionEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim) + + @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=last_hidden_state, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """The vision model from CLIP, used in GIT, without any head or projection on top.""", + GIT_START_DOCSTRING, +) +class GitVisionModel(GitPreTrainedModel): + config_class = GitVisionConfig + main_input_name = "pixel_values" + + # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git + def __init__(self, config: GitVisionConfig): + super().__init__(config) + self.vision_model = GitVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, GitVisionModel + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base") + >>> model = GitVisionModel.from_pretrained("microsoft/git-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class GitProjection(nn.Module): + def __init__(self, config: GitConfig): + super().__init__() + self.config = config + self.visual_projection = nn.Sequential( + nn.Linear(config.vision_config.hidden_size, config.hidden_size), nn.LayerNorm(config.hidden_size) + ) + + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: + return self.visual_projection(embeddings) + + +@add_start_docstrings( + "The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states" + " without any specific head on top.", + GIT_START_DOCSTRING, +) +class GitModel(GitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.embeddings = GitEmbeddings(config) + self.image_encoder = GitVisionModel(config.vision_config) + self.encoder = GitEncoder(config) + + self.visual_projection = GitProjection(config) + + if config.num_image_with_embedding is not None: + self.img_temperal_embedding = nn.ParameterList( + nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size)) + for _ in range(config.num_image_with_embedding) + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def _generate_future_mask(self, size: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + # Default mask is for forward direction. Flip for backward direction. + mask = torch.triu(torch.ones(size, size, device=device, dtype=dtype), diagonal=1) + mask = mask.masked_fill(mask == 1, float("-inf")) + return mask + + def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None): + num_tgt = tgt.shape[1] + num_memory = memory.shape[1] + device = tgt.device + dtype = tgt.dtype + top_left = torch.zeros((num_memory, num_memory), device=device, dtype=dtype) + top_right = torch.full( + (num_memory, num_tgt + past_key_values_length), + float("-inf"), + device=tgt.device, + dtype=dtype, + ) + bottom_left = torch.zeros( + (num_tgt, num_memory), + dtype=dtype, + device=tgt_mask.device, + ) + + if past_key_values_length > 0: + tgt_mask = torch.zeros( + (tgt_mask.shape[0], tgt_mask.shape[0] + past_key_values_length), + dtype=dtype, + device=tgt_mask.device, + ) + + left = torch.cat((top_left, bottom_left), dim=0) + right = torch.cat((top_right, tgt_mask.to(dtype)), dim=0) + + full_attention_mask = torch.cat((left, right), dim=1)[None, :] + + if memory_key_padding_mask is None: + memory_key_padding_mask = torch.full((memory.shape[0], memory.shape[1]), fill_value=False, device=device) + # if it is False, it means valid. That is, it is not a padding + if memory_key_padding_mask.dtype != torch.bool: + raise ValueError("Memory key padding mask must be a boolean tensor.") + zero_negative_infinity = torch.zeros_like(memory_key_padding_mask, dtype=tgt.dtype) + zero_negative_infinity[memory_key_padding_mask] = float("-inf") + full_attention_mask = full_attention_mask.expand( + (memory_key_padding_mask.shape[0], num_memory + num_tgt, num_memory + past_key_values_length + num_tgt) + ) + full_attention_mask = full_attention_mask.clone() + origin_left = full_attention_mask[:, :, :num_memory] + update = zero_negative_infinity[:, None, :] + full_attention_mask[:, :, :num_memory] = origin_left + update + + # add axis for multi-head + full_attention_mask = full_attention_mask[:, None, :, :] + + return full_attention_mask + + @add_start_docstrings_to_model_forward(GIT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: + r""" + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, AutoModel + >>> import requests + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base") + >>> model = AutoModel.from_pretrained("microsoft/git-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> text = "this is an image of two cats" + + >>> inputs = processor(text, images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length = input_shape[1] + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + projected_visual_features = None + if pixel_values is not None: + if pixel_values.ndim == 4: + # here we assume pixel_values is of shape (batch_size, num_channels, height, width) + visual_features = self.image_encoder(pixel_values).last_hidden_state + + elif pixel_values.ndim == 5: + # here we assume pixel_values is of shape (batch_size, num_frames, num_channels, height, width) + visual_features = [] + for frame_idx in range(pixel_values.shape[1]): + visual_features_frame = self.image_encoder(pixel_values[:, frame_idx, :, :]).last_hidden_state + visual_features_frame += self.img_temperal_embedding[frame_idx] + visual_features.append(visual_features_frame) + + # finally, concatenate all features along sequence dimension + visual_features = torch.cat(visual_features, dim=1) + + else: + raise ValueError("pixel_values must be of rank 4 or 5") + + projected_visual_features = self.visual_projection(visual_features) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if projected_visual_features is None: + projected_visual_features = torch.zeros( + (embedding_output.shape[0], 0, embedding_output.shape[2]), + dtype=embedding_output.dtype, + device=embedding_output.device, + ) + + # concatenate patch token and text token embeddings + hidden_states = torch.cat((projected_visual_features, embedding_output), dim=1) + + # By default, an additive causal mask is created + # for masking the future (one direction). + tgt_mask = self._generate_future_mask(seq_length, embedding_output.dtype, embedding_output.device) + + # Create an attention mask of shape (batch_size, 1, tgt_seq_len, src_seq_len) + combined_attention_mask = self.create_attention_mask( + tgt=embedding_output, + memory=projected_visual_features, + tgt_mask=tgt_mask, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # if the user provides an attention mask, we add it to the default one + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, embedding_output.dtype, tgt_len=input_shape[-1]).to( + embedding_output.device + ) + if past_key_values_length > 0: + expanded_attn_mask = expanded_attn_mask[:, :, -past_key_values_length:, :] + else: + combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=combined_attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + pixel_values_present=pixel_values is not None, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutputWithPast( + last_hidden_state=sequence_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """GIT Model with a `language modeling` head on top for autoregressive language modeling.""", GIT_START_DOCSTRING +) +class GitForCausalLM(GitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.git = GitModel(config) + self.output = nn.Linear(config.hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + @add_start_docstrings_to_model_forward(GIT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Examples: + + Image captioning example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM + >>> import requests + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values + + >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50) + >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + >>> print(generated_caption) + two cats sleeping on a pink blanket next to remotes. + ``` + + Visual question answering (VQA) example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM + >>> from huggingface_hub import hf_hub_download + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa") + + >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") + >>> image = Image.open(file_path).convert("RGB") + + >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values + + >>> question = "what does the front of the bus say at the top?" + + >>> input_ids = processor(text=question, add_special_tokens=False).input_ids + >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids + >>> input_ids = torch.tensor(input_ids).unsqueeze(0) + + >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) + >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True)) + ['what does the front of the bus say at the top? special'] + ``` + + Video captioning example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM + >>> from PIL import Image + >>> import numpy as np + >>> from huggingface_hub import hf_hub_download + >>> from decord import VideoReader, cpu + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex") + + >>> # set seed for reproducability + >>> np.random.seed(45) + + + >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + ... converted_len = int(clip_len * frame_sample_rate) + ... end_idx = np.random.randint(converted_len, seg_len) + ... start_idx = end_idx - converted_len + ... indices = np.linspace(start_idx, end_idx, num=clip_len) + ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + ... return indices + + + >>> def sample_frames(file_path, num_frames): + ... videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0)) + ... videoreader.seek(0) + ... indices = sample_frame_indices(clip_len=num_frames, frame_sample_rate=4, seg_len=len(videoreader)) + ... frames = videoreader.get_batch(indices).asnumpy() + ... return list(frames) + + + >>> # load video + >>> file_path = hf_hub_download( + ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ... ) + + >>> # sample frames + >>> num_frames = model.config.num_image_with_embedding + >>> frames = sample_frames(file_path, num_frames) + + >>> pixel_values = processor(images=frames, return_tensors="pt").pixel_values + + >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50) + + >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True)) + Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.'] + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.git( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + pixel_values=pixel_values, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.output(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_logits = logits[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shifted_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithPast( + loss=lm_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=True, **kwargs): + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + input_shape = input_ids.shape + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": kwargs.get("pixel_values", None), + "past_key_values": past, + "use_cache": use_cache, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py new file mode 100644 index 0000000000..3e11be322b --- /dev/null +++ b/src/transformers/models/git/processing_git.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for GIT +""" + +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding + + +class GitProcessor(ProcessorMixin): + r""" + Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor. + + [`GitProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BertTokenizerFast`]. See the + [`~GitProcessor.__call__`] and [`~GitProcessor.decode`] for more information. + + Args: + image_processor ([`AutoImageProcessor`]): + The image processor is a required input. + tokenizer ([`AutoTokenizer`]): + The tokenizer is a required input. + """ + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + + def __call__(self, text=None, images=None, return_tensors=None, **kwargs): + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + + if text is None and images is None: + raise ValueError("You have to specify either text or images. Both cannot be none.") + + if text is not None: + encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + + if images is not None: + image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + + if text is not None and images is not None: + encoding["pixel_values"] = image_features.pixel_values + return encoding + elif text is not None: + return encoding + else: + return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + return ["input_ids", "attention_mask", "pixel_values"] diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index fe46e68934..652b78c65b 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -2588,6 +2588,37 @@ def load_tf_weights_in_funnel(*args, **kwargs): requires_backends(load_tf_weights_in_funnel, ["torch"]) +GIT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class GitForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GitModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GitPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GitVisionModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + GLPN_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 4a44c15b22..c27dcdc64e 100644 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -156,6 +156,7 @@ _SPECIAL_SUPPORTED_MODELS = [ "CLIPTextModelWithProjection", "CLIPVisionModel", "CLIPVisionModelWithProjection", + "GitVisionModel", "GPT2DoubleHeadsModel", "Speech2Text2Decoder", "TrOCRDecoder", diff --git a/tests/models/git/__init__.py b/tests/models/git/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py new file mode 100644 index 0000000000..4bef577ee7 --- /dev/null +++ b/tests/models/git/test_modeling_git.py @@ -0,0 +1,462 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import unittest + +from huggingface_hub import hf_hub_download +from transformers import GitConfig, GitProcessor, GitVisionConfig, is_torch_available, is_vision_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import MODEL_FOR_PRETRAINING_MAPPING, GitForCausalLM, GitModel, GitVisionModel + from transformers.models.git.modeling_git import GIT_PRETRAINED_MODEL_ARCHIVE_LIST + + +if is_vision_available(): + from PIL import Image + + +class GitVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=32, + patch_size=16, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return GitVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = GitVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class GitVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as GIT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (GitVisionModel,) if is_torch_available() else () + fx_compatible = True + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = GitVisionModelTester(self) + self.config_tester = ConfigTester(self, config_class=GitVisionConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="GIT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_training(self): + pass + + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="GitVisionModel has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="GitVisionModel has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_to_base(self): + pass + + @slow + def test_model_from_pretrained(self): + for model_name in GIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = GitVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class GitModelTester: + def __init__( + self, + parent, + num_channels=3, + image_size=32, + patch_size=16, + batch_size=13, + text_seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.batch_size = batch_size + self.text_seq_length = text_seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + + # make sure the BOS, EOS and PAD tokens are within the vocab + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + # for GIT, the sequence length is the sum of the text and patch tokens, + 1 due to the CLS token + self.seq_length = self.text_seq_length + int((self.image_size / self.patch_size) ** 2) + 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.text_seq_length]) + + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + token_labels = None + if self.use_labels: + token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels) + + config = self.get_config() + + return config, input_ids, input_mask, pixel_values, token_labels + + def get_config(self): + """ + Returns a tiny configuration by default. + """ + return GitConfig( + vision_config={ + "num_channels": self.num_channels, + "image_size": self.image_size, + "patch_size": self.patch_size, + }, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_model(self, config, input_ids, input_mask, pixel_values, token_labels): + model = GitModel(config=config) + model.to(torch_device) + model.eval() + + # inference with pixel values + result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # inference without pixel values + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size) + ) + + def create_and_check_for_causal_lm(self, config, input_ids, input_mask, pixel_values, token_labels): + model = GitForCausalLM(config=config) + model.to(torch_device) + model.eval() + + # inference with pixel values + result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + # inference without pixel values + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.vocab_size)) + + # TODO training + # result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values) + # self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + # self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + pixel_values, + token_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "attention_mask": input_mask, + "pixel_values": pixel_values, + } + + return config, inputs_dict + + +@require_torch +class GitModelTest(ModelTesterMixin, unittest.TestCase): + + all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else () + all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else () + fx_compatible = False + test_torchscript = False + + # special case for ForPreTraining model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + inputs_dict["next_sentence_label"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = GitModelTester(self) + self.config_tester = ConfigTester(self, config_class=GitConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in GIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = GitModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +@require_vision +@slow +class GitModelIntegrationTest(unittest.TestCase): + def test_forward_pass(self): + processor = GitProcessor.from_pretrained("microsoft/git-base") + model = GitForCausalLM.from_pretrained("microsoft/git-base") + + model.to(torch_device) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(images=image, text="hello world", return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**inputs) + + expected_shape = torch.Size((1, 201, 30522)) + self.assertEqual(outputs.logits.shape, expected_shape) + expected_slice = torch.tensor( + [[-0.9514, -0.9512, -0.9507], [-0.5454, -0.5453, -0.5453], [-0.8862, -0.8857, -0.8848]], + device=torch_device, + ) + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)) + + def test_inference_image_captioning(self): + processor = GitProcessor.from_pretrained("microsoft/git-base") + model = GitForCausalLM.from_pretrained("microsoft/git-base") + model.to(torch_device) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + outputs = model.generate( + pixel_values=pixel_values, max_length=20, output_scores=True, return_dict_in_generate=True + ) + generated_caption = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0] + + expected_shape = torch.Size((1, 9)) + self.assertEqual(outputs.sequences.shape, expected_shape) + self.assertEquals(generated_caption, "two cats laying on a pink blanket") + self.assertTrue(outputs.scores[-1].shape, expected_shape) + expected_slice = torch.tensor([[-0.8805, -0.8803, -0.8799]], device=torch_device) + self.assertTrue(torch.allclose(outputs.scores[-1][0, :3], expected_slice, atol=1e-4)) + + def test_visual_question_answering(self): + processor = GitProcessor.from_pretrained("microsoft/git-base-textvqa") + model = GitForCausalLM.from_pretrained("microsoft/git-base-textvqa") + model.to(torch_device) + + # prepare image + file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") + image = Image.open(file_path).convert("RGB") + inputs = processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # prepare question + question = "what does the front of the bus say at the top?" + input_ids = processor(text=question, add_special_tokens=False).input_ids + input_ids = [processor.tokenizer.cls_token_id] + input_ids + input_ids = torch.tensor(input_ids).unsqueeze(0).to(torch_device) + + generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=20) + generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + + expected_shape = torch.Size((1, 15)) + self.assertEqual(generated_ids.shape, expected_shape) + self.assertEquals(generated_caption, "what does the front of the bus say at the top? special") diff --git a/tests/models/git/test_processor_git.py b/tests/models/git/test_processor_git.py new file mode 100644 index 0000000000..95e436d8e4 --- /dev/null +++ b/tests/models/git/test_processor_git.py @@ -0,0 +1,153 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import shutil +import tempfile +import unittest + +import numpy as np +import pytest + +from transformers.testing_utils import require_vision +from transformers.utils import is_vision_available + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoProcessor, BertTokenizer, CLIPImageProcessor, GitProcessor, PreTrainedTokenizerFast + + +@require_vision +class GitProcessorTest(unittest.TestCase): + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + + image_processor = CLIPImageProcessor() + tokenizer = BertTokenizer.from_pretrained( + "hf-internal-testing/tiny-random-BertModel", model_input_names=["input_ids", "attention_mask"] + ) + + processor = GitProcessor(image_processor, tokenizer) + + processor.save_pretrained(self.tmpdirname) + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def prepare_image_inputs(self): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + """ + + image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] + + image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] + + return image_inputs + + def test_save_load_pretrained_additional_features(self): + processor = GitProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = GitProcessor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, CLIPImageProcessor) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_feat_extract = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str, return_token_type_ids=False) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + # For now the processor supports only ['input_ids', 'attention_mask', 'pixel_values'] + self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py index 4de6f878dd..922a4e24b2 100644 --- a/tests/pipelines/test_pipelines_text_generation.py +++ b/tests/pipelines/test_pipelines_text_generation.py @@ -212,7 +212,11 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM # it requires BOS token to exist. # Special case for Pegasus which will always append EOS so will # work even without BOS. - if text_generator.tokenizer.bos_token_id is not None or "Pegasus" in tokenizer.__class__.__name__: + if ( + text_generator.tokenizer.bos_token_id is not None + or "Pegasus" in tokenizer.__class__.__name__ + or "Git" in model.__class__.__name__ + ): outputs = text_generator("") self.assertEqual(outputs, [{"generated_text": ANY(str)}]) else: diff --git a/utils/check_repo.py b/utils/check_repo.py index 73efe4d388..eb342671d8 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -148,6 +148,7 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [ # should **not** be the rule. IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ # models to ignore for model xxx mapping + "GitVisionModel", "BlipForConditionalGeneration", "BlipForImageTextRetrieval", "BlipForQuestionAnswering", diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index b0fabcfd24..7839f58a20 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -81,6 +81,7 @@ src/transformers/models/ernie/configuration_ernie.py src/transformers/models/flava/configuration_flava.py src/transformers/models/fnet/configuration_fnet.py src/transformers/models/fsmt/configuration_fsmt.py +src/transformers/models/git/modeling_git.py src/transformers/models/glpn/modeling_glpn.py src/transformers/models/gpt2/configuration_gpt2.py src/transformers/models/gpt2/modeling_gpt2.py