Compare commits
15 Commits
v4.51.3-D-
...
v4.51.3-Ll
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
63cd4c76f3 | ||
|
|
34f26e2c3e | ||
|
|
a57274466f | ||
|
|
481de7204c | ||
|
|
5f8d17268c | ||
|
|
50f8caaa48 | ||
|
|
91f3e9422f | ||
|
|
c34afa5957 | ||
|
|
66ad8b2db0 | ||
|
|
096f25ae1f | ||
|
|
da7ae467c4 | ||
|
|
aa6b79db43 | ||
|
|
517367fe9a | ||
|
|
755b0fa2fe | ||
|
|
3a1acc36ed |
@@ -28,6 +28,8 @@ COMMON_ENV_VARIABLES = {
|
||||
"TRANSFORMERS_IS_CI": True,
|
||||
"PYTEST_TIMEOUT": 120,
|
||||
"RUN_PIPELINE_TESTS": False,
|
||||
# will be adjust in `CircleCIJob.to_dict`.
|
||||
"RUN_FLAKY": True,
|
||||
}
|
||||
# Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
|
||||
COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
|
||||
@@ -126,6 +128,8 @@ class CircleCIJob:
|
||||
|
||||
def to_dict(self):
|
||||
env = COMMON_ENV_VARIABLES.copy()
|
||||
# Do not run tests decorated by @is_flaky on pull requests
|
||||
env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
|
||||
env.update(self.additional_env)
|
||||
|
||||
job = {
|
||||
|
||||
@@ -149,6 +149,8 @@
|
||||
title: TPU
|
||||
- local: perf_train_special
|
||||
title: Apple Silicon
|
||||
- local: perf_train_gaudi
|
||||
title: Intel Gaudi
|
||||
- local: perf_hardware
|
||||
title: Build your own machine
|
||||
title: Hardware
|
||||
|
||||
34
docs/source/en/perf_train_gaudi.md
Normal file
34
docs/source/en/perf_train_gaudi.md
Normal file
@@ -0,0 +1,34 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Intel Gaudi
|
||||
|
||||
The Intel Gaudi AI accelerator family includes [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html) overview.
|
||||
|
||||
[`TrainingArguments`], [`Trainer`] and [`Pipeline`] detect and set the backend device to `hpu` if an Intel Gaudi device is available. No additional changes are required to enable training and inference on your device.
|
||||
|
||||
Some modeling code in Transformers is not optimized for HPU lazy mode. If you encounter any errors, set the environment variable below to use eager mode:
|
||||
```
|
||||
PT_HPU_LAZY_MODE=0
|
||||
```
|
||||
|
||||
In some cases, you'll also need to enable int64 support to avoid casting issues with long integers:
|
||||
```
|
||||
PT_ENABLE_INT64_SUPPORT=1
|
||||
```
|
||||
Refer to the [Gaudi docs](https://docs.habana.ai/en/latest/index.html) for more details.
|
||||
|
||||
> [!TIP]
|
||||
> For training and inference with Gaudi-optimized model implementations, we recommend using [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index).
|
||||
@@ -354,8 +354,8 @@
|
||||
title: (번역중) DistilBERT
|
||||
- local: in_translation
|
||||
title: (번역중) DPR
|
||||
- local: in_translation
|
||||
title: (번역중) ELECTRA
|
||||
- local: model_doc/electra
|
||||
title: ELECTRA
|
||||
- local: model_doc/encoder-decoder
|
||||
title: 인코더 디코더 모델
|
||||
- local: in_translation
|
||||
|
||||
196
docs/source/ko/model_doc/electra.md
Normal file
196
docs/source/ko/model_doc/electra.md
Normal file
@@ -0,0 +1,196 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# ELECTRA[[electra]]
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
|
||||
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
|
||||
">
|
||||
</div>
|
||||
|
||||
## 개요[[overview]]
|
||||
|
||||
ELECTRA 모델은 [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
|
||||
Generators](https://openreview.net/pdf?id=r1xMH1BtvB) 논문에서 제안되었습니다. ELECTRA는 두가지 트랜스포머 모델인 생성 모델과 판별 모델을 학습시키는 새로운 사전학습 접근법입니다. 생성 모델의 역할은 시퀀스에 있는 토큰을 대체하는 것이며 마스킹된 언어 모델로 학습됩니다. 우리가 관심을 가진 판별 모델은 시퀀스에서 어떤 토큰이 생성 모델에 의해 대체되었는지 식별합니다.
|
||||
|
||||
논문의 초록은 다음과 같습니다:
|
||||
|
||||
*BERT와 같은 마스킹된 언어 모델(MLM) 사전학습 방법은 일부 토큰을 [MASK] 토큰으로 바꿔 손상시키고 난 뒤, 모델이 다시 원본 토큰을 복원하도록 학습합니다. 이런 방식은 다운스트림 NLP 작업을 전이할 때 좋은 성능을 내지만, 효과적으로 사용하기 위해서는 일반적으로 많은 양의 연산이 필요합니다. 따라서 대안으로, 대체 토큰 탐지라고 불리는 샘플-효과적인 사전학습을 제안합니다. 우리의 방법론은 입력에 마스킹을 하는 대신에 소형 생성 모델의 그럴듯한 대안 토큰으로 손상시킵니다. 그리고 나서, 모델이 손상된 토큰의 원래 토큰을 예측하도록 훈련시키는 대신, 판별 모델을 각각의 토큰이 생성 모델의 샘플로 손상되었는지 아닌지 학습합니다. 실험들은 통해 이 새로운 사전학습 방식은 마스킹된 일부 토큰에만 적용되는 기존 방식과 달리 모든 입력 토큰에 대해 학습이 이뤄지기 때문에 마스킹된 언어 모델(MLM)보다 더 효율적임을 입증하였습니다. 결과적으로 소개된 방식이 같은 모델 크기, 데이터, 연산량을 가진 BERT모델로 학습한 결과를 압도하는 문맥 표현 학습을 할 수 있다는 것을 확인했습니다. 특히 작은 모델에서 성능 향상이 두드러지며, 예를 들어 GPU 한 대로 4일간 학습한 모델이 30배 더 많은 계산 자원을 사용한 GPT보다 GLUE 자연어 이해 벤치마크에서 더 나은 성능을 보입니다. 대규모 환경에서도 유효하며 더 적은 연산량으로 RoBERTa와 XLNet과 비슷한 성능을 낼 수 있으며, 동일한 연산량을 가질 경우 이들의 성능을 능가합니다.*
|
||||
|
||||
|
||||
이 모델은 [lysandre](https://huggingface.co/lysandre)이 기여했습니다. 원본 코드는 [이곳](https://github.com/google-research/electra)에서 찾아보실 수 있습니다.
|
||||
|
||||
## 사용 팁[[usage-tips]]
|
||||
|
||||
- ELECTRA는 사전학습 방법으로 기본 모델인 BERT의 구조와 거의 차이가 없습니다. 유일한 차이는 임베딩 크기와 히든 크기를 구분했다는 점입니다. 임베딩 크기는 일반적으로 더 작고, 히든 크기는 더 큽니다. 임베딩에서 임베딩 크기를 히든 크기로 변환하기 위해 추가로 선형 변환 층이 사용됩니다. 임베딩 크기와 히든 크기가 동일할 경우에는 이 선형 변환 층이 필요하지 않습니다.
|
||||
- ELECTRA는 또 다른 (작은) 마스킹된 언어 모델을 사용해 사전학습 된 트랜스포머 모델입니다. 작은 언어 모델이 입력 텍스트의 일부를 무작위로 마스킹하고, 그 자리에 새로운 토큰을 삽입합니다. ELECTRA는 원래 토큰과 대체된 토큰을 구분하는 역할을 수행합니다. GAN 훈련과 비슷하지만, 생성 모델은 ELECTRA 모델을 속이는 것이 아니라 원래 텍스트를 복원하는 목표로 몇 단계 학습합니다. 그 후 ELECTRA가 학습을 하게 됩니다.
|
||||
- [구글 리서치의 구현](https://github.com/google-research/electra)으로 저장된 ELECTRA checkpoints는 생성 모델과 판별 모델을 포함합니다. 변환 스크립트에서는 사용자가 어떤 모델을 어떤 아키텍처로 내보낼지 명시해야 합니다. 일단 Hugging Face 포맷으로 변환되면, 이 체크포인트들은 모든 ELECTRA 모델에서 불러올 수 있습니다. 즉, 판별 모델은 [`ElectraForMaskedLM`] 모델에, 생성 모델은 [`ElectraForPreTraining`]모델에 불러올 수 있다는 의미입니다. (단, 생성 모델에는 분류 헤드가 존재하지 않기 때문에, 해당 부분은 무작위로 초기화됩니다.)
|
||||
|
||||
## 참고 자료[[resources]]
|
||||
|
||||
- [텍스트 분류 가이드](../tasks/sequence_classification)
|
||||
- [토큰 분류 가이드](../tasks/token_classification)
|
||||
- [질의 응답 가이드](../tasks/question_answering)
|
||||
- [인과 언어 모델링 가이드](../tasks/language_modeling)
|
||||
- [마스킹된 언어 모델링 가이드](../tasks/masked_language_modeling)
|
||||
- [객관식 문제 가이드](../tasks/multiple_choice)
|
||||
|
||||
## ElectraConfig
|
||||
|
||||
[[autodoc]] ElectraConfig
|
||||
|
||||
## ElectraTokenizer
|
||||
|
||||
[[autodoc]] ElectraTokenizer
|
||||
|
||||
## ElectraTokenizerFast
|
||||
|
||||
[[autodoc]] ElectraTokenizerFast
|
||||
|
||||
## Electra specific outputs
|
||||
|
||||
[[autodoc]] models.electra.modeling_electra.ElectraForPreTrainingOutput
|
||||
|
||||
[[autodoc]] models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
## ElectraModel
|
||||
|
||||
[[autodoc]] ElectraModel
|
||||
- forward
|
||||
|
||||
## ElectraForPreTraining
|
||||
|
||||
[[autodoc]] ElectraForPreTraining
|
||||
- forward
|
||||
|
||||
## ElectraForCausalLM
|
||||
|
||||
[[autodoc]] ElectraForCausalLM
|
||||
- forward
|
||||
|
||||
## ElectraForMaskedLM
|
||||
|
||||
[[autodoc]] ElectraForMaskedLM
|
||||
- forward
|
||||
|
||||
## ElectraForSequenceClassification
|
||||
|
||||
[[autodoc]] ElectraForSequenceClassification
|
||||
- forward
|
||||
|
||||
## ElectraForMultipleChoice
|
||||
|
||||
[[autodoc]] ElectraForMultipleChoice
|
||||
- forward
|
||||
|
||||
## ElectraForTokenClassification
|
||||
|
||||
[[autodoc]] ElectraForTokenClassification
|
||||
- forward
|
||||
|
||||
## ElectraForQuestionAnswering
|
||||
|
||||
[[autodoc]] ElectraForQuestionAnswering
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
## TFElectraModel
|
||||
|
||||
[[autodoc]] TFElectraModel
|
||||
- call
|
||||
|
||||
## TFElectraForPreTraining
|
||||
|
||||
[[autodoc]] TFElectraForPreTraining
|
||||
- call
|
||||
|
||||
## TFElectraForMaskedLM
|
||||
|
||||
[[autodoc]] TFElectraForMaskedLM
|
||||
- call
|
||||
|
||||
## TFElectraForSequenceClassification
|
||||
|
||||
[[autodoc]] TFElectraForSequenceClassification
|
||||
- call
|
||||
|
||||
## TFElectraForMultipleChoice
|
||||
|
||||
[[autodoc]] TFElectraForMultipleChoice
|
||||
- call
|
||||
|
||||
## TFElectraForTokenClassification
|
||||
|
||||
[[autodoc]] TFElectraForTokenClassification
|
||||
- call
|
||||
|
||||
## TFElectraForQuestionAnswering
|
||||
|
||||
[[autodoc]] TFElectraForQuestionAnswering
|
||||
- call
|
||||
|
||||
</tf>
|
||||
<jax>
|
||||
|
||||
## FlaxElectraModel
|
||||
|
||||
[[autodoc]] FlaxElectraModel
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForPreTraining
|
||||
|
||||
[[autodoc]] FlaxElectraForPreTraining
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForCausalLM
|
||||
|
||||
[[autodoc]] FlaxElectraForCausalLM
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForMaskedLM
|
||||
|
||||
[[autodoc]] FlaxElectraForMaskedLM
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForSequenceClassification
|
||||
|
||||
[[autodoc]] FlaxElectraForSequenceClassification
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForMultipleChoice
|
||||
|
||||
[[autodoc]] FlaxElectraForMultipleChoice
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForTokenClassification
|
||||
|
||||
[[autodoc]] FlaxElectraForTokenClassification
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForQuestionAnswering
|
||||
|
||||
[[autodoc]] FlaxElectraForQuestionAnswering
|
||||
- __call__
|
||||
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
@@ -376,7 +376,7 @@ class DynamicCache(Cache):
|
||||
self.key_cache.append(key_states)
|
||||
self.value_cache.append(value_states)
|
||||
|
||||
def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
|
||||
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
|
||||
sequence length.
|
||||
@@ -649,7 +649,7 @@ class OffloadedCache(DynamicCache):
|
||||
self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
|
||||
self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)
|
||||
|
||||
def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
|
||||
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
|
||||
if layer_idx < len(self):
|
||||
# Evict the previous layer if necessary
|
||||
@@ -1473,7 +1473,7 @@ class EncoderDecoderCache(Cache):
|
||||
for layer_idx in range(len(cross_attention_cache.key_cache)):
|
||||
self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)
|
||||
|
||||
def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
|
||||
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
|
||||
sequence length.
|
||||
|
||||
@@ -151,17 +151,24 @@ def get_imports(filename: Union[str, os.PathLike]) -> list[str]:
|
||||
content = f.read()
|
||||
imported_modules = set()
|
||||
|
||||
import transformers.utils
|
||||
|
||||
def recursive_look_for_imports(node):
|
||||
if isinstance(node, ast.Try):
|
||||
return # Don't recurse into Try blocks and ignore imports in them
|
||||
return # Don't recurse into Try blocks and ignore imports in them
|
||||
elif isinstance(node, ast.If):
|
||||
test = node.test
|
||||
for condition_node in ast.walk(test):
|
||||
if isinstance(condition_node, ast.Call) and getattr(condition_node.func, "id", "").startswith(
|
||||
"is_flash_attn"
|
||||
):
|
||||
# Don't recurse into "if flash_attn_available()" blocks and ignore imports in them
|
||||
return
|
||||
if isinstance(condition_node, ast.Call):
|
||||
check_function = getattr(condition_node.func, "id", "")
|
||||
if (
|
||||
check_function.endswith("available")
|
||||
and check_function.startswith("is_flash_attn")
|
||||
or hasattr(transformers.utils.import_utils, check_function)
|
||||
):
|
||||
# Don't recurse into "if flash_attn_available()" or any "if library_available" blocks
|
||||
# that appears in `transformers.utils.import_utils` and ignore imports in them
|
||||
return
|
||||
elif isinstance(node, ast.Import):
|
||||
# Handle 'import x' statements
|
||||
for alias in node.names:
|
||||
|
||||
@@ -376,7 +376,7 @@ def infer_channel_dimension_format(
|
||||
|
||||
if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
|
||||
logger.warning(
|
||||
f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension."
|
||||
f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension."
|
||||
)
|
||||
return ChannelDimension.FIRST
|
||||
elif image.shape[first_dim] in num_channels:
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
@@ -50,14 +51,22 @@ class TorchExportableModuleForDecoderOnlyLM(torch.nn.Module):
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if model.config.cache_implementation == "static":
|
||||
if not hasattr(model.config, "use_cache") or model.config.use_cache is False:
|
||||
raise ValueError("The model must have caching enabled to be performant.")
|
||||
|
||||
if not hasattr(model.config, "cache_implementation"):
|
||||
# If `cache_implementation` is not specified explicitly in the config, `DynamicCache` will
|
||||
# be used by default, so export will use `StaticCache` by default.
|
||||
logging.info("Using `StaticCache` for export as `cache_implementation` is not specified in the config.")
|
||||
self.model = TorchExportableModuleWithStaticCache(model)
|
||||
elif model.config.cache_implementation == "hybrid":
|
||||
self.model = TorchExportableModuleWithHybridCache(model, max_batch_size, max_cache_len)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported cache implementation in this export recipe: '{model.config.cache_implementation}'"
|
||||
)
|
||||
if model.config.cache_implementation == "hybrid":
|
||||
self.model = TorchExportableModuleWithHybridCache(model, max_batch_size, max_cache_len)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported cache implementation: {model.config.cache_implementation}. "
|
||||
"Please use `hybrid` or `static`."
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -462,6 +471,8 @@ def convert_and_export_with_cache(
|
||||
model: PreTrainedModel,
|
||||
example_input_ids: Optional[torch.Tensor] = None,
|
||||
example_cache_position: Optional[torch.Tensor] = None,
|
||||
dynamic_shapes: Optional[dict] = None,
|
||||
strict: Optional[bool] = None,
|
||||
):
|
||||
"""
|
||||
Convert a `PreTrainedModel` into an exportable module and export it using `torch.export`,
|
||||
@@ -469,8 +480,10 @@ def convert_and_export_with_cache(
|
||||
|
||||
Args:
|
||||
model (`PreTrainedModel`): The pretrained model to be exported.
|
||||
example_input_ids (`torch.Tensor`): Example input token id used by `torch.export`.
|
||||
example_cache_position (`torch.Tensor`): Example current cache position used by `torch.export`.
|
||||
example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
|
||||
example_cache_position (`Optional[torch.Tensor]`): Example current cache position used by `torch.export`.
|
||||
dynamic_shapes(`Optional[dict]`): Dynamic shapes used by `torch.export`.
|
||||
strict(`Optional[bool]`): Flag to instruct `torch.export` to use `torchdynamo`.
|
||||
|
||||
Returns:
|
||||
Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
|
||||
@@ -489,14 +502,21 @@ def convert_and_export_with_cache(
|
||||
example_cache_position if example_cache_position is not None else torch.tensor([0], dtype=torch.long)
|
||||
)
|
||||
|
||||
if is_torch_greater_or_equal("2.5.0"):
|
||||
if is_torch_greater_or_equal("2.6.0"):
|
||||
exported_program = torch.export.export(
|
||||
TorchExportableModuleWithStaticCache(model),
|
||||
args=(example_input_ids,),
|
||||
kwargs={"cache_position": example_cache_position},
|
||||
strict=True,
|
||||
args=(example_input_ids, example_cache_position),
|
||||
kwargs={},
|
||||
dynamic_shapes=dynamic_shapes,
|
||||
strict=strict if strict is not None else True,
|
||||
)
|
||||
else:
|
||||
if dynamic_shapes is not None:
|
||||
logging.warning(
|
||||
"Dynamic shapes spec will be ignored by convert_and_export_with_cache for torch < 2.6.0."
|
||||
)
|
||||
if strict is not None:
|
||||
logging.warning("The strict flag will be ingored by convert_and_export_with_cache for torch < 2.6.0.")
|
||||
# We have to keep this path for BC.
|
||||
#
|
||||
# Due to issue https://github.com/pytorch/pytorch/issues/128394, we need to switch to use an internal
|
||||
|
||||
@@ -93,6 +93,7 @@ else:
|
||||
),
|
||||
("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("biogpt", ("BioGptTokenizer", None)),
|
||||
("bitnet", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
|
||||
("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
|
||||
("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
|
||||
@@ -224,8 +224,13 @@ class Llama4TextConfig(PretrainedConfig):
|
||||
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
||||
<TODO>
|
||||
<TODO>
|
||||
no_rope_layers (`int`, *optional*): TODO
|
||||
no_rope_layer_interval (`int`, *optional*, defaults to 4): TODO
|
||||
no_rope_layers (`List[int]`, *optional*):
|
||||
List with at least the same length as the number of layers in the model.
|
||||
A `1` at an index position indicates that the corresponding layer will use RoPE,
|
||||
while a `0` indicates that it's a NoPE layer.
|
||||
no_rope_layer_interval (`int`, *optional*, defaults to 4):
|
||||
If `no_rope_layers` is `None`, it will be created using a NoPE layer every
|
||||
`no_rope_layer_interval` layers.
|
||||
attention_chunk_size (`int`, *optional*, defaults to 8192):
|
||||
<TODO>
|
||||
attn_temperature_tuning (`bool`, *optional*, defaults to `True`):
|
||||
@@ -339,11 +344,15 @@ class Llama4TextConfig(PretrainedConfig):
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.router_jitter_noise = router_jitter_noise
|
||||
|
||||
# Backwards compatibility
|
||||
if no_rope_layers == []:
|
||||
no_rope_layers = None
|
||||
|
||||
default_no_rope_layers = [
|
||||
int((layer_idx + 1) % no_rope_layer_interval != 0) for layer_idx in range(self.num_hidden_layers)
|
||||
]
|
||||
|
||||
# no_rope_layers == [] is invalid as we cannot have 0 layers
|
||||
self.no_rope_layers = no_rope_layers if no_rope_layers else default_no_rope_layers
|
||||
|
||||
self.interleave_moe_layer_step = interleave_moe_layer_step
|
||||
|
||||
@@ -65,6 +65,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
|
||||
r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.feed_forward.up_proj.weight", # might need to be fused for efficiency?
|
||||
# r"layers.(\d+).feed_forward.mlp.fc1_weight": r"language_model.model.layers.\1.feed_forward.gate_up_proj.weight",
|
||||
r"layers.(\d+).feed_forward.mlp.fc2_weight": r"language_model.model.layers.\1.feed_forward.down_proj.weight",
|
||||
r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.feed_forward.down_proj.weight",
|
||||
r"layers.(\d+).feed_forward.mlp.layer_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight",
|
||||
|
||||
# Vision encoder mapping
|
||||
@@ -166,8 +167,8 @@ def get_concat_dim(key):
|
||||
return 0
|
||||
|
||||
|
||||
def compute_intermediate_size(hidden_dim, multiple_of=1024, ffn_dim_multiplier=1.3):
|
||||
hidden_dim = 4 * int(2 * hidden_dim / 3)
|
||||
def compute_intermediate_size(hidden_dim, ffn_exp=4, multiple_of=1024, ffn_dim_multiplier=1.2):
|
||||
hidden_dim = ffn_exp * int(2 * hidden_dim / 3)
|
||||
hidden_dim = int(ffn_dim_multiplier * hidden_dim)
|
||||
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
||||
return hidden_dim
|
||||
@@ -203,6 +204,8 @@ def max_context_length(model_path, instruct=False):
|
||||
with open(os.path.join(model_path, "params.json"), "r") as f:
|
||||
params = json.load(f)
|
||||
params = params.get("model", params)
|
||||
if params.get("moe_args") is None:
|
||||
return 8192
|
||||
num_experts = params["moe_args"]["num_experts"]
|
||||
return 10485760 if num_experts == 16 else 1048576
|
||||
|
||||
@@ -242,24 +245,40 @@ def write_model(
|
||||
# some constants from original code
|
||||
rope_scaling = {
|
||||
"rope_type": "llama3",
|
||||
"factor": 8.0,
|
||||
"factor": params.get("rope_scaling_factor", 8.0),
|
||||
"low_freq_factor": 1.0,
|
||||
"high_freq_factor": 4.0,
|
||||
"high_freq_factor": params.get("rope_high_freq_factor", 4.0),
|
||||
"original_max_position_embeddings": 8192,
|
||||
}
|
||||
config_kwargs.update({"rope_scaling": rope_scaling})
|
||||
|
||||
if attention_chunk_size is None:
|
||||
config_kwargs.update({"cache_implementation": "static"})
|
||||
|
||||
# compute additional params for weight conversion
|
||||
num_heads_per_shard = num_heads // num_shards
|
||||
dim_per_head = dim // num_heads
|
||||
# intermediate_size = compute_intermediate_size(dim, multiple_of=params["multiple_of"])
|
||||
intermediate_size_mlp = compute_intermediate_size(
|
||||
dim,
|
||||
ffn_exp=params["ffn_exp"],
|
||||
multiple_of=params["multiple_of"],
|
||||
ffn_dim_multiplier=params["ffn_dim_multiplier"],
|
||||
)
|
||||
|
||||
num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
|
||||
|
||||
num_experts = params["moe_args"]["num_experts"]
|
||||
interleave_moe_layer_step = params["moe_args"].get("interleave_moe_layer_step", 1)
|
||||
if hasattr(params, "moe_args"):
|
||||
num_experts = params["moe_args"]["num_experts"]
|
||||
interleave_moe_layer_step = params["moe_args"].get("interleave_moe_layer_step", 1)
|
||||
else:
|
||||
# Dense model (possibly Llama Guard) - disable all moe layers
|
||||
num_experts = 0
|
||||
interleave_moe_layer_step = 0
|
||||
config_kwargs.update({"moe_layers": []})
|
||||
|
||||
# Ensure all layers are rope if `nope_layer_interval` is None
|
||||
no_rope_layer_interval = params["nope_layer_interval"]
|
||||
no_rope_layer_interval = num_heads * 2 if no_rope_layer_interval is None else no_rope_layer_interval
|
||||
|
||||
bos_token_id = 200000
|
||||
eos_token_id = [200001, 200007, 200008] if instruct else 200001
|
||||
@@ -273,7 +292,7 @@ def write_model(
|
||||
rope_theta=rope_theta,
|
||||
num_hidden_layers=num_layers,
|
||||
intermediate_size=8192,
|
||||
intermediate_size_mlp=16384,
|
||||
intermediate_size_mlp=intermediate_size_mlp,
|
||||
max_position_embeddings=max_context_length(input_base_path, instruct),
|
||||
num_local_experts=num_experts,
|
||||
interleave_moe_layer_step=interleave_moe_layer_step,
|
||||
@@ -336,7 +355,7 @@ def write_model(
|
||||
sharded_keys = []
|
||||
for _key in all_keys_raw:
|
||||
try:
|
||||
if (loaded[0][_key] == loaded[1][_key]).all():
|
||||
if num_shards == 1 or (loaded[0][_key] == loaded[1][_key]).all():
|
||||
repeated_keys.append(_key)
|
||||
else:
|
||||
sharded_keys.append(_key)
|
||||
@@ -354,7 +373,7 @@ def write_model(
|
||||
for key in tqdm(all_keys, desc="Renaming and processing all keys", unit="key"):
|
||||
new_key = new_keys[key]
|
||||
print(key, new_key)
|
||||
if not is_param_same_across_shards(new_key):
|
||||
if num_shards > 1 and not is_param_same_across_shards(new_key):
|
||||
current_parameter = [chunk.pop(key) for chunk in loaded if not isinstance(chunk[key], io.BytesIO)]
|
||||
else:
|
||||
print(f"{key} (now {new_key}) is the same across all shards.")
|
||||
@@ -565,8 +584,8 @@ LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS = [
|
||||
"<|python_end|>",
|
||||
"<|finetune_right_pad|>",
|
||||
] + get_reserved_special_tokens(
|
||||
"text_post_train", 61, 6
|
||||
) # <|text_post_train_reserved_special_token_6|>, ..., <|text_post_train_reserved_special_token_66|>
|
||||
"text_post_train", 61, 8
|
||||
) # <|text_post_train_reserved_special_token_8|>, ..., <|text_post_train_reserved_special_token_68|>
|
||||
|
||||
# 200080, ..., 201133
|
||||
LLAMA4_VISION_SPECIAL_TOKENS = [
|
||||
@@ -621,15 +640,6 @@ class Llama4Converter(TikTokenConverter):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# to check
|
||||
# import tiktoken
|
||||
# model = tiktoken.Encoding(
|
||||
# name=Path(model_path).name,
|
||||
# pat_str=self.O200K_PATTERN,
|
||||
# mergeable_ranks=mergeable_ranks,
|
||||
# special_tokens=self.special_tokens,
|
||||
# )
|
||||
|
||||
instruct = chat_template is not None
|
||||
self.update_post_processor(self.converted_tokenizer)
|
||||
# finer special_tokens_map.json
|
||||
@@ -687,12 +697,10 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--input_dir",
|
||||
type=str,
|
||||
default="/fsx/arthur/Llama-4-17B-Omni-Instruct-Original",
|
||||
help="Location of the local folder copied from the Hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default="llama4_hf_vision",
|
||||
type=str,
|
||||
help="Location to write HF model and tokenizer",
|
||||
)
|
||||
|
||||
@@ -20,12 +20,11 @@ from typing import Callable, List, Optional, Tuple, Union
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint
|
||||
|
||||
from transformers.models.llama4.configuration_llama4 import Llama4VisionConfig
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, HybridChunkedCache
|
||||
from ...cache_utils import Cache, DynamicCache, HybridChunkedCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...integrations.hub_kernels import use_kernel_forward_from_hub
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
@@ -287,7 +286,7 @@ class Llama4TextAttention(nn.Module):
|
||||
self.attn_temperature_tuning = config.attn_temperature_tuning
|
||||
self.attention_dropout = config.attention_dropout
|
||||
self.is_causal = True
|
||||
self.use_rope = int((layer_idx + 1) % 4 != 0) # rope unused for dense layers
|
||||
self.use_rope = config.no_rope_layers[layer_idx]
|
||||
self.q_proj = nn.Linear(
|
||||
config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
|
||||
)
|
||||
@@ -374,7 +373,7 @@ class Llama4TextDecoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.self_attn = Llama4TextAttention(config, layer_idx)
|
||||
self.use_chunked_attention = int((layer_idx + 1) % 4 != 0) # <=> use rope
|
||||
self.use_chunked_attention = config.attention_chunk_size is not None and bool(config.no_rope_layers[layer_idx])
|
||||
self.is_moe_layer = layer_idx in config.moe_layers
|
||||
if self.is_moe_layer: # the 128E model interleaves dense / sparse
|
||||
self.feed_forward = Llama4TextMoe(config)
|
||||
@@ -643,7 +642,10 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
inputs_embeds = self.embed_tokens(input_ids.to(self.embed_tokens.weight.device))
|
||||
|
||||
if use_cache and past_key_values is None:
|
||||
past_key_values = HybridChunkedCache(self.config, inputs_embeds.shape[0], inputs_embeds.shape[1])
|
||||
if self.config.get_text_config().get("attention_chunk_size") is not None:
|
||||
past_key_values = HybridChunkedCache(self.config, inputs_embeds.shape[0], inputs_embeds.shape[1])
|
||||
else:
|
||||
past_key_values = DynamicCache(self.config, inputs_embeds.shape[0], inputs_embeds.shape[1])
|
||||
|
||||
if cache_position is None:
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
@@ -740,6 +742,7 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
sequence_length = input_tensor.shape[1]
|
||||
cache_position = cache_position.to(self.device)
|
||||
attention_chunk_size = self.config.attention_chunk_size
|
||||
using_chunked_attention = attention_chunk_size is not None
|
||||
|
||||
first_cache_position = cache_position[0]
|
||||
|
||||
@@ -748,26 +751,28 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
else:
|
||||
full_cache_length = attention_mask.shape[-1] if attention_mask is not None else sequence_length
|
||||
|
||||
cond1 = first_cache_position >= attention_chunk_size
|
||||
cond2 = (first_cache_position < attention_chunk_size) & (
|
||||
first_cache_position + sequence_length > attention_chunk_size
|
||||
)
|
||||
key_length = (
|
||||
torch.where(
|
||||
cond1,
|
||||
attention_chunk_size + sequence_length - 1,
|
||||
torch.where(cond2, first_cache_position + sequence_length, attention_chunk_size),
|
||||
if using_chunked_attention:
|
||||
cond1 = first_cache_position >= attention_chunk_size
|
||||
cond2 = (first_cache_position < attention_chunk_size) & (
|
||||
first_cache_position + sequence_length > attention_chunk_size
|
||||
)
|
||||
key_length = (
|
||||
torch.where(
|
||||
cond1,
|
||||
attention_chunk_size + sequence_length - 1,
|
||||
torch.where(cond2, first_cache_position + sequence_length, attention_chunk_size),
|
||||
)
|
||||
if use_cache
|
||||
else full_cache_length
|
||||
)
|
||||
if use_cache
|
||||
else full_cache_length
|
||||
)
|
||||
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
offsets = (first_cache_position, max(first_cache_position - attention_chunk_size + 1, 0))
|
||||
chunked_attention_mask = make_flex_block_causal_mask(
|
||||
attention_mask, self.config.attention_chunk_size, sequence_length, key_length, offsets=offsets
|
||||
)
|
||||
if using_chunked_attention:
|
||||
offsets = (first_cache_position, max(first_cache_position - attention_chunk_size + 1, 0))
|
||||
chunked_attention_mask = make_flex_block_causal_mask(
|
||||
attention_mask, attention_chunk_size, sequence_length, key_length, offsets=offsets
|
||||
)
|
||||
attention_mask = make_flex_block_causal_mask(
|
||||
attention_mask,
|
||||
query_length=sequence_length,
|
||||
@@ -780,15 +785,16 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
dtype, device = input_tensor.dtype, input_tensor.device
|
||||
target_length = max(full_cache_length, attention_chunk_size) if using_chunked_attention else full_cache_length
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=max(full_cache_length, attention_chunk_size),
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
if full_cache_length > self.config.attention_chunk_size:
|
||||
if using_chunked_attention and full_cache_length > attention_chunk_size:
|
||||
start_idx = max(first_cache_position - attention_chunk_size + 1, 0)
|
||||
end_idx = start_idx + key_length
|
||||
chunked_attention_mask = self.create_chunked_attention_mask(
|
||||
@@ -873,7 +879,6 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
device: torch.device,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
@@ -906,16 +911,18 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=device) > cache_position.to(device).reshape(-1, 1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(device)
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
cache_position.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
|
||||
@@ -1440,6 +1440,9 @@ class ProcessorMixin(PushToHubMixin):
|
||||
if value is not None and not isinstance(value, dict):
|
||||
processed_kwargs[kwarg_type][key] = value
|
||||
|
||||
# Pass unprocessed custom kwargs
|
||||
processed_kwargs["template_kwargs"].update(kwargs)
|
||||
|
||||
if isinstance(conversation, (list, tuple)) and (
|
||||
isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
|
||||
):
|
||||
|
||||
@@ -240,6 +240,7 @@ def parse_int_from_env(key, default=None):
|
||||
|
||||
|
||||
_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
|
||||
_run_flaky_tests = parse_flag_from_env("RUN_FLAKY", default=True)
|
||||
_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
|
||||
_run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
|
||||
_run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
|
||||
@@ -2614,7 +2615,7 @@ def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, d
|
||||
|
||||
return test_func_ref(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
return unittest.skipUnless(_run_flaky_tests, "test is flaky")(wrapper)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@@ -3704,7 +3704,10 @@ class Trainer:
|
||||
arguments, depending on the situation.
|
||||
"""
|
||||
if self.use_cpu_amp:
|
||||
ctx_manager = torch.amp.autocast("cpu", cache_enabled=cache_enabled, dtype=self.amp_dtype)
|
||||
# TODO Matt: This syntax is deprecated and the preferred version is
|
||||
# torch.amp.autocast("cpu", cache_enabled=cache_enabled, dtype=self.amp_dtype)
|
||||
# but this is unavailable on Torch 2.1 or earlier. We can change this when we stop supporting 2.1.
|
||||
ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
|
||||
else:
|
||||
ctx_manager = contextlib.nullcontext()
|
||||
|
||||
|
||||
@@ -1097,25 +1097,18 @@ class GenerationTesterMixin:
|
||||
|
||||
# test output equality of low versus high memory
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
generate_kwargs = {
|
||||
"top_k": 4,
|
||||
"penalty_alpha": 0.6,
|
||||
"max_new_tokens": self.max_new_tokens,
|
||||
"use_cache": True,
|
||||
"return_dict_in_generate": True,
|
||||
"output_scores": True,
|
||||
}
|
||||
|
||||
low_output = model.generate(
|
||||
top_k=4,
|
||||
penalty_alpha=0.6,
|
||||
low_memory=True,
|
||||
max_new_tokens=self.max_new_tokens,
|
||||
**inputs_dict,
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
high_output = model.generate(
|
||||
top_k=4,
|
||||
penalty_alpha=0.6,
|
||||
low_memory=False,
|
||||
max_new_tokens=self.max_new_tokens,
|
||||
**inputs_dict,
|
||||
use_cache=True,
|
||||
)
|
||||
self.assertListEqual(low_output.tolist(), high_output.tolist())
|
||||
low_output = model.generate(**inputs_dict, **generate_kwargs, low_memory=True)
|
||||
high_output = model.generate(**inputs_dict, **generate_kwargs, low_memory=False)
|
||||
self._check_similar_generate_outputs(low_output, high_output)
|
||||
|
||||
@parameterized.expand([("random",), ("same",)])
|
||||
@pytest.mark.generate
|
||||
@@ -1863,22 +1856,29 @@ class GenerationTesterMixin:
|
||||
|
||||
model = model_class(config).to(torch_device)
|
||||
model.eval()
|
||||
model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1
|
||||
model.generation_config.forced_eos_token_id = None
|
||||
model.generation_config.encoder_no_repeat_ngram_size = 0
|
||||
model.generation_config.use_cache = True
|
||||
|
||||
# If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
|
||||
outputs = model(**inputs)
|
||||
if "past_key_values" not in outputs:
|
||||
self.skipTest(reason="This model doesn't return `past_key_values`")
|
||||
|
||||
generate_kwargs = {
|
||||
"pad_token_id": -1,
|
||||
"eos_token_id": -1,
|
||||
"forced_eos_token_id": None,
|
||||
"encoder_no_repeat_ngram_size": 0,
|
||||
"use_cache": True,
|
||||
"do_sample": False,
|
||||
"return_dict_in_generate": True,
|
||||
"output_scores": True,
|
||||
}
|
||||
|
||||
# Traditional way of generating text, with `return_dict_in_generate` to return the past key values
|
||||
outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True)
|
||||
outputs = model.generate(**inputs, **generate_kwargs, max_new_tokens=4)
|
||||
|
||||
# Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the
|
||||
# inputs may need to be tweaked across `generate` calls (like the attention mask).
|
||||
outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=3, return_dict_in_generate=True)
|
||||
outputs_cached = model.generate(**inputs, **generate_kwargs, max_new_tokens=3)
|
||||
|
||||
# Continue from the tokens generated above, preparing the inputs accordingly
|
||||
inputs["past_key_values"] = outputs_cached.past_key_values
|
||||
@@ -1901,10 +1901,13 @@ class GenerationTesterMixin:
|
||||
mode="constant",
|
||||
value=1,
|
||||
)
|
||||
outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=1, return_dict_in_generate=True)
|
||||
first_caches_scores = outputs_cached.scores
|
||||
outputs_cached = model.generate(**inputs, **generate_kwargs, max_new_tokens=1)
|
||||
full_cached_scores = first_caches_scores + outputs_cached.scores
|
||||
outputs_cached.scores = full_cached_scores
|
||||
|
||||
# The two sets of generated text and past kv should be equal to each other
|
||||
self.assertListEqual(outputs.sequences.tolist(), outputs_cached.sequences.tolist())
|
||||
self._check_similar_generate_outputs(outputs, outputs_cached)
|
||||
for layer_idx in range(len(outputs_cached.past_key_values)):
|
||||
for kv_idx in range(len(outputs_cached.past_key_values[layer_idx])):
|
||||
self.assertTrue(
|
||||
@@ -1930,6 +1933,8 @@ class GenerationTesterMixin:
|
||||
|
||||
if config.is_encoder_decoder:
|
||||
self.skipTest(reason="This model is encoder-decoder")
|
||||
# TODO (joao, raushan): the correct line below is `if not hasattr(config.get_text_config(), "use_cache")`,
|
||||
# but it breaks a few models. Fix and then apply `_check_similar_generate_outputs` pattern
|
||||
if not hasattr(config, "use_cache"):
|
||||
self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
|
||||
|
||||
@@ -1990,32 +1995,6 @@ class GenerationTesterMixin:
|
||||
)
|
||||
)
|
||||
|
||||
@parameterized.expand([("offloaded",)]) # ("offloaded_static",) TODO: @raushan fixme in some models (eg T5)
|
||||
@require_torch_accelerator
|
||||
@pytest.mark.generate
|
||||
def test_offloaded_cache_implementation(self, cache_implementation):
|
||||
"""Tests we can generate by indicating `cache_implementation` for each possible cache class"""
|
||||
for model_class in self.all_generative_model_classes:
|
||||
if not model_class._supports_cache_class:
|
||||
self.skipTest(reason="This model does not support the new cache format")
|
||||
|
||||
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
||||
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
generation_kwargs = {
|
||||
"max_new_tokens": 5,
|
||||
"use_cache": True,
|
||||
"cache_implementation": cache_implementation,
|
||||
}
|
||||
|
||||
legacy_results = model.generate(**generation_kwargs, **inputs_dict)
|
||||
|
||||
# Most cache classes have their own tests except for some that are tested here
|
||||
# The ones here do not need special treatment when passing `cache_implementation`
|
||||
# and are not bound to specific models only
|
||||
new_results = model.generate(**generation_kwargs, **inputs_dict)
|
||||
self.assertListEqual(legacy_results.tolist(), new_results.tolist())
|
||||
|
||||
@pytest.mark.generate
|
||||
def test_generate_with_static_cache(self):
|
||||
"""
|
||||
|
||||
@@ -27,11 +27,13 @@ from transformers import (
|
||||
is_vision_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
cleanup,
|
||||
require_av,
|
||||
require_bitsandbytes,
|
||||
require_deterministic_for_xpu,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -177,7 +179,7 @@ class InternVLVisionText2TextModelTester:
|
||||
model = InternVLForConditionalGeneration(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.autocast(device_type="cuda", dtype=torch.float16):
|
||||
with torch.autocast(device_type=torch_device, dtype=torch.float16):
|
||||
logits = model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@@ -279,7 +281,7 @@ class InternVLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.small_model_checkpoint = "OpenGVLab/InternVL3-1B-hf"
|
||||
@@ -326,7 +328,14 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
output = model(**inputs)
|
||||
|
||||
actual_logits = output.logits[0, -1, :5].cpu()
|
||||
expected_logits = torch.tensor([11.9375, 14.8750, 14.0625, 10.7500, 6.9062], dtype=torch.bfloat16)
|
||||
expected_logits_all = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
|
||||
("cuda", 7): torch.tensor([11.9375, 14.8750, 14.0625, 10.7500, 6.9062], dtype=torch.bfloat16),
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_logits = expected_logits_all.get_expectation()
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(actual_logits, expected_logits, atol=0.1),
|
||||
f"Actual logits: {actual_logits}"
|
||||
@@ -334,6 +343,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
f"\nDifference: {torch.abs(actual_logits - expected_logits)}",
|
||||
)
|
||||
|
||||
@require_deterministic_for_xpu
|
||||
def test_qwen2_small_model_integration_generate_text_only(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
@@ -346,7 +356,15 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
decoded_output = processor.decode(
|
||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||
)
|
||||
expected_output = "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins."
|
||||
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
||||
("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
|
||||
self.assertEqual(decoded_output, expected_output)
|
||||
|
||||
def test_qwen2_small_model_integration_generate_chat_template(self):
|
||||
@@ -375,6 +393,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
|
||||
self.assertEqual(decoded_output, expected_output)
|
||||
|
||||
@require_deterministic_for_xpu
|
||||
def test_qwen2_small_model_integration_batched_generate(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
@@ -404,7 +423,15 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
)
|
||||
# Check second output
|
||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||
expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of' # fmt: skip
|
||||
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
|
||||
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -455,7 +482,14 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
|
||||
# Check second output
|
||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||
expected_output = 'user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the differences:\n\n1. **Angle' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the differences:\n\n1. **Foreground",
|
||||
("cuda", 7): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the differences:\n\n1. **Angle",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -495,7 +529,13 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||
|
||||
decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
||||
expected_output = 'The man is performing a forehand shot.' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "The man is performing a volley.",
|
||||
("cuda", 7): "The man is performing a forehand shot.",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -503,6 +543,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
@require_av
|
||||
@require_deterministic_for_xpu
|
||||
def test_qwen2_small_model_integration_interleaved_images_videos(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
@@ -564,7 +605,13 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
|
||||
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
||||
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
|
||||
expected_output = 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image**: This shows the Statue of Liberty on Liberty Island, with the' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
||||
("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image**: This shows the Statue of Liberty on Liberty Island, with the",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -572,7 +619,13 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
)
|
||||
# Check second output
|
||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||
expected_output = 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
||||
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -590,7 +643,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.small_model_checkpoint = "OpenGVLab/InternVL2_5-2B-MPO-hf"
|
||||
@@ -711,7 +764,13 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
# Check first output
|
||||
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
||||
expected_output = 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
||||
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -880,7 +939,13 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
||||
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
|
||||
expected_output = 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
||||
("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -889,7 +954,13 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
# Check second output
|
||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||
expected_output = 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
||||
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@@ -898,7 +969,13 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
# Check third output
|
||||
decoded_output = processor.decode(output[2], skip_special_tokens=True)
|
||||
expected_output = 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, untouched dreams.' # fmt: skip
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
||||
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, untouched dreams.",
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
|
||||
@@ -80,7 +80,6 @@ from transformers.testing_utils import (
|
||||
require_bitsandbytes,
|
||||
require_deepspeed,
|
||||
require_flash_attn,
|
||||
require_non_xpu,
|
||||
require_safetensors,
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
@@ -2604,7 +2603,7 @@ class ModelTesterMixin:
|
||||
)[0]
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
@require_non_xpu
|
||||
@require_torch_gpu
|
||||
@require_torch_multi_gpu
|
||||
def test_multi_gpu_data_parallel_forward(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@@ -3874,7 +3873,6 @@ class ModelTesterMixin:
|
||||
with sdpa_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
@require_non_xpu
|
||||
@require_torch_sdpa
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
@@ -3887,8 +3885,8 @@ class ModelTesterMixin:
|
||||
self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
|
||||
elif device_type == "rocm" and major < 9:
|
||||
self.skipTest(reason="This test requires an AMD GPU with compute capability >= 9.0")
|
||||
else:
|
||||
self.skipTest(reason="This test requires a Nvidia or AMD GPU")
|
||||
elif device_type not in ["cuda", "rocm", "xpu"]:
|
||||
self.skipTest(reason="This test requires a Nvidia or AMD GPU, or an Intel XPU")
|
||||
|
||||
torch.compiler.reset()
|
||||
|
||||
|
||||
@@ -20,11 +20,11 @@ from parameterized import parameterized
|
||||
from transformers import set_seed
|
||||
from transformers.testing_utils import (
|
||||
CaptureStderr,
|
||||
cleanup,
|
||||
get_gpu_count,
|
||||
is_torch_available,
|
||||
require_gptq,
|
||||
require_non_xpu,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
@@ -53,6 +53,8 @@ if is_torch_available():
|
||||
|
||||
@require_torch
|
||||
class CacheTest(unittest.TestCase):
|
||||
"""Cache tests that don't require loading models"""
|
||||
|
||||
def test_dynamic_cache_retrocompatibility(self):
|
||||
"""Tests that we can convert back and forth between the legacy cache format and DynamicCache"""
|
||||
legacy_cache = ()
|
||||
@@ -173,120 +175,17 @@ class CacheTest(unittest.TestCase):
|
||||
self.assertTrue(cached_keys.shape == (1, 1, 10, 128))
|
||||
self.assertTrue(cached_values.shape == (1, 1, 10, 128))
|
||||
|
||||
def test_dynamic_cache_exportability(self):
|
||||
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
|
||||
model = model.eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
|
||||
prompt = "What is the best way to debug python script?"
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
attention_mask = inputs.attention_mask
|
||||
input_ids = inputs.input_ids
|
||||
|
||||
past_key_values = DynamicCache()
|
||||
ep = torch.export.export(
|
||||
model,
|
||||
(),
|
||||
{
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"past_key_values": past_key_values,
|
||||
"use_cache": True,
|
||||
},
|
||||
strict=False,
|
||||
)
|
||||
res = ep.module()(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=True,
|
||||
)
|
||||
self.assertTrue(len(res.past_key_values.key_cache) == model.config.num_hidden_layers)
|
||||
self.assertEqual(2 * model.config.num_hidden_layers + 1, len(ep.graph_signature.output_specs))
|
||||
self.assertEqual(
|
||||
3,
|
||||
len(
|
||||
[
|
||||
x
|
||||
for x in ep.graph_signature.input_specs
|
||||
if x.kind == torch.export.graph_signature.InputKind.USER_INPUT
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
past_key_values_eager = DynamicCache()
|
||||
res_eager = model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values_eager,
|
||||
use_cache=True,
|
||||
)
|
||||
self.assertTrue(torch.allclose(res.logits, res_eager.logits))
|
||||
for k1, k2 in zip(res.past_key_values.key_cache, res_eager.past_key_values.key_cache):
|
||||
self.assertTrue(torch.allclose(k1, k2))
|
||||
|
||||
for v1, v2 in zip(res.past_key_values.value_cache, res_eager.past_key_values.value_cache):
|
||||
self.assertTrue(torch.allclose(v1, v2))
|
||||
|
||||
@slow
|
||||
@require_read_token
|
||||
def test_static_cache_exportability(self):
|
||||
"""
|
||||
Tests that static cache works with `torch.export()`
|
||||
"""
|
||||
if not is_torch_greater_or_equal("2.3"):
|
||||
self.skipTest(reason="This test requires torch >= 2.3 to run.")
|
||||
|
||||
set_seed(0)
|
||||
device = "cpu"
|
||||
dtype = "bfloat16"
|
||||
cache_implementation = "static"
|
||||
attn_implementation = "sdpa" # Export and ExecuTorch only works for SdpaAttention
|
||||
batch_size = 1
|
||||
max_cache_len = 1234
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"google/gemma-2b",
|
||||
device_map=device,
|
||||
torch_dtype=dtype,
|
||||
attn_implementation=attn_implementation,
|
||||
generation_config=GenerationConfig(
|
||||
use_cache=True,
|
||||
cache_implementation=cache_implementation,
|
||||
max_length=max_cache_len,
|
||||
cache_config={
|
||||
"batch_size": batch_size,
|
||||
"max_cache_len": max_cache_len,
|
||||
"device": device,
|
||||
},
|
||||
),
|
||||
)
|
||||
# Check if cache config is passed through correctly
|
||||
self.assertEqual(model.generation_config.use_cache, True)
|
||||
self.assertEqual(model.generation_config.cache_implementation, cache_implementation)
|
||||
self.assertEqual(model.generation_config.max_length, max_cache_len)
|
||||
self.assertTrue(model.generation_config.cache_config is not None)
|
||||
self.assertEqual(model.generation_config.cache_config.batch_size, batch_size)
|
||||
self.assertEqual(model.generation_config.cache_config.max_cache_len, max_cache_len)
|
||||
|
||||
exported_program = convert_and_export_with_cache(model)
|
||||
|
||||
# Check if the exported model is configured with the `StaticCache` correctly
|
||||
n_static_key_caches = n_static_value_caches = 0
|
||||
for buffer_name, buffer in exported_program.named_buffers():
|
||||
if buffer_name.startswith("key_cache"):
|
||||
self.assertTrue(buffer.shape[0] == batch_size)
|
||||
self.assertTrue(buffer.shape[2] == max_cache_len)
|
||||
n_static_key_caches = n_static_key_caches + 1
|
||||
if buffer_name.startswith("value_cache"):
|
||||
self.assertTrue(buffer.shape[0] == batch_size)
|
||||
self.assertTrue(buffer.shape[2] == max_cache_len)
|
||||
n_static_value_caches = n_static_value_caches + 1
|
||||
self.assertEqual(n_static_key_caches, model.config.num_hidden_layers)
|
||||
self.assertEqual(n_static_value_caches, model.config.num_hidden_layers)
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
class CacheIntegrationTest(unittest.TestCase):
|
||||
"""Cache tests that require loading models"""
|
||||
|
||||
def tearDown(self):
|
||||
# Some tests use large models, which might result in suboptimal torch re-allocation if we run multiple tests
|
||||
# in a row
|
||||
cleanup(torch_device, gc_collect=True)
|
||||
|
||||
@slow
|
||||
def test_dynamic_cache_hard(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
@@ -316,6 +215,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(decoded[0], expected_text)
|
||||
|
||||
@slow
|
||||
def test_dynamic_cache_batched(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
@@ -331,6 +231,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"]
|
||||
self.assertListEqual(decoded, expected_text)
|
||||
|
||||
@slow
|
||||
def test_dynamic_cache_beam_search(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
@@ -352,6 +253,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
]
|
||||
self.assertListEqual(decoded, expected_text)
|
||||
|
||||
@slow
|
||||
def test_hybrid_cache_n_sequences(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
@@ -379,6 +281,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
|
||||
@require_non_xpu
|
||||
@require_gptq
|
||||
@slow
|
||||
def test_sink_cache_hard(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")
|
||||
model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto")
|
||||
@@ -392,6 +295,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
|
||||
self.assertTrue(decoded[0].endswith("to perform a variety of tasks. The Transformer is a neural network"))
|
||||
|
||||
@slow
|
||||
def test_sink_cache_iterative_prompts(self):
|
||||
"""Tests that SinkCache supports more than one new token at once, when shifting the cache"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
|
||||
@@ -434,13 +338,14 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
self.assertTrue(decoded[0].endswith(last_output))
|
||||
|
||||
@require_torch_gpu
|
||||
@parameterized.expand(
|
||||
[
|
||||
("eager", "static"),
|
||||
("sdpa", "static"),
|
||||
]
|
||||
)
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
def test_static_cache_greedy_decoding_pad_left(self, attn_implementation, cache_implementation):
|
||||
EXPECTED_GENERATION = [
|
||||
"The best color is the one that complements the skin tone of the",
|
||||
@@ -479,44 +384,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
with self.subTest(f"{attn_implementation}, static, compiled"):
|
||||
self.assertListEqual(decoded, EXPECTED_GENERATION)
|
||||
|
||||
@require_torch_gpu
|
||||
@parameterized.expand(
|
||||
[
|
||||
("eager", "static"),
|
||||
("sdpa", "static"),
|
||||
]
|
||||
)
|
||||
def test_static_cache_greedy_decoding_pad_right(self, attn_implementation, cache_implementation):
|
||||
EXPECTED_GENERATION = [
|
||||
"The best color isЋ the one that complements the skin tone of",
|
||||
"We should not undermind the issues at hand.\nWe should not undermind the issues",
|
||||
]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
"NousResearch/Llama-2-7b-chat-hf", padding_side="right", pad_token="<s>"
|
||||
)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"NousResearch/Llama-2-7b-chat-hf",
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation=attn_implementation,
|
||||
).to(torch_device)
|
||||
inputs = tokenizer(
|
||||
["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
|
||||
).to(model.device)
|
||||
|
||||
set_seed(0)
|
||||
gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
|
||||
decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
|
||||
with self.subTest(f"{attn_implementation}, dynamic"):
|
||||
self.assertListEqual(decoded, EXPECTED_GENERATION)
|
||||
|
||||
set_seed(0)
|
||||
model.generation_config.cache_implementation = cache_implementation
|
||||
gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
|
||||
decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
|
||||
with self.subTest(f"{attn_implementation}, static, eager"):
|
||||
self.assertListEqual(decoded, EXPECTED_GENERATION)
|
||||
|
||||
@slow
|
||||
def test_dynamic_cache_extra_left_padding(self):
|
||||
"""Tests that adding extra left-padding does not affect the generation with the dynamic cache"""
|
||||
EXPECTED_GENERATION = [
|
||||
@@ -551,12 +419,8 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
|
||||
self.assertListEqual(decoded, EXPECTED_GENERATION)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
"static",
|
||||
]
|
||||
)
|
||||
def test_static_cache_extra_left_padding(self, cache_implementation):
|
||||
@slow
|
||||
def test_static_cache_extra_left_padding(self):
|
||||
"""Tests that adding extra left-padding does not affect the generation with the static cache"""
|
||||
EXPECTED_GENERATION = [
|
||||
"The best color is the one that complements the skin tone of the",
|
||||
@@ -574,7 +438,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt"
|
||||
).to(model.device)
|
||||
|
||||
model.generation_config.cache_implementation = cache_implementation
|
||||
model.generation_config.cache_implementation = "static"
|
||||
|
||||
gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10)
|
||||
decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
|
||||
@@ -597,6 +461,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
pass
|
||||
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
def test_offloaded_cache_equivalent_to_dynamic_cache(self):
|
||||
"""Tests that OffloadedCache produces the same result as the default DynamicCache"""
|
||||
model_name = "microsoft/Phi-3-mini-4k-instruct"
|
||||
@@ -625,6 +490,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
assert torch.all(original_output == offloaded_output).item()
|
||||
|
||||
@require_torch_accelerator
|
||||
@slow
|
||||
def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self):
|
||||
"""Tests that OffloadedCache uses less memory than the default DynamicCache"""
|
||||
model_name = "microsoft/Phi-3-mini-4k-instruct"
|
||||
@@ -664,6 +530,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
assert offloaded_peak_memory < original_peak_memory
|
||||
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
def test_cache_copy(self):
|
||||
model_name = "microsoft/Phi-3-mini-4k-instruct"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
@@ -745,6 +612,7 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(cap.err, "")
|
||||
|
||||
@require_torch_multi_gpu
|
||||
@slow
|
||||
def test_static_cache_multi_gpu(self):
|
||||
"""Regression test for #35164: static cache with multi-gpu"""
|
||||
|
||||
@@ -764,3 +632,173 @@ class CacheIntegrationTest(unittest.TestCase):
|
||||
inputs = tokenizer("Today is a beautiful day!", return_tensors="pt").to(0)
|
||||
_ = model(**inputs)
|
||||
_ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid")
|
||||
|
||||
|
||||
@require_torch
|
||||
class CacheExportIntegrationTest(unittest.TestCase):
|
||||
"""Cache tests that rely on `torch.export()` and model loading"""
|
||||
|
||||
def test_dynamic_cache_exportability(self):
|
||||
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
|
||||
model = model.eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
|
||||
prompt = "What is the best way to debug python script?"
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
attention_mask = inputs.attention_mask
|
||||
input_ids = inputs.input_ids
|
||||
|
||||
past_key_values = DynamicCache()
|
||||
ep = torch.export.export(
|
||||
model,
|
||||
(),
|
||||
{
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"past_key_values": past_key_values,
|
||||
"use_cache": True,
|
||||
},
|
||||
strict=False,
|
||||
)
|
||||
res = ep.module()(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=True,
|
||||
)
|
||||
self.assertTrue(len(res.past_key_values.key_cache) == model.config.num_hidden_layers)
|
||||
self.assertEqual(2 * model.config.num_hidden_layers + 1, len(ep.graph_signature.output_specs))
|
||||
self.assertEqual(
|
||||
3,
|
||||
len(
|
||||
[
|
||||
x
|
||||
for x in ep.graph_signature.input_specs
|
||||
if x.kind == torch.export.graph_signature.InputKind.USER_INPUT
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
past_key_values_eager = DynamicCache()
|
||||
res_eager = model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values_eager,
|
||||
use_cache=True,
|
||||
)
|
||||
self.assertTrue(torch.allclose(res.logits, res_eager.logits))
|
||||
for k1, k2 in zip(res.past_key_values.key_cache, res_eager.past_key_values.key_cache):
|
||||
self.assertTrue(torch.allclose(k1, k2))
|
||||
|
||||
for v1, v2 in zip(res.past_key_values.value_cache, res_eager.past_key_values.value_cache):
|
||||
self.assertTrue(torch.allclose(v1, v2))
|
||||
|
||||
def test_static_cache_exportability(self):
|
||||
"""
|
||||
Tests that static cache works with `torch.export()`
|
||||
"""
|
||||
if not is_torch_greater_or_equal("2.3"):
|
||||
self.skipTest(reason="This test requires torch >= 2.3 to run.")
|
||||
|
||||
set_seed(0)
|
||||
device = "cpu"
|
||||
dtype = "bfloat16"
|
||||
cache_implementation = "static"
|
||||
attn_implementation = "sdpa" # Export and ExecuTorch only works for SdpaAttention
|
||||
batch_size = 1
|
||||
max_cache_len = 1234
|
||||
model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
device_map=device,
|
||||
torch_dtype=dtype,
|
||||
attn_implementation=attn_implementation,
|
||||
generation_config=GenerationConfig(
|
||||
use_cache=True,
|
||||
cache_implementation=cache_implementation,
|
||||
max_length=max_cache_len,
|
||||
cache_config={
|
||||
"batch_size": batch_size,
|
||||
"max_cache_len": max_cache_len,
|
||||
"device": device,
|
||||
},
|
||||
),
|
||||
)
|
||||
# Check if cache config is passed through correctly
|
||||
self.assertEqual(model.generation_config.use_cache, True)
|
||||
self.assertEqual(model.generation_config.cache_implementation, cache_implementation)
|
||||
self.assertEqual(model.generation_config.max_length, max_cache_len)
|
||||
self.assertTrue(model.generation_config.cache_config is not None)
|
||||
self.assertEqual(model.generation_config.cache_config.batch_size, batch_size)
|
||||
self.assertEqual(model.generation_config.cache_config.max_cache_len, max_cache_len)
|
||||
|
||||
exported_program = convert_and_export_with_cache(model)
|
||||
|
||||
# Check if the exported model is configured with the `StaticCache` correctly
|
||||
n_static_key_caches = n_static_value_caches = 0
|
||||
for buffer_name, buffer in exported_program.named_buffers():
|
||||
if buffer_name.startswith("key_cache"):
|
||||
self.assertTrue(buffer.shape[0] == batch_size)
|
||||
self.assertTrue(buffer.shape[2] == max_cache_len)
|
||||
n_static_key_caches = n_static_key_caches + 1
|
||||
if buffer_name.startswith("value_cache"):
|
||||
self.assertTrue(buffer.shape[0] == batch_size)
|
||||
self.assertTrue(buffer.shape[2] == max_cache_len)
|
||||
n_static_value_caches = n_static_value_caches + 1
|
||||
self.assertEqual(n_static_key_caches, model.config.num_hidden_layers)
|
||||
self.assertEqual(n_static_value_caches, model.config.num_hidden_layers)
|
||||
|
||||
# Export with dynamic shapes using Dim.AUTO
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
input_ids = tokenizer("Here's everything I know", return_tensors="pt").input_ids
|
||||
dynamic_shapes = {"input_ids": {1: torch.export.Dim.AUTO}, "cache_position": None}
|
||||
exported_program = convert_and_export_with_cache(
|
||||
model,
|
||||
example_input_ids=input_ids,
|
||||
dynamic_shapes=dynamic_shapes,
|
||||
strict=False,
|
||||
)
|
||||
|
||||
def test_hybrid_cache_exportability(self):
|
||||
"""
|
||||
Tests that static cache works with `torch.export()`
|
||||
"""
|
||||
if not is_torch_greater_or_equal("2.6"):
|
||||
self.skipTest(reason="This test requires torch >= 2.6 to run.")
|
||||
|
||||
from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
|
||||
|
||||
set_seed(0)
|
||||
model_id = "hf-internal-testing/tiny-random-Gemma3ForCausalLM"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
model.eval()
|
||||
self.assertEqual(model.config.use_cache, True)
|
||||
self.assertEqual(model.config.cache_implementation, "hybrid")
|
||||
|
||||
# Export + HybridCache
|
||||
model.eval()
|
||||
max_batch_size = 1
|
||||
max_cache_len = 23
|
||||
exportable_module = TorchExportableModuleForDecoderOnlyLM(model, max_batch_size, max_cache_len)
|
||||
exported_program = exportable_module.export()
|
||||
n_g_key_caches = n_g_value_caches = 0
|
||||
for buffer_name, buffer in exported_program.named_buffers():
|
||||
if buffer_name.startswith("key_cache"):
|
||||
self.assertTrue(buffer.shape[0] == max_batch_size)
|
||||
self.assertTrue(buffer.shape[2] == max_cache_len)
|
||||
n_g_key_caches = n_g_key_caches + 1
|
||||
if buffer_name.startswith("value_cache"):
|
||||
self.assertTrue(buffer.shape[0] == max_batch_size)
|
||||
self.assertTrue(buffer.shape[2] == max_cache_len)
|
||||
n_g_value_caches = n_g_value_caches + 1
|
||||
self.assertEqual(n_g_key_caches, model.config.num_hidden_layers)
|
||||
self.assertEqual(n_g_value_caches, model.config.num_hidden_layers)
|
||||
|
||||
# Export with dynamic shapes using Dim.AUTO
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
input_ids = tokenizer("Here's everything I know", return_tensors="pt").input_ids
|
||||
dynamic_shapes = {"input_ids": {1: torch.export.Dim.AUTO}, "cache_position": None}
|
||||
exported_program = exportable_module.export(
|
||||
input_ids=input_ids,
|
||||
dynamic_shapes=dynamic_shapes,
|
||||
strict=False,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user