fix: manual edits

feat: nmt draft
2025-07-20 19:06:32 -07:00 · 2025-07-20 19:01:42 -07:00
1572 changed files with 103404 additions and 62112 deletions
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -18,6 +18,10 @@ jobs:
      notebook_folder: transformers_doc
      languages: ar de en es fr hi it ko pt tr zh ja te
      custom_container: huggingface/transformers-doc-builder
+      # Temporary pin to work around datasets exception in the docbuilder.Remove after docker images and main have
+      # the right dependencies (which **should** be the case by 2025-07-20). See
+      # https://github.com/huggingface/transformers/actions/runs/16365952006/job/46243081358?pr=38545
+      pre_command: uv pip install datasets>=2.15.0
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -15,3 +15,7 @@ jobs:
      pr_number: ${{ github.event.number }}
      package: transformers
      languages: en
+      # Temporary pin to work around datasets exception in the docbuilder. Remove after docker images and main have
+      # the right dependencies (which **should** be the case by 2025-07-20). See
+      # https://github.com/huggingface/transformers/actions/runs/16365952006/job/46243081358?pr=38545
+      pre_command: uv pip install datasets>=2.15.0
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@@ -31,7 +31,7 @@ jobs:
      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -18,7 +18,7 @@ jobs:
      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      job_splits: ${{ steps.set-matrix.outputs.job_splits }}
      split_keys: ${{ steps.set-matrix.outputs.split_keys }}
--- a/.github/workflows/pr_build_doc_with_comment.yml
+++ b/.github/workflows/pr_build_doc_with_comment.yml
@@ -1,134 +0,0 @@
-name: PR - build doc via comment
-on:
-  issue_comment:
-    types:
-      - created
-    branches-ignore:
-      - main
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'build-doc') }}
-  cancel-in-progress: true
-permissions: {}
-
-
-jobs:
-  get-pr-number:
-    name: Get PR number
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
-    uses: ./.github/workflows/get-pr-number.yml
-
-  get-pr-info:
-    name: Get PR commit SHA
-    needs: get-pr-number
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    uses: ./.github/workflows/get-pr-info.yml
-    with:
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-
-  verity_pr_commit:
-    name: Verity PR commit corresponds to a specific event by comparing timestamps
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    runs-on: ubuntu-22.04
-    needs: get-pr-info
-    env:
-      COMMENT_DATE: ${{ github.event.comment.created_at }}
-      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
-      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
-    steps:
-      - run: |
-          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
-          echo "COMMENT_DATE: $COMMENT_DATE"
-          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
-          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
-          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
-          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
-            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
-            exit -1;
-          fi
-
-  create_run:
-    name: Create run
-    needs: [get-pr-number, get-pr-info]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
-    permissions:
-      statuses: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Create Run
-        id: create_run
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
-          # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-        run: |
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Custom doc building job" -f "context=custom-doc-build"
-
-  reply_to_comment:
-    name: Reply to the comment
-    if: ${{ needs.create_run.result == 'success' }}
-    needs: [get-pr-number, create_run]
-    permissions:
-      pull-requests: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Reply to the comment
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-        run: |
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=[Building docs for all languages...](${{ env.GITHUB_RUN_URL }})"
-
-  build-doc:
-    name: Build doc
-    needs: [get-pr-number, get-pr-info]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
-    with:
-      commit_sha: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-      package: transformers
-      languages: ar de en es fr hi it ko pt tr zh ja te
-
-  update_run_status:
-    name: Update Check Run Status
-    needs: [ get-pr-info, create_run, build-doc ]
-    permissions:
-      statuses: write
-    if: ${{ always() && needs.create_run.result == 'success' }}
-    runs-on: ubuntu-22.04
-    env:
-      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.create_run.result) }}
-    steps:
-      - name: Get `build-doc` job status
-        run: |
-          echo "${{ needs.build-doc.result }}"
-          echo $STATUS_OK
-          if [ "$STATUS_OK" = "true" ]; then
-            echo "STATUS=success" >> $GITHUB_ENV
-          else
-            echo "STATUS=failure" >> $GITHUB_ENV
-          fi
-
-      - name: Update PR commit statuses
-        run: |
-          echo "${{ needs.build-doc.result }}"
-          echo "${{ env.STATUS }}"
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Custom doc building job" -f "context=custom-doc-build"
--- a/.github/workflows/pr_run_slow_ci.yml
+++ b/.github/workflows/pr_run_slow_ci.yml
@@ -16,6 +16,28 @@ jobs:
    with:
      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}

+  # We only need to verify the timestamp if the workflow is triggered by `issue_comment`.
+  verity_pr_commit:
+    name: Verity PR commit corresponds to a specific event by comparing timestamps
+    if: ${{ github.event.comment.created_at != '' }}
+    runs-on: ubuntu-22.04
+    needs: get-pr-info
+    env:
+      COMMENT_DATE: ${{ github.event.comment.created_at }}
+      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
+      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
+    steps:
+      - run: |
+          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
+          echo "COMMENT_DATE: $COMMENT_DATE"
+          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
+          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
+          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
+          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
+            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
+            exit -1;
+          fi
+
  get-jobs:
    name: Get test files to run
    runs-on: ubuntu-22.04
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -36,7 +36,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      test_map: ${{ steps.set-matrix.outputs.test_map }}
@@ -136,7 +136,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
@@ -362,7 +362,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
--- a/.github/workflows/self-scheduled-amd-mi325-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi325-caller.yml
@@ -1,63 +0,0 @@
-name: Self-hosted runner scale set (AMD mi325 scheduled CI caller)
-
-# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
-# For example, 1gpu scale set: amd-mi325-ci-1gpu
-#              2gpu scale set: amd-mi325-ci-2gpu
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
-
-jobs:
-  model-ci:
-    name: Model CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -55,7 +55,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
@@ -219,7 +219,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
@@ -242,7 +242,7 @@ pipeline(

 - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
 - The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
- The [example scripts](https://github.com/huggingface/transformers/tree/main/examples) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
+- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.

 ## 100 projects using Transformers

@@ -280,8 +280,8 @@ Expand each modality below to see a few example models for various use cases.
 - Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
 - Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
 - Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
- Keypoint detection with [SuperPoint](https://huggingface.co/magic-leap-community/superpoint)
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
+- Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
+- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue)
 - Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
 - Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
 - Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
--- a/conftest.py
+++ b/conftest.py
@@ -23,12 +23,13 @@ from os.path import abspath, dirname, join
 import _pytest
 import pytest

-from transformers.testing_utils import HfDoctestModule, HfDocTestParser, is_torch_available
+from transformers.testing_utils import HfDoctestModule, HfDocTestParser


 NOT_DEVICE_TESTS = {
    "test_tokenization",
    "test_tokenization_mistral_common",
+    "test_processor",
    "test_processing",
    "test_beam_constraints",
    "test_configuration_utils",
@@ -127,10 +128,3 @@ class CustomOutputChecker(OutputChecker):
 doctest.OutputChecker = CustomOutputChecker
 _pytest.doctest.DoctestModule = HfDoctestModule
 doctest.DocTestParser = HfDocTestParser
-
-if is_torch_available():
-    import torch
-
-    # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
-    # We set it to `False` for CI. See https://github.com/pytorch/pytorch/issues/157274#issuecomment-3090791615
-    torch.backends.cudnn.allow_tf32 = False
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -1,8 +1,11 @@
-FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
+FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

+ARG TORCH_VISION='0.21.0'
+ARG TORCH_AUDIO='2.6.0'
+
 RUN apt update && \
    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
    apt clean && \
@@ -20,12 +23,9 @@ WORKDIR /
 ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-# On ROCm, torchcodec is required to decode audio files
-# RUN python3 -m pip install --no-cache-dir torchcodec
-# Install transformers
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video,audio]
+RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]

-# Remove tensorflow and flax as they are no longer supported by transformers
 RUN python3 -m pip uninstall -y tensorflow flax

 # When installing in editable mode, `transformers` is not recognized as a package.
@@ -36,4 +36,4 @@ RUN cd transformers && python3 setup.py develop
 RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y

 # `kernels` may causes many failing tests
-RUN python3 -m pip uninstall -y kernels
+RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -78,10 +78,6 @@ RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submod
 # RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
 # RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git

-# Add fp-quant for quantization testing
-# Requires py3.11 but our CI runs on 3.9
-# RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
-
 # Add compressed-tensors for quantization testing
 RUN python3 -m pip install --no-cache-dir compressed-tensors

--- a/docs/source/ar/custom_models.md
+++ b/docs/source/ar/custom_models.md
@@ -280,7 +280,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 أو من دفتر ملاحظات:
--- a/docs/source/ar/llm_tutorial_optimization.md
+++ b/docs/source/ar/llm_tutorial_optimization.md
@@ -13,11 +13,11 @@

 في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:

-1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization).
+1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization.md).

 2.  **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.

-3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)](https://huggingface.co/papers/2305.13245).
+3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)]((https://huggingface.co/papers/2305.13245)).

 على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.

--- a/docs/source/ar/model_sharing.md
+++ b/docs/source/ar/model_sharing.md
@@ -41,7 +41,7 @@ picture-in-picture" allowfullscreen></iframe>
 قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك.  إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):

 ```bash
-hf auth login
+huggingface-cli login
 ```

 إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
--- a/docs/source/ar/run_scripts.md
+++ b/docs/source/ar/run_scripts.md
@@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
--- a/docs/source/de/model_sharing.md
+++ b/docs/source/de/model_sharing.md
@@ -56,7 +56,7 @@ Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können
 Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -72,6 +72,8 @@
      title: Caching
    - local: kv_cache
      title: KV cache strategies
+    - local: serving
+      title: Serving
    - local: llm_tutorial_optimization
      title: Getting the most out of LLMs
    - local: perplexity
@@ -89,18 +91,6 @@
    - local: chat_extras
      title: Tools and RAG
    title: Chat with models
-  - sections:
-      - local: serving
-        title: Serving LLMs, VLMs, and other chat-based models
-      - local: jan
-        title: Jan
-      - local: cursor
-        title: Cursor
-      - local: tiny_agents
-        title: Tiny-Agents CLI and MCP tools
-      - local: open_webui
-        title: Open WebUI
-    title: Serving
  - sections:
    - local: perf_torch_compile
      title: torch.compile
@@ -115,8 +105,6 @@
    title: Agents
  - local: tools
    title: Tools
-  - local: transformers_as_backend
-    title: Inference server backends
  title: Inference
 - isExpanded: false
  sections:
@@ -189,8 +177,6 @@
    title: FBGEMM
  - local: quantization/finegrained_fp8
    title: Fine-grained FP8
-  - local: quantization/fp_quant
-    title: FP-Quant
  - local: gguf
    title: GGUF
  - local: quantization/gptq
@@ -455,16 +441,10 @@
        title: Encoder Decoder Models
      - local: model_doc/ernie
        title: ERNIE
-      - local: model_doc/ernie4_5
-        title: Ernie4_5
-      - local: model_doc/ernie4_5_moe
-        title: Ernie4_5_MoE
      - local: model_doc/ernie_m
        title: ErnieM
      - local: model_doc/esm
        title: ESM
-      - local: model_doc/exaone4
-        title: EXAONE-4.0
      - local: model_doc/falcon
        title: Falcon
      - local: model_doc/falcon3
@@ -495,8 +475,6 @@
        title: GLM
      - local: model_doc/glm4
        title: glm4
-      - local: model_doc/glm4_moe
-        title: glm4_moe
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
@@ -511,8 +489,6 @@
        title: GPT2
      - local: model_doc/gpt_bigcode
        title: GPTBigCode
-      - local: model_doc/gpt_oss
-        title: GptOss
      - local: model_doc/gptsan-japanese
        title: GPTSAN Japanese
      - local: model_doc/gpt-sw3
@@ -711,8 +687,6 @@
        title: XLM-V
      - local: model_doc/xlnet
        title: XLNet
-      - local: model_doc/xlstm
-        title: xLSTM
      - local: model_doc/yoso
        title: YOSO
      - local: model_doc/zamba
@@ -741,10 +715,6 @@
        title: DAB-DETR
      - local: model_doc/deepseek_v2
        title: DeepSeek-V2
-      - local: model_doc/deepseek_vl
-        title: DeepseekVL
-      - local: model_doc/deepseek_vl_hybrid
-        title: DeepseekVLHybrid
      - local: model_doc/deformable_detr
        title: Deformable DETR
      - local: model_doc/deit
@@ -771,8 +741,6 @@
        title: DPT
      - local: model_doc/efficientformer
        title: EfficientFormer
-      - local: model_doc/efficientloftr
-        title: EfficientLoFTR
      - local: model_doc/efficientnet
        title: EfficientNet
      - local: model_doc/eomt
@@ -983,8 +951,6 @@
        title: CLIPSeg
      - local: model_doc/clvp
        title: CLVP
-      - local: model_doc/cohere2_vision
-        title: Cohere2Vision
      - local: model_doc/colpali
        title: ColPali
      - local: model_doc/colqwen2
@@ -997,8 +963,6 @@
        title: Donut
      - local: model_doc/emu3
        title: Emu3
-      - local: model_doc/evolla
-        title: Evolla
      - local: model_doc/flava
        title: FLAVA
      - local: model_doc/gemma3
@@ -1063,8 +1027,6 @@
        title: Mistral3
      - local: model_doc/mllama
        title: mllama
-      - local: model_doc/mm-grounding-dino
-        title: MM Grounding DINO
      - local: model_doc/nougat
        title: Nougat
      - local: model_doc/omdet-turbo
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@@ -72,34 +72,6 @@ model(torch.ones(1, 5, dtype=int))
 and it will stop printing the statements, as it now uses the `sdpa` attention.  
 This allows to quickly change an attention function, without needing to reload the model!

-## Different attention per backbone in multimodal models
-
-For multimodal models different attention functions may work better for each backbone module. For example, some vision backbones perform better in fp32, but are incompatible with FlashAttention. To continue using FlashAttention while keeping the vision encoder in fp32, create a dict and map each config to an attention implementation as shown below.
-
-```python
-from transformers import AutoModelForImageTextToText
-
-model_id = "facebook/chameleon-7b"
-
-attention_implementation_per_backbone = {"vision_config": "sdpa", "text_config": "flash_attention_2"}
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation=attention_implementation_per_backbone)
-
-# NOTE: keys in the attention implementation have to be the same as the sub-config names
-for key in attention_implementation_per_backbone:
-    assert key in model.config.sub_configs, f"Invalid key in `attention_implementation`"
-
-# You can omit certain backbones - the default attention function (SDPA) will be used
-# This is equivalent to the previous example
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2"})
-
-
-# Set the same attention implementation for all backbones with single string, same as in non-multimodal models
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
-
-# Alternatively use a dict with an empty key for global configuration
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"": "eager"})
-```
-
 ## What about new args needed in my custom attention function?

 But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@@ -82,18 +82,22 @@ When you use Transformers' [`Cache`] class, the self-attention module performs s

 ## Cache storage implementation

-Caches are structured as a list of layers, where each layer contains a key and value cache. The key and value caches are tensors with the shape `[batch_size, num_heads, seq_len, head_dim]`.
+The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].

-Layers can be of different types (e.g. `DynamicLayer`, `StaticLayer`, `SlidingWindowLayer`), which mostly changes how sequence length is handled and how the cache is updated.

-The simplest is a `DynamicLayer` that grows as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token:
+In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
+- `key_cache`: A list of tensors, one for each layer.
+- `value_cache`: A list of tensors, one for each layer.

+When new tokens are processed:
+
+1. For each layer, the new key and value states are concatenated with the existing cache.
 ```py
-cache.layers[idx].keys = torch.cat([cache.layers[idx].keys, key_states], dim=-2)
-cache.layers[idx].values = torch.cat([cache.layers[idx].values, value_states], dim=-2)
+self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
 ```

-Other layer types like `StaticLayer` and `SlidingWindowLayer` have a fixed sequence length that is set when the cache is created. This makes them compatible with `torch.compile`. In the case of `SlidingWindowLayer`, existing tokens are shifted out of the cache when a new token is added.
+2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.

 The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.

@@ -128,34 +132,6 @@ for _ in range(max_new_tokens):
 print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
 "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
 ```
-
-## Cache position
-
-The cache position tracks where to insert new tokens in the attention cache. It represents the *absolute* position of each token in the context, independent of padding or batch structure. Suppose you already cached `N` tokens and are now processing `K` new tokens. The cache position for the new tokens will range from `N` to `N + K - 1`. In other words, you're processing tokens at positions - `[N, N + 1, N + 2, ..., N + K - 1]`.
-
-Cache position is used internally for two purposes:
-
-1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
-2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, like [`StaticCache`], that pre-allocates a specific cache length.
-
-The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
-
-
-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
-
-model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-messages = [{"role": "user", "content": "You are a helpful assistant."}]
-inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
-generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
-
-```
-
-
 ## Legacy cache format

 Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
@@ -181,4 +157,4 @@ generation_outputs = model.generate(**inputs, return_dict_in_generate=True, retu

 cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
 legacy_format_cache = cache.to_legacy_cache()
-```
+```
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@@ -111,7 +111,6 @@ Some vision models also support video inputs. The message format is very similar

 - The content `"type"` should be `"video"` to indicate the content is a video.
 - For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.

 > [!WARNING]
 > Loading a video from `"url"` is only supported by the PyAV or Decord backends.
@@ -138,52 +137,6 @@ messages = [
 ]
 ```

-### Example: Passing decoded video objects
-```python
-import numpy as np
-
-video_object1 = np.random.randint(0, 255, size=(16, 224, 224, 3), dtype=np.uint8),
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "video", "video": video_object1},
-            {"type": "text", "text": "What do you see in this video?"}
-        ],
-    },
-]
-```
-You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages.
-```python
-
-# Make sure a video backend library (pyav, decord, or torchvision) is available.
-from transformers.video_utils import load_video
-
-# load a video file in memory for testing
-video_object2, _ = load_video(
-    "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
-)
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "video", "video": video_object2},
-            {"type": "text", "text": "What do you see in this video?"}
-        ],
-    },
-]
-```
-
 Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.

 The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@@ -27,7 +27,7 @@ This guide shows you how to quickly start chatting with Transformers from the co

 ## chat CLI

-After you've [installed Transformers](./installation), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
+After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
 transformers chat Qwen/Qwen2.5-0.5B-Instruct
@@ -158,4 +158,4 @@ The easiest solution for improving generation speed is to either quantize a mode
 You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.

 > [!TIP]
-> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
+> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
--- a/docs/source/en/cursor.md
+++ b/docs/source/en/cursor.md
@@ -1,42 +0,0 @@
-# Using Cursor as a client of transformers serve
-
-This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. In this particular case, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
-
-To launch a server with CORS enabled, run
-
-```shell
-transformers serve --enable-cors
-```
-
-You'll also need to expose your server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
-
-```shell
-ngrok http [port]
-```
-
-where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see a https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
-</h3>
-
-You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
-1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
-2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
-3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
-4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
-5. Hit "Verify".
-
-After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
-</h3>
-
-You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
-</h3>
-
-
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@@ -271,7 +271,7 @@ The model is ready to be pushed to the Hub now. Log in to your Hugging Face acco
 <hfoption id="huggingface-CLI">

 ```bash
-hf auth login
+huggingface-cli login
 ```

 </hfoption>
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -356,93 +356,66 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 ## Caches

-[[autodoc]] CacheLayerMixin
-    - update
-    - get_seq_length
-    - get_mask_sizes
-    - get_max_cache_shape
-    - reset
-    - reorder_cache
-
-[[autodoc]] DynamicLayer
-    - update
-    - crop
-    - batch_repeat_interleave
-    - batch_select_indices
-
-[[autodoc]] StaticLayer
-    - update
-
-[[autodoc]] SlidingWindowLayer
-    - update
-
-[[autodoc]] CacheProcessor
-    - pre_update
-    - post_update
-
-[[autodoc]] OffloadedCacheProcessor
-    - pre_update
-
-[[autodoc]] QuantizedCacheProcessor
-    - post_update
-
-[[autodoc]] QuantoQuantizedCacheProcessor
-    - post_update
-
-[[autodoc]] HQQQuantizedCacheProcessor
-    - post_update
-
 [[autodoc]] Cache
    - update
-    - get_seq_length
-    - get_mask_sizes
-    - get_max_cache_shape
-    - reset
-    - reorder_cache
-    - crop
-    - batch_repeat_interleave
-    - batch_select_indices
+
+[[autodoc]] CacheConfig
+	- update
+
+[[autodoc]] QuantizedCacheConfig
+	- validate

 [[autodoc]] DynamicCache
+    - update
+    - get_seq_length
+    - reorder_cache
    - to_legacy_cache
    - from_legacy_cache

 [[autodoc]] QuantizedCache
+    - update
+    - get_seq_length

 [[autodoc]] QuantoQuantizedCache

-[[autodoc]] QuantoQuantizedCacheProcessor
-
 [[autodoc]] HQQQuantizedCache

-[[autodoc]] HQQQuantizedCacheProcessor
-
 [[autodoc]] OffloadedCache
+    - update
+    - prefetch_layer
+    - evict_previous_layer

 [[autodoc]] StaticCache
+    - update
+    - get_seq_length
+    - reset

 [[autodoc]] OffloadedStaticCache
+    - update
+    - get_seq_length
+    - reset

 [[autodoc]] HybridCache
-
-[[autodoc]] HybridChunkedCache
+    - update
+    - get_seq_length
+    - reset

 [[autodoc]] SlidingWindowCache
+    - update
+    - reset

 [[autodoc]] EncoderDecoderCache
+    - get_seq_length
    - to_legacy_cache
    - from_legacy_cache
+    - reset
+    - reorder_cache

 [[autodoc]] MambaCache
    - update_conv_state
    - update_ssm_state
    - reset

-[[autodoc]] CacheConfig
-
-[[autodoc]] QuantizedCacheConfig
-
-
 ## Watermark Utils

 [[autodoc]] WatermarkingConfig
--- a/docs/source/en/jan.md
+++ b/docs/source/en/jan.md
@@ -1,32 +0,0 @@
-# Jan: using the serving API as a local LLM provider
-
-This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.
-
-## Running models locally
-
-To connect `transformers serve` with Jan, you'll need to set up a new model provider ("Settings" > "Model Providers"). Click on "Add Provider", and set a new name. In your new model provider page, all you need to set is the "Base URL" to the following pattern:
-
-```shell
-http://[host]:[port]/v1
-```
-
-where `host` and `port` are the `transformers serve` CLI parameters (`localhost:8000` by default). After setting this up, you should be able to see some models in the "Models" section, hitting "Refresh". Make sure you add some text in the "API key" text field too -- this data is not actually used, but the field can't be empty. Your custom model provider page should look like this:
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_jan_model_providers.png"/>
-</h3>
-
-You are now ready to chat!
-
-> [!TIP]
-> You can add any `transformers`-compatible model to Jan through `transformers serve`. In the custom model provider you created, click on the "+" button in the "Models" section and add its Hub repository name, e.g. `Qwen/Qwen3-4B`.
-
-## Running models on a separate machine
-
-To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
-
-```
-ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
-```
-
-Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@@ -134,7 +134,7 @@ The [`QuantizedCache`] reduces memory requirements by quantizing the KV values t
 > [!WARNING]
 > Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.

-Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and the quantization backend, as well as any additional quantization related parameters should also be passed either as a dict. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
+Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.

 <hfoptions id="quantized-cache">
 <hfoption id="HQQQuantizedCache">
@@ -143,7 +143,7 @@ For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value`

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache
+from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@@ -161,7 +161,7 @@ For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-valu

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache
+from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@@ -275,6 +275,7 @@ from transformers.cache_utils import (
    StaticCache,
    SlidingWindowCache,
    QuantoQuantizedCache,
+    QuantizedCacheConfig,
 )

 model_id = "meta-llama/Llama-2-7b-chat-hf"
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@@ -341,7 +341,7 @@ A known issue with transformer models is that the self-attention mechanism grows

 FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to the GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead.

-To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or set with `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface) after the model is loaded.
+To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`].

 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@@ -353,14 +353,6 @@ model = AutoModelForCausalLM.from_pretrained(
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
 )
-
-# Change the model's attention dynamically after loading
-model = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2b",
-    quantization_config=quant_config,
-    torch_dtype=torch.bfloat16
-)
-model.set_attention_implementation("flash_attention_2")
 ```

 ### PyTorch scaled dot product attention
@@ -368,7 +360,7 @@ model.set_attention_implementation("flash_attention_2")
 Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation.

 > [!TIP]
-> SDPA automatically supports FlashAttention-2 as long as you have the latest PyTorch version installed.
+> SDPA automaticallysupports FlashAttention-2 as long as you have the latest PyTorch version installed.

 Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@@ -148,9 +148,9 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | Option name | Type | Simplified description |
 |---|---|---|
 | `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
-| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies) for more information. |
+| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
 | `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
-| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies) for more information. |
+| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
 | `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
 | `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |

--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@@ -23,11 +23,11 @@ The crux of these challenges lies in augmenting the computational and memory cap

 In this guide, we will go over the effective techniques for efficient LLM deployment:

-1.  **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization) can achieve computational advantages without a considerable decline in model performance.
+1.  **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.

 2.  **Flash Attention:** Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization.

-3.  **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)](https://huggingface.co/papers/2305.13245).
+3.  **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)]((https://huggingface.co/papers/2305.13245)).

 Throughout this guide, we will offer an analysis of auto-regressive generation from a tensor's perspective. We delve into the pros and cons of adopting lower precision, provide a comprehensive exploration of the latest attention algorithms, and discuss improved LLM architectures. While doing so, we run practical examples showcasing each of the feature improvements.

--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@@ -33,7 +33,6 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
  it's the second one).
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
  or tensorboardX).
- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@@ -73,9 +72,6 @@ Here is the list of the available [`TrainerCallback`] in the library:

 [[autodoc]] integrations.TensorBoardCallback

-[[autodoc]] integrations.TrackioCallback
-    - setup
-
 [[autodoc]] integrations.WandbCallback
    - setup

--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -65,10 +65,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] HqqConfig

-## Mxfp4Config
-
-[[autodoc]] Mxfp4Config
-
 ## FbgemmFp8Config

 [[autodoc]] FbgemmFp8Config
@@ -97,10 +93,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] QuarkConfig

-## FPQuantConfig
-
-[[autodoc]] FPQuantConfig
-
 ## AutoRoundConfig

 [[autodoc]] AutoRoundConfig
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -258,10 +258,6 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForKeypointDetection

-### AutoModelForKeypointMatching
-
-[[autodoc]] AutoModelForKeypointMatching
-
 ### AutoModelForMaskedImageModeling

 [[autodoc]] AutoModelForMaskedImageModeling
--- a/docs/source/en/model_doc/barthez.md
+++ b/docs/source/en/model_doc/barthez.md
@@ -14,81 +14,49 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
-    </div>
-</div>
-
 # BARThez

-[BARThez](https://huggingface.co/papers/2010.12321) is a [BART](./bart) model designed for French language tasks. Unlike existing French BERT models, BARThez includes a pretrained encoder-decoder, allowing it to generate text as well. This model is also available as a multilingual variant, mBARThez, by continuing pretraining multilingual BART on a French corpus.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>

-You can find all of the original BARThez checkpoints under the [BARThez](https://huggingface.co/collections/dascim/barthez-670920b569a07aa53e3b6887) collection.
+## Overview

-> [!TIP]
-> This model was contributed by [moussakam](https://huggingface.co/moussakam).
-> Refer to the [BART](./bart) docs for more usage examples.
+The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://huggingface.co/papers/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
+2020.
+
+The abstract of the paper:


-The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
+*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
+(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
+understanding tasks. While there are some notable exceptions, most of the available models and research have been
+conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
+(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
+that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
+CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
+its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
+summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
+pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
+provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+This model was contributed by [moussakam](https://huggingface.co/moussakam). The Authors' code can be found [here](https://github.com/moussaKam/BARThez).

-```py
-import torch
-from transformers import pipeline
+<Tip> 

-pipeline = pipeline(
-    task="fill-mask",
-    model="moussaKam/barthez",
-    torch_dtype=torch.float16,
-    device=0
-)
-pipeline("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.")
-```
+BARThez implementation is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on 
+configuration classes and their parameters. BARThez-specific tokenizers are documented below.  

-</hfoption>
-<hfoption id="AutoModel">
+</Tip>

-```py
-import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer
+## Resources

-tokenizer = AutoTokenizer.from_pretrained(
-    "moussaKam/barthez",
-)
-model = AutoModelForMaskedLM.from_pretrained(
-    "moussaKam/barthez",
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
-inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to("cuda")
+- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
+  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).

-with torch.no_grad():
-    outputs = model(**inputs)
-    predictions = outputs.logits
-
-masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
-predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
-predicted_token = tokenizer.decode(predicted_token_id)
-
-print(f"The predicted token is: {predicted_token}")
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "Les plantes produisent <mask> grâce à un processus appelé photosynthèse." | transformers run --task fill-mask --model moussaKam/barthez --device 0
-```
-
-</hfoption>
-</hfoptions>

 ## BarthezTokenizer

--- a/docs/source/en/model_doc/clap.md
+++ b/docs/source/en/model_doc/clap.md
@@ -14,50 +14,25 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-  <div class="flex flex-wrap space-x-1">
-    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-  </div>
-</div>
-
 # CLAP

-[CLAP (Contrastive Language-Audio Pretraining)](https://huggingface.co/papers/2211.06687) is a multimodal model that combines audio data with natural language descriptions through contrastive learning.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-It incorporates feature fusion and keyword-to-caption augmentation to process variable-length audio inputs and to improve performance. CLAP doesn't require task-specific training data and can learn meaningful audio representations through natural language.
+## Overview

-You can find all the original CLAP checkpoints under the [CLAP](https://huggingface.co/collections/laion/clap-contrastive-language-audio-pretraining-65415c0b18373b607262a490) collection.
+The CLAP model was proposed in [Large Scale Contrastive Language-Audio pretraining with
+feature fusion and keyword-to-caption augmentation](https://huggingface.co/papers/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.

-> [!TIP]
-> This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
->
-> Click on the CLAP models in the right sidebar for more examples of how to apply CLAP to different audio retrieval and classification tasks.
+CLAP (Contrastive Language-Audio Pretraining) is a neural network trained on a variety of (audio, text) pairs. It can be instructed in to predict the most relevant text snippet, given an audio, without directly optimizing for the task. The CLAP model uses a SWINTransformer to get audio features from a log-Mel spectrogram input, and a RoBERTa model to get text features. Both the text and audio features are then projected to a latent space with identical dimension. The dot product between the projected audio and text features is then used as a similar score.

-The example below demonstrates how to extract text embeddings with the [`AutoModel`] class.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="AutoModel">
+*Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-6*

-```python
-import torch
-from transformers import AutoTokenizer, AutoModel
-
-model = AutoModel.from_pretrained("laion/clap-htsat-unfused", torch_dtype=torch.float16, device_map="auto")
-tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
-
-texts = ["the sound of a cat", "the sound of a dog", "music playing"]
-
-inputs = tokenizer(texts, padding=True, return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    text_features = model.get_text_features(**inputs)
-
-print(f"Text embeddings shape: {text_features.shape}")
-print(f"Text embeddings: {text_features}")
-```
-
-</hfoption>
-</hfoptions>
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ) .
+The original code can be found [here](https://github.com/LAION-AI/Clap).

 ## ClapConfig

--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@@ -1,115 +1,43 @@
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-    </div>
+# Cohere
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

+## Overview
+[C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages.

-# Cohere2
+The model features three layers with sliding window attention (window size 4096) and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence.

-[Cohere Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model. It is a multilingual model trained on 23 languages and has a context window of 128k. The model features three layers with sliding window attention and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence.
+The model has been trained on 23 languages: English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Arabic, Chinese, Russian, Polish, Turkish, Vietnamese, Dutch, Czech, Indonesian, Ukrainian, Romanian, Greek, Hindi, Hebrew, and Persian.

-This model is optimized for speed, cost-performance, and compute resources.
-
-You can find all the original Command-R checkpoints under the [Command Models](https://huggingface.co/collections/CohereForAI/command-models-67652b401665205e17b192ad) collection.
-
-
-> [!TIP]
-> Click on the Cohere models in the right sidebar for more examples of how to apply Cohere to different language tasks.
-
-The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class, and from the command line.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
+## Usage tips
+The model and tokenizer can be loaded via:

 ```python
-import torch
-from transformers import pipeline
-
-pipeline = pipeline(
-    task="text-generation", 
-    model="CohereLabs/c4ai-command-r7b-12-2024",
-    torch_dtype=torch.float16,
-    device_map=0
-)
-
-messages = [
-    {"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"},
-]
-pipeline(messages)
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```python
-import torch
+# pip install transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM

-tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
-model = AutoModelForCausalLM.from_pretrained(
-    "CohereLabs/c4ai-command-r7b-12-2024", 
-    torch_dtype=torch.float16, 
-    device_map="auto", 
-    attn_implementation="sdpa"
-)
+model_id = "CohereForAI/c4ai-command-r7b-12-2024"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)

-# format message with the Command-R chat template
-messages = [{"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
-output = model.generate(
+# Format message with the command-r chat template
+messages = [{"role": "user", "content": "Hello, how are you?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+gen_tokens = model.generate(
    input_ids,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.3,
-    cache_implementation="static",
-)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-# pip install -U flash-attn --no-build-isolation
-transformers-cli chat CohereLabs/c4ai-command-r7b-12-2024 --torch_dtype auto --attn_implementation flash_attention_2
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview.md) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to 4-bits.
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
-
-bnb_config = BitsAndBytesConfig(load_in_4bit=True)
-tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
-model = AutoModelForCausalLM.from_pretrained(
-    "CohereLabs/c4ai-command-r7b-12-2024", 
-    torch_dtype=torch.float16, 
-    device_map="auto", 
-    quantization_config=bnb_config, 
-    attn_implementation="sdpa"
 )

-# format message with the Command-R chat template
-messages = [{"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
-output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    cache_implementation="static",
-)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
 ```

 ## Cohere2Config
--- a/docs/source/en/model_doc/cohere2_vision.md
+++ b/docs/source/en/model_doc/cohere2_vision.md
@@ -1,123 +0,0 @@
-# Command A Vision
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-</div>
-
-## Overview
-
-Command A Vision is a state-of-the-art multimodal model designed to seamlessly integrate visual and textual information for a wide range of applications. By combining advanced computer vision techniques with natural language processing capabilities, Command A Vision enables users to analyze, understand, and generate insights from both visual and textual data.
-
-The model excels at tasks including image captioning, visual question answering, document understanding, and chart understanding. This makes it a versatile tool for AI practitioners. Its ability to process complex visual and textual inputs makes it useful in settings where text-only representations are imprecise or unavailable, like real-world image understanding and graphics-heavy document processing.
-
-Command A Vision is built upon a robust architecture that leverages the latest advancements in VLMs. It's highly performant and efficient, even when dealing with large-scale datasets. The model's flexibility makes it suitable for a wide range of use cases, from content moderation and image search to medical imaging analysis and robotics.
-
-## Usage tips
-
-The model and image processor can be loaded as follows:
-
-<hfoptions id="usage">
-<hfoption id="AutoModel">
-
-```python
-import torch
-
-from transformers import AutoProcessor, AutoModelForImageTextToText
-
-model_id = "CohereLabs/command-a-vision-07-2025"
-
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="auto", torch_dtype=torch.float16
-)
-
-# Format message with the Command-A-Vision chat template
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
-            },
-            {"type": "text", "text": "what is in this image?"},
-        ],
-    },
-]
-
-inputs = processor.apply_chat_template(
-    messages,
-    padding=True,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-).to(model.device)
-
-gen_tokens = model.generate(
-    **inputs,
-    max_new_tokens=300,
-    do_sample=True,
-    temperature=0.3,
-)
-
-print(
-    processor.tokenizer.decode(
-        gen_tokens[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
-    )
-)
-```
-
-</hfoption>
-<hfoption id="Pipeline">
-
-```python
-from transformers import pipeline
-
-pipe = pipeline(model="CohereLabs/command-a-vision-07-2025", task="image-text-to-text", device_map="auto")
-
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=",
-            },
-            {"type": "text", "text": "Where was this taken ?"},
-        ],
-    },
-]
-
-outputs = pipe(text=messages, max_new_tokens=300, return_full_text=False)
-
-print(outputs)
-```
-</hfoption>
-</hfoptions>
-
-## Cohere2VisionConfig
-
-[[autodoc]] Cohere2VisionConfig
-
-## Cohere2VisionForConditionalGeneration
-
-[[autodoc]] Cohere2VisionForConditionalGeneration
-    - forward
-
-## Cohere2VisionModel
-
-[[autodoc]] Cohere2VisionModel
-    - forward
-
-## Cohere2VisionImageProcessorFast
-
-[[autodoc]] Cohere2VisionImageProcessorFast
-    - preprocess
-
-## Cohere2VisionProcessor
-
-[[autodoc]] Cohere2VisionProcessor
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@@ -95,7 +95,7 @@ images = [

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to int4.
+The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.

 ```python
 import requests
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@@ -99,7 +99,7 @@ images = [

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to int4.
+The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.

 ```python
 import requests
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.

 **Model Architecture:**
-CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
+CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.

 The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.

--- a/docs/source/en/model_doc/deepseek_vl.md
+++ b/docs/source/en/model_doc/deepseek_vl.md
@@ -1,224 +0,0 @@
-<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# DeepseekVL
-
-[Deepseek-VL](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding images.
-
-You can find all the original Deepseek-VL checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
-
-> [!TIP]
-> Click on the Deepseek-VL models in the right sidebar for more examples of how to apply Deepseek-VL to different vision and language tasks.
-
-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    task="image-text-to-text",
-    model="deepseek-community/deepseek-vl-1.3b-chat",
-    device=0,
-    torch_dtype=torch.float16
-)
-
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-            },
-            { "type": "text", "text": "Describe this image."},
-        ]
-    }
-]
-
-pipe(text=messages, max_new_tokens=20, return_full_text=False)
-```
-</hfoption>
-
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
-
-model = DeepseekVLForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-1.3b-chat",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    attn_implementation="sdpa"
-)
-
-processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
-
-messages = [
-    {
-        "role":"user",
-        "content":[
-            {
-                "type":"image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-            },
-            {
-                "type":"text",
-                "text":"Describe this image."
-            }
-        ]
-    }
-
-]
-
-inputs = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device, dtype=model.dtype)
-
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-
-print(output_text)
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
-
-```python
-import torch
-from transformers import TorchAoConfig, DeepseekVLForConditionalGeneration, AutoProcessor
-
-quantization_config = TorchAoConfig(
-    "int4_weight_only",
-    group_size=128
-)
-
-model = DeepseekVLForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-1.3b-chat",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-```
-### Notes
-
- Do inference with multiple images in a single conversation.
-    ```py
-    import torch
-    from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
-
-    model = DeepseekVLForConditionalGeneration.from_pretrained(
-        "deepseek-community/deepseek-vl-1.3b-chat",
-        torch_dtype=torch.float16,
-        device_map="auto",
-        attn_implementation="sdpa"
-    )
-
-    processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
-
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What’s the difference between"},
-                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-                    {"type": "text", "text": " and "},
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
-                ]
-            }
-        ],
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
-                    {"type": "text", "text": "What do you see in this image?"}
-                ]
-            }
-        ]
-    ]
-
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        padding=True,
-        truncation=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device, dtype=model.dtype)
-
-    generated_ids = model.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-
-    print(output_text)
-    ```
-
-## DeepseekVLConfig
-
-[[autodoc]] DeepseekVLConfig
-
-## DeepseekVLProcessor
-
-[[autodoc]] DeepseekVLProcessor
-
-## DeepseekVLImageProcessor
-
-[[autodoc]] DeepseekVLImageProcessor
-
-## DeepseekVLImageProcessorFast
-
-[[autodoc]] DeepseekVLImageProcessorFast
-
-## DeepseekVLModel
-
-[[autodoc]] DeepseekVLModel
-    - forward
-
-## DeepseekVLForConditionalGeneration
-
-[[autodoc]] DeepseekVLForConditionalGeneration
-    - forward
--- a/docs/source/en/model_doc/deepseek_vl_hybrid.md
+++ b/docs/source/en/model_doc/deepseek_vl_hybrid.md
@@ -1,223 +0,0 @@
-<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# DeepseekVLHybrid
-
-[Deepseek-VL-Hybrid](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding low-resolution images and [SAM (Segment Anything Model)](./sam) is incorporated to handle high-resolution image encoding, enhancing the model’s ability to process fine-grained visual details. Deepseek-VL-Hybrid is a variant of Deepseek-VL that uses [SAM (Segment Anything Model)](./sam) to handle high-resolution image encoding.
-
-You can find all the original Deepseek-VL-Hybrid checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
-
-> [!TIP]
-> Click on the Deepseek-VL-Hybrid models in the right sidebar for more examples of how to apply Deepseek-VL-Hybrid to different vision and language tasks.
-
-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    task="image-text-to-text",
-    model="deepseek-community/deepseek-vl-7b-chat",
-    device=0,
-    torch_dtype=torch.float16
-)
-
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-            },
-            { "type": "text", "text": "Describe this image."},
-        ]
-    }
-]
-
-pipe(text=messages, max_new_tokens=20, return_full_text=False)
-```
-</hfoption>
-
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
-
-model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-7b-chat",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    attn_implementation="sdpa"
-)
-
-processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
-
-messages = [
-    {
-        "role":"user",
-        "content":[
-            {
-                "type":"image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-            },
-            {
-                "type":"text",
-                "text":"Describe this image."
-            }
-        ]
-    }
-
-]
-
-inputs = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device, dtype=model.dtype)
-
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-
-print(output_text)
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
-
-```python
-import torch
-from transformers import TorchAoConfig, DeepseekVLHybridForConditionalGeneration, AutoProcessor
-
-quantization_config = TorchAoConfig(
-    "int4_weight_only",
-    group_size=128
-)
-
-model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-7b-chat",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-```
-### Notes
-
- Do inference with multiple images in a single conversation.
-    ```py
-    import torch
-    from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
-
-    model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
-        "deepseek-community/deepseek-vl-7b-chat",
-        torch_dtype=torch.float16,
-        device_map="auto",
-        attn_implementation="sdpa"
-    )
-
-    processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
-
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What’s the difference between"},
-                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-                    {"type": "text", "text": " and "},
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
-                ]
-            }
-        ],
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
-                    {"type": "text", "text": "What do you see in this image?"}
-                ]
-            }
-        ]
-    ]
-
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        padding=True,
-        truncation=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device, dtype=model.dtype)
-
-    generated_ids = model.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-
-    print(output_text)
-    ```
-
-## DeepseekVLHybridConfig
-
-[[autodoc]] DeepseekVLHybridConfig
-
-## DeepseekVLHybridProcessor
-
-[[autodoc]] DeepseekVLHybridProcessor
-
-## DeepseekVLHybridImageProcessor
-
-[[autodoc]] DeepseekVLHybridImageProcessor
-
-## DeepseekVLHybridImageProcessorFast
-
-[[autodoc]] DeepseekVLHybridImageProcessorFast
-
-## DeepseekVLHybridModel
-
-[[autodoc]] DeepseekVLHybridModel
-    - forward
-
-## DeepseekVLHybridForConditionalGeneration
-
-[[autodoc]] DeepseekVLHybridForConditionalGeneration
-    - forward
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@@ -14,122 +14,132 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-	<div class="flex flex-wrap space-x-1">
-		<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-	</div>
-</div>
-
 # DETR

-[DETR](https://huggingface.co/papers/2005.12872) consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original DETR checkpoints under the [AI at Meta](https://huggingface.co/facebook/models?search=detr) organization.
+## Overview

-> [!TIP]
-> This model was contributed by [nielsr](https://huggingface.co/nielsr).
->
-> Click on the DETR models in the right sidebar for more examples of how to apply DETR to different object detection and segmentation tasks.
+The DETR model was proposed in [End-to-End Object Detection with Transformers](https://huggingface.co/papers/2005.12872) by
+Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov and Sergey Zagoruyko. DETR
+consists of a convolutional backbone followed by an encoder-decoder Transformer which can be trained end-to-end for
+object detection. It greatly simplifies a lot of the complexity of models like Faster-R-CNN and Mask-R-CNN, which use
+things like region proposals, non-maximum suppression procedure and anchor generation. Moreover, DETR can also be
+naturally extended to perform panoptic segmentation, by simply adding a mask head on top of the decoder outputs.

-The example below demonstrates how to perform object detection with the [`Pipeline`] or the [`AutoModel`] class.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+*We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the
+detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression
+procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the
+new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via
+bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries,
+DETR reasons about the relations of the objects and the global image context to directly output the final set of
+predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many
+other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and
+highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily
+generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive
+baselines.*

-```python
-from transformers import pipeline
-import torch
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/detr).

-pipeline = pipeline(
-    "object-detection", 
-    model="facebook/detr-resnet-50",
-    torch_dtype=torch.float16,
-    device_map=0
-)
-
-pipeline("http://images.cocodataset.org/val2017/000000039769.jpg")
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```python
-from transformers import AutoImageProcessor, AutoModelForObjectDetection
-from PIL import Image
-import requests
-import torch
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
-model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50")
-
-# prepare image for the model
-inputs = image_processor(images=image, return_tensors="pt")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-
-results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
-
-for result in results:
-    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
-        score, label = score.item(), label_id.item()
-        box = [round(i, 2) for i in box.tolist()]
-        print(f"{model.config.id2label[label]}: {score:.2f} {box}")
-```
-
-</hfoption>
-</hfoptions>
-
-<details>
-<summary>How DETR works</summary>
+## How DETR works

 Here's a TLDR explaining how [`~transformers.DetrForObjectDetection`] works:

-First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a tensor of shape `(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone outputs a new lower-resolution feature map, typically of shape `(batch_size, 2048, height/32, width/32)`. This is then projected to match the hidden dimension of the Transformer of DETR, which is `256` by default, using a `nn.Conv2D` layer. So now, we have a tensor of shape `(batch_size, 256, height/32, width/32).` Next, the feature map is flattened and transposed to obtain a tensor of shape `(batch_size, seq_len, d_model)` = `(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually longer than usual, but with a smaller `d_model` (which in NLP is typically 768 or higher).
+First, an image is sent through a pre-trained convolutional backbone (in the paper, the authors use
+ResNet-50/ResNet-101). Let's assume we also add a batch dimension. This means that the input to the backbone is a
+tensor of shape `(batch_size, 3, height, width)`, assuming the image has 3 color channels (RGB). The CNN backbone
+outputs a new lower-resolution feature map, typically of shape `(batch_size, 2048, height/32, width/32)`. This is
+then projected to match the hidden dimension of the Transformer of DETR, which is `256` by default, using a
+`nn.Conv2D` layer. So now, we have a tensor of shape `(batch_size, 256, height/32, width/32).` Next, the
+feature map is flattened and transposed to obtain a tensor of shape `(batch_size, seq_len, d_model)` =
+`(batch_size, width/32*height/32, 256)`. So a difference with NLP models is that the sequence length is actually
+longer than usual, but with a smaller `d_model` (which in NLP is typically 768 or higher).

-Next, this is sent through the encoder, outputting `encoder_hidden_states` of the same shape (you can consider these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape `(batch_size, num_queries, d_model)`, with `num_queries` typically set to 100 and initialized with zeros. These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to the encoder, they are added to the input of each attention layer. Each object query will look for a particular object in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers to output `decoder_hidden_states` of the same shape: `(batch_size, num_queries, d_model)`. Next, two heads are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no object", and a MLP to predict bounding boxes for each query.
+Next, this is sent through the encoder, outputting `encoder_hidden_states` of the same shape (you can consider
+these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape
+`(batch_size, num_queries, d_model)`, with `num_queries` typically set to 100 and initialized with zeros.
+These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to
+the encoder, they are added to the input of each attention layer. Each object query will look for a particular object
+in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers
+to output `decoder_hidden_states` of the same shape: `(batch_size, num_queries, d_model)`. Next, two heads
+are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no
+object", and a MLP to predict bounding boxes for each query.

-The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes + bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N (so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as bounding box). The [Hungarian matching algorithm](https://en.wikipedia.org/wiki/Hungarian_algorithm) is used to find an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for the classes) and a linear combination of the L1 and [generalized IoU loss](https://giou.stanford.edu/) (for the bounding boxes) are used to optimize the parameters of the model.
+The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes +
+bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N
+(so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as
+bounding box). The [Hungarian matching algorithm](https://en.wikipedia.org/wiki/Hungarian_algorithm) is used to find
+an optimal one-to-one mapping of each of the N queries to each of the N annotations. Next, standard cross-entropy (for
+the classes) and a linear combination of the L1 and [generalized IoU loss](https://giou.stanford.edu/) (for the
+bounding boxes) are used to optimize the parameters of the model.

-DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance segmentation). [`~transformers.DetrForSegmentation`] adds a segmentation mask head on top of [`~transformers.DetrForObjectDetection`]. The mask head can be trained either jointly, or in a two steps process, where one first trains a [`~transformers.DetrForObjectDetection`] model to detect bounding boxes around both "things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is required for the training to be possible, since the Hungarian matching is computed using distances between boxes.
+DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance
+segmentation). [`~transformers.DetrForSegmentation`] adds a segmentation mask head on top of
+[`~transformers.DetrForObjectDetection`]. The mask head can be trained either jointly, or in a two steps process,
+where one first trains a [`~transformers.DetrForObjectDetection`] model to detect bounding boxes around both
+"things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only
+the mask head for 25 epochs. Experimentally, these two approaches give similar results. Note that predicting boxes is
+required for the training to be possible, since the Hungarian matching is computed using distances between boxes.

-</details>
+## Usage tips

-## Notes
+- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum
+  number of objects that can be detected in a single image, and is set to 100 by default (see parameter
+  `num_queries` of [`~transformers.DetrConfig`]). Note that it's good to have some slack (in COCO, the
+  authors used 100, while the maximum number of objects in a COCO image is ~70).
+- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2,
+  which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
+- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting
+  to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned
+  absolute position embeddings. By default, the parameter `position_embedding_type` of
+  [`~transformers.DetrConfig`] is set to `"sine"`.
+- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help
+  the model output the correct number of objects of each class. If you set the parameter `auxiliary_loss` of
+  [`~transformers.DetrConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses
+  are added after each decoder layer (with the FFNs sharing parameters).
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the
+  _num_boxes_ variable in the _DetrLoss_ class of _modeling_detr.py_. When training on multiple nodes, this should be
+  set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
+- [`~transformers.DetrForObjectDetection`] and [`~transformers.DetrForSegmentation`] can be initialized with
+  any convolutional backbone available in the [timm library](https://github.com/rwightman/pytorch-image-models).
+  Initializing with a MobileNet backbone for example can be done by setting the `backbone` attribute of
+  [`~transformers.DetrConfig`] to `"tf_mobilenetv3_small_075"`, and then initializing the model with that
+  config.
+- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is
+  at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at
+  least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use
+  [`~transformers.DetrImageProcessor`] to prepare images (and optional annotations in COCO format) for the
+  model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the
+  largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding.
+  Alternatively, one can also define a custom `collate_fn` in order to batch images together, using
+  [`~transformers.DetrImageProcessor.pad_and_create_pixel_mask`].
+- The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`.
+  It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.

- DETR uses so-called **object queries** to detect objects in an image. The number of queries determines the maximum number of objects that can be detected in a single image, and is set to 100 by default (see parameter `num_queries` of [`~transformers.DetrConfig`]). Note that it's good to have some slack (in COCO, the authors used 100, while the maximum number of objects in a COCO image is ~70).
- The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2, which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
- DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned absolute position embeddings. By default, the parameter `position_embedding_type` of [`~transformers.DetrConfig`] is set to `"sine"`.
- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help the model output the correct number of objects of each class. If you set the parameter `auxiliary_loss` of [`~transformers.DetrConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses are added after each decoder layer (with the FFNs sharing parameters).
- If you want to train the model in a distributed environment across multiple nodes, then one should update the _num_boxes_ variable in the _DetrLoss_ class of _modeling_detr.py_. When training on multiple nodes, this should be set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
- [`~transformers.DetrForObjectDetection`] and [`~transformers.DetrForSegmentation`] can be initialized with any convolutional backbone available in the [timm library](https://github.com/rwightman/pytorch-image-models). Initializing with a MobileNet backbone for example can be done by setting the `backbone` attribute of [`~transformers.DetrConfig`] to `"tf_mobilenetv3_small_075"`, and then initializing the model with that config.
- DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use [`~transformers.DetrImageProcessor`] to prepare images (and optional annotations in COCO format) for the model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding. Alternatively, one can also define a custom `collate_fn` in order to batch images together, using [`~transformers.DetrImageProcessor.pad_and_create_pixel_mask`].
- The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`. It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.
+There are three ways to instantiate a DETR model (depending on what you prefer):

-There are three other ways to instantiate a DETR model (depending on what you prefer):
+Option 1: Instantiate DETR with pre-trained weights for entire model
+```py
+>>> from transformers import DetrForObjectDetection

- Option 1: Instantiate DETR with pre-trained weights for entire model
-```python
-from transformers import DetrForObjectDetection
-
-model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+>>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
 ```

- Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
-```python
-from transformers import DetrConfig, DetrForObjectDetection
+Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
+```py
+>>> from transformers import DetrConfig, DetrForObjectDetection

-config = DetrConfig()
-model = DetrForObjectDetection(config)
+>>> config = DetrConfig()
+>>> model = DetrForObjectDetection(config)
 ```
-
- Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer
-```python
-config = DetrConfig(use_pretrained_backbone=False)
-model = DetrForObjectDetection(config)
+Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformer
+```py
+>>> config = DetrConfig(use_pretrained_backbone=False)
+>>> model = DetrForObjectDetection(config)
 ```

 As a summary, consider the following table:
@@ -143,12 +153,24 @@ As a summary, consider the following table:
 | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
 | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |

- In short, one should prepare the data either in COCO detection or COCO panoptic format, then use [`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional `labels`, which can then be used to train (or fine-tune) a model. 
- For evaluation, one should first convert the outputs of the model using one of the postprocessing methods of [`~transformers.DetrImageProcessor`]. These can be provided to either `CocoEvaluator` or `PanopticEvaluator`, which allow you to calculate metrics like mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the [original repository](https://github.com/facebookresearch/detr). See the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for more info regarding evaluation.
+In short, one should prepare the data either in COCO detection or COCO panoptic format, then use
+[`~transformers.DetrImageProcessor`] to create `pixel_values`, `pixel_mask` and optional
+`labels`, which can then be used to train (or fine-tune) a model. For evaluation, one should first convert the
+outputs of the model using one of the postprocessing methods of [`~transformers.DetrImageProcessor`]. These can
+be provided to either `CocoEvaluator` or `PanopticEvaluator`, which allow you to calculate metrics like
+mean Average Precision (mAP) and Panoptic Quality (PQ). The latter objects are implemented in the [original repository](https://github.com/facebookresearch/detr). See the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for more info regarding evaluation.

 ## Resources

- Refer to these [notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR) for examples of fine-tuning [`DetrForObjectDetection`] and [`DetrForSegmentation`] on a custom dataset.
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETR.
+
+<PipelineTag pipeline="object-detection"/>
+
+- All example notebooks illustrating fine-tuning [`DetrForObjectDetection`] and [`DetrForSegmentation`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR).
+- Scripts for finetuning [`DetrForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
+- See also: [Object detection task guide](../tasks/object_detection).
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

 ## DetrConfig

--- a/docs/source/en/model_doc/dia.md
+++ b/docs/source/en/model_doc/dia.md
@@ -26,14 +26,14 @@ rendered properly in your Markdown viewer.

 ## Overview

-Dia is an open-source text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
-It can generate highly realistic dialogue from transcript including non-verbal communications such as laughter and coughing.
+Dia is an opensource text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
+It can generate highly realistic dialogue from transcript including nonverbal communications such as laughter and coughing.
 Furthermore, emotion and tone control is also possible via audio conditioning (voice cloning).

 **Model Architecture:**
 Dia is an encoder-decoder transformer based on the original transformer architecture. However, some more modern features such as
 rotational positional embeddings (RoPE) are also included. For its text portion (encoder), a byte tokenizer is utilized while
-for the audio portion (decoder), a pretrained codec model [DAC](./dac) is used - DAC encodes speech into discrete codebook
+for the audio portion (decoder), a pretrained codec model [DAC](./dac.md) is used - DAC encodes speech into discrete codebook
 tokens and decodes them back into audio.

 ## Usage Tips
--- a/docs/source/en/model_doc/efficientloftr.md
+++ b/docs/source/en/model_doc/efficientloftr.md
@@ -1,149 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the MIT License; you may not use this file except in compliance with
-the License.
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-    </div>
-</div>
-
-# EfficientLoFTR
-
-[EfficientLoFTR](https://huggingface.co/papers/2403.04765) is an efficient detector-free local feature matching method that produces semi-dense matches across images with sparse-like speed. It builds upon the original [LoFTR](https://huggingface.co/papers/2104.00680) architecture but introduces significant improvements for both efficiency and accuracy. The key innovation is an aggregated attention mechanism with adaptive token selection that makes the model ~2.5× faster than LoFTR while achieving higher accuracy. EfficientLoFTR can even surpass state-of-the-art efficient sparse matching pipelines like [SuperPoint](./superpoint) + [LightGlue](./lightglue) in terms of speed, making it suitable for large-scale or latency-sensitive applications such as image retrieval and 3D reconstruction.
-
-> [!TIP]
-> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
->
-> Click on the EfficientLoFTR models in the right sidebar for more examples of how to apply EfficientLoFTR to different computer vision tasks.
-
-The example below demonstrates how to match keypoints between two images with the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="AutoModel">
-
-```py
-from transformers import AutoImageProcessor, AutoModelForKeypointMatching
-import torch
-from PIL import Image
-import requests
-
-url_image1 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg"
-image1 = Image.open(requests.get(url_image1, stream=True).raw)
-url_image2 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg"
-image2 = Image.open(requests.get(url_image2, stream=True).raw)
-
-images = [image1, image2]
-
-processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
-model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")
-
-inputs = processor(images, return_tensors="pt")
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# Post-process to get keypoints and matches
-image_sizes = [[(image.height, image.width) for image in images]]
-processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-```
-
-</hfoption>
-</hfoptions>
-
-## Notes
-
- EfficientLoFTR is designed for efficiency while maintaining high accuracy. It uses an aggregated attention mechanism with adaptive token selection to reduce computational overhead compared to the original LoFTR.
-
-    ```py
-    from transformers import AutoImageProcessor, AutoModelForKeypointMatching
-    import torch
-    from PIL import Image
-    import requests
-    
-    processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
-    model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")
-    
-    # EfficientLoFTR requires pairs of images
-    images = [image1, image2]
-    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
-    
-    # Extract matching information
-    keypoints = outputs.keypoints        # Keypoints in both images
-    matches = outputs.matches            # Matching indices 
-    matching_scores = outputs.matching_scores  # Confidence scores
-    ```
-
- The model produces semi-dense matches, offering a good balance between the density of matches and computational efficiency. It excels in handling large viewpoint changes and texture-poor scenarios.
-
- For better visualization and analysis, use the [`~EfficientLoFTRImageProcessor.post_process_keypoint_matching`] method to get matches in a more readable format.
-
-    ```py
-    # Process outputs for visualization
-    image_sizes = [[(image.height, image.width) for image in images]]
-    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-    
-    for i, output in enumerate(processed_outputs):
-        print(f"For the image pair {i}")
-        for keypoint0, keypoint1, matching_score in zip(
-                output["keypoints0"], output["keypoints1"], output["matching_scores"]
-        ):
-            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
-    ```
-
- Visualize the matches between the images using the built-in plotting functionality.
-
-    ```py
-    # Easy visualization using the built-in plotting method
-    visualized_images = processor.visualize_keypoint_matching(images, processed_outputs)
-    ```
-
- EfficientLoFTR uses a novel two-stage correlation layer that achieves accurate subpixel correspondences, improving upon the original LoFTR's fine correlation module.
-
-<div class="flex justify-center">
-    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/2nJZQlFToCYp_iLurvcZ4.png">
-</div>
-
-## Resources
-
- Refer to the [original EfficientLoFTR repository](https://github.com/zju3dv/EfficientLoFTR) for more examples and implementation details.
- [EfficientLoFTR project page](https://zju3dv.github.io/efficientloftr/) with interactive demos and additional information.
-
-## EfficientLoFTRConfig
-
-[[autodoc]] EfficientLoFTRConfig
-
-## EfficientLoFTRImageProcessor
-
-[[autodoc]] EfficientLoFTRImageProcessor
-
- preprocess
- post_process_keypoint_matching
- visualize_keypoint_matching
-
-<frameworkcontent>
-<pt>
-## EfficientLoFTRModel
-
-[[autodoc]] EfficientLoFTRModel
-
- forward
-
-## EfficientLoFTRForKeypointMatching
-
-[[autodoc]] EfficientLoFTRForKeypointMatching
-
- forward
-
-</pt>
-</frameworkcontent>
--- a/docs/source/en/model_doc/encodec.md
+++ b/docs/source/en/model_doc/encodec.md
@@ -47,8 +47,7 @@ Here is a quick example of how to encode and decode an audio using this model:
 >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

 >>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
->>> # `encoder_outputs.audio_codes` contains discrete codes
->>> audio_values = model.decode(**encoder_outputs, padding_mask=inputs["padding_mask"])[0]
+>>> audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]
 >>> # or the equivalent with a forward pass
 >>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
 ```
--- a/docs/source/en/model_doc/ernie.md
+++ b/docs/source/en/model_doc/ernie.md
@@ -14,83 +14,29 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-    </div>
-</div>
-
 # ERNIE

-[ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
-[ERNIE3.0](https://arxiv.org/abs/2107.02137), [ERNIE-Gram](https://arxiv.org/abs/2010.12148), [ERNIE-health](https://arxiv.org/abs/2110.07244) are a series of powerful models proposed by baidu, especially in Chinese tasks.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-ERNIE (Enhanced Representation through kNowledge IntEgration) is designed to learn language representation enhanced by knowledge masking strategies, which includes entity-level masking and phrase-level masking.
+## Overview
+ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks,
+including [ERNIE1.0](https://huggingface.co/papers/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
+[ERNIE3.0](https://huggingface.co/papers/2107.02137), [ERNIE-Gram](https://huggingface.co/papers/2010.12148), [ERNIE-health](https://huggingface.co/papers/2110.07244), etc.

-Other ERNIE models released by baidu can be found at [Ernie 4.5](./ernie4_5), and [Ernie 4.5 MoE](./ernie4_5_moe).
+These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).

-> [!TIP]
-> This model was contributed by [nghuyong](https://huggingface.co/nghuyong), and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).
->
-> Click on the ERNIE models in the right sidebar for more examples of how to apply ERNIE to different language tasks.
+### Usage example
+Take `ernie-1.0-base-zh` as an example:

-The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(
-    task="fill-mask",
-    model="nghuyong/ernie-3.0-xbase-zh"
-)
-
-pipeline("巴黎是[MASK]国的首都。")
+```Python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
+model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh")
 ```

-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "nghuyong/ernie-3.0-xbase-zh",
-)
-model = AutoModelForMaskedLM.from_pretrained(
-    "nghuyong/ernie-3.0-xbase-zh",
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-inputs = tokenizer("巴黎是[MASK]国的首都。", return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-    predictions = outputs.logits
-
-masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
-predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
-predicted_token = tokenizer.decode(predicted_token_id)
-
-print(f"The predicted token is: {predicted_token}")
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "巴黎是[MASK]国的首都。" | transformers run --task fill-mask --model nghuyong/ernie-3.0-xbase-zh --device 0
-```
-
-</hfoption>
-</hfoptions>
-
-## Notes
-
-Model variants are available in different sizes and languages.
+### Model checkpoints

 |     Model Name      | Language |           Description           |
 |:-------------------:|:--------:|:-------------------------------:|
@@ -105,11 +51,18 @@ Model variants are available in different sizes and languages.
 |   ernie-health-zh   | Chinese  | Layer:12, Heads:12, Hidden:768  |
 |    ernie-gram-zh    | Chinese  | Layer:12, Heads:12, Hidden:768  |

-## Resources
-
 You can find all the supported models from huggingface's model hub: [huggingface.co/nghuyong](https://huggingface.co/nghuyong), and model details from paddle's official
 repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html)
-and [ERNIE's legacy branch](https://github.com/PaddlePaddle/ERNIE/tree/legacy/develop).
+and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro).
+
+## Resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)

 ## ErnieConfig

@@ -163,4 +116,4 @@ and [ERNIE's legacy branch](https://github.com/PaddlePaddle/ERNIE/tree/legacy/de
 ## ErnieForQuestionAnswering

 [[autodoc]] ErnieForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/ernie4_5.md
+++ b/docs/source/en/model_doc/ernie4_5.md
@@ -1,99 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-    </div>
-</div>
-
-# Ernie 4.5
-
-## Overview
-
-The Ernie 4.5 model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
-This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
-model without mixture of experts (moe) with 0.3B parameters in total. It uses the standard [Llama](./llama) at its core.
-
-Other models from the family can be found at [Ernie 4.5 Moe](./ernie4_5_moe).
-
-<div class="flex justify-center">
-    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
-</div>
-
-
-## Usage Tips
-
-### Generate text
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-0.3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
-The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
-
-
-## Ernie4_5Config
-
-[[autodoc]] Ernie4_5Config
-
-## Ernie4_5Model
-
-[[autodoc]] Ernie4_5Model
-    - forward
-
-## Ernie4_5ForCausalLM
-
-[[autodoc]] Ernie4_5ForCausalLM
-    - forward
--- a/docs/source/en/model_doc/ernie4_5_moe.md
+++ b/docs/source/en/model_doc/ernie4_5_moe.md
@@ -1,183 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-    </div>
-</div>
-
-# Ernie 4.5 Moe
-
-## Overview
-
-The Ernie 4.5 Moe model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
-This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
-model with mixture of experts (moe) - one with 21B total, 3B active parameters and another one with 300B total, 47B active parameters.
-It uses the standard [Llama](./llama) at its core combined with a specialized MoE based on [Mixtral](./mixtral) with additional shared
-experts.
-
-Other models from the family can be found at [Ernie 4.5](./ernie4_5).
-
-<div class="flex justify-center">
-    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
-</div>
-
-
-## Usage Tips
-
-### Generate text
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-### Distributed Generation with Tensor Parallelism
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-    tp_plan="auto",
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-### Quantization with Bitsandbytes
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    quantization_config=BitsAndBytesConfig(load_in_4bit=True),
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
-The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
-
-
-## Ernie4_5_MoeConfig
-
-[[autodoc]] Ernie4_5_MoeConfig
-
-## Ernie4_5_MoeModel
-
-[[autodoc]] Ernie4_5_MoeModel
-    - forward
-
-## Ernie4_5_MoeForCausalLM
-
-[[autodoc]] Ernie4_5_MoeForCausalLM
-    - forward
-    - generate
--- a/docs/source/en/model_doc/evolla.md
+++ b/docs/source/en/model_doc/evolla.md
@@ -1,95 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Evolla
-
-## Overview
-
-The Evolla model was proposed in [Decoding the Molecular Language of Proteins with Evolla](https://doi.org/10.1101/2025.01.05.630192) by [Zhou et al.](https://doi.org/10.1101/2025.01.05.630192).
-
-Evolla is an advanced 80-billion-parameter protein-language generative model designed to decode the molecular language of proteins. It integrates information from protein sequences, structures, and user queries to generate precise and contextually nuanced insights into protein function. Trained on an unprecedented AI-generated dataset of 546 million protein question-answer pairs and 150 billion word tokens, Evolla significantly advances research in proteomics and functional genomics, providing expert-level insights and shedding light on the molecular logic encoded in proteins.
-
-The abstract from the paper is the following:
-
-*Proteins, nature’s intricate molecular machines, are the products of billions of years of evolution and play fundamental roles in sustaining life. Yet, deciphering their molecular language - that is, understanding how protein sequences and structures encode and determine biological functions - remains a corner-stone challenge in modern biology. Here, we introduce Evolla, an 80 billion frontier protein-language generative model designed to decode the molecular language of proteins. By integrating information from protein sequences, structures, and user queries, Evolla generates precise and contextually nuanced insights into protein function. A key innovation of Evolla lies in its training on an unprecedented AI-generated dataset: 546 million protein question-answer pairs and 150 billion word tokens, designed to reflect the immense complexity and functional diversity of proteins. Post-pretraining, Evolla integrates Direct Preference Optimization (DPO) to refine the model based on preference signals and Retrieval-Augmented Generation (RAG) for external knowledge incorporation, improving response quality and relevance. To evaluate its performance, we propose a novel framework, Instructional Response Space (IRS), demonstrating that Evolla delivers expert-level insights, advancing research in proteomics and functional genomics while shedding light on the molecular logic encoded in proteins. The online demo is available at http://www.chat-protein.com/.*
-
-Examples:
-
-```python
-processor = EvollaProcessor.from_pretrained("westlake-repl/Evolla-10B-DPO-hf")
-model = EvollaForProteinText2Text.from_pretrained("westlake-repl/Evolla-10B-DPO-hf")
-# aa_seq should have same length as foldseek
-protein_inputs = [
-    {
-        
-        "aa_seq": "MATGGRRG...",
-        "foldseek": "###lqpfd...", # hashtag means the low-confidence foldseek tokens
-    },
-    {
-        "aa_seq": "MLPGLALL...",
-        "foldseek": "dfwwkwad...",
-    }
-]
-message_list = [
-    [
-        {
-            "role": "system",
-            "content": "You are an AI expert that can answer any questions about protein.",
-        },
-        {"role": "user", "content": "What is the function of this protein?"},
-    ],
-    [
-        {
-            "role": "system",
-            "content": "You are an AI expert that can answer any questions about protein.",
-        },
-        {"role": "user", "content": "What is the function of this protein?"},
-    ]
-]
-input_dict = processor(
-    protein_informations, messages_list, return_tensors="pt", text_max_length=512, protein_max_length=1024
-)
-with torch.no_grad():
-    generated_ids = hf_model.generate(**input_dict)
-generated_texts = processor.batch_decode(
-    generated_ids, skip_special_tokens=True
-)
-```
-
-Tips:
-
- This model was contributed by [Xibin Bayes Zhou](https://huggingface.co/XibinBayesZhou).
- The original code can be found [here](https://github.com/westlake-repl/Evolla).
-
-
-## EvollaConfig
-
-[[autodoc]] EvollaConfig
-
-## EvollaModel
-
-[[autodoc]] EvollaModel
-    - forward
-
-## EvollaForProteinText2Text
-
-[[autodoc]] EvollaForProteinText2Text
-    - forward
-
-## EvollaProcessor
-
-[[autodoc]] EvollaProcessor
-    - __call__
--- a/docs/source/en/model_doc/exaone4.md
+++ b/docs/source/en/model_doc/exaone4.md
@@ -1,208 +0,0 @@
-<!--Copyright 2025 The LG AI Research and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# EXAONE 4
-
-## Overview
-
-**[EXAONE 4.0](https://github.com/LG-AI-EXAONE/EXAONE-4.0)** model is the language model, which integrates a **Non-reasoning mode** and **Reasoning mode** to achieve both the excellent usability of [EXAONE 3.5](https://github.com/LG-AI-EXAONE/EXAONE-3.5) and the advanced reasoning abilities of [EXAONE Deep](https://github.com/LG-AI-EXAONE/EXAONE-Deep). To pave the way for the agentic AI era, EXAONE 4.0 incorporates essential features such as agentic tool use, and its multilingual capabilities are extended
-to support Spanish in addition to English and Korean. 
-
-The EXAONE 4.0 model series consists of two sizes: a mid-size **32B** model optimized for high performance, and a small-size **1.2B** model designed for on-device applications.
-
-In the EXAONE 4.0 architecture, we apply new architectural changes compared to previous EXAONE models as below:
-
-1. **Hybrid Attention**: For the 32B model, we adopt hybrid attention scheme, which combines *Local attention (sliding window attention)* with *Global attention (full attention)* in a 3:1 ratio. We do not use RoPE (Rotary Positional Embedding) for global attention for better global context understanding.
-2. **QK-Reorder-Norm**: We reorder the LayerNorm position from the traditional Pre-LN scheme by applying LayerNorm directly to the attention and MLP outputs, and we add RMS normalization right after the Q and K projection. It helps yield better performance on downstream tasks despite consuming more computation.
-
-For more details, please refer to our [technical report](https://arxiv.org/abs/2507.11407), [HuggingFace paper](https://huggingface.co/papers/2507.11407), [blog](https://www.lgresearch.ai/blog/view?seq=576), and [GitHub](https://github.com/LG-AI-EXAONE/EXAONE-4.0).
-
-All model weights including quantized versions are available at [Huggingface Collections](https://huggingface.co/collections/LGAI-EXAONE/exaone-40-686b2e0069800c835ed48375).
-
-
-## Model Details
-
-### Model Specifications
-
-| Model Configuration | 32B | 1.2B |
-|:-------------------|:-----:|:------:|
-| d_model | 5,120 | 2,048 |
-| Number of layers | 64 | 30 |
-| Normalization | QK-Reorder-LN | QK-Reorder-LN |
-| Non-linearity | SwiGLU | SwiGLU |
-| Feedforward dimension | 27,392 | 4,096 |
-| Attention type | Hybrid (3:1 Local-Global) | Global |
-| Head type | GQA | GQA |
-| Number of heads | 40 | 32 |
-| Number of KV heads | 8 | 8 |
-| Head size | 128 | 64 |
-| Max sequence length | 131,072 | 65,536 |
-| RoPE theta | 1,000,000 | 1,000,000 |
-| Tokenizer | BBPE | BBPE |
-| Vocab size | 102,400 | 102,400 |
-| Tied word embedding | False | True |
-| Knowledge cut-off | Nov. 2024 | Nov. 2024 |
-
-
-## Usage tips
-
-### Non-reasoning mode
-
-For general use, you can use the EXAONE 4.0 models with the following example:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "LGAI-EXAONE/EXAONE-4.0-32B"
-
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="bfloat16",
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-# choose your prompt
-prompt = "Explain how wonderful you are"
-prompt = "Explica lo increíble que eres"
-prompt = "너가 얼마나 대단한지 설명해 봐"
-
-messages = [
-    {"role": "user", "content": prompt}
-]
-input_ids = tokenizer.apply_chat_template(
-    messages,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt"
-)
-
-output = model.generate(
-    input_ids.to(model.device),
-    max_new_tokens=128,
-    do_sample=False,
-)
-print(tokenizer.decode(output[0]))
-```
-
-### Reasoning mode
-
-The EXAONE 4.0 models have reasoning capabilities for handling complex problems. You can activate reasoning mode by using the `enable_thinking=True` argument with the tokenizer, which opens a reasoning block that starts with `<think>` tag without closing it.
-
-```python
-messages = [
-    {"role": "user", "content": "Which one is bigger, 3.12 vs 3.9?"}
-]
-input_ids = tokenizer.apply_chat_template(
-    messages,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    enable_thinking=True,
-)
-
-output = model.generate(
-    input_ids.to(model.device),
-    max_new_tokens=128,
-    do_sample=True,
-    temperature=0.6,
-    top_p=0.95
-)
-print(tokenizer.decode(output[0]))
-```
-
-> [!IMPORTANT]
-> The model generation with reasoning mode can be affected sensitively by sampling parameters, so please refer to the [Usage Guideline](https://github.com/LG-AI-EXAONE/EXAONE-4.0#usage-guideline) on official GitHub page for better quality.
-
-### Agentic tool use
-
-The EXAONE 4.0 models can be used as agents with their tool calling capabilities. You can provide tool schemas to the model for effective tool calling.
-
-```python
-import random
-
-def roll_dice(max_num: int):
-    return random.randint(1, max_num)
-
-tools = [
-    {
-        "type": "function",
-        "function": {
-            "name": "roll_dice",
-            "description": "Roll a dice with the number 1 to N. User can select the number N.",
-            "parameters": {
-                "type": "object",
-                "required": ["max_num"],
-                "properties": {
-                    "max_num": {
-                        "type": "int",
-                        "description": "Max number of the dice"
-                    }
-                }
-            }
-        }
-    }
-]
-
-messages = [
-    {"role": "user", "content": "Roll D6 dice twice!"}
-]
-input_ids = tokenizer.apply_chat_template(
-    messages,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    tools=tools,
-)
-
-output = model.generate(
-    input_ids.to(model.device),
-    max_new_tokens=1024,
-    do_sample=True,
-    temperature=0.6,
-    top_p=0.95,
-)
-print(tokenizer.decode(output[0]))
-```
-
-## Exaone4Config
-
-[[autodoc]] Exaone4Config
-
-## Exaone4Model
-
-[[autodoc]] Exaone4Model
-    - forward
-
-## Exaone4ForCausalLM
-
-[[autodoc]] Exaone4ForCausalLM
-    - forward
-
-## Exaone4ForSequenceClassification
-
-[[autodoc]] Exaone4ForSequenceClassification
-    - forward
-
-## Exaone4ForTokenClassification
-
-[[autodoc]] Exaone4ForTokenClassification
-    - forward
-
-## Exaone4ForQuestionAnswering
-
-[[autodoc]] Exaone4ForQuestionAnswering
-    - forward
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@@ -110,13 +110,6 @@ outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

-## FalconMambaCache
-
-[[autodoc]] FalconMambaCache
-    - update_conv_state
-    - update_ssm_state
-    - reset
-
 ## FalconMambaConfig

 [[autodoc]] FalconMambaConfig
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@@ -267,8 +267,3 @@ visualizer("<img>What is shown in this image?")

 [[autodoc]] Gemma3ForConditionalGeneration
    - forward
-
-## Gemma3ForSequenceClassification
-
-[[autodoc]] Gemma3ForSequenceClassification
-    - forward
--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@@ -30,7 +30,7 @@ Gemma3n is a multimodal model with pretrained and instruction-tuned variants, av
 large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
 this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
 [MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
-a similar attention pattern to [Gemma 3](./gemma3) with alternating 4 local sliding window self-attention layers for
+a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
 every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
 [MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
 trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.
--- a/docs/source/en/model_doc/glm4_moe.md
+++ b/docs/source/en/model_doc/glm4_moe.md
@@ -1,35 +0,0 @@
-<!--Copyright 2025 The ZhipuAI Inc. and The HuggingFace Inc. team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Glm4Moe
-
-## Overview
-
-This will update After model release.
-
-## Glm4MoeConfig
-
-[[autodoc]] Glm4MoeConfig
-
-## Glm4MoeModel
-
-[[autodoc]] Glm4MoeModel
-    - forward
-
-## Glm4MoeForCausalLM
-
-[[autodoc]] Glm4MoeForCausalLM
-    - forward
--- a/docs/source/en/model_doc/gpt_oss.md
+++ b/docs/source/en/model_doc/gpt_oss.md
@@ -1,58 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# GptOss
-
-## Overview
-
-The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
-
-
-## GptOssConfig
-
-[[autodoc]] GptOssConfig
-
-## GptOssModel
-
-[[autodoc]] GptOssModel
-    - forward
-
-## GptOssForCausalLM
-
-[[autodoc]] GptOssForCausalLM
-    - forward
--- a/docs/source/en/model_doc/granitemoehybrid.md
+++ b/docs/source/en/model_doc/granitemoehybrid.md
@@ -48,32 +48,6 @@ for i in output:

 This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co/SukritiSharma) and [Alexander Brooks](https://huggingface.co/abrooks9944).

-## Notes
-
- `GraniteMoeHybridForCausalLM` supports padding-free training which concatenates distinct training examples while still processing inputs as separate batches. It can significantly accelerate inference by [~2x](https://github.com/huggingface/transformers/pull/35861#issue-2807873129) (depending on model and data distribution) and reduce memory-usage if there are examples of varying lengths by avoiding unnecessary compute and memory overhead from padding tokens.
-
-  Padding-free training requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d` packages and the following arguments must be passed to the model in addition to `input_ids` and `labels`.
-
-  - `position_ids: torch.LongTensor`: the position index of each token in each sequence.
-  - `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
-  - Each of the [`FlashAttentionKwargs`]
-    - `cu_seq_lens_q: torch.LongTensor`: the cumulative sequence lengths of all queries.
-    - `cu_seq_lens_k: torch.LongTensor`: the cumulative sequence lengths of all keys.
-    - `max_length_q: int`: the longest query length in the batch.
-    - `max_length_k: int`: the longest key length in the batch.
-
-  The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] programmatically generates the set of additional arguments above using `return_seq_idx=True` and `return_flash_attn_kwargs=True`. See the [Improving Hugging Face Training Efficiency Through Packing with Flash Attention](https://huggingface.co/blog/packing-with-FA2) blog post for additional information.
-
-  ```python
-  from transformers import DataCollatorWithFlattening
-
-  # Example of using padding-free training
-  data_collator = DataCollatorWithFlattening(
-      tokenizer=tokenizer,
-      return_seq_idx=True,
-      return_flash_attn_kwargs=True
-  )
-  ```

 ## GraniteMoeHybridConfig

@@ -87,4 +61,4 @@ This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co
 ## GraniteMoeHybridForCausalLM

 [[autodoc]] GraniteMoeHybridForCausalLM
-    - forward
+    - forward
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@@ -169,9 +169,9 @@ model = Idefics2ForConditionalGeneration.from_pretrained(

 ## Shrinking down Idefics2 using quantization

-As the Idefics2 model has 8 billion parameters, that would require about 16GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), that requires only about 3.5GB of RAM.
+As the Idefics2 model has 8 billion parameters, that would require about 16GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), that requires only about 3.5GB of RAM.

-Quantizing a model is as simple as passing a `quantization_config` to the model. One can change the code snippet above with the changes below. We'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization) for other quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. One can change the code snippet above with the changes below. We'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization.md) for other quantization methods):

 ```diff
 + from transformers import BitsAndBytesConfig
@@ -193,7 +193,7 @@ model = Idefics2ForConditionalGeneration.from_pretrained(

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Idefics2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

- A notebook on how to fine-tune Idefics2 on a custom dataset using the [Trainer](../main_classes/trainer) can be found [here](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing). It supports both full fine-tuning as well as (quantized) LoRa.
+- A notebook on how to fine-tune Idefics2 on a custom dataset using the [Trainer](../main_classes/trainer.md) can be found [here](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing). It supports both full fine-tuning as well as (quantized) LoRa.
 - A script regarding how to fine-tune Idefics2 using the TRL library can be found [here](https://gist.github.com/edbeeching/228652fc6c2b29a1641be5a5778223cb).
 - Demo notebook regarding fine-tuning Idefics2 for JSON extraction use cases can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Idefics2). 🌎

--- a/docs/source/en/model_doc/janus.md
+++ b/docs/source/en/model_doc/janus.md
@@ -44,11 +44,11 @@ Here is the example of visual understanding with a single image.
 > Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts.

 ```python
-import torch
-from PIL import Image
-import requests
+import torch  
+from PIL import Image  
+import requests  

-from transformers import JanusForConditionalGeneration, JanusProcessor
+from transformers import JanusForConditionalGeneration, JanusProcessor  

 model_id = "deepseek-community/Janus-Pro-1B"
 # Prepare Input for generation.
@@ -64,7 +64,7 @@ messages = [

 # Set generation mode to `text` to perform text generation.
 processor = JanusProcessor.from_pretrained(model_id)
-model = JanusForConditionalGeneration.from_pretrained(model_id,
+model = JanusForConditionalGeneration.from_pretrained(model_id,     
        torch_dtype=torch.bfloat16,
        device_map="auto")

@@ -209,10 +209,6 @@ for i, image in enumerate(images['pixel_values']):

 [[autodoc]] JanusImageProcessor

-## JanusImageProcessorFast
-
-[[autodoc]] JanusImageProcessorFast
-
 ## JanusVisionModel

 [[autodoc]] JanusVisionModel
--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@@ -107,7 +107,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size

    ```py
    # Easy visualization using the built-in plotting method
-    processor.visualize_keypoint_matching(images, processed_outputs)
+    processor.plot_keypoint_matching(images, processed_outputs)
    ```

 <div class="flex justify-center">
@@ -128,7 +128,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size

 - preprocess
 - post_process_keypoint_matching
- visualize_keypoint_matching
+- plot_keypoint_matching

 <frameworkcontent>
 <pt>
--- a/docs/source/en/model_doc/mamba.md
+++ b/docs/source/en/model_doc/mamba.md
@@ -116,13 +116,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
  trainer.train()
   ```

-## MambaCache
-
-[[autodoc]] MambaCache
-    - update_conv_state
-    - update_ssm_state
-    - reset
-
 ## MambaConfig

 [[autodoc]] MambaConfig
--- a/docs/source/en/model_doc/mask2former.md
+++ b/docs/source/en/model_doc/mask2former.md
@@ -77,12 +77,4 @@ The resource should ideally demonstrate something new instead of duplicating an
    - encode_inputs
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## Mask2FormerImageProcessorFast
-
-[[autodoc]] Mask2FormerImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
--- a/docs/source/en/model_doc/maskformer.md
+++ b/docs/source/en/model_doc/maskformer.md
@@ -76,14 +76,6 @@ This model was contributed by [francesco](https://huggingface.co/francesco). The
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation

-## MaskFormerImageProcessorFast
-
-[[autodoc]] MaskFormerImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
 ## MaskFormerFeatureExtractor

 [[autodoc]] MaskFormerFeatureExtractor
--- a/docs/source/en/model_doc/mgp-str.md
+++ b/docs/source/en/model_doc/mgp-str.md
@@ -33,7 +33,7 @@ alt="drawing" width="600"/>

 <small> MGP-STR architecture. Taken from the <a href="https://huggingface.co/papers/2209.03592">original paper</a>. </small>

-MGP-STR is trained on two synthetic datasets [MJSynth](http://www.robots.ox.ac.uk/~vgg/data/text/) (MJ) and [SynthText](http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
+MGP-STR is trained on two synthetic datasets [MJSynth]((http://www.robots.ox.ac.uk/~vgg/data/text/)) (MJ) and [SynthText](http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
 This model was contributed by [yuekun](https://huggingface.co/yuekun). The original code can be found [here](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR).

 ## Inference example
--- a/docs/source/en/model_doc/mimi.md
+++ b/docs/source/en/model_doc/mimi.md
@@ -14,29 +14,30 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
 # Mimi

-[Mimi](huggingface.co/papers/2410.00037) is a neural audio codec model with pretrained and quantized variants, designed for efficient speech representation and compression. The model operates at 1.1 kbps with a 12 Hz frame rate and uses a convolutional encoder-decoder architecture combined with a residual vector quantizer of 16 codebooks. Mimi outputs dual token streams i.e. semantic and acoustic to balance linguistic richness with high fidelity reconstruction. Key features include a causal streaming encoder for low-latency use, dual-path tokenization for flexible downstream generation, and integration readiness with large speech models like Moshi.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find the original Mimi checkpoints under the [Kyutai](https://huggingface.co/kyutai/models?search=mimi) organization.
+## Overview

->[!TIP]
-> This model was contributed by [ylacombe](https://huggingface.co/ylacombe).
->
-> Click on the Mimi models in the right sidebar for more examples of how to apply Mimi.
+The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”.

-The example below demonstrates how to encode and decode audio with the [`AutoModel`] class.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="AutoModel">
+*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* 
+
+Its architecture is based on [Encodec](model_doc/encodec) with several major differences:
+* it uses a much lower frame-rate.
+* it uses additional transformers for encoding and decoding for better latent contextualization
+* it uses a different quantization scheme: one codebook is dedicated to semantic projection.
+
+## Usage example 
+
+Here is a quick example of how to encode and decode an audio using this model:

 ```python 
 >>> from datasets import load_dataset, Audio
@@ -58,8 +59,9 @@ The example below demonstrates how to encode and decode audio with the [`AutoMod
 >>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
 ```

-</hfoption>
-</hfoptions>
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe).
+The original code can be found [here](https://github.com/kyutai-labs/moshi).
+

 ## MimiConfig

@@ -70,4 +72,4 @@ The example below demonstrates how to encode and decode audio with the [`AutoMod
 [[autodoc]] MimiModel
    - decode
    - encode
-    - forward
+    - forward
--- a/docs/source/en/model_doc/minimax.md
+++ b/docs/source/en/model_doc/minimax.md
@@ -115,9 +115,9 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech

 ## Shrinking down MiniMax using quantization

-As the MiniMax model has 456 billion parameters, that would require about 912GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), about 228 GB of RAM is required.
+As the MiniMax model has 456 billion parameters, that would require about 912GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), about 228 GB of RAM is required.

-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization) for alternative quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):

 ```python
 >>> import torch
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@@ -13,125 +13,116 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&amp;logo=pytorch&amp;logoColor=white">
-    </div>
-</div>

-# Mistral 3
+# Mistral3

-[Mistral 3](https://mistral.ai/news/mistral-small-3) is a latency optimized model with a lot fewer layers to reduce the time per forward pass. This model adds vision understanding and supports long context lengths of up to 128K tokens without compromising performance.
+## Overview

-You can find the original Mistral 3 checkpoints under the [Mistral AI](https://huggingface.co/mistralai/models?search=mistral-small-3) organization.
+Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks.

+It is ideal for:
+- Fast-response conversational agents.
+- Low-latency function calling.
+- Subject matter experts via fine-tuning.
+- Local inference for hobbyists and organizations handling sensitive data.
+- Programming and math reasoning.
+- Long document understanding.
+- Visual understanding.

-> [!TIP]
-> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).
-> Click on the Mistral3 models in the right sidebar for more examples of how to apply Mistral3 to different tasks.
+This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).

-The example below demonstrates how to generate text for an image with [`Pipeline`] and the [`AutoModel`] class.
+The original code can be found [here](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/pixtral.py) and [here](https://github.com/mistralai/mistral-common).

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+## Usage example

-```py
-import torch
-from transformers import pipeline
+### Inference with Pipeline

-messages = [
-    {"role": "user",
-        "content":[
-            {"type": "image",
-            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
-            {"type": "text", "text": "Describe this image."}
-        ,]
-    ,}
-,]
+Here is how you can use the `image-text-to-text` pipeline to perform inference with the `Mistral3` models in just a few lines of code:
+```python
+>>> from transformers import pipeline

-pipeline = pipeline(
-    task="image-text-to-text", 
-    model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", 
-    torch_dtype=torch.bfloat16,
-    device=0
-)
-outputs = pipeline(text=messages, max_new_tokens=50, return_full_text=False)
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {
+...                 "type": "image",
+...                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+...             },
+...             {"type": "text", "text": "Describe this image."},
+...         ],
+...     },
+... ]

-outputs[0]["generated_text"]
+>>> pipe = pipeline("image-text-to-text", model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", torch_dtype=torch.bfloat16)
+>>> outputs = pipe(text=messages, max_new_tokens=50, return_full_text=False)
+>>> outputs[0]["generated_text"]
 'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
 ```
-</hfoption>
-<hfoption id="AutoModel">
+### Inference on a single image

-```py
-import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText 
+This example demonstrates how to perform inference on a single image with the Mistral3 models using chat templates.

-torch_device = "cuda"
-model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_checkpoint, 
-    device_map=torch_device, 
-    torch_dtype=torch.bfloat16
-)
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch

-messages = [
-    {"role": "user",
-        "content":[
-            {"type": "image",
-            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
-            {"type": "text", "text": "Describe this image."}
-        ,]
-    ,}
-,]
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

-inputs = processor.apply_chat_template(
-    messages, 
-    add_generation_prompt=True, 
-    tokenize=True, return_dict=True, 
-    return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+...             {"type": "text", "text": "Describe this image"},
+...         ],
+...     }
+... ]

-generate_ids = model.generate(**inputs, max_new_tokens=20)
-decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

-decoded_output
-'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
+>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
+>>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+>>> decoded_output
+"The image depicts two cats lying on a pink blanket. The larger cat, which appears to be an"...
 ```
-</hfoption>
-</hfoptions>

-## Notes 
+### Text-only generation
+This example shows how to generate text using the Mistral3 model without providing any image input.

- Mistral 3 supports text-only generation. 
-```py 
-from transformers import AutoProcessor, AutoModelForImageTextToText
-import torch

-torch_device = "cuda"
-model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+````python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch

-SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
-user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."
+>>> torch_device = "cuda"
+>>> model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

-messages = [
-    {"role": "system", "content": SYSTEM_PROMPT},
-    {"role": "user", "content": user_prompt},
-]
+>>> SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
+>>> user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."

-text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
-generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
-decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]
+>>> messages = [
+...    {"role": "system", "content": SYSTEM_PROMPT},
+...    {"role": "user", "content": user_prompt},
+... ]

-print(decoded_output)
+>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+>>> inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
+>>> generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+>>> decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]
+
+>>> print(decoded_output)
 "1. À plus tard!
- 2. Salut, à plus!
- 3. À toute!
- 4. À la prochaine!
- 5. Je me casse, à plus!
+2. Salut, à plus!
+3. À toute!
+4. À la prochaine!
+5. Je me casse, à plus!

 ```
 /\_/\
@@ -140,93 +131,98 @@ print(decoded_output)
 ```"
 ````

- Mistral 3 accepts batched image and text inputs. 
-```py
-from transformers import AutoProcessor, AutoModelForImageTextToText
-import torch
+### Batched image and text inputs
+Mistral3 models also support batched image and text inputs.

-torch_device = "cuda"
-model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch

-messages = [
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-                 {"type": "text", "text": "Write a haiku for this image"},
-             ],
-         },
-     ],
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-                 {"type": "text", "text": "Describe this image"},
-             ],
-         },
-     ],
- ]
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+...                 {"type": "text", "text": "Describe this image"},
+...             ],
+...         },
+...     ],
+... ]


- inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

- output = model.generate(**inputs, max_new_tokens=25)
+>>> output = model.generate(**inputs, max_new_tokens=25)

- decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
- decoded_outputs
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
 ["Write a haiku for this imageCalm waters reflect\nWhispers of the forest's breath\nPeace on wooden path"
 , "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"]
 ```

- Mistral 3 also supported batched image and text inputs with a different number of images for each text. The example below quantizes the model with bitsandbytes. 
+### Batched multi-image input and quantization with BitsAndBytes
+This implementation of the Mistral3 models supports batched text-images inputs with different number of images for each text.
+This example also how to use `BitsAndBytes` to load the model in 4bit quantization.

-```py 
-from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
-import torch
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+>>> import torch

-torch_device = "cuda"
-model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-model = AutoModelForImageTextToText.from_pretrained(
-     model_checkpoint, quantization_config=quantization_config
- )
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+>>> model = AutoModelForImageTextToText.from_pretrained(
+...     model_checkpoint, quantization_config=quantization_config
+... )

-messages = [
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-                 {"type": "text", "text": "Write a haiku for this image"},
-             ],
-         },
-     ],
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
-                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
-                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
-             ],
-         },
-     ],
- ]
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
+...                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
+...                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+...             ],
+...         },
+...     ],
+>>> ]

- inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

- output = model.generate(**inputs, max_new_tokens=25)
+>>> output = model.generate(**inputs, max_new_tokens=25)

- decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
- decoded_outputs
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
 ["Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's wooden path\nSilent forest stands guard\n", "These images depict two different landmarks. Can you identify them? Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City."]
 ```

+
 ## Mistral3Config

 [[autodoc]] Mistral3Config
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@@ -146,9 +146,9 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech

 ## Shrinking down Mixtral using quantization

-As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.
+As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.

-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization) for alternative quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):

 ```python
 >>> import torch
--- a/docs/source/en/model_doc/mm-grounding-dino.md
+++ b/docs/source/en/model_doc/mm-grounding-dino.md
@@ -1,124 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# MM Grounding DINO
-
-[MM Grounding DINO](https://arxiv.org/abs/2401.02361) model was proposed in [An Open and Comprehensive Pipeline for Unified Object Grounding and Detection](https://arxiv.org/abs/2401.02361) by Xiangyu Zhao, Yicheng Chen, Shilin Xu, Xiangtai Li, Xinjiang Wang, Yining Li, Haian Huang>.
-
-MM Grounding DINO improves upon the [Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino) by improving the contrastive class head and removing the parameter sharing in the decoder, improving zero-shot detection performance on both COCO (50.6(+2.2) AP) and LVIS (31.9(+11.8) val AP and 41.4(+12.6) minival AP).
-
-You can find all the original MM Grounding DINO checkpoints under the [MM Grounding DINO](https://huggingface.co/collections/openmmlab-community/mm-grounding-dino-688cbde05b814c4e2832f9df) collection. This model also supports LLMDet inference. You can find LLMDet checkpoints under the [LLMDet](https://huggingface.co/collections/iSEE-Laboratory/llmdet-688475906dc235d5f1dc678e) collection.
-
-> [!TIP]
-> Click on the MM Grounding DINO models in the right sidebar for more examples of how to apply MM Grounding DINO to different MM Grounding DINO tasks.
-
-The example below demonstrates how to generate text based on an image with the [`AutoModelForZeroShotObjectDetection`] class.
-
-<hfoptions id="usage">
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
-from transformers.image_utils import load_image
-
-
-# Prepare processor and model
-model_id = "openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
-
-# Prepare inputs
-image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = load_image(image_url)
-text_labels = [["a cat", "a remote control"]]
-inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
-
-# Run inference
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# Postprocess outputs
-results = processor.post_process_grounded_object_detection(
-    outputs,
-    threshold=0.4,
-    target_sizes=[(image.height, image.width)]
-)
-
-# Retrieve the first image result
-result = results[0]
-for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
-    box = [round(x, 2) for x in box.tolist()]
-    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
-```
-
-</hfoption>
-</hfoptions>
-
-## Notes
-
- Here's a table of models and their object detection performance results on COCO (results from [official repo](https://github.com/open-mmlab/mmdetection/blob/main/configs/mm_grounding_dino/README.md)):
-
-    |                                                              Model                                                             | Backbone |      Pre-Train Data      |   Style   |  COCO mAP  |
-    | ------------------------------------------------------------------------------------------------------------------------------ | -------- | ------------------------ | --------- | ---------- |
-    |  [mm_grounding_dino_tiny_o365v1_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg)                       |  Swin-T  |        O365,GoldG        | Zero-shot | 50.4(+2.3) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit)             |  Swin-T  |     O365,GoldG,GRIT      | Zero-shot | 50.5(+2.1) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det)           |  Swin-T  |     O365,GoldG,V3Det     | Zero-shot | 50.6(+2.2) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit_v3det) |  Swin-T  |  O365,GoldG,GRIT,V3Det   | Zero-shot | 50.4(+2.0) |
-    |  [mm_grounding_dino_base_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_base_o365v1_goldg_v3det)           |  Swin-B  |     O365,GoldG,V3Det     | Zero-shot |    52.5    |
-    |  [mm_grounding_dino_base_all](https://huggingface.co/openmmlab-community/mm_grounding_dino_base_all)                                         |  Swin-B  |         O365,ALL         |     -     |    59.5    |
-    |  [mm_grounding_dino_large_o365v2_oiv6_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_large_o365v2_oiv6_goldg)           |  Swin-L  | O365V2,OpenImageV6,GoldG | Zero-shot |    53.0    |
-    |  [mm_grounding_dino_large_all](https://huggingface.co/openmmlab-community/mm_grounding_dino_large_all)                                       |  Swin-L  |  O365V2,OpenImageV6,ALL  |     -     |    60.3    |
-
- Here's a table of MM Grounding DINO tiny models and their object detection performance on LVIS (results from [official repo](https://github.com/open-mmlab/mmdetection/blob/main/configs/mm_grounding_dino/README.md)):
-
-    |                                                              Model                                                             |    Pre-Train Data     | MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |
-    | ------------------------------------------------------------------------------------------------------------------------------ | --------------------- | ----------- | ----------- | ----------- | ----------- | ---------- | ---------- | ---------- | ----------- |
-    |  [mm_grounding_dino_tiny_o365v1_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg)                       |      O365,GoldG       |    28.1     |    30.2     |    42.0     | 35.7(+6.9)  |    17.1    |    22.4    |    36.5    | 27.0(+6.9)  |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit)             |    O365,GoldG,GRIT    |    26.6     |    32.4     |    41.8     | 36.5(+7.7)  |    17.3    |    22.6    |    36.4    | 27.1(+7.0)  |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det)           |   O365,GoldG,V3Det    |    33.0     |    36.0     |    45.9     | 40.5(+11.7) |    21.5    |    25.5    |    40.2    | 30.6(+10.5) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit_v3det) | O365,GoldG,GRIT,V3Det |    34.2     |    37.4     |    46.2     | 41.4(+12.6) |    23.6    |    27.6    |    40.5    | 31.9(+11.8) |
-
-
- This implementation also supports inference for [LLMDet](https://github.com/iSEE-Laboratory/LLMDet). Here's a table of LLMDet models and their performance on LVIS (results from [official repo](https://github.com/iSEE-Laboratory/LLMDet)):
-
-    |                             Model                         | Pre-Train Data            |  MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |
-    | --------------------------------------------------------- | -------------------------------------------- | ------------ | ----------- | ----------- | ----------- | ---------- | ---------- | ---------- | ----------- |
-    | [llmdet_tiny](https://huggingface.co/iSEE-Laboratory/llmdet_tiny)   | (O365,GoldG,GRIT,V3Det) + GroundingCap-1M    | 44.7         | 37.3        | 39.5        | 50.7        | 34.9       | 26.0       | 30.1       | 44.3        |
-    | [llmdet_base](https://huggingface.co/iSEE-Laboratory/llmdet_base)   | (O365,GoldG,V3Det) + GroundingCap-1M         | 48.3         | 40.8        | 43.1        | 54.3        | 38.5       | 28.2       | 34.3       | 47.8        |
-    | [llmdet_large](https://huggingface.co/iSEE-Laboratory/llmdet_large) | (O365V2,OpenImageV6,GoldG) + GroundingCap-1M | 51.1         | 45.1        | 46.1        | 56.6        | 42.0       | 31.6       | 38.8       | 50.2        |
-
-
-## MMGroundingDinoConfig
-
-[[autodoc]] MMGroundingDinoConfig
-
-## MMGroundingDinoModel
-
-[[autodoc]] MMGroundingDinoModel
-    - forward
-
-## MMGroundingDinoForObjectDetection
-
-[[autodoc]] MMGroundingDinoForObjectDetection
-    - forward
--- a/docs/source/en/model_doc/modernbert-decoder.md
+++ b/docs/source/en/model_doc/modernbert-decoder.md
@@ -24,18 +24,14 @@ rendered properly in your Markdown viewer.

 # ModernBERT Decoder

-ModernBERT Decoder has the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but it is trained from scratch with a causal language modeling objective from the [Ettin paper](https://huggingface.co/papers/2507.11412). This allows for using the same architecture to compare encoders and decoders. This model is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.
+ModernBERT Decoder is the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but trained from scratch with a causal language modeling (CLM) objective. This allows for using the same architecture for comparing encoders and decoders. This is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.

-ModernBERT Decoder uses sliding window attention and rotary positional embeddings for efficiency and to handle longer sequences.
-
-You can find all the original ModernBERT Decoder checkpoints under the [jhu-clsp](https://huggingface.co/collections/jhu-clsp/encoders-vs-decoders-the-ettin-suite-686303e16142257eed8e6aeb) collection.
+Like the encoder version, ModernBERT Decoder incorporates modern architectural improvements such as rotary positional embeddings to support sequences of up to 8192 tokens, unpadding to avoid wasting compute on padding tokens, GeGLU layers, and alternating attention patterns. However, it uses causal (unidirectional) attention to enable autoregressive generation.

 > [!TIP]
-> This model was contributed by [orionw](https://huggingface.co/orionweller).
->
 > Click on the ModernBERT Decoder models in the right sidebar for more examples of how to apply ModernBERT Decoder to different text generation tasks.

-The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`] (with and without quantization), and from the command line. 
+The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`], and from the command line.

 <hfoptions id="usage">
 <hfoption id="Pipeline">
@@ -46,7 +42,7 @@ from transformers import pipeline

 generator = pipeline(
    task="text-generation",
-    model="jhu-clsp/ettin-decoder-17m",
+    model="blab-jhu/test-32m-dec",
    torch_dtype=torch.float16,
    device=0
 )
@@ -55,7 +51,7 @@ generator("The future of artificial intelligence is", max_length=50, num_return_
 # For sequence classification
 classifier = pipeline(
    task="text-classification",
-    model="jhu-clsp/ettin-decoder-17m",
+    model="blab-jhu/test-32m-dec",
    torch_dtype=torch.float16,
    device=0
 )
@@ -69,9 +65,9 @@ classifier("This movie is really great!")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

-tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-17m")
+tokenizer = AutoTokenizer.from_pretrained("blab-jhu/test-32m-dec")
 model = AutoModelForCausalLM.from_pretrained(
-    "jhu-clsp/ettin-decoder-17m",
+    "blab-jhu/test-32m-dec",
    torch_dtype=torch.float16,
    device_map="auto",
 )
@@ -96,7 +92,7 @@ print(f"Generated text: {generated_text}")
 from transformers import AutoModelForSequenceClassification

 classifier_model = AutoModelForSequenceClassification.from_pretrained(
-    "jhu-clsp/ettin-decoder-17m",
+    "blab-jhu/test-32m-dec",
    torch_dtype=torch.float16,
    device_map="auto",
    num_labels=2
@@ -115,53 +111,15 @@ print(f"Prediction probabilities: {predictions}")
 ```

 </hfoption>
-
-<hfoption id="AutoModel (w/quantization)">
-
-```
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-)
-
-tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-1b")
-model = AutoModelForCausalLM.from_pretrained(
-    "jhu-clsp/ettin-decoder-1b",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-prompt = "The future of artificial intelligence is"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model.generate(
-        **inputs,
-        max_length=50,
-        num_return_sequences=1,
-        temperature=0.7,
-        do_sample=True,
-        pad_token_id=tokenizer.eos_token_id
-    )
-
-generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(f"Generated text: {generated_text}")
-```
-</hfoption>
-
 <hfoption id="transformers CLI">

 ```bash
-echo "The future of artificial intelligence is" | transformers run --task text-generation --model jhu-clsp/ettin-decoder-17m --device 0
+echo "The future of artificial intelligence is" | transformers run --task text-generation --model your-username/modernbert-decoder-base --device 0
 ```

 </hfoption>
 </hfoptions>

-
 ## ModernBertDecoderConfig

 [[autodoc]] ModernBertDecoderConfig
@@ -184,5 +142,14 @@ echo "The future of artificial intelligence is" | transformers run --task text-g
 [[autodoc]] ModernBertDecoderForSequenceClassification
    - forward

+### Usage tips
+
+The ModernBertDecoder model can be fine-tuned for various text generation tasks using the HuggingFace Transformers library. It supports efficient inference with features like:
+
+- **Causal attention**: Ensures autoregressive generation by masking future tokens
+- **Sliding window attention**: Alternates between local and global attention patterns for efficiency
+- **Rotary positional embeddings**: Enables handling of longer sequences up to 8000 tokens
+- **FlashAttention support**: Optimized attention computation for faster training and inference
+
 </pt>
 </frameworkcontent>
--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@@ -115,11 +115,6 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran
 [[autodoc]] ModernBertForTokenClassification
    - forward

-## ModernBertForMultipleChoice
-
-[[autodoc]] ModernBertForMultipleChoice
-    - forward
-
 ## ModernBertForQuestionAnswering

 [[autodoc]] ModernBertForQuestionAnswering
--- a/docs/source/en/model_doc/mt5.md
+++ b/docs/source/en/model_doc/mt5.md
@@ -14,115 +14,54 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
-    </div>
-</div>
-
 # mT5

-[mT5](https://huggingface.co/papers/2010.11934) is a multilingual variant of [T5](./t5), training on 101 languages. It also incorporates a new "accidental translation" technique to prevent the model from incorrectly translating predictions into the wrong language.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>

-You can find all the original [mT5] checkpoints under the [mT5](https://huggingface.co/collections/google/mt5-release-65005f1a520f8d7b4d039509) collection.
+## Overview

-> [!TIP]
-> This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
->
-> Click on the mT5 models in the right sidebar for more examples of how to apply mT5 to different language tasks.
+The mT5 model was presented in [mT5: A massively multilingual pre-trained text-to-text transformer](https://huggingface.co/papers/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
+Siddhant, Aditya Barua, Colin Raffel.

-The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
+state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
+the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
+benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
+generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
+checkpoints used in this work are publicly available.*

-```python
-import torch
-from transformers import pipeline
+Note: mT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
+Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.

-pipeline = pipeline(
-    task="text2text-generation",
-    model="csebuetnlp/mT5_multilingual_XLSum",
-    torch_dtype=torch.float16,
-    device=0
-)
-pipeline("""Plants are remarkable organisms that produce their own food using a method called photosynthesis.
-This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
-Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems.""")
-```
+Google has released the following variants:

-</hfoption>
-<hfoption id="AutoModel">
+- [google/mt5-small](https://huggingface.co/google/mt5-small)

-```python
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+- [google/mt5-base](https://huggingface.co/google/mt5-base)

-tokenizer = AutoTokenizer.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum",
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
+- [google/mt5-large](https://huggingface.co/google/mt5-large)

-input_text = """Plants are remarkable organisms that produce their own food using a method called photosynthesis.
-This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
-Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+- [google/mt5-xl](https://huggingface.co/google/mt5-xl)

-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
+- [google/mt5-xxl](https://huggingface.co/google/mt5-xxl).

-</hfoption>
-<hfoption id="transformers CLI">
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/multilingual-t5).

-```bash
-echo -e "Plants are remarkable organisms that produce their own food using a method called photosynthesis." | transformers run --task text2text-generation --model csebuetnlp/mT5_multilingual_XLSum --device 0
-```
+## Resources

-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum"
-)
-input_text = """Plants are remarkable organisms that produce their own food using a method called photosynthesis.
-This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
-Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-## Notes
-
- mT5 must be fine-tuned for downstream tasks because it was only pretrained on the [mc4](https://huggingface.co/datasets/mc4) dataset.
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)

 ## MT5Config

--- a/docs/source/en/model_doc/olmoe.md
+++ b/docs/source/en/model_doc/olmoe.md
@@ -14,89 +14,27 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
+# OLMoE
+
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
-</div>

-# OLMoE
+## Overview

-[OLMoE](https://huggingface.co/papers/2409.02060) is a sparse Mixture-of-Experts (MoE) language model with 7B parameters but only 1B parameters are used per input token. It has similar inference costs as dense models but trains ~3x faster. OLMoE uses fine-grained routing with 64 small experts in each layer and uses a dropless token-based routing algorithm.
+The OLMoE model was proposed in [OLMoE: Open Mixture-of-Experts Language Models](https://huggingface.co/papers/2409.02060) by Niklas Muennighoff, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Jacob Morrison, Sewon Min, Weijia Shi, Pete Walsh, Oyvind Tafjord, Nathan Lambert, Yuling Gu, Shane Arora, Akshita Bhagia, Dustin Schwenk, David Wadden, Alexander Wettig, Binyuan Hui, Tim Dettmers, Douwe Kiela, Ali Farhadi, Noah A. Smith, Pang Wei Koh, Amanpreet Singh, Hannaneh Hajishirzi.

-You can find all the original OLMoE checkpoints under the [OLMoE](https://huggingface.co/collections/allenai/olmoe-november-2024-66cf678c047657a30c8cd3da) collection.
+OLMoE is a series of **O**pen **L**anguage **Mo**dels using sparse **M**ixture-**o**f-**E**xperts designed to enable the science of language models. We release all code, checkpoints, logs, and details involved in training these models.

-> [!TIP]
-> This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
->
-> Click on the OLMoE models in the right sidebar for more examples of how to apply OLMoE to different language tasks.
+The abstract from the paper is the following:

-The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class.
+*We introduce OLMoE, a fully open, state-of-the-art language model leveraging sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but uses only 1B per input token. We pretrain it on 5 trillion tokens and further adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available models with similar active parameters, even surpassing larger ones like Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE training, analyze routing in our model showing high specialization, and open-source all aspects of our work: model weights, training data, code, and logs.*

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
+The original code can be found [here](https://github.com/allenai/OLMoE).

-```py
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    task="text-generation",
-    model="allenai/OLMoE-1B-7B-0125",
-    torch_dtype=torch.float16,
-    device=0,
-)
-
-result = pipe("Dionysus is the god of")
-print(result)
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto").to(device)
-tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
-
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.to(device) for k, v in inputs.items()}
-output = model.generate(**inputs, max_length=64)
-print(tokenizer.decode(output[0]))
-```
-
-## Quantization
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-quantization_config = BitsAndBytesConfig(
-   load_in_4bit=True,
-   bnb_4bit_compute_dtype=torch.float16,
-   bnb_4bit_use_double_quant=True,
-   bnb_4bit_quant_type="nf4"
-)
-
-model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto", quantization_config=quantization_config).to(device)
-tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
-
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.to(device) for k, v in inputs.items()}
-output = model.generate(**inputs, max_length=64)
-print(tokenizer.decode(output[0]))
-```

 ## OlmoeConfig

--- a/docs/source/en/model_doc/oneformer.md
+++ b/docs/source/en/model_doc/oneformer.md
@@ -38,7 +38,7 @@ This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3

 ## Usage tips

-  OneFormer requires two inputs during inference: *image* and *task token*.
+-  OneFormer requires two inputs during inference: *image* and *task token*. 
 - During training, OneFormer only uses panoptic annotations.
 - If you want to train the model in a distributed environment across multiple nodes, then one should update the
  `get_num_masks` function inside in the `OneFormerLoss` class of `modeling_oneformer.py`. When training on multiple nodes, this should be
@@ -69,14 +69,7 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerImageProcessor
    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## OneFormerImageProcessorFast
-
-[[autodoc]] OneFormerImageProcessorFast
-    - preprocess
+    - encode_inputs
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
@@ -94,3 +87,4 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerForUniversalSegmentation
    - forward
+    
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@@ -1,101 +1,194 @@
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-           <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N9lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFmsnSos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQsKKaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScSKSBqKCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD82gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtbREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG23nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-           <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-           <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->

 # OPT

-[OPT](https://huggingface.co/papers/2205.01068) is a suite of open-source decoder-only pre-trained transformers whose parameters range from 125M to 175B. OPT models are designed for casual language modeling and aim to enable responsible and reproducible research at scale. OPT-175B is comparable in performance to GPT-3 with only 1/7th the carbon footprint.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original OPT checkpoints under the [OPT](https://huggingface.co/collections/facebook/opt-66ed00e15599f02966818844) collection.
+## Overview

-> [!TIP]
-> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ), [ybelkada](https://huggingface.co/ybelkada), and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
->
-> Click on the OPT models in the right sidebar for more examples of how to apply OPT to different language tasks.
+The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://huggingface.co/papers/2205.01068) by Meta AI.
+OPT is a series of open-sourced large causal language models which perform similar in performance to GPT3.

-The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
+The abstract from the paper is the following:

+*Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*

-<hfoptions id="usage">
-<hfoption id="Pipeline">
-  
-```py  
-import torch
-from transformers import pipeline
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
+The original code can be found [here](https://github.com/facebookresearch/metaseq).

-pipeline = pipeline(task="text-generation", model="facebook/opt-125m", torch_dtype=torch.float16, device=0)
-pipeline("Once upon a time, in a land far, far away,", max_length=50, num_return_sequences=1)
-```
+Tips:
+- OPT has the same architecture as [`BartDecoder`].
+- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.

-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-device = "cuda"
-
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-
-prompt = ("Once upon a time, in a land far, far away, ")
-
-model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
-model.to(device)
-
-generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
-tokenizer.batch_decode(generated_ids)[0]
-```
-</hfoption>
-<hfoption id="transformers CLI">
-
-```py
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model facebook/opt-125m --device 0
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](..quantization/bitsandbytes) to quantize the weights to 8-bits.
-
-```py
-import torch
-from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
-
-device = "cuda"
-
-bnb_config = BitsAndBytesConfig(load_in_8bit=True)
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-13b", torch_dtype=torch.float16, attn_implementation="sdpa", quantization_config=bnb_config)
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-13b")
-
-prompt = ("Once upon a time, in a land far, far away, ")
-
-model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
-model.to(device)
-
-generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
-tokenizer.batch_decode(generated_ids)[0]
-```
-
-## Notes
-
- OPT adds an `EOS` token `</s>` to the beginning of every prompt.
-
- The `head_mask` argument is ignored if the attention implementation isn't `"eager"`. Set `attn_implementation="eager"` to enable the `head_mask`.
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`

 ## Resources

- Refer to this [notebook](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing) for an example of fine-tuning OPT with PEFT, bitsandbytes, and Transformers.
- The [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) blog post demonstrates how to run OPT for inference.
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
+interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-generation" />
+
+- A notebook on [fine-tuning OPT with PEFT, bitsandbytes, and Transformers](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing). 🌎
+- A blog post on [decoding strategies with OPT](https://huggingface.co/blog/introducing-csearch#62-example-two---opt).
+- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
+- [`OPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling).
+
+<PipelineTag pipeline="text-classification" />
+
+- [Text classification task guide](sequence_classification.md)
+- [`OPTForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+
+<PipelineTag pipeline="question-answering" />
+
+- [`OPTForQuestionAnswering`] is supported by this [question answering example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter
+  of the 🤗 Hugging Face Course.
+
+⚡️ Inference
+
+- A blog post on [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) with OPT.
+
+
+## Combining OPT and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> from transformers import OPTForCausalLM, GPT2Tokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
+>>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
+
+>>> prompt = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
+              "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
+              "there?")
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+>>> tokenizer.batch_decode(generated_ids)[0]
+'</s>A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for about a year.\nHuman: What is your favorite place to eat?\nStatue: I love'
+```
+
+### Expected speedups
+
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-2.7b` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
+
+<div style="text-align: center">
+<img src="https://user-images.githubusercontent.com/49240599/281101546-d2fca6d2-ee44-48f3-9534-ba8d5bee4531.png">
+</div>
+
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-350m` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
+
+<div style="text-align: center">
+<img src="https://user-images.githubusercontent.com/49240599/281101682-d1144e90-0dbc-46f4-8fc8-c6206cb793c9.png">
+</div>
+
+
+### Using Scaled Dot Product Attention (SDPA)
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```python
+from transformers import OPTForCausalLM
+model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (L40S-45GB, PyTorch 2.4.0, OS Debian GNU/Linux 11) using `float16` with
+[facebook/opt-350m](https://huggingface.co/facebook/opt-350m), we saw the
+following speedups during training and inference.
+
+### Training
+
+|    batch_size |    seq_len |  Time per batch (eager - s)   |    Time per batch (sdpa - s) |  Speedup (%)   |  Eager peak mem (MB)   |    sdpa peak mem (MB) |  Mem saving (%)   |
+|--------------:|-----------:|:------------------------------|-----------------------------:|:---------------|:-----------------------|----------------------:|:------------------|
+|             1 |        128 | 0.047                         |                        0.037 | 26.360         | 1474.611               |               1474.32 | 0.019             |
+|             1 |        256 | 0.046                         |                        0.037 | 24.335         | 1498.541               |               1499.49 | -0.063            |
+|             1 |        512 | 0.046                         |                        0.037 | 24.959         | 1973.544               |               1551.35 | 27.215            |
+|             1 |       1024 | 0.062                         |                        0.038 | 65.135         | 4867.113               |               1698.35 | 186.578           |
+|             1 |       2048 | 0.230                         |                        0.039 | 483.933        | 15662.224              |               2715.75 | 476.718           |
+|             2 |        128 | 0.045                         |                        0.037 | 20.455         | 1498.164               |               1499.49 | -0.089            |
+|             2 |        256 | 0.046                         |                        0.037 | 24.027         | 1569.367               |               1551.35 | 1.161             |
+|             2 |        512 | 0.045                         |                        0.037 | 20.965         | 3257.074               |               1698.35 | 91.778            |
+|             2 |       1024 | 0.122                         |                        0.038 | 225.958        | 9054.405               |               2715.75 | 233.403           |
+|             2 |       2048 | 0.464                         |                        0.067 | 593.646        | 30572.058              |               4750.55 | 543.548           |
+|             4 |        128 | 0.045                         |                        0.037 | 21.918         | 1549.448               |               1551.35 | -0.123            |
+|             4 |        256 | 0.044                         |                        0.038 | 18.084         | 2451.768               |               1698.35 | 44.361            |
+|             4 |        512 | 0.069                         |                        0.037 | 84.421         | 5833.180               |               2715.75 | 114.791           |
+|             4 |       1024 | 0.262                         |                        0.062 | 319.475        | 17427.842              |               4750.55 | 266.860           |
+|             4 |       2048 | OOM                           |                        0.062 | Eager OOM      | OOM                    |               4750.55 | Eager OOM         |
+|             8 |        128 | 0.044                         |                        0.037 | 18.436         | 2049.115               |               1697.78 | 20.694            |
+|             8 |        256 | 0.048                         |                        0.036 | 32.887         | 4222.567               |               2715.75 | 55.484            |
+|             8 |        512 | 0.153                         |                        0.06  | 154.862        | 10985.391              |               4750.55 | 131.245           |
+|             8 |       1024 | 0.526                         |                        0.122 | 330.697        | 34175.763              |               8821.18 | 287.428           |
+|             8 |       2048 | OOM                           |                        0.122 | Eager OOM      | OOM                    |               8821.18 | Eager OOM         |
+
+### Inference
+
+|    batch_size |    seq_len |    Per token latency eager (ms) |    Per token latency SDPA (ms) |    Speedup (%) |    Mem eager (MB) |    Mem BT (MB) |    Mem saved (%) |
+|--------------:|-----------:|--------------------------------:|-------------------------------:|---------------:|------------------:|---------------:|-----------------:|
+|             1 |        128 |                          11.634 |                          8.647 |         34.546 |           717.676 |        717.674 |            0     |
+|             1 |        256 |                          11.593 |                          8.86  |         30.851 |           742.852 |        742.845 |            0.001 |
+|             1 |        512 |                          11.515 |                          8.816 |         30.614 |           798.232 |        799.593 |           -0.17  |
+|             1 |       1024 |                          11.556 |                          8.915 |         29.628 |           917.265 |        895.538 |            2.426 |
+|             2 |        128 |                          12.724 |                         11.002 |         15.659 |           762.434 |        762.431 |            0     |
+|             2 |        256 |                          12.704 |                         11.063 |         14.83  |           816.809 |        816.733 |            0.009 |
+|             2 |        512 |                          12.757 |                         10.947 |         16.535 |           917.383 |        918.339 |           -0.104 |
+|             2 |       1024 |                          13.018 |                         11.018 |         18.147 |          1162.65  |       1114.81  |            4.291 |
+|             4 |        128 |                          12.739 |                         10.959 |         16.243 |           856.335 |        856.483 |           -0.017 |
+|             4 |        256 |                          12.718 |                         10.837 |         17.355 |           957.298 |        957.674 |           -0.039 |
+|             4 |        512 |                          12.813 |                         10.822 |         18.393 |          1158.44  |       1158.45  |           -0.001 |
+|             4 |       1024 |                          13.416 |                         11.06  |         21.301 |          1653.42  |       1557.19  |            6.18  |
+|             8 |        128 |                          12.763 |                         10.891 |         17.193 |          1036.13  |       1036.51  |           -0.036 |
+|             8 |        256 |                          12.89  |                         11.104 |         16.085 |          1236.98  |       1236.87  |            0.01  |
+|             8 |        512 |                          13.327 |                         10.939 |         21.836 |          1642.29  |       1641.78  |            0.031 |
+|             8 |       1024 |                          15.181 |                         11.175 |         35.848 |          2634.98  |       2443.35  |            7.843 |

 ## OPTConfig

--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@@ -106,13 +106,6 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
    - post_process_object_detection
    - post_process_image_guided_detection

-## Owlv2ImageProcessorFast
-
-[[autodoc]] Owlv2ImageProcessorFast
-    - preprocess
-    - post_process_object_detection
-    - post_process_image_guided_detection
-
 ## Owlv2Processor

 [[autodoc]] Owlv2Processor
--- a/docs/source/en/model_doc/patchtsmixer.md
+++ b/docs/source/en/model_doc/patchtsmixer.md
@@ -38,7 +38,7 @@ This model was contributed by [ajati](https://huggingface.co/ajati), [vijaye12](

 ## Usage example

-The code snippet below shows how to randomly initialize a PatchTSMixer model. The model is compatible with the [Trainer API](../trainer).
+The code snippet below shows how to randomly initialize a PatchTSMixer model. The model is compatible with the [Trainer API](../trainer.md).

 ```python

--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 # Qwen2MoE


-[Qwen2MoE](https://huggingface.co/papers/2407.10671) is a Mixture-of-Experts (MoE) variant of [Qwen2](./qwen2), available as a base model and an aligned chat model. It uses SwiGLU activation, group query attention and a mixture of sliding window attention and full attention. The tokenizer can also be adapted to multiple languages and codes.
+[Qwen2MoE]((https://huggingface.co/papers/2407.10671) ) is a Mixture-of-Experts (MoE) variant of [Qwen2](./qwen2), available as a base model and an aligned chat model. It uses SwiGLU activation, group query attention and a mixture of sliding window attention and full attention. The tokenizer can also be adapted to multiple languages and codes.

 The MoE architecture uses upcyled models from the dense language models. For example, Qwen1.5-MoE-A2.7B is upcycled from Qwen-1.8B. It has 14.3B parameters but only 2.7B parameters are activated during runtime.

--- a/docs/source/en/model_doc/segformer.md
+++ b/docs/source/en/model_doc/segformer.md
@@ -128,12 +128,6 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

-## SegformerImageProcessorFast
-
-[[autodoc]] SegformerImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-
 <frameworkcontent>
 <pt>

@@ -181,4 +175,4 @@ If you're interested in submitting a resource to be included here, please feel f
    - call

 </tf>
-</frameworkcontent>
+</frameworkcontent>
--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@@ -103,11 +103,38 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
    ```

- Visualize the matches between the images using the built-in plotting functionality.
+- The example below demonstrates how to visualize matches between two images.

    ```py
-    # Easy visualization using the built-in plotting method
-    processor.visualize_keypoint_matching(images, processed_outputs)
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    # Create side by side image
+    merged_image = np.zeros((max(image1.height, image2.height), image1.width + image2.width, 3))
+    merged_image[: image1.height, : image1.width] = np.array(image1) / 255.0
+    merged_image[: image2.height, image1.width :] = np.array(image2) / 255.0
+    plt.imshow(merged_image)
+    plt.axis("off")
+
+    # Retrieve the keypoints and matches
+    output = processed_outputs[0]
+    keypoints0 = output["keypoints0"]
+    keypoints1 = output["keypoints1"]
+    matching_scores = output["matching_scores"]
+
+    # Plot the matches
+    for keypoint0, keypoint1, matching_score in zip(keypoints0, keypoints1, matching_scores):
+        plt.plot(
+            [keypoint0[0], keypoint1[0] + image1.width],
+            [keypoint0[1], keypoint1[1]],
+            color=plt.get_cmap("RdYlGn")(matching_score.item()),
+            alpha=0.9,
+            linewidth=0.5,
+        )
+        plt.scatter(keypoint0[0], keypoint0[1], c="black", s=2)
+        plt.scatter(keypoint1[0] + image1.width, keypoint1[1], c="black", s=2)
+
+    plt.savefig("matched_image.png", dpi=300, bbox_inches='tight')
    ```

 <div class="flex justify-center">
@@ -128,7 +155,6 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size

 - preprocess
 - post_process_keypoint_matching
- visualize_keypoint_matching

 <frameworkcontent>
 <pt>
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -130,11 +130,6 @@ processed_outputs = processor.post_process_keypoint_detection(outputs, [image_si

 [[autodoc]] SuperPointImageProcessor

- preprocess
-
-## SuperPointImageProcessorFast
-
-[[autodoc]] SuperPointImageProcessorFast
 - preprocess
 - post_process_keypoint_detection

--- a/docs/source/en/model_doc/voxtral.md
+++ b/docs/source/en/model_doc/voxtral.md
@@ -37,11 +37,7 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:

 ## Usage

-### Audio Instruct Mode
-
-The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
-
-➡️ audio + text instruction
+Let's first load the model!
 ```python
 from transformers import VoxtralForConditionalGeneration, AutoProcessor
 import torch
@@ -51,7 +47,14 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"

 processor = AutoProcessor.from_pretrained(repo_id)
 model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+```

+### Audio Instruct Mode
+
+The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
+
+➡️ audio + text instruction
+```python
 conversation = [
    {
        "role": "user",
@@ -79,15 +82,6 @@ print("=" * 80)

 ➡️ multi-audio + text instruction 
 ```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
 conversation = [
    {
        "role": "user",
@@ -119,15 +113,6 @@ print("=" * 80)

 ➡️ multi-turn:
 ```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
 conversation = [
    {
        "role": "user",
@@ -173,15 +158,6 @@ print("=" * 80)

 ➡️ text only:
 ```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
 conversation = [
    {
        "role": "user",
@@ -208,15 +184,6 @@ print("=" * 80)

 ➡️ audio only:
 ```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
 conversation = [
    {
        "role": "user",
@@ -243,15 +210,6 @@ print("=" * 80)

 ➡️ batched inference!
 ```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
 conversations = [
    [
        {
@@ -304,16 +262,7 @@ for decoded_output in decoded_outputs:
 Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!

 ```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
+inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
 inputs = inputs.to(device, dtype=torch.bfloat16)

 outputs = model.generate(**inputs, max_new_tokens=500)
--- a/docs/source/en/model_doc/xlstm.md
+++ b/docs/source/en/model_doc/xlstm.md
@@ -1,47 +0,0 @@
-<!--Copyright 2025 NXAI GmbH. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-
-# xLSTM
-
-## Overview
-
-The xLSTM model was proposed in [xLSTM: Extended Long Short-Term Memory](https://openreview.net/forum?id=ARAxPPIAhq) by Maximilian Beck*, Korbinian Pöppel*, Markus Spanring, Andreas Auer, Oleksandra Prudnikova, Michael Kopp, Günter Klambauer, Johannes Brandstetter and Sepp Hochreiter.
-xLSTM updates the original LSTM architecture to be competitive with Transformer models by introducing exponential gating, matrix memory expansion, and parallelizable training and ingestion.
-
-The [7B model](https://hf.co/NX-AI/xLSTM-7b) variant was trained by the xLSTM team Maximilian Beck, Korbinian Pöppel, Phillip Lippe, Richard Kurle, Patrick Blies, Sebastian Böck and Sepp Hochreiter at NXAI.
-
-The abstract from the paper is the following:
-
-*In the 1990s, the constant error carousel and gating were introduced as the central ideas of the Long Short-Term Memory (LSTM). Since then, LSTMs have stood the test of time and contributed to numerous deep learning success stories, in particular they constituted the first Large Language Models (LLMs). However, the advent of the Transformer technology with parallelizable self-attention at its core marked the dawn of a new era, outpacing LSTMs at scale. We now raise a simple question: How far do we get in language modeling when scaling LSTMs to billions of parameters, leveraging the latest techniques from modern LLMs, but mitigating known limitations of LSTMs? Firstly, we introduce exponential gating with appropriate normalization and stabilization techniques. Secondly, we modify the LSTM memory structure, obtaining: (i) sLSTM with a scalar memory, a scalar update, and new memory mixing, (ii) mLSTM that is fully parallelizable with a matrix memory and a covariance update rule. Integrating these LSTM extensions into residual block backbones yields xLSTM blocks that are then residually stacked into xLSTM architectures. Exponential gating and modified memory structures boost xLSTM capabilities to perform favorably when compared to state-of-the-art Transformers and State Space Models, both in performance and scaling.*
-
-This model was contributed by [NX-AI](https://huggingface.co/NX-AI).
-The original code can be found [here](https://github.com/NX-AI/xlstm).
-
-
-## xLSTMConfig
-
-[[autodoc]] xLSTMConfig
-
-## xLSTMModel
-
-[[autodoc]] xLSTMModel
-    - forward
-
-## xLSTMLMHeadModel
-
-[[autodoc]] xLSTMForCausalLM
-    - forward
--- a/docs/source/en/model_doc/yolos.md
+++ b/docs/source/en/model_doc/yolos.md
@@ -13,95 +13,76 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>

 # YOLOS

-[YOLOS](https://huggingface.co/papers/2106.00666) uses a [Vision Transformer (ViT)](./vit) for object detection with minimal modifications and region priors. It can achieve performance comparable to specialized object detection models and frameworks with knowledge about 2D spatial structures.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

+## Overview

-You can find all the original YOLOS checkpoints under the [HUST Vision Lab](https://huggingface.co/hustvl/models?search=yolos) organization.
+The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://huggingface.co/papers/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN.

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png" alt="drawing" width="600"/>
+The abstract from the paper is the following:
+
+*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png"
+alt="drawing" width="600"/>

 <small> YOLOS architecture. Taken from the <a href="https://huggingface.co/papers/2106.00666">original paper</a>.</small>

+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS).

-> [!TIP]
-> This model wasa contributed by [nielsr](https://huggingface.co/nielsr).
-> Click on the YOLOS models in the right sidebar for more examples of how to apply YOLOS to different object detection tasks.
+## Using Scaled Dot Product Attention (SDPA)

-The example below demonstrates how to detect objects with [`Pipeline`] or the [`AutoModel`] class.
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.

-```py
-import torch
-from transformers import pipeline
-
-detector = pipeline(
-    task="object-detection",
-    model="hustvl/yolos-base",
-    torch_dtype=torch.float16,
-    device=0
-)
-detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+```
+from transformers import AutoModelForObjectDetection
+model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-base", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
 ```

-</hfoption>
-<hfoption id="Automodel">
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).

-```py
-import torch
-from PIL import Image
-import requests
-from transformers import AutoImageProcessor, AutoModelForObjectDetection
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `hustvl/yolos-base` model, we saw the following speedups during inference.

-processor = AutoImageProcessor.from_pretrained("hustvl/yolos-base")
-model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-base", torch_dtype=torch.float16, attn_implementation="sdpa").to("cuda")
-
-url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"
-image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-inputs = processor(images=image, return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-logits = outputs.logits.softmax(-1)
-scores, labels = logits[..., :-1].max(-1)
-boxes = outputs.pred_boxes
-
-threshold = 0.3
-keep = scores[0] > threshold
-
-filtered_scores = scores[0][keep]
-filtered_labels = labels[0][keep]
-filtered_boxes  = boxes[0][keep]
-
-width, height = image.size
-pixel_boxes = filtered_boxes * torch.tensor([width, height, width, height], device=boxes.device)
-
-for score, label, box in zip(filtered_scores, filtered_labels, pixel_boxes):
-    x0, y0, x1, y1 = box.tolist()
-    print(f"Label {model.config.id2label[label.item()]}: {score:.2f} at [{x0:.0f}, {y0:.0f}, {x1:.0f}, {y1:.0f}]")
-```
-
-</hfoption>
-</hfoptions>
-
-
-## Notes
- Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](./detr), YOLOS doesn't require a `pixel_mask`.
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                       106 |                                        76 |                      1.39 |
+|            2 |                                       154 |                                        90 |                      1.71 |
+|            4 |                                       222 |                                       116 |                      1.91 |
+|            8 |                                       368 |                                       168 |                      2.19 |

 ## Resources

- Refer to these [notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS) for inference and fine-tuning with [`YolosForObjectDetection`] on a custom dataset.
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with YOLOS.
+
+<PipelineTag pipeline="object-detection"/>
+
+- All example notebooks illustrating inference + fine-tuning [`YolosForObjectDetection`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS).
+- Scripts for finetuning [`YolosForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
+- See also: [Object detection task guide](../tasks/object_detection)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<Tip>
+
+Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
+
+</Tip>

 ## YolosConfig

--- a/docs/source/en/model_doc/zamba.md
+++ b/docs/source/en/model_doc/zamba.md
@@ -69,11 +69,11 @@ print(tokenizer.decode(outputs[0]))
 ## Model card

 The model cards can be found at:
-* [Zamba-7B](https://huggingface.co/Zyphra/Zamba-7B-v1)
+* [Zamba-7B](MODEL_CARD_ZAMBA-7B-v1.md)


 ## Issues
-For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba-7B-v1/discussions)
+For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/zyphra/zamba-7b)


 ## License
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@@ -28,7 +28,7 @@ To share a model to the Hub, you need a Hugging Face [account](https://hf.co/joi
 <hfoption id="huggingface-CLI">

 ```bash
-hf auth login
+huggingface-cli login
 ```

 </hfoption>
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@@ -94,7 +94,7 @@ ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should

 ## Implementing a modular file

-The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere2](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from.
+The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from.

 | Component | Model |
 |---|---|
--- a/docs/source/en/open_webui.md
+++ b/docs/source/en/open_webui.md
@@ -1,22 +0,0 @@
-#  Audio transcriptions with WebUI and `transformers serve`
-
-This guide shows how to do audio transcription for chat purposes, using `transformers serve` and [Open WebUI](https://openwebui.com/). This guide assumes you have Open WebUI installed on your machine and ready to run. Please refer to the examples above to use the text functionalities of `transformer serve` with Open WebUI -- the instructions are the same.
-
-To start, let's launch the server. Some of Open WebUI's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons, so you need to enable it:
-
-```shell
-transformers serve --enable-cors
-```
-
-Before you can speak into Open WebUI, you need to update its settings to use your server for speech to text (STT) tasks. Launch Open WebUI, and navigate to the audio tab inside the admin settings. If you're using Open WebUI with the default ports, [this link (default)](http://localhost:3000/admin/settings/audio) or [this link (python deployment)](http://localhost:8080/admin/settings/audio) will take you there. Do the following changes there:
-1. Change the type of "Speech-to-Text Engine" to "OpenAI";
-2. Update the address to your server's address -- `http://localhost:8000/v1` by default;
-3. Type your model of choice into the "STT Model" field, e.g. `openai/whisper-large-v3` ([available models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending)).
-
-If you've done everything correctly, the audio tab should look like this
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_openwebui_stt_settings.png"/>
-</h3>
-
-You're now ready to speak! Open a new chat, utter a few words after hitting the microphone button, and you should see the corresponding text on the chat input after the model transcribes it.
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -177,16 +177,10 @@ There are three supported implementations available.

 SDPA is used by default for PyTorch v2.1.1. and greater when an implementation is available. You could explicitly enable SDPA by setting `attn_implementation="sdpa"` in [`~PreTrainedModel.from_pretrained`] though. Certain attention parameters, such as `head_mask` and `output_attentions=True`, are unsupported and returns a warning that Transformers will fall back to the (slower) eager implementation.

-Refer to the [AttentionInterface](./attention_interface) guide to learn how to change the attention implementation after loading a model.
-
 ```py
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa")
-
-# Change the model's attention dynamically after loading it
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto")
-model.set_attention_implementation("sdpa")
 ```

 SDPA selects the most performant implementation available, but you can also explicitly select an implementation with [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager. The example below shows how to enable the FlashAttention2 implementation with `enable_flash=True`.
@@ -240,7 +234,7 @@ FlashAttention2 support is currently limited to Instinct MI210, Instinct MI250 a
 </hfoption>
 </hfoptions>

-Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or by setting `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface). FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.
+Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`]. FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.

 ```py
 from transformers import AutoModelForCausalLM
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@@ -1,66 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# FP-Quant
-
-[FP-Quant](https://github.com/IST-DASLab/FP-Quant) is a family of quantization algorithms tailored for the Blackwell generation of Nvidia GPUs. The goal is to allow for efficient post-training quantization (PTQ) and quantization-aware trainin (QAT) of LLMs in the [MXFP4 and NVFP4 data-types](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
-
-Currently, only PTQ with MXFP4 is supported. Models can either be quantized on the fly with `quantization_config=FPQuantConfig()`:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
-import torch
-
-model = AutoModelForCausalLM.from_pretrained(
-    "qwen/Qwen3-8B",
-    quantization_config=FPQuantConfig(),
-    device_map="cuda",
-    torch_dtype=torch.bfloat16,
-)
-```
-
-or pre-processed with GPTQ for better quality (see [FP Format Quantization Harness](https://github.com/IST-DASLab/FP-Quant)).
-
-A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.
-
-Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
-
-> [!TIP]
-> Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
-
-## torch.compile
-
-FP-Quant is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
-
-model = AutoModelForCausalLM.from_pretrained(
-    "qwen/Qwen3-8B",
-    quantization_config=FPQuantConfig(),
-    device_map="cuda",
-    torch_dtype=torch.bfloat16,
-)
-
-model.forward = torch.compile(model.forward, mode="max-autotune", fullgraph=True)
-```
-
-## Speedups
-
-FP-Quant currently performs best for very large batch size processing.
-
-See [QuTLASS README](https://github.com/IST-DASLab/qutlass/blob/main/README.md) for speedups.
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -30,7 +30,6 @@ Use the Space below to help you pick a quantization method depending on your har
 | [bitsandbytes](./bitsandbytes)            | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🟢 | 4/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
 | [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
 | [EETQ](./eetq)                            | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8            | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| [FP-Quant](./fp_quant)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 4           | 🔴               | 🟢                          | 🟢                      | https://github.com/IST-DASLab/FP-Quant      |
 | [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
 | [GPTQModel](./gptq)                       | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
 | [AutoGPTQ](./gptq)                        | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
--- a/docs/source/en/quantization/spqr.md
+++ b/docs/source/en/quantization/spqr.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # SpQR

-The [SpQR](https://hf.co/papers/2306.03078) quantization algorithm involves a 16x16 tiled bi-level group 3-bit quantization structure with sparse outliers.
+The [SpQR]((https://hf.co/papers/2306.03078)) quantization algorithm involves a 16x16 tiled bi-level group 3-bit quantization structure with sparse outliers.

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/spqr-diagram.png">
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -49,7 +49,7 @@ notebook_login()
 Make sure the [huggingface_hub[cli]](https://huggingface.co/docs/huggingface_hub/guides/cli#getting-started) package is installed and run the command below. Paste your User Access Token when prompted to log in.

 ```bash
-hf auth login
+huggingface-cli login
 ```

 </hfoption>
--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 [ONNX](http://onnx.ai) is an open standard that defines a common set of operators and a file format to represent deep learning models in different frameworks, including PyTorch and TensorFlow. When a model is exported to ONNX, the operators construct a computational graph (or *intermediate representation*) which represents the flow of data through the model. Standardized operators and data types makes it easy to switch between frameworks.

-The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to ONNX with configuration objects which are supported for [many architectures](https://huggingface.co/docs/optimum/exporters/onnx/overview) and can be easily extended. If a model isn't supported, feel free to make a [contribution](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) to Optimum.
+The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to ONNX with configuration objects which are supported for [many architectures]((https://huggingface.co/docs/optimum/exporters/onnx/overview)) and can be easily extended. If a model isn't supported, feel free to make a [contribution](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) to Optimum.

 The benefits of exporting to ONNX include the following.

--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@@ -16,18 +16,54 @@ rendered properly in your Markdown viewer.

 # Serving

-Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users. Refer to [Transformers as Backend for Inference Servers](./transformers_as_backends) for usage examples.
+Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users.
+
+You can also serve transformer models easily using the `transformers serve` CLI. This is ideal for experimentation purposes, or to run models locally for personal and private use.
+
+## TGI
+
+[TGI](https://huggingface.co/docs/text-generation-inference/index) can serve models that aren't [natively implemented](https://huggingface.co/docs/text-generation-inference/supported_models) by falling back on the Transformers implementation of the model. Some of TGIs high-performance features aren't available in the Transformers implementation, but other features like continuous batching and streaming are still supported.

 > [!TIP]
-> Responses API is now supported as an experimental API! Read more about it [here](#responses-api).
+> Refer to the [Non-core model serving](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) guide for more details.

-Apart from that you can also serve transformer models easily using the `transformers serve` CLI. This is ideal for experimentation purposes, or to run models locally for personal and private use.
+Serve a Transformers implementation the same way you'd serve a TGI model.

-In this document, we dive into the different supported endpoints and modalities; we also cover the setup of several user interfaces that can be used on top of `transformers serve` in the following guides:
- [Jan (text and MCP user interface)](./jan.md)
- [Cursor (IDE)](./cursor.md)
- [Open WebUI (text, image, speech user interface)](./open_webui.md)
- [Tiny-Agents (text and MCP CLI tool)](./tiny_agents.md)
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+Add `--trust-remote_code` to the command to serve a custom Transformers model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+## vLLM
+
+[vLLM](https://docs.vllm.ai/en/latest/index.html) can also serve a Transformers implementation of a model if it isn't [natively implemented](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) in vLLM.
+
+Many features like quantization, LoRA adapters, and distributed inference and serving are supported for the Transformers implementation.
+
+> [!TIP]
+> Refer to the [Transformers fallback](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers-fallback) section for more details.
+
+By default, vLLM serves the native implementation and if it doesn't exist, it falls back on the Transformers implementation. But you can also set `--model-impl transformers` to explicitly use the Transformers model implementation.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers
+```
+
+Add the `trust-remote-code` parameter to enable loading a remote code model.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers \
+    --trust-remote-code
+```

 ## Serve CLI

@@ -54,14 +90,7 @@ The simplest way to interact with the server is through our `transformers chat`
 transformers chat localhost:8000 --model-name-or-path Qwen/Qwen3-4B
 ```

-or by sending an HTTP request, like we'll see below.
-
-## Chat Completions - text-based
-
-See below for examples for text-based requests. Both LLMs and VLMs should handle 
-
-<hfoptions id="chat-completion-http">
-<hfoption id="curl">
+or by sending an HTTP request with `cURL`, e.g.

 ```shell
 curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"messages": [{"role": "system", "content": "hello"}], "temperature": 0.9, "max_tokens": 1000, "stream": true, "model": "Qwen/Qwen2.5-0.5B-Instruct"}'
@@ -77,289 +106,7 @@ data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863,
 (...)
 ```

-</hfoption>
-<hfoption id="python - huggingface_hub">
-
-```python
-import asyncio
-from huggingface_hub import AsyncInferenceClient
-
-messages = [{"role": "user", "content": "What is the Transformers library known for?"}]
-client = AsyncInferenceClient("http://localhost:8000")
-
-async def responses_api_test_async():
-    async for chunk in (await client.chat_completion(messages, model="Qwen/Qwen2.5-0.5B-Instruct", max_tokens=256, stream=True)):
-        token = chunk.choices[0].delta.content
-        if token:
-            print(token, end='')
-
-asyncio.run(responses_api_test_async())
-asyncio.run(client.close())
-```
-
-From which you should get an iterative string printed:
-
-```shell
-The Transformers library is primarily known for its ability to create and manipulate large-scale language models [...]
-```
-
-</hfoption>
-<hfoption id="python - openai">
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="<random_string>")
-
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-0.5B-Instruct",
-    messages=[
-        {
-            "role": "user",
-            "content": "What is the Transformers library known for?"
-        }
-    ],
-    stream=True
-)
-
-for chunk in completion:
-    token = chunk.choices[0].delta.content
-    if token:
-        print(token, end='')
-```
-
-From which you should get an iterative string printed:
-
-```shell
-The Transformers library is primarily known for its ability to create and manipulate large-scale language models [...]
-```
-
-</hfoption>
-</hfoptions>
-
-## Chat Completions - VLMs
-
-The Chat Completion API also supports images; see below for examples for text-and-image-based requests.
-
-<hfoptions id="chat-completion-http-images">
-<hfoption id="curl">
-
-```shell
-curl http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "Qwen/Qwen2.5-VL-7B-Instruct",
-    "stream": true,
-    "messages": [
-      {
-        "role": "user",
-        "content": [
-          {
-            "type": "text",
-            "text": "What is in this image?"
-          },
-          {
-            "type": "image_url",
-            "image_url": {
-              "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-            }
-          }
-        ]
-      }
-    ],
-    "max_tokens": 300
-  }'
-
-```
-
-from which you'll receive multiple chunks in the Completions API format
-
-```shell
-data: {"id":"req_0","choices":[{"delta":{"role":"assistant"},"index":0}],"created":1753366665,"model":"Qwen/Qwen2.5-VL-7B-Instruct@main","object":"chat.completion.chunk","system_fingerprint":""}
-
-data: {"id":"req_0","choices":[{"delta":{"content":"The "},"index":0}],"created":1753366701,"model":"Qwen/Qwen2.5-VL-7B-Instruct@main","object":"chat.completion.chunk","system_fingerprint":""}
-
-data: {"id":"req_0","choices":[{"delta":{"content":"image "},"index":0}],"created":1753366701,"model":"Qwen/Qwen2.5-VL-7B-Instruct@main","object":"chat.completion.chunk","system_fingerprint":""}
-```
-
-</hfoption>
-<hfoption id="python - huggingface_hub">
-
-```python
-import asyncio
-from huggingface_hub import AsyncInferenceClient
-
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "What's in this image?"},
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg",
-                }
-            },
-        ],
-    }
-]
-client = AsyncInferenceClient("http://localhost:8000")
-
-async def responses_api_test_async():
-    async for chunk in (await client.chat_completion(messages, model="Qwen/Qwen2.5-VL-7B-Instruct", max_tokens=256, stream=True)):
-        token = chunk.choices[0].delta.content
-        if token:
-            print(token, end='')
-
-asyncio.run(responses_api_test_async())
-asyncio.run(client.close())
-```
-
-From which you should get an iterative string printed:
-
-```xmp
-The image depicts an astronaut in a space suit standing on what appears to be the surface of the moon, given the barren, rocky landscape and the dark sky in the background. The astronaut is holding a large egg that has cracked open, revealing a small creature inside. The scene is imaginative and playful, combining elements of space exploration with a whimsical twist involving the egg and the creature.
-```
-
-</hfoption>
-<hfoption id="python - openai">
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="<random_string>")
-
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-VL-7B-Instruct",
-    messages=[
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What's in this image?"},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg",
-                    }
-                },
-            ],
-        }
-    ],
-    stream=True
-)
-
-for chunk in completion:
-    token = chunk.choices[0].delta.content
-    if token:
-        print(token, end='')
-```
-
-From which you should get an iterative string printed:
-
-```xmp
-The image depicts an astronaut in a space suit standing on what appears to be the surface of the moon, given the barren, rocky landscape and the dark sky in the background. The astronaut is holding a large egg that has cracked open, revealing a small creature inside. The scene is imaginative and playful, combining elements of space exploration with a whimsical twist involving the egg and the creature.
-```
-
-</hfoption>
-</hfoptions>
-
-## Responses API
-
-The Responses API is the newest addition to the supported APIs of `transformers serve`.
-
-> [!TIP]
-> This API is still experimental: expect bug patches and additition of new features in the coming weeks.
-> If you run into any issues, please let us know and we'll work on fixing them ASAP.
-
-Instead of the previous `/v1/chat/completions` path, the Responses API lies behind the `/v1/responses` path.
-See below for examples interacting with our Responses endpoint with `curl`, as well as the Python OpenAI client.
-
-So far, this endpoint only supports text and therefore only LLMs. VLMs to come!
-
-<hfoptions id="responses">
-<hfoption id="curl">
-
-```shell
-curl http://localhost:8000/v1/responses \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "Qwen/Qwen2.5-0.5B-Instruct",
-    "stream": true,
-    "input": "Tell me a three sentence bedtime story about a unicorn."
-  }'
-```
-
-from which you'll receive multiple chunks in the Responses API format
-
-```shell
-data: {"response":{"id":"resp_req_0","created_at":1754059817.783648,"model":"Qwen/Qwen2.5-0.5B-Instruct@main","object":"response","output":[],"parallel_tool_calls":false,"tool_choice":"auto","tools":[],"status":"queued","text":{"format":{"type":"text"}}},"sequence_number":0,"type":"response.created"}
-
-data: {"response":{"id":"resp_req_0","created_at":1754059817.783648,"model":"Qwen/Qwen2.5-0.5B-Instruct@main","object":"response","output":[],"parallel_tool_calls":false,"tool_choice":"auto","tools":[],"status":"in_progress","text":{"format":{"type":"text"}}},"sequence_number":1,"type":"response.in_progress"}
-
-data: {"item":{"id":"msg_req_0","content":[],"role":"assistant","status":"in_progress","type":"message"},"output_index":0,"sequence_number":2,"type":"response.output_item.added"}
-
-data: {"content_index":0,"item_id":"msg_req_0","output_index":0,"part":{"annotations":[],"text":"","type":"output_text"},"sequence_number":3,"type":"response.content_part.added"}
-
-data: {"content_index":0,"delta":"","item_id":"msg_req_0","output_index":0,"sequence_number":4,"type":"response.output_text.delta"}
-
-data: {"content_index":0,"delta":"Once ","item_id":"msg_req_0","output_index":0,"sequence_number":5,"type":"response.output_text.delta"}
-
-data: {"content_index":0,"delta":"upon ","item_id":"msg_req_0","output_index":0,"sequence_number":6,"type":"response.output_text.delta"}
-
-data: {"content_index":0,"delta":"a ","item_id":"msg_req_0","output_index":0,"sequence_number":7,"type":"response.output_text.delta"}
-```
-
-</hfoption>
-<hfoption id="python - openai">
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="<KEY>")
-
-response = client.responses.create(
-    model="Qwen/Qwen2.5-0.5B-Instruct",
-    instructions="You are a helpful assistant.",
-    input="Hello!",
-    stream=True,
-    metadata={"foo": "bar"},
-)
-
-for event in response:
-    print(event)
-```
-
-From which you should get events printed out successively.
-
-```shell
-ResponseCreatedEvent(response=Response(id='resp_req_0', created_at=1754060400.3718212, error=None, incomplete_details=None, instructions='You are a helpful assistant.', metadata={'foo': 'bar'}, model='Qwen/Qwen2.5-0.5B-Instruct@main', object='response', output=[], parallel_tool_calls=False, temperature=None, tool_choice='auto', tools=[], top_p=None, background=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, reasoning=None, service_tier=None, status='queued', text=ResponseTextConfig(format=ResponseFormatText(type='text')), top_logprobs=None, truncation=None, usage=None, user=None), sequence_number=0, type='response.created')
-ResponseInProgressEvent(response=Response(id='resp_req_0', created_at=1754060400.3718212, error=None, incomplete_details=None, instructions='You are a helpful assistant.', metadata={'foo': 'bar'}, model='Qwen/Qwen2.5-0.5B-Instruct@main', object='response', output=[], parallel_tool_calls=False, temperature=None, tool_choice='auto', tools=[], top_p=None, background=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, reasoning=None, service_tier=None, status='in_progress', text=ResponseTextConfig(format=ResponseFormatText(type='text')), top_logprobs=None, truncation=None, usage=None, user=None), sequence_number=1, type='response.in_progress')
-ResponseOutputItemAddedEvent(item=ResponseOutputMessage(id='msg_req_0', content=[], role='assistant', status='in_progress', type='message'), output_index=0, sequence_number=2, type='response.output_item.added')
-ResponseContentPartAddedEvent(content_index=0, item_id='msg_req_0', output_index=0, part=ResponseOutputText(annotations=[], text='', type='output_text', logprobs=None), sequence_number=3, type='response.content_part.added')
-ResponseTextDeltaEvent(content_index=0, delta='', item_id='msg_req_0', output_index=0, sequence_number=4, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='', item_id='msg_req_0', output_index=0, sequence_number=5, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='Hello! ', item_id='msg_req_0', output_index=0, sequence_number=6, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='How ', item_id='msg_req_0', output_index=0, sequence_number=7, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='can ', item_id='msg_req_0', output_index=0, sequence_number=8, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='I ', item_id='msg_req_0', output_index=0, sequence_number=9, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='assist ', item_id='msg_req_0', output_index=0, sequence_number=10, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='you ', item_id='msg_req_0', output_index=0, sequence_number=11, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='', item_id='msg_req_0', output_index=0, sequence_number=12, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='', item_id='msg_req_0', output_index=0, sequence_number=13, type='response.output_text.delta')
-ResponseTextDeltaEvent(content_index=0, delta='today?', item_id='msg_req_0', output_index=0, sequence_number=14, type='response.output_text.delta')
-ResponseTextDoneEvent(content_index=0, item_id='msg_req_0', output_index=0, sequence_number=15, text='Hello! How can I assist you today?', type='response.output_text.done')
-ResponseContentPartDoneEvent(content_index=0, item_id='msg_req_0', output_index=0, part=ResponseOutputText(annotations=[], text='Hello! How can I assist you today?', type='output_text', logprobs=None), sequence_number=16, type='response.content_part.done')
-ResponseOutputItemDoneEvent(item=ResponseOutputMessage(id='msg_req_0', content=[ResponseOutputText(annotations=[], text='Hello! How can I assist you today?', type='output_text', logprobs=None)], role='assistant', status='completed', type='message', annotations=[]), output_index=0, sequence_number=17, type='response.output_item.done')
-ResponseCompletedEvent(response=Response(id='resp_req_0', created_at=1754060400.3718212, error=None, incomplete_details=None, instructions='You are a helpful assistant.', metadata={'foo': 'bar'}, model='Qwen/Qwen2.5-0.5B-Instruct@main', object='response', output=[ResponseOutputMessage(id='msg_req_0', content=[ResponseOutputText(annotations=[], text='Hello! How can I assist you today?', type='output_text', logprobs=None)], role='assistant', status='completed', type='message', annotations=[])], parallel_tool_calls=False, temperature=None, tool_choice='auto', tools=[], top_p=None, background=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, reasoning=None, service_tier=None, status='completed', text=ResponseTextConfig(format=ResponseFormatText(type='text')), top_logprobs=None, truncation=None, usage=None, user=None), sequence_number=18, type='response.completed')
-```
-
-</hfoption>
-</hfoptions>
-
-
-## MCP integration
-
-The `transformers serve` server is also an MCP client, so it can interact with MCP tools in agentic use cases. This, of course, requires the use of an LLM that is designed to use tools.
+The server is also an MCP client, so it can interact with MCP tools in agentic use cases. This, of course, requires the use of an LLM that is designed to use tools.

 > [!TIP]
 > At the moment, MCP tool usage in `transformers` is limited to the `qwen` family of models.
@@ -367,5 +114,142 @@ The `transformers serve` server is also an MCP client, so it can interact with M
 <!-- TODO: example with a minimal python example, and explain that it is possible to pass a full generation config in the request -->


+### Usage example 1: chat with local requests (feat. Jan)
+
+This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.
+
+To connect `transformers serve` with Jan, you'll need to set up a new model provider ("Settings" > "Model Providers"). Click on "Add Provider", and set a new name. In your new model provider page, all you need to set is the "Base URL" to the following pattern:
+
+```shell
+http://[host]:[port]/v1
+```
+
+where `host` and `port` are the `transformers serve` CLI parameters (`localhost:8000` by default). After setting this up, you should be able to see some models in the "Models" section, hitting "Refresh". Make sure you add some text in the "API key" text field too -- this data is not actually used, but the field can't be empty. Your custom model provider page should look like this:
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_jan_model_providers.png"/>
+</h3>
+
+You are now ready to chat!
+
+> [!TIP]
+> You can add any `transformers`-compatible model to Jan through `transformers serve`. In the custom model provider you created, click on the "+" button in the "Models" section and add its Hub repository name, e.g. `Qwen/Qwen3-4B`.
+
+To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
+
+```
+ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
+```
+
+Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.


+### Usage example 2: chat with external requests (feat. Cursor)
+
+This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. Unlike in the previous example, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
+
+To launch a server with CORS enabled, run
+
+```shell
+transformers serve --enable-cors
+```
+
+You'll also need to expose your server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
+
+```shell
+ngrok http [port]
+```
+
+where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see an https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
+</h3>
+
+You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
+1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
+2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
+3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
+4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
+5. Hit "Verify".
+
+After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
+</h3>
+
+You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
+</h3>
+
+
+### Usage example 3: `tiny-agents` CLI and MCP Tools
+
+To showcase the use of MCP tools, let's see how to integrate the `transformers serve` server with the [`tiny-agents`](https://huggingface.co/blog/python-tiny-agents) CLI.
+
+> [!TIP]
+> Many Hugging Face Spaces can be used as MCP servers, as in this example. You can find all compatible Spaces [here](https://huggingface.co/spaces?filter=mcp-server).
+
+The first step to use MCP tools is to let the model know which tools are available. As an example, let's consider a `tiny-agents` configuration file with a reference to an [image generation MCP server](https://evalstate-flux1-schnell.hf.space/).
+
+```json
+{
+    "model": "Menlo/Jan-nano",
+    "endpointUrl": "http://localhost:8000",
+    "servers": [
+        {
+            "type": "sse",
+            "url": "https://evalstate-flux1-schnell.hf.space/gradio_api/mcp/sse"
+        }
+    ]
+}
+```
+
+You can then launch your `tiny-agents` chat interface with the following command.
+
+```bash
+tiny-agents run path/to/your/config.json
+```
+
+If you have `transformers serve` running in the background, you're ready to use MCP tools from a local model! For instance, here's the example of a chat session with `tiny-agents`:
+
+```bash
+Agent loaded with 1 tools:
+ • flux1_schnell_infer
+»  Generate an image of a cat on the moon
+<Tool req_0_tool_call>flux1_schnell_infer {"prompt": "a cat on the moon", "seed": 42, "randomize_seed": true, "width": 1024, "height": 1024, "num_inference_steps": 4}
+
+Tool req_0_tool_call
+[Binary Content: Image image/webp, 57732 bytes]
+The task is complete and the content accessible to the User
+Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/3dbddc0e53b5a865ed56a4e3dbdd30f3f61cf3b8aabf1b456f43e5241bd968b8/image.webp
+380576952
+
+I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance!
+```
+
+### Usage example 4: speech to text transcription (feat. Open WebUI)
+
+This guide shows how to do audio transcription for chat purposes, using `transformers serve` and [Open WebUI](https://openwebui.com/). This guide assumes you have Open WebUI installed on your machine and ready to run. Please refer to the examples above to use the text functionalities of `transformer serve` with Open WebUI -- the instructions are the same.
+
+To start, let's launch the server. Some of Open WebUI's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons, so you need to enable it:
+
+```shell
+transformers serve --enable-cors
+```
+
+Before you can speak into Open WebUI, you need to update its settings to use your server for speech to text (STT) tasks. Launch Open WebUI, and navigate to the audio tab inside the admin settings. If you're using Open WebUI with the default ports, [this link (default)](http://localhost:3000/admin/settings/audio) or [this link (python deployment)](http://localhost:8080/admin/settings/audio) will take you there. Do the following changes there:
+1. Change the type of "Speech-to-Text Engine" to "OpenAI";
+2. Update the address to your server's address -- `http://localhost:8000/v1` by default;
+3. Type your model of choice into the "STT Model" field, e.g. `openai/whisper-large-v3` ([available models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending)).
+
+If you've done everything correctly, the audio tab should look like this
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_openwebui_stt_settings.png"/>
+</h3>
+
+You're now ready to speak! Open a new chat, utter a few words after hitting the microphone button, and you should see the corresponding text on the chat input after the model transcribes it.
--- a/docs/source/en/tasks/keypoint_detection.md
+++ b/docs/source/en/tasks/keypoint_detection.md
@@ -25,7 +25,7 @@ Keypoint detection identifies and locates specific points of interest within an

 In this guide, we will show how to extract keypoints from images.

-For this tutorial, we will use [SuperPoint](./model_doc/superpoint), a foundation model for keypoint detection.
+For this tutorial, we will use [SuperPoint](./model_doc/superpoint.md), a foundation model for keypoint detection.

 ```python
 from transformers import AutoImageProcessor, SuperPointForKeypointDetection
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -146,7 +146,7 @@ To get an even better understanding of the data, visualize an example in the dat
 >>> annotations = cppe5["train"][2]["objects"]
 >>> draw = ImageDraw.Draw(image)

->>> categories = cppe5["train"].features["objects"]["category"].feature.names
+>>> categories = cppe5["train"].features["objects"].feature["category"].names

 >>> id2label = {index: x for index, x in enumerate(categories, start=0)}
 >>> label2id = {v: k for k, v in id2label.items()}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ssum21	d88745c6d7	fix: manual edits Some checks failed Secret Leaks / trufflehog (push) Has been cancelled Details	2025-07-20 19:06:32 -07:00
ssum21	3fb5d6ceda	feat: nmt draft	2025-07-20 19:01:42 -07:00