From 55b7a0404ea6bfc5f9197b1042a57304af05eb92 Mon Sep 17 00:00:00 2001
From: John B Nelson <jbn@abreka.com>
Date: Fri, 27 Sep 2024 01:33:55 -0700
Subject: [PATCH] Make siglip examples clearer and error free (#33667)

Update siglip.md

This was already partially fixed relative to the deployed docs. But the partial fix made it inconsistent. Additionally, giving the full text ("This is a photo of...") is likely not the desired output.
---
 docs/source/en/model_doc/siglip.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md
index 4f46174fb1..88e38cbb59 100644
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@@ -85,7 +85,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
 
 >>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
->>> candidate_labels = [f'This is a photo of {label}.' for label in candidate_labels]
+>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
 >>> # important: we pass `padding=max_length` since the model was trained with this
 >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
 
@@ -94,7 +94,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
 
 >>> logits_per_image = outputs.logits_per_image
 >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 31.9% that image 0 is 'a photo of 2 cats'
 ```
 
@@ -140,9 +140,9 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 
 >>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
->>> candidate_labels = [f'This is a photo of {label}.' for label in candidate_labels]
+>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
 # important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=candidate_labels, images=image, padding="max_length", return_tensors="pt")
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
 >>> inputs.to(device)
 
 >>> with torch.no_grad():
@@ -240,4 +240,4 @@ Below is an expected speedup diagram that compares inference time between the na
 ## SiglipForImageClassification
 
 [[autodoc]] SiglipForImageClassification
-    - forward
\ No newline at end of file
+    - forward