From 55b7a0404ea6bfc5f9197b1042a57304af05eb92 Mon Sep 17 00:00:00 2001 From: John B Nelson Date: Fri, 27 Sep 2024 01:33:55 -0700 Subject: [PATCH] Make siglip examples clearer and error free (#33667) Update siglip.md This was already partially fixed relative to the deployed docs. But the partial fix made it inconsistent. Additionally, giving the full text ("This is a photo of...") is likely not the desired output. --- docs/source/en/model_doc/siglip.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index 4f46174fb1..88e38cbb59 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -85,7 +85,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that: >>> candidate_labels = ["2 cats", "2 dogs"] # follows the pipeline prompt template to get same results ->>> candidate_labels = [f'This is a photo of {label}.' for label in candidate_labels] +>>> texts = [f'This is a photo of {label}.' for label in candidate_labels] >>> # important: we pass `padding=max_length` since the model was trained with this >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt") @@ -94,7 +94,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that: >>> logits_per_image = outputs.logits_per_image >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities ->>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") +>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") 31.9% that image 0 is 'a photo of 2 cats' ``` @@ -140,9 +140,9 @@ To load and run a model using Flash Attention 2, refer to the snippet below: >>> candidate_labels = ["2 cats", "2 dogs"] # follows the pipeline prompt template to get same results ->>> candidate_labels = [f'This is a photo of {label}.' for label in candidate_labels] +>>> texts = [f'This is a photo of {label}.' for label in candidate_labels] # important: we pass `padding=max_length` since the model was trained with this ->>> inputs = processor(text=candidate_labels, images=image, padding="max_length", return_tensors="pt") +>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt") >>> inputs.to(device) >>> with torch.no_grad(): @@ -240,4 +240,4 @@ Below is an expected speedup diagram that compares inference time between the na ## SiglipForImageClassification [[autodoc]] SiglipForImageClassification - - forward \ No newline at end of file + - forward