[doctest] Fixes (#35863)

doctest fixes
2025-01-26 15:26:38 -08:00
parent fc269f77da
commit f11f57c925
15 changed files with 76 additions and 76 deletions
--- a/docs/source/en/model_doc/glm.md
+++ b/docs/source/en/model_doc/glm.md
@@ -56,7 +56,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto

->>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
+>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True)
 >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")

 >>> prompt = "Give me a short introduction to large language model."
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -64,18 +64,19 @@ Here's how to use the model for zero-shot object detection:

 >>> results = processor.post_process_grounded_object_detection(
 ...     outputs,
-...     threshold=0.4,
+...     inputs.input_ids,
+...     box_threshold=0.4,
 ...     text_threshold=0.3,
-...     target_sizes=[(image.height, image.width)]
+...     target_sizes=[image.size[::-1]]
 ... )
->>> # Retrieve the first image result
+
+# Retrieve the first image result
 >>> result = results[0]
->>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
+>>> for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
 ...     box = [round(x, 2) for x in box.tolist()]
-...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
-Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
-Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
-Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
+...     print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
+Detected a cat with confidence 0.468 at location [344.78, 22.9, 637.3, 373.62]
+Detected a cat with confidence 0.426 at location [11.74, 51.55, 316.51, 473.22]
 ```

 ## Grounded SAM
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@@ -81,7 +81,7 @@ text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=

 # Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
 print(text_prompt)
->>> "<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>"
+'<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>'
 ```

 This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@@ -110,9 +110,14 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran
 >>> from datasets import load_dataset, Audio
 >>> import torch, math
 >>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer
->>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")


+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16")
+>>> tokenizer = AutoTokenizer.from_pretrained("kyutai/moshiko-pytorch-bf16")
+>>> device = "cuda"
+>>> dtype = torch.bfloat16
+
 >>> # prepare user input audio 
 >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
 >>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@@ -57,10 +57,7 @@ Phi-3 has been integrated in the development version (4.40.0.dev) of `transforme
 >>> outputs = model.generate(inputs, max_new_tokens=32)
 >>> text = tokenizer.batch_decode(outputs)[0]
 >>> print(text)
-<s><|user|> 
-Can you provide ways to eat combinations of bananas and dragonfruits?<|end|> 
-<|assistant|> 
-Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for eating combinations of bananas and
+<|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some creative ideas for incorporating both fruits
 ```

 ## Phi3Config
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@@ -86,7 +86,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
 >>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
 >>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
->>> # important: we pass `padding=max_length` since the model was trained with this
+# important: we pass `padding=max_length` since the model was trained with this
 >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

 >>> with torch.no_grad():
@@ -95,7 +95,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
 >>> logits_per_image = outputs.logits_per_image
 >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
 >>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-31.9% that image 0 is 'a photo of 2 cats'
+19.8% that image 0 is '2 cats'
 ```

 ## Resources
@@ -142,8 +142,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 # follows the pipeline prompt template to get same results
 >>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
 # important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
->>> inputs.to(device)
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)

 >>> with torch.no_grad():
 ...     with torch.autocast(device):
@@ -152,7 +151,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> logits_per_image = outputs.logits_per_image
 >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
 >>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-51.3% that image 0 is 'This is a photo of 2 cats.'
+19.8% that image 0 is '2 cats'
 ```


--- a/docs/source/en/model_doc/zoedepth.md
+++ b/docs/source/en/model_doc/zoedepth.md
@@ -70,7 +70,7 @@ Alternatively, one can also perform inference using the classes:
 >>> inputs = image_processor(images=image, return_tensors="pt")

 >>> with torch.no_grad():   
-...     outputs = model(pixel_values)
+...     outputs = model(inputs)

 >>> # interpolate to original size and visualize the prediction
 >>> ## ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument