From 31299670cda29f25fbc655f6f166e7b8cc21c89f Mon Sep 17 00:00:00 2001
From: Henry Hyeonmok Ko <52618631+henryhmko@users.noreply.github.com>
Date: Mon, 2 Dec 2024 07:26:34 -0800
Subject: [PATCH] Multiple typo fixes in Tutorials docs (#35035)

* Fixed typo in multi gpu docs and OLMoE version

* Fixed typos in docs for agents, agents advanced, knowledge distillation, and image feature extraction

* Fixed incorrect usage of model.image_guided_detection in zero shot object detection docs
---
 docs/source/en/agents.md                                      | 2 +-
 docs/source/en/agents_advanced.md                             | 2 +-
 docs/source/en/perf_train_gpu_many.md                         | 2 +-
 docs/source/en/tasks/image_feature_extraction.md              | 2 +-
 .../tasks/knowledge_distillation_for_image_classification.md  | 2 +-
 docs/source/en/tasks/zero_shot_object_detection.md            | 2 +-
 src/transformers/models/olmoe/configuration_olmoe.py          | 2 +-
 src/transformers/models/olmoe/modeling_olmoe.py               | 4 ++--
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index 721e348f89..56c9184980 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -225,7 +225,7 @@ You have access to the following tools:
 To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
 
 At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
-Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence.
+Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
 During each intermediate step, you can use 'print()' to save whatever important information you will then need.
 These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
 
diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md
index e80e402d73..c4753bf136 100644
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@@ -211,7 +211,7 @@ agent.run("How many more blocks (also denoted as layers) are in BERT base encode
 
 ## Display your agent run in a cool Gradio interface
 
-You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example:
+You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example:
 
 ```py
 import gradio as gr
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index 858da99e7b..c810a18470 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -553,7 +553,7 @@ It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
 Examples:
 * Sample
 
-Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512.
+Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes 5 x 2 x 512.
 
 * Operator
 
diff --git a/docs/source/en/tasks/image_feature_extraction.md b/docs/source/en/tasks/image_feature_extraction.md
index 80b701588b..e55a9e2379 100644
--- a/docs/source/en/tasks/image_feature_extraction.md
+++ b/docs/source/en/tasks/image_feature_extraction.md
@@ -84,7 +84,7 @@ If you want to get the last hidden states before pooling, avoid passing any valu
 
 ```python
 pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-224", device=DEVICE)
-output = pipe(image_real)
+outputs = pipe(image_real)
 ```
 
 Since the outputs are unpooled, we get the last hidden states where the first dimension is the batch size, and the last two are the embedding shape.
diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
index 17fb363df8..c1ccafb6fc 100644
--- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
@@ -17,7 +17,7 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between it's outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this.
+Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between its outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this.
 
 This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of 🤗 Transformers.
 
diff --git a/docs/source/en/tasks/zero_shot_object_detection.md b/docs/source/en/tasks/zero_shot_object_detection.md
index 5ac4706bff..defe54b1c7 100644
--- a/docs/source/en/tasks/zero_shot_object_detection.md
+++ b/docs/source/en/tasks/zero_shot_object_detection.md
@@ -288,7 +288,7 @@ as before except now there are no labels.
 >>> scores = results["scores"].tolist()
 >>> boxes = results["boxes"].tolist()
 
->>> for box, score, label in zip(boxes, scores, labels):
+>>> for box, score in zip(boxes, scores):
 ...     xmin, ymin, xmax, ymax = box
 ...     draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=4)
 
diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py
index 434d633bec..dbee3bfa0b 100644
--- a/src/transformers/models/olmoe/configuration_olmoe.py
+++ b/src/transformers/models/olmoe/configuration_olmoe.py
@@ -19,7 +19,7 @@ class OlmoeConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`OlmoeModel`]. It is used to instantiate an OLMoE
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [allenai/OLMoE-1B-7B-0824](https://huggingface.co/allenai/OLMoE-1B-7B-0824).
+    defaults will yield a similar configuration to that of the [allenai/OLMoE-1B-7B-0924](https://huggingface.co/allenai/OLMoE-1B-7B-0924).
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index 169827ffd7..4398e2f5c9 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -1249,8 +1249,8 @@ class OlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
         ```python
         >>> from transformers import AutoTokenizer, OlmoeForCausalLM
 
-        >>> model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0824")
-        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0824")
+        >>> model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")