Rename OLMo November to OLMo2 (#34864)

* Rename/move OLMo Nov files to OLMo2 * Rename Olmo1124 and its variants to Olmo2
2024-11-25 07:31:22 -08:00
parent 1de3598d30
commit 9121ab8fe8
17 changed files with 221 additions and 221 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -516,8 +516,8 @@
        title: Nyströmformer
      - local: model_doc/olmo
        title: OLMo
-      - local: model_doc/olmo_1124
+      - local: model_doc/olmo2
-        title: OLMo November 2024
+        title: OLMo2
      - local: model_doc/olmoe
        title: OLMoE
      - local: model_doc/open-llama
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -240,7 +240,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
 |                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
 |                          [OLMo](model_doc/olmo)                          |       ✅        |         ❌         |      ❌      |
-|                [OLMo November 2024](model_doc/olmo_1124)                 |       ✅        |         ❌         |      ❌      |
+|                         [OLMo2](model_doc/olmo2)                         |       ✅        |         ❌         |      ❌      |
 |                         [OLMoE](model_doc/olmoe)                         |       ✅        |         ❌         |      ❌      |
 |                   [OmDet-Turbo](model_doc/omdet-turbo)                   |       ✅        |         ❌         |      ❌      |
 |                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/model_doc/olmo_1124.md
+++ b/docs/source/en/model_doc/olmo_1124.md
@@ -14,11 +14,11 @@ rendered properly in your Markdown viewer.
 -->
-# OLMo November 2024
+# OLMo2
 ## Overview
-The OLMo November 2024 model is a successor of the OLMo model, which was proposed in
+The OLMo2 model is the successor of the OLMo model, which was proposed in
 [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838).
 The architectural changes from the original OLMo model to this model are:
@@ -31,16 +31,16 @@ This model was contributed by [shanearora](https://huggingface.co/shanearora).
 The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo).
-## Olmo1124Config
+## Olmo2Config
-[[autodoc]] Olmo1124Config
+[[autodoc]] Olmo2Config
-## Olmo1124Model
+## Olmo2Model
-[[autodoc]] Olmo1124Model
+[[autodoc]] Olmo2Model
    - forward
-## Olmo1124ForCausalLM
+## Olmo2ForCausalLM
-[[autodoc]] Olmo1124ForCausalLM
+[[autodoc]] Olmo2ForCausalLM
    - forward
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -77,7 +77,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
-* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
+* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
@@ -261,7 +261,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
-* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
+* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
 * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -620,7 +620,7 @@ _import_structure = {
    "models.nougat": ["NougatProcessor"],
    "models.nystromformer": ["NystromformerConfig"],
    "models.olmo": ["OlmoConfig"],
-    "models.olmo_1124": ["Olmo1124Config"],
+    "models.olmo2": ["Olmo2Config"],
    "models.olmoe": ["OlmoeConfig"],
    "models.omdet_turbo": [
        "OmDetTurboConfig",
@@ -2920,11 +2920,11 @@ else:
            "OlmoPreTrainedModel",
        ]
    )
-    _import_structure["models.olmo_1124"].extend(
+    _import_structure["models.olmo2"].extend(
        [
-            "Olmo1124ForCausalLM",
+            "Olmo2ForCausalLM",
-            "Olmo1124Model",
+            "Olmo2Model",
-            "Olmo1124PreTrainedModel",
+            "Olmo2PreTrainedModel",
        ]
    )
    _import_structure["models.olmoe"].extend(
@@ -5514,7 +5514,7 @@ if TYPE_CHECKING:
        NystromformerConfig,
    )
    from .models.olmo import OlmoConfig
-    from .models.olmo_1124 import Olmo1124Config
+    from .models.olmo2 import Olmo2Config
    from .models.olmoe import OlmoeConfig
    from .models.omdet_turbo import (
        OmDetTurboConfig,
@@ -7533,10 +7533,10 @@ if TYPE_CHECKING:
            OlmoModel,
            OlmoPreTrainedModel,
        )
-        from .models.olmo_1124 import (
+        from .models.olmo2 import (
-            Olmo1124ForCausalLM,
+            Olmo2ForCausalLM,
-            Olmo1124Model,
+            Olmo2Model,
-            Olmo1124PreTrainedModel,
+            Olmo2PreTrainedModel,
        )
        from .models.olmoe import (
            OlmoeForCausalLM,
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@@ -177,7 +177,7 @@ from . import (
    nougat,
    nystromformer,
    olmo,
-    olmo_1124,
+    olmo2,
    olmoe,
    omdet_turbo,
    oneformer,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -195,7 +195,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("nougat", "VisionEncoderDecoderConfig"),
        ("nystromformer", "NystromformerConfig"),
        ("olmo", "OlmoConfig"),
-        ("olmo_1124", "Olmo1124Config"),
+        ("olmo2", "Olmo2Config"),
        ("olmoe", "OlmoeConfig"),
        ("omdet-turbo", "OmDetTurboConfig"),
        ("oneformer", "OneFormerConfig"),
@@ -511,7 +511,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("nougat", "Nougat"),
        ("nystromformer", "Nyströmformer"),
        ("olmo", "OLMo"),
-        ("olmo_1124", "OLMo November 2024"),
+        ("olmo2", "OLMo2"),
        ("olmoe", "OLMoE"),
        ("omdet-turbo", "OmDet-Turbo"),
        ("oneformer", "OneFormer"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -184,7 +184,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("nllb-moe", "NllbMoeModel"),
        ("nystromformer", "NystromformerModel"),
        ("olmo", "OlmoModel"),
-        ("olmo_1124", "Olmo1124Model"),
+        ("olmo2", "Olmo2Model"),
        ("olmoe", "OlmoeModel"),
        ("omdet-turbo", "OmDetTurboForObjectDetection"),
        ("oneformer", "OneFormerModel"),
@@ -517,7 +517,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
        ("mvp", "MvpForCausalLM"),
        ("nemotron", "NemotronForCausalLM"),
        ("olmo", "OlmoForCausalLM"),
-        ("olmo_1124", "Olmo1124ForCausalLM"),
+        ("olmo2", "Olmo2ForCausalLM"),
        ("olmoe", "OlmoeForCausalLM"),
        ("open-llama", "OpenLlamaForCausalLM"),
        ("openai-gpt", "OpenAIGPTLMHeadModel"),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -348,7 +348,7 @@ else:
                ),
            ),
            ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
-            ("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
+            ("olmo2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
            ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
            (
                "omdet-turbo",
--- a/src/transformers/models/olmo_1124/init.py
+++ b/src/transformers/models/olmo_1124/init.py
@@ -18,8 +18,8 @@ from ...utils.import_utils import define_import_structure
 if TYPE_CHECKING:
-    from .configuration_olmo_1124 import *
+    from .configuration_olmo2 import *
-    from .modeling_olmo_1124 import *
+    from .modeling_olmo2 import *
 else:
    import sys
--- a/src/transformers/models/olmo_1124/configuration_olmo_1124.py
+++ b/src/transformers/models/olmo_1124/configuration_olmo_1124.py
@@ -1,18 +1,18 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
+#           This file was automatically generated from src/transformers/models/olmo2/modular_olmo2.py.
 #               Do NOT edit this file manually as any edits will be overwritten by the generation of
 #             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_olmo_1124.py file directly. One of our CI enforces this.
+#                          modular_olmo2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 from ...configuration_utils import PretrainedConfig
-class Olmo1124Config(PretrainedConfig):
+class Olmo2Config(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
+    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
+    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
@@ -20,8 +20,8 @@ class Olmo1124Config(PretrainedConfig):
    Args:
        vocab_size (`int`, *optional*, defaults to 50304):
-            Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
+            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Olmo1124Model`]
+            `inputs_ids` passed when calling [`Olmo2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
@@ -73,20 +73,20 @@ class Olmo1124Config(PretrainedConfig):
            The epsilon used by the rms normalization layers.
    ```python
-    >>> from transformers import Olmo1124Model, Olmo1124Config
+    >>> from transformers import Olmo2Model, Olmo2Config
-    >>> # Initializing a Olmo November 2024 7B style configuration
+    >>> # Initializing a Olmo2 7B style configuration
-    >>> configuration = Olmo1124Config()
+    >>> configuration = Olmo2Config()
-    >>> # Initializing a model from the Olmo November 2024 7B style configuration
+    >>> # Initializing a model from the Olmo2 7B style configuration
-    >>> model = Olmo1124Model(configuration)
+    >>> model = Olmo2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
-    model_type = "olmo_1124"
+    model_type = "olmo2"
    keys_to_ignore_at_inference = ["past_key_values"]
    def __init__(
@@ -163,4 +163,4 @@ class Olmo1124Config(PretrainedConfig):
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
-__all__ = ["Olmo1124Config"]
+__all__ = ["Olmo2Config"]
--- a/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py
+++ b/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py
@@ -23,7 +23,7 @@ import torch
 import yaml
 from tokenizers import Tokenizer
-from transformers import Olmo1124Config, Olmo1124ForCausalLM
+from transformers import Olmo2Config, Olmo2ForCausalLM
 from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
@@ -31,16 +31,16 @@ from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
 Sample usage:
 ```
-python src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py \
+python src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py \
-    --input_dir /path/to/downloaded/olmo_1124/weights --model_size 7B --output_dir /output/path
+    --input_dir /path/to/downloaded/olmo2/weights --model_size 7B --output_dir /output/path
 ```
 Thereafter, models can be loaded via:
 ```py
-from transformers import Olmo1124ForCausalLM, AutoTokenizer
+from transformers import Olmo2ForCausalLM, AutoTokenizer
-model = Olmo1124ForCausalLM.from_pretrained("/output/path")
+model = Olmo2ForCausalLM.from_pretrained("/output/path")
 tokenizer = AutoTokenizer.from_pretrained("/output/path")
 ```
@@ -77,26 +77,26 @@ def write_model(
    os.makedirs(tmp_model_path, exist_ok=True)
    config_path = Path(input_base_path) / "config.yaml"
-    olmo_1124_config = yaml.safe_load(config_path.read_text())["model"]
+    olmo2_config = yaml.safe_load(config_path.read_text())["model"]
-    if not olmo_1124_config.get("attention_layer_norm", False):
+    if not olmo2_config.get("attention_layer_norm", False):
-        raise RuntimeError("OLMo November 2024 checkpoints must have attention layer norm")
+        raise RuntimeError("OLMo2 checkpoints must have attention layer norm")
-    if not olmo_1124_config.get("norm_after", False):
+    if not olmo2_config.get("norm_after", False):
-        raise RuntimeError("OLMo November 2024 checkpoints must set norm_after to True")
+        raise RuntimeError("OLMo2 checkpoints must set norm_after to True")
-    n_layers = olmo_1124_config["n_layers"]
+    n_layers = olmo2_config["n_layers"]
-    n_heads = olmo_1124_config["n_heads"]
+    n_heads = olmo2_config["n_heads"]
-    dim = olmo_1124_config["d_model"]
+    dim = olmo2_config["d_model"]
    dims_per_head = dim // n_heads
-    base = olmo_1124_config["rope_theta"]
+    base = olmo2_config["rope_theta"]
    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmo_1124_config["max_sequence_length"]
+    max_position_embeddings = olmo2_config["max_sequence_length"]
-    vocab_size = olmo_1124_config.get("embedding_size", olmo_1124_config["vocab_size"])
+    vocab_size = olmo2_config.get("embedding_size", olmo2_config["vocab_size"])
-    if olmo_1124_config.get("n_kv_heads", None) is not None:
+    if olmo2_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmo_1124_config["n_kv_heads"]  # for GQA / MQA
+        num_key_value_heads = olmo2_config["n_kv_heads"]  # for GQA / MQA
-    elif olmo_1124_config["multi_query_attention"]:  # compatibility with other checkpoints
+    elif olmo2_config["multi_query_attention"]:  # compatibility with other checkpoints
        num_key_value_heads = 1
    else:
        num_key_value_heads = n_heads
@@ -167,17 +167,17 @@ def write_model(
    index_dict["metadata"] = {"total_size": param_count * 2}
    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-    if olmo_1124_config.get("mlp_hidden_size", None) is not None:
+    if olmo2_config.get("mlp_hidden_size", None) is not None:
-        intermediate_size = olmo_1124_config["mlp_hidden_size"] // 2
+        intermediate_size = olmo2_config["mlp_hidden_size"] // 2
    else:
-        intermediate_size = (dim * olmo_1124_config["mlp_ratio"]) // 2
+        intermediate_size = (dim * olmo2_config["mlp_ratio"]) // 2
-    if fix_eos_token_id and olmo_1124_config["eos_token_id"] == 0:
+    if fix_eos_token_id and olmo2_config["eos_token_id"] == 0:
        # Fixing a bug in OLMo where eos token id was incorrectly set
        print("Changing eos_token_id from 0 to 50279.")
-        olmo_1124_config["eos_token_id"] = 50279
+        olmo2_config["eos_token_id"] = 50279
-    config = Olmo1124Config(
+    config = Olmo2Config(
        vocab_size=vocab_size,
        hidden_size=dim,
        intermediate_size=intermediate_size,
@@ -185,11 +185,11 @@ def write_model(
        num_attention_heads=n_heads,
        num_key_value_heads=num_key_value_heads,
        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmo_1124_config["pad_token_id"],
+        pad_token_id=olmo2_config["pad_token_id"],
        bos_token_id=None,
-        eos_token_id=olmo_1124_config["eos_token_id"],
+        eos_token_id=olmo2_config["eos_token_id"],
-        tie_word_embeddings=olmo_1124_config["weight_tying"],
+        tie_word_embeddings=olmo2_config["weight_tying"],
-        rms_norm_eps=olmo_1124_config["layer_norm_eps"],
+        rms_norm_eps=olmo2_config["layer_norm_eps"],
        rope_theta=base,
    )
    config.save_pretrained(tmp_model_path)
@@ -202,8 +202,8 @@ def write_model(
    if include_tokenizer:
        _write_tokenizer(model_path, config, input_base_path, tokenizer_path)
-    print("Loading the checkpoint in a OLMo November 2024 model.")
+    print("Loading the checkpoint in a OLMo2 model.")
-    model = Olmo1124ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
+    model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
    # Avoid saving this as part of the config.
    del model.config._name_or_path
    print("Saving in the Transformers format.")
@@ -216,7 +216,7 @@ def write_model(
 def _write_tokenizer(
    output_path: Path,
-    config: Olmo1124Config,
+    config: Olmo2Config,
    checkpoint_dir: str,
    input_tokenizer_path: Path | None,
 ) -> None:
@@ -251,7 +251,7 @@ def main():
    parser.add_argument(
        "--input_dir",
        required=True,
-        help="Location of OLMo November 2024 weights, which contains config.yaml and model.pt.",
+        help="Location of OLMo2 weights, which contains config.yaml and model.pt.",
    )
    parser.add_argument(
        "--no_tokenizer",
@@ -263,7 +263,7 @@ def main():
        "--tokenizer_json_path",
        type=Path,
        default=None,
-        help="Location of OLMo November 2024 tokenizer json file. Defaults to what is set in the config file.",
+        help="Location of OLMo2 tokenizer json file. Defaults to what is set in the config file.",
    )
    parser.add_argument(
        "--output_dir",
--- a/src/transformers/models/olmo_1124/modeling_olmo_1124.py
+++ b/src/transformers/models/olmo_1124/modeling_olmo_1124.py
@@ -1,8 +1,8 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
+#           This file was automatically generated from src/transformers/models/olmo2/modular_olmo2.py.
 #               Do NOT edit this file manually as any edits will be overwritten by the generation of
 #             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_olmo_1124.py file directly. One of our CI enforces this.
+#                          modular_olmo2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 from typing import List, Optional, Tuple, Union
@@ -25,7 +25,7 @@ from ...utils import (
    logging,
    replace_return_docstrings,
 )
-from .configuration_olmo_1124 import Olmo1124Config
+from .configuration_olmo2 import Olmo2Config
 if is_flash_attn_2_available():
@@ -34,13 +34,13 @@ if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "Olmo1124Config"
+_CONFIG_FOR_DOC = "Olmo2Config"
-class Olmo1124RMSNorm(nn.Module):
+class Olmo2RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
-        Olmo1124RMSNorm is equivalent to T5LayerNorm
+        Olmo2RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -57,9 +57,9 @@ class Olmo1124RMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo1124
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo2
 # TODO(joao): add me back asap :)
-class Olmo1124RotaryEmbedding(nn.Module):
+class Olmo2RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.scaling_factor = scaling_factor
@@ -88,10 +88,10 @@ class Olmo1124RotaryEmbedding(nn.Module):
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo1124
+# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo2
 # TODO(joao): add me back asap :)
-class Olmo1124LinearScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
+class Olmo2LinearScalingRotaryEmbedding(Olmo2RotaryEmbedding):
-    """Olmo1124RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    """Olmo2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
    def forward(self, x, position_ids):
        # difference to the original RoPE: a scaling factor is aplied to the position ids
@@ -100,10 +100,10 @@ class Olmo1124LinearScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
        return cos, sin
-# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo1124
+# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo2
 # TODO(joao): add me back asap :)
-class Olmo1124DynamicNTKScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
+class Olmo2DynamicNTKScalingRotaryEmbedding(Olmo2RotaryEmbedding):
-    """Olmo1124RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    """Olmo2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
    def forward(self, x, position_ids):
        # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
@@ -167,12 +167,12 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-class Olmo1124Attention(nn.Module):
+class Olmo2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo1124
+    # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo2
    # TODO(joao): add me back asap :)
-    def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
+    def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
@@ -204,12 +204,12 @@ class Olmo1124Attention(nn.Module):
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
        self._init_rope()
-        self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
+        self.q_norm = Olmo2RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
-        self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
+        self.k_norm = Olmo2RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
    def _init_rope(self):
        if self.config.rope_scaling is None:
-            self.rotary_emb = Olmo1124RotaryEmbedding(
+            self.rotary_emb = Olmo2RotaryEmbedding(
                self.head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
@@ -218,14 +218,14 @@ class Olmo1124Attention(nn.Module):
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
-                self.rotary_emb = Olmo1124LinearScalingRotaryEmbedding(
+                self.rotary_emb = Olmo2LinearScalingRotaryEmbedding(
                    self.head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
-                self.rotary_emb = Olmo1124DynamicNTKScalingRotaryEmbedding(
+                self.rotary_emb = Olmo2DynamicNTKScalingRotaryEmbedding(
                    self.head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
@@ -295,13 +295,13 @@ class Olmo1124Attention(nn.Module):
        return attn_output, attn_weights, past_key_value
-class Olmo1124FlashAttention2(Olmo1124Attention):
+class Olmo2FlashAttention2(Olmo2Attention):
    """
-    Olmo1124 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
+    Olmo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
-    OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
+    OLMo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """
@@ -403,14 +403,14 @@ class Olmo1124FlashAttention2(Olmo1124Attention):
        return attn_output, attn_weights, past_key_value
-class Olmo1124SdpaAttention(Olmo1124Attention):
+class Olmo2SdpaAttention(Olmo2Attention):
    """
-    Olmo1124 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    Olmo2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Olmo1124Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    `Olmo2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """
-    # Adapted from Olmo1124Attention.forward
+    # Adapted from Olmo2Attention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -424,7 +424,7 @@ class Olmo1124SdpaAttention(Olmo1124Attention):
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
-                "Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                "Olmo2Model is using Olmo2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
@@ -479,7 +479,7 @@ class Olmo1124SdpaAttention(Olmo1124Attention):
        return attn_output, None, past_key_value
-class Olmo1124MLP(nn.Module):
+class Olmo2MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
@@ -494,23 +494,23 @@ class Olmo1124MLP(nn.Module):
        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-OLMO_1124_ATTENTION_CLASSES = {
+OLMO2_ATTENTION_CLASSES = {
-    "eager": Olmo1124Attention,
+    "eager": Olmo2Attention,
-    "flash_attention_2": Olmo1124FlashAttention2,
+    "flash_attention_2": Olmo2FlashAttention2,
-    "sdpa": Olmo1124SdpaAttention,
+    "sdpa": Olmo2SdpaAttention,
 }
-class Olmo1124DecoderLayer(nn.Module):
+class Olmo2DecoderLayer(nn.Module):
-    def __init__(self, config: Olmo1124Config, layer_idx: int):
+    def __init__(self, config: Olmo2Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
-        self.self_attn = OLMO_1124_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.self_attn = OLMO2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-        self.mlp = Olmo1124MLP(config)
+        self.mlp = Olmo2MLP(config)
-        self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    # copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward
    # TODO(joao): add me back asap :)
@@ -574,7 +574,7 @@ class Olmo1124DecoderLayer(nn.Module):
        return outputs
-OLMO_1124_START_DOCSTRING = r"""
+OLMO2_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)
@@ -584,7 +584,7 @@ OLMO_1124_START_DOCSTRING = r"""
    and behavior.
    Parameters:
-        config ([`Olmo1124Config`]):
+        config ([`Olmo2Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -592,14 +592,14 @@ OLMO_1124_START_DOCSTRING = r"""
@add_start_docstrings(
-    "The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.",
+    "The bare Olmo2 Model outputting raw hidden-states without any specific head on top.",
-    OLMO_1124_START_DOCSTRING,
+    OLMO2_START_DOCSTRING,
 )
-class Olmo1124PreTrainedModel(PreTrainedModel):
+class Olmo2PreTrainedModel(PreTrainedModel):
-    config_class = Olmo1124Config
+    config_class = Olmo2Config
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
-    _no_split_modules = ["Olmo1124DecoderLayer"]
+    _no_split_modules = ["Olmo2DecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
@@ -619,7 +619,7 @@ class Olmo1124PreTrainedModel(PreTrainedModel):
                module.weight.data[module.padding_idx].zero_()
-OLMO_1124_INPUTS_DOCSTRING = r"""
+OLMO2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -695,27 +695,27 @@ OLMO_1124_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
-    "The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.",
+    "The bare Olmo2 Model outputting raw hidden-states without any specific head on top.",
-    OLMO_1124_START_DOCSTRING,
+    OLMO2_START_DOCSTRING,
 )
-class Olmo1124Model(Olmo1124PreTrainedModel):
+class Olmo2Model(Olmo2PreTrainedModel):
    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Olmo1124DecoderLayer`]
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Olmo2DecoderLayer`]
    Args:
-        config: Olmo1124Config
+        config: Olmo2Config
    """
-    def __init__(self, config: Olmo1124Config):
+    def __init__(self, config: Olmo2Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
-            [Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+            [Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
-        self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
@@ -727,7 +727,7 @@ class Olmo1124Model(Olmo1124PreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING)
    # copied from transformers.models.llama.modeling_llama.LlamaModel.forward
    # TODO(joao): add me back asap :)
    def forward(
@@ -971,13 +971,13 @@ class Olmo1124Model(Olmo1124PreTrainedModel):
        return causal_mask
-# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO_1124,Llama->Olmo1124
+# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO2,Llama->Olmo2
-class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
+class Olmo2ForCausalLM(Olmo2PreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config: Olmo1124Config):
+    def __init__(self, config: Olmo2Config):
        super().__init__(config)
-        self.model = Olmo1124Model(config)
+        self.model = Olmo2Model(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1002,7 +1002,7 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
    def get_decoder(self):
        return self.model
-    @add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    # Ignore copy
    def forward(
@@ -1038,10 +1038,10 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
        Example:
        ```python
-        >>> from transformers import AutoTokenizer, Olmo1124ForCausalLM
+        >>> from transformers import AutoTokenizer, Olmo2ForCausalLM
-        >>> model = Olmo1124ForCausalLM.from_pretrained("allenai/Olmo1124-1B-hf")
+        >>> model = Olmo2ForCausalLM.from_pretrained("allenai/Olmo2-1B-hf")
-        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo1124-1B-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo2-1B-hf")
        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -1093,4 +1093,4 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
        )
-__all__ = ["Olmo1124ForCausalLM", "Olmo1124Model", "Olmo1124PreTrainedModel"]
+__all__ = ["Olmo2ForCausalLM", "Olmo2Model", "Olmo2PreTrainedModel"]
--- a/src/transformers/models/olmo_1124/modular_olmo_1124.py
+++ b/src/transformers/models/olmo_1124/modular_olmo_1124.py
@@ -28,11 +28,11 @@ if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
-class Olmo1124Config(OlmoConfig):
+class Olmo2Config(OlmoConfig):
    r"""
-    This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
+    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
+    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
@@ -40,8 +40,8 @@ class Olmo1124Config(OlmoConfig):
    Args:
        vocab_size (`int`, *optional*, defaults to 50304):
-            Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
+            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Olmo1124Model`]
+            `inputs_ids` passed when calling [`Olmo2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
@@ -93,20 +93,20 @@ class Olmo1124Config(OlmoConfig):
            The epsilon used by the rms normalization layers.
    ```python
-    >>> from transformers import Olmo1124Model, Olmo1124Config
+    >>> from transformers import Olmo2Model, Olmo2Config
-    >>> # Initializing a Olmo November 2024 7B style configuration
+    >>> # Initializing a Olmo2 7B style configuration
-    >>> configuration = Olmo1124Config()
+    >>> configuration = Olmo2Config()
-    >>> # Initializing a model from the Olmo November 2024 7B style configuration
+    >>> # Initializing a model from the Olmo2 7B style configuration
-    >>> model = Olmo1124Model(configuration)
+    >>> model = Olmo2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
-    model_type = "olmo_1124"
+    model_type = "olmo2"
    def __init__(
        self,
@@ -157,21 +157,21 @@ class Olmo1124Config(OlmoConfig):
        del self.clip_qkv
-class Olmo1124RMSNorm(LlamaRMSNorm):
+class Olmo2RMSNorm(LlamaRMSNorm):
    pass
-ALL_LAYERNORM_LAYERS.append(Olmo1124RMSNorm)
+ALL_LAYERNORM_LAYERS.append(Olmo2RMSNorm)
-# Olmo1124 attention is identical to OLMo attention except:
+# Olmo2 attention is identical to OLMo attention except:
 # - Norm is applied to attention queries and keys.
 # - No qkv clipping.
-class Olmo1124Attention(OlmoAttention):
+class Olmo2Attention(OlmoAttention):
-    def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
+    def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None):
        super().__init__(config, layer_idx=layer_idx)
-        self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
+        self.q_norm = Olmo2RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
-        self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
+        self.k_norm = Olmo2RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
    def forward(
        self,
@@ -234,15 +234,15 @@ class Olmo1124Attention(OlmoAttention):
        return attn_output, attn_weights, past_key_value
-class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention):
+class Olmo2FlashAttention2(OlmoFlashAttention2, Olmo2Attention):
    """
-    OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
+    OLMo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """
    def __init__(self, *args, **kwargs):
-        Olmo1124Attention.__init__(*args, **kwargs)
+        Olmo2Attention.__init__(*args, **kwargs)
        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
@@ -338,8 +338,8 @@ class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention):
        return attn_output, attn_weights, past_key_value
-class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
+class Olmo2SdpaAttention(OlmoSdpaAttention, Olmo2Attention):
-    # Adapted from Olmo1124Attention.forward
+    # Adapted from Olmo2Attention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -353,7 +353,7 @@ class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
-                "Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                "Olmo2Model is using Olmo2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
@@ -408,14 +408,14 @@ class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
        return attn_output, None, past_key_value
-# The OLMo November 2024 layers are identical to those of the OLMo model except:
+# The OLMo2 layers are identical to those of the OLMo model except:
 # - RMSNorm is used instead of standard layer norm.
 # - Norm is applied after attention/feedforward rather than before.
-class Olmo1124DecoderLayer(OlmoDecoderLayer):
+class Olmo2DecoderLayer(OlmoDecoderLayer):
-    def __init__(self, config: Olmo1124Config, layer_idx: int):
+    def __init__(self, config: Olmo2Config, layer_idx: int):
        super().__init__(config, layer_idx=layer_idx)
-        self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        del self.input_layernorm
    def forward(
@@ -459,31 +459,31 @@ class Olmo1124DecoderLayer(OlmoDecoderLayer):
        return outputs
-class Olmo1124PreTrainedModel(OlmoPreTrainedModel):
+class Olmo2PreTrainedModel(OlmoPreTrainedModel):
    pass
-# The OLMo November 2024 model is identical to the OLMo model, except RMSNorm is used instead of
+# The OLMo2 model is identical to the OLMo model, except RMSNorm is used instead of
 # standard layer norm for the output norm.
-class Olmo1124Model(OlmoModel):
+class Olmo2Model(OlmoModel):
-    def __init__(self, config: Olmo1124Config):
+    def __init__(self, config: Olmo2Config):
        super().__init__(config)
        self.layers = nn.ModuleList(
-            [Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+            [Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
-        self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 # The heads now only need to redefine the model inside to the correct `RobertaModel`
-class Olmo1124ForCausalLM(OlmoForCausalLM):
+class Olmo2ForCausalLM(OlmoForCausalLM):
-    def __init__(self, config: Olmo1124Config):
+    def __init__(self, config: Olmo2Config):
        super().__init__(config)
-        self.model = Olmo1124Model(config)
+        self.model = Olmo2Model(config)
 __all__ = [
-    "Olmo1124Config",
+    "Olmo2Config",
-    "Olmo1124ForCausalLM",
+    "Olmo2ForCausalLM",
-    "Olmo1124Model",
+    "Olmo2Model",
-    "Olmo1124PreTrainedModel",
+    "Olmo2PreTrainedModel",
 ]
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -6758,21 +6758,21 @@ class OlmoPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
-class Olmo1124ForCausalLM(metaclass=DummyObject):
+class Olmo2ForCausalLM(metaclass=DummyObject):
    _backends = ["torch"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch"])
-class Olmo1124Model(metaclass=DummyObject):
+class Olmo2Model(metaclass=DummyObject):
    _backends = ["torch"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch"])
-class Olmo1124PreTrainedModel(metaclass=DummyObject):
+class Olmo2PreTrainedModel(metaclass=DummyObject):
    _backends = ["torch"]
    def __init__(self, *args, **kwargs):
--- a/tests/models/olmo_1124/init.py
+++ b/tests/models/olmo_1124/init.py
--- a/tests/models/olmo_1124/test_modeling_olmo_1124.py
+++ b/tests/models/olmo_1124/test_modeling_olmo_1124.py
@@ -12,14 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Testing suite for the PyTorch OLMo November 2024 model."""
+"""Testing suite for the PyTorch OLMo2 model."""
 import unittest
 from packaging import version
 from parameterized import parameterized
-from transformers import Olmo1124Config, is_torch_available, set_seed
+from transformers import Olmo2Config, is_torch_available, set_seed
 from transformers.generation.configuration_utils import GenerationConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.testing_utils import (
@@ -39,12 +39,12 @@ if is_torch_available():
    import torch
    from transformers import (
-        Olmo1124ForCausalLM,
+        Olmo2ForCausalLM,
-        Olmo1124Model,
+        Olmo2Model,
    )
-class Olmo1124ModelTester:
+class Olmo2ModelTester:
    def __init__(
        self,
        parent,
@@ -119,7 +119,7 @@ class Olmo1124ModelTester:
        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def get_config(self):
-        return Olmo1124Config(
+        return Olmo2Config(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
@@ -138,7 +138,7 @@ class Olmo1124ModelTester:
    def create_and_check_model(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
-        model = Olmo1124Model(config=config)
+        model = Olmo2Model(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask)
@@ -158,7 +158,7 @@ class Olmo1124ModelTester:
        encoder_attention_mask,
    ):
        config.add_cross_attention = True
-        model = Olmo1124Model(config)
+        model = Olmo2Model(config)
        model.to(torch_device)
        model.eval()
        result = model(
@@ -187,7 +187,7 @@ class Olmo1124ModelTester:
        encoder_hidden_states,
        encoder_attention_mask,
    ):
-        model = Olmo1124ForCausalLM(config=config)
+        model = Olmo2ForCausalLM(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
@@ -207,7 +207,7 @@ class Olmo1124ModelTester:
    ):
        config.is_decoder = True
        config.add_cross_attention = True
-        model = Olmo1124ForCausalLM(config=config)
+        model = Olmo2ForCausalLM(config=config)
        model.to(torch_device)
        model.eval()
@@ -271,13 +271,13 @@ class Olmo1124ModelTester:
@require_torch
-class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class Olmo2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (Olmo1124Model, Olmo1124ForCausalLM) if is_torch_available() else ()
+    all_model_classes = (Olmo2Model, Olmo2ForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (Olmo1124ForCausalLM,) if is_torch_available() else ()
+    all_generative_model_classes = (Olmo2ForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
-            "feature-extraction": Olmo1124Model,
+            "feature-extraction": Olmo2Model,
-            "text-generation": Olmo1124ForCausalLM,
+            "text-generation": Olmo2ForCausalLM,
        }
        if is_torch_available()
        else {}
@@ -290,8 +290,8 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
    model_split_percents = [0.5, 0.7, 0.8]
    def setUp(self):
-        self.model_tester = Olmo1124ModelTester(self)
+        self.model_tester = Olmo2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Olmo1124Config, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=Olmo2Config, hidden_size=37)
    def test_config(self):
        self.config_tester.run_common_tests()
@@ -300,7 +300,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)
-    @unittest.skip(reason="OLMo November 2024 does not support head pruning.")
+    @unittest.skip(reason="OLMo2 does not support head pruning.")
    def test_headmasking(self):
        pass
@@ -310,7 +310,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
            config_and_inputs[0].position_embedding_type = type
            self.model_tester.create_and_check_model(*config_and_inputs)
-    @unittest.skip(reason="OLMo November 2024 buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="OLMo2 buffers include complex numbers, which breaks this test")
    def test_save_load_fast_init_from_base(self):
        pass
@@ -321,7 +321,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = Olmo1124Model(config)
+        original_model = Olmo2Model(config)
        original_model.to(torch_device)
        original_model.eval()
        original_short_output = original_model(short_input).last_hidden_state
@@ -329,7 +329,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = Olmo1124Model(config)
+        scaled_model = Olmo2Model(config)
        scaled_model.to(torch_device)
        scaled_model.eval()
        scaled_short_output = scaled_model(short_input).last_hidden_state
@@ -347,11 +347,11 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
@require_torch
-class Olmo1124IntegrationTest(unittest.TestCase):
+class Olmo2IntegrationTest(unittest.TestCase):
    @slow
    def test_model_7b_logits(self):
        input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
-        model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
+        model = Olmo2ForCausalLM.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto")
        out = model(torch.tensor(input_ids)).logits.float()
        # Expected mean on dim = -1
        EXPECTED_MEAN = torch.tensor(
@@ -366,8 +366,8 @@ class Olmo1124IntegrationTest(unittest.TestCase):
    def test_model_7b_greedy_generation(self):
        EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the fastest speed possible, and 3) the speed of light is the same for all observers, regardless of their relative motion. The theory of relativity is based on the idea that the speed of light is constant. This means that"""
        prompt = "Simply put, the theory of relativity states that "
-        tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto")
-        model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
+        model = Olmo2ForCausalLM.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto")
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
        # greedy generation outputs
@@ -377,7 +377,7 @@ class Olmo1124IntegrationTest(unittest.TestCase):
    @require_tokenizers
    def test_simple_encode_decode(self):
-        rust_tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf")
+        rust_tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo2-7B-1124-hf")
        self.assertEqual(rust_tokenizer.encode("This is a test"), [2028, 374, 264, 1296])
        self.assertEqual(rust_tokenizer.decode([2028, 374, 264, 1296], skip_special_tokens=True), "This is a test")
@@ -414,9 +414,9 @@ class Olmo1124IntegrationTest(unittest.TestCase):
            convert_and_export_with_cache,
        )
-        olmo_1124_model = "shanearora/OLMo-7B-1124-hf"
+        olmo2_model = "shanearora/OLMo2-7B-1124-hf"
-        tokenizer = AutoTokenizer.from_pretrained(olmo_1124_model, pad_token="</s>", padding_side="right")
+        tokenizer = AutoTokenizer.from_pretrained(olmo2_model, pad_token="</s>", padding_side="right")
        EXPECTED_TEXT_COMPLETION = [
            "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light",
        ]
@@ -439,8 +439,8 @@ class Olmo1124IntegrationTest(unittest.TestCase):
                "max_cache_len": max_generation_length,
            },
        )
-        model = Olmo1124ForCausalLM.from_pretrained(
+        model = Olmo2ForCausalLM.from_pretrained(
-            olmo_1124_model,
+            olmo2_model,
            device_map=device,
            torch_dtype=dtype,
            attn_implementation=attn_implementation,