From 9121ab8fe87a296e57f9846e70153b1a3c555d75 Mon Sep 17 00:00:00 2001 From: Shane A Date: Mon, 25 Nov 2024 07:31:22 -0800 Subject: [PATCH] Rename OLMo November to OLMo2 (#34864) * Rename/move OLMo Nov files to OLMo2 * Rename Olmo1124 and its variants to Olmo2 --- docs/source/en/_toctree.yml | 4 +- docs/source/en/index.md | 2 +- .../en/model_doc/{olmo_1124.md => olmo2.md} | 16 +-- docs/source/en/perf_infer_gpu_one.md | 4 +- src/transformers/__init__.py | 20 +-- src/transformers/models/__init__.py | 2 +- .../models/auto/configuration_auto.py | 4 +- src/transformers/models/auto/modeling_auto.py | 4 +- .../models/auto/tokenization_auto.py | 2 +- .../models/{olmo_1124 => olmo2}/__init__.py | 4 +- .../configuration_olmo2.py} | 28 ++-- .../convert_olmo2_weights_to_hf.py} | 68 ++++----- .../modeling_olmo2.py} | 134 +++++++++--------- .../modular_olmo2.py} | 84 +++++------ src/transformers/utils/dummy_pt_objects.py | 6 +- tests/models/{olmo_1124 => olmo2}/__init__.py | 0 .../test_modeling_olmo2.py} | 60 ++++---- 17 files changed, 221 insertions(+), 221 deletions(-) rename docs/source/en/model_doc/{olmo_1124.md => olmo2.md} (84%) rename src/transformers/models/{olmo_1124 => olmo2}/__init__.py (92%) rename src/transformers/models/{olmo_1124/configuration_olmo_1124.py => olmo2/configuration_olmo2.py} (90%) rename src/transformers/models/{olmo_1124/convert_olmo_1124_weights_to_hf.py => olmo2/convert_olmo2_weights_to_hf.py} (80%) rename src/transformers/models/{olmo_1124/modeling_olmo_1124.py => olmo2/modeling_olmo2.py} (91%) rename src/transformers/models/{olmo_1124/modular_olmo_1124.py => olmo2/modular_olmo2.py} (88%) rename tests/models/{olmo_1124 => olmo2}/__init__.py (100%) rename tests/models/{olmo_1124/test_modeling_olmo_1124.py => olmo2/test_modeling_olmo2.py} (91%) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ca7ee4557f..0d2b752d5a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -516,8 +516,8 @@ title: Nyströmformer - local: model_doc/olmo title: OLMo - - local: model_doc/olmo_1124 - title: OLMo November 2024 + - local: model_doc/olmo2 + title: OLMo2 - local: model_doc/olmoe title: OLMoE - local: model_doc/open-llama diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 341cb417c7..8a9ccf45b6 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -240,7 +240,7 @@ Flax), PyTorch, and/or TensorFlow. | [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ | | [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ | | [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ | -| [OLMo November 2024](model_doc/olmo_1124) | ✅ | ❌ | ❌ | +| [OLMo2](model_doc/olmo2) | ✅ | ❌ | ❌ | | [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ | | [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ | | [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/olmo_1124.md b/docs/source/en/model_doc/olmo2.md similarity index 84% rename from docs/source/en/model_doc/olmo_1124.md rename to docs/source/en/model_doc/olmo2.md index f36ec438e5..8ca3326660 100644 --- a/docs/source/en/model_doc/olmo_1124.md +++ b/docs/source/en/model_doc/olmo2.md @@ -14,11 +14,11 @@ rendered properly in your Markdown viewer. --> -# OLMo November 2024 +# OLMo2 ## Overview -The OLMo November 2024 model is a successor of the OLMo model, which was proposed in +The OLMo2 model is the successor of the OLMo model, which was proposed in [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838). The architectural changes from the original OLMo model to this model are: @@ -31,16 +31,16 @@ This model was contributed by [shanearora](https://huggingface.co/shanearora). The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo). -## Olmo1124Config +## Olmo2Config -[[autodoc]] Olmo1124Config +[[autodoc]] Olmo2Config -## Olmo1124Model +## Olmo2Model -[[autodoc]] Olmo1124Model +[[autodoc]] Olmo2Model - forward -## Olmo1124ForCausalLM +## Olmo2ForCausalLM -[[autodoc]] Olmo1124ForCausalLM +[[autodoc]] Olmo2ForCausalLM - forward diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 84109746f9..8a106cae73 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -77,7 +77,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) -* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -261,7 +261,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) -* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e56959928b..ce2a2553d6 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -620,7 +620,7 @@ _import_structure = { "models.nougat": ["NougatProcessor"], "models.nystromformer": ["NystromformerConfig"], "models.olmo": ["OlmoConfig"], - "models.olmo_1124": ["Olmo1124Config"], + "models.olmo2": ["Olmo2Config"], "models.olmoe": ["OlmoeConfig"], "models.omdet_turbo": [ "OmDetTurboConfig", @@ -2920,11 +2920,11 @@ else: "OlmoPreTrainedModel", ] ) - _import_structure["models.olmo_1124"].extend( + _import_structure["models.olmo2"].extend( [ - "Olmo1124ForCausalLM", - "Olmo1124Model", - "Olmo1124PreTrainedModel", + "Olmo2ForCausalLM", + "Olmo2Model", + "Olmo2PreTrainedModel", ] ) _import_structure["models.olmoe"].extend( @@ -5514,7 +5514,7 @@ if TYPE_CHECKING: NystromformerConfig, ) from .models.olmo import OlmoConfig - from .models.olmo_1124 import Olmo1124Config + from .models.olmo2 import Olmo2Config from .models.olmoe import OlmoeConfig from .models.omdet_turbo import ( OmDetTurboConfig, @@ -7533,10 +7533,10 @@ if TYPE_CHECKING: OlmoModel, OlmoPreTrainedModel, ) - from .models.olmo_1124 import ( - Olmo1124ForCausalLM, - Olmo1124Model, - Olmo1124PreTrainedModel, + from .models.olmo2 import ( + Olmo2ForCausalLM, + Olmo2Model, + Olmo2PreTrainedModel, ) from .models.olmoe import ( OlmoeForCausalLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 0d4b9f2f94..2d2a3b41d4 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -177,7 +177,7 @@ from . import ( nougat, nystromformer, olmo, - olmo_1124, + olmo2, olmoe, omdet_turbo, oneformer, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 7f0182b500..4ab6d39228 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -195,7 +195,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("nougat", "VisionEncoderDecoderConfig"), ("nystromformer", "NystromformerConfig"), ("olmo", "OlmoConfig"), - ("olmo_1124", "Olmo1124Config"), + ("olmo2", "Olmo2Config"), ("olmoe", "OlmoeConfig"), ("omdet-turbo", "OmDetTurboConfig"), ("oneformer", "OneFormerConfig"), @@ -511,7 +511,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("nougat", "Nougat"), ("nystromformer", "Nyströmformer"), ("olmo", "OLMo"), - ("olmo_1124", "OLMo November 2024"), + ("olmo2", "OLMo2"), ("olmoe", "OLMoE"), ("omdet-turbo", "OmDet-Turbo"), ("oneformer", "OneFormer"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 5206972b72..2c519a7dc4 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -184,7 +184,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("nllb-moe", "NllbMoeModel"), ("nystromformer", "NystromformerModel"), ("olmo", "OlmoModel"), - ("olmo_1124", "Olmo1124Model"), + ("olmo2", "Olmo2Model"), ("olmoe", "OlmoeModel"), ("omdet-turbo", "OmDetTurboForObjectDetection"), ("oneformer", "OneFormerModel"), @@ -517,7 +517,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ("mvp", "MvpForCausalLM"), ("nemotron", "NemotronForCausalLM"), ("olmo", "OlmoForCausalLM"), - ("olmo_1124", "Olmo1124ForCausalLM"), + ("olmo2", "Olmo2ForCausalLM"), ("olmoe", "OlmoeForCausalLM"), ("open-llama", "OpenLlamaForCausalLM"), ("openai-gpt", "OpenAIGPTLMHeadModel"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 4ed67df0e8..e246bf3094 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -348,7 +348,7 @@ else: ), ), ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("olmo2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ( "omdet-turbo", diff --git a/src/transformers/models/olmo_1124/__init__.py b/src/transformers/models/olmo2/__init__.py similarity index 92% rename from src/transformers/models/olmo_1124/__init__.py rename to src/transformers/models/olmo2/__init__.py index 5d4127766c..e2161a4948 100644 --- a/src/transformers/models/olmo_1124/__init__.py +++ b/src/transformers/models/olmo2/__init__.py @@ -18,8 +18,8 @@ from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_olmo_1124 import * - from .modeling_olmo_1124 import * + from .configuration_olmo2 import * + from .modeling_olmo2 import * else: import sys diff --git a/src/transformers/models/olmo_1124/configuration_olmo_1124.py b/src/transformers/models/olmo2/configuration_olmo2.py similarity index 90% rename from src/transformers/models/olmo_1124/configuration_olmo_1124.py rename to src/transformers/models/olmo2/configuration_olmo2.py index b7f6c57ae4..144520f87e 100644 --- a/src/transformers/models/olmo_1124/configuration_olmo_1124.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -1,18 +1,18 @@ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py. +# This file was automatically generated from src/transformers/models/olmo2/modular_olmo2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the -# modular_olmo_1124.py file directly. One of our CI enforces this. +# modular_olmo2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 from ...configuration_utils import PretrainedConfig -class Olmo1124Config(PretrainedConfig): +class Olmo2Config(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024 + This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf). + defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf). Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -20,8 +20,8 @@ class Olmo1124Config(PretrainedConfig): Args: vocab_size (`int`, *optional*, defaults to 50304): - Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Olmo1124Model`] + Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Olmo2Model`] hidden_size (`int`, *optional*, defaults to 4096): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 11008): @@ -73,20 +73,20 @@ class Olmo1124Config(PretrainedConfig): The epsilon used by the rms normalization layers. ```python - >>> from transformers import Olmo1124Model, Olmo1124Config + >>> from transformers import Olmo2Model, Olmo2Config - >>> # Initializing a Olmo November 2024 7B style configuration - >>> configuration = Olmo1124Config() + >>> # Initializing a Olmo2 7B style configuration + >>> configuration = Olmo2Config() - >>> # Initializing a model from the Olmo November 2024 7B style configuration - >>> model = Olmo1124Model(configuration) + >>> # Initializing a model from the Olmo2 7B style configuration + >>> model = Olmo2Model(configuration) >>> # Accessing the model configuration >>> configuration = model.config ``` """ - model_type = "olmo_1124" + model_type = "olmo2" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -163,4 +163,4 @@ class Olmo1124Config(PretrainedConfig): raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") -__all__ = ["Olmo1124Config"] +__all__ = ["Olmo2Config"] diff --git a/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py similarity index 80% rename from src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py rename to src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py index da35fc8301..43837fc14c 100644 --- a/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py +++ b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py @@ -23,7 +23,7 @@ import torch import yaml from tokenizers import Tokenizer -from transformers import Olmo1124Config, Olmo1124ForCausalLM +from transformers import Olmo2Config, Olmo2ForCausalLM from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast @@ -31,16 +31,16 @@ from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast Sample usage: ``` -python src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py \ - --input_dir /path/to/downloaded/olmo_1124/weights --model_size 7B --output_dir /output/path +python src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py \ + --input_dir /path/to/downloaded/olmo2/weights --model_size 7B --output_dir /output/path ``` Thereafter, models can be loaded via: ```py -from transformers import Olmo1124ForCausalLM, AutoTokenizer +from transformers import Olmo2ForCausalLM, AutoTokenizer -model = Olmo1124ForCausalLM.from_pretrained("/output/path") +model = Olmo2ForCausalLM.from_pretrained("/output/path") tokenizer = AutoTokenizer.from_pretrained("/output/path") ``` @@ -77,26 +77,26 @@ def write_model( os.makedirs(tmp_model_path, exist_ok=True) config_path = Path(input_base_path) / "config.yaml" - olmo_1124_config = yaml.safe_load(config_path.read_text())["model"] + olmo2_config = yaml.safe_load(config_path.read_text())["model"] - if not olmo_1124_config.get("attention_layer_norm", False): - raise RuntimeError("OLMo November 2024 checkpoints must have attention layer norm") - if not olmo_1124_config.get("norm_after", False): - raise RuntimeError("OLMo November 2024 checkpoints must set norm_after to True") + if not olmo2_config.get("attention_layer_norm", False): + raise RuntimeError("OLMo2 checkpoints must have attention layer norm") + if not olmo2_config.get("norm_after", False): + raise RuntimeError("OLMo2 checkpoints must set norm_after to True") - n_layers = olmo_1124_config["n_layers"] - n_heads = olmo_1124_config["n_heads"] - dim = olmo_1124_config["d_model"] + n_layers = olmo2_config["n_layers"] + n_heads = olmo2_config["n_heads"] + dim = olmo2_config["d_model"] dims_per_head = dim // n_heads - base = olmo_1124_config["rope_theta"] + base = olmo2_config["rope_theta"] inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - max_position_embeddings = olmo_1124_config["max_sequence_length"] + max_position_embeddings = olmo2_config["max_sequence_length"] - vocab_size = olmo_1124_config.get("embedding_size", olmo_1124_config["vocab_size"]) + vocab_size = olmo2_config.get("embedding_size", olmo2_config["vocab_size"]) - if olmo_1124_config.get("n_kv_heads", None) is not None: - num_key_value_heads = olmo_1124_config["n_kv_heads"] # for GQA / MQA - elif olmo_1124_config["multi_query_attention"]: # compatibility with other checkpoints + if olmo2_config.get("n_kv_heads", None) is not None: + num_key_value_heads = olmo2_config["n_kv_heads"] # for GQA / MQA + elif olmo2_config["multi_query_attention"]: # compatibility with other checkpoints num_key_value_heads = 1 else: num_key_value_heads = n_heads @@ -167,17 +167,17 @@ def write_model( index_dict["metadata"] = {"total_size": param_count * 2} write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) - if olmo_1124_config.get("mlp_hidden_size", None) is not None: - intermediate_size = olmo_1124_config["mlp_hidden_size"] // 2 + if olmo2_config.get("mlp_hidden_size", None) is not None: + intermediate_size = olmo2_config["mlp_hidden_size"] // 2 else: - intermediate_size = (dim * olmo_1124_config["mlp_ratio"]) // 2 + intermediate_size = (dim * olmo2_config["mlp_ratio"]) // 2 - if fix_eos_token_id and olmo_1124_config["eos_token_id"] == 0: + if fix_eos_token_id and olmo2_config["eos_token_id"] == 0: # Fixing a bug in OLMo where eos token id was incorrectly set print("Changing eos_token_id from 0 to 50279.") - olmo_1124_config["eos_token_id"] = 50279 + olmo2_config["eos_token_id"] = 50279 - config = Olmo1124Config( + config = Olmo2Config( vocab_size=vocab_size, hidden_size=dim, intermediate_size=intermediate_size, @@ -185,11 +185,11 @@ def write_model( num_attention_heads=n_heads, num_key_value_heads=num_key_value_heads, max_position_embeddings=max_position_embeddings, - pad_token_id=olmo_1124_config["pad_token_id"], + pad_token_id=olmo2_config["pad_token_id"], bos_token_id=None, - eos_token_id=olmo_1124_config["eos_token_id"], - tie_word_embeddings=olmo_1124_config["weight_tying"], - rms_norm_eps=olmo_1124_config["layer_norm_eps"], + eos_token_id=olmo2_config["eos_token_id"], + tie_word_embeddings=olmo2_config["weight_tying"], + rms_norm_eps=olmo2_config["layer_norm_eps"], rope_theta=base, ) config.save_pretrained(tmp_model_path) @@ -202,8 +202,8 @@ def write_model( if include_tokenizer: _write_tokenizer(model_path, config, input_base_path, tokenizer_path) - print("Loading the checkpoint in a OLMo November 2024 model.") - model = Olmo1124ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True) + print("Loading the checkpoint in a OLMo2 model.") + model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True) # Avoid saving this as part of the config. del model.config._name_or_path print("Saving in the Transformers format.") @@ -216,7 +216,7 @@ def write_model( def _write_tokenizer( output_path: Path, - config: Olmo1124Config, + config: Olmo2Config, checkpoint_dir: str, input_tokenizer_path: Path | None, ) -> None: @@ -251,7 +251,7 @@ def main(): parser.add_argument( "--input_dir", required=True, - help="Location of OLMo November 2024 weights, which contains config.yaml and model.pt.", + help="Location of OLMo2 weights, which contains config.yaml and model.pt.", ) parser.add_argument( "--no_tokenizer", @@ -263,7 +263,7 @@ def main(): "--tokenizer_json_path", type=Path, default=None, - help="Location of OLMo November 2024 tokenizer json file. Defaults to what is set in the config file.", + help="Location of OLMo2 tokenizer json file. Defaults to what is set in the config file.", ) parser.add_argument( "--output_dir", diff --git a/src/transformers/models/olmo_1124/modeling_olmo_1124.py b/src/transformers/models/olmo2/modeling_olmo2.py similarity index 91% rename from src/transformers/models/olmo_1124/modeling_olmo_1124.py rename to src/transformers/models/olmo2/modeling_olmo2.py index 5a9cca39b8..bdf53376a1 100644 --- a/src/transformers/models/olmo_1124/modeling_olmo_1124.py +++ b/src/transformers/models/olmo2/modeling_olmo2.py @@ -1,8 +1,8 @@ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py. +# This file was automatically generated from src/transformers/models/olmo2/modular_olmo2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the -# modular_olmo_1124.py file directly. One of our CI enforces this. +# modular_olmo2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 import math from typing import List, Optional, Tuple, Union @@ -25,7 +25,7 @@ from ...utils import ( logging, replace_return_docstrings, ) -from .configuration_olmo_1124 import Olmo1124Config +from .configuration_olmo2 import Olmo2Config if is_flash_attn_2_available(): @@ -34,13 +34,13 @@ if is_flash_attn_2_available(): logger = logging.get_logger(__name__) -_CONFIG_FOR_DOC = "Olmo1124Config" +_CONFIG_FOR_DOC = "Olmo2Config" -class Olmo1124RMSNorm(nn.Module): +class Olmo2RMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ - Olmo1124RMSNorm is equivalent to T5LayerNorm + Olmo2RMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) @@ -57,9 +57,9 @@ class Olmo1124RMSNorm(nn.Module): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo1124 +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo2 # TODO(joao): add me back asap :) -class Olmo1124RotaryEmbedding(nn.Module): +class Olmo2RotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() self.scaling_factor = scaling_factor @@ -88,10 +88,10 @@ class Olmo1124RotaryEmbedding(nn.Module): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo1124 +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo2 # TODO(joao): add me back asap :) -class Olmo1124LinearScalingRotaryEmbedding(Olmo1124RotaryEmbedding): - """Olmo1124RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" +class Olmo2LinearScalingRotaryEmbedding(Olmo2RotaryEmbedding): + """Olmo2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" def forward(self, x, position_ids): # difference to the original RoPE: a scaling factor is aplied to the position ids @@ -100,10 +100,10 @@ class Olmo1124LinearScalingRotaryEmbedding(Olmo1124RotaryEmbedding): return cos, sin -# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo1124 +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo2 # TODO(joao): add me back asap :) -class Olmo1124DynamicNTKScalingRotaryEmbedding(Olmo1124RotaryEmbedding): - """Olmo1124RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" +class Olmo2DynamicNTKScalingRotaryEmbedding(Olmo2RotaryEmbedding): + """Olmo2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" def forward(self, x, position_ids): # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length @@ -167,12 +167,12 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) -class Olmo1124Attention(nn.Module): +class Olmo2Attention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" - # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo1124 + # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo2 # TODO(joao): add me back asap :) - def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None): + def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None): super().__init__() self.config = config self.layer_idx = layer_idx @@ -204,12 +204,12 @@ class Olmo1124Attention(nn.Module): self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) self._init_rope() - self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps) - self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps) + self.q_norm = Olmo2RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps) + self.k_norm = Olmo2RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps) def _init_rope(self): if self.config.rope_scaling is None: - self.rotary_emb = Olmo1124RotaryEmbedding( + self.rotary_emb = Olmo2RotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, @@ -218,14 +218,14 @@ class Olmo1124Attention(nn.Module): scaling_type = self.config.rope_scaling["type"] scaling_factor = self.config.rope_scaling["factor"] if scaling_type == "linear": - self.rotary_emb = Olmo1124LinearScalingRotaryEmbedding( + self.rotary_emb = Olmo2LinearScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor, base=self.rope_theta, ) elif scaling_type == "dynamic": - self.rotary_emb = Olmo1124DynamicNTKScalingRotaryEmbedding( + self.rotary_emb = Olmo2DynamicNTKScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor, @@ -295,13 +295,13 @@ class Olmo1124Attention(nn.Module): return attn_output, attn_weights, past_key_value -class Olmo1124FlashAttention2(Olmo1124Attention): +class Olmo2FlashAttention2(Olmo2Attention): """ - Olmo1124 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays + Olmo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays untouched. The only required change would be on the forward pass where it needs to correctly call the public API of flash attention and deal with padding tokens in case the input contains any of them. - OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays + OLMo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays untouched. The only required change would be on the forward pass where it needs to correctly call the public API of flash attention and deal with padding tokens in case the input contains any of them. """ @@ -403,14 +403,14 @@ class Olmo1124FlashAttention2(Olmo1124Attention): return attn_output, attn_weights, past_key_value -class Olmo1124SdpaAttention(Olmo1124Attention): +class Olmo2SdpaAttention(Olmo2Attention): """ - Olmo1124 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `Olmo1124Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + Olmo2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Olmo2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to SDPA API. """ - # Adapted from Olmo1124Attention.forward + # Adapted from Olmo2Attention.forward def forward( self, hidden_states: torch.Tensor, @@ -424,7 +424,7 @@ class Olmo1124SdpaAttention(Olmo1124Attention): if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + "Olmo2Model is using Olmo2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( @@ -479,7 +479,7 @@ class Olmo1124SdpaAttention(Olmo1124Attention): return attn_output, None, past_key_value -class Olmo1124MLP(nn.Module): +class Olmo2MLP(nn.Module): def __init__(self, config): super().__init__() self.config = config @@ -494,23 +494,23 @@ class Olmo1124MLP(nn.Module): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) -OLMO_1124_ATTENTION_CLASSES = { - "eager": Olmo1124Attention, - "flash_attention_2": Olmo1124FlashAttention2, - "sdpa": Olmo1124SdpaAttention, +OLMO2_ATTENTION_CLASSES = { + "eager": Olmo2Attention, + "flash_attention_2": Olmo2FlashAttention2, + "sdpa": Olmo2SdpaAttention, } -class Olmo1124DecoderLayer(nn.Module): - def __init__(self, config: Olmo1124Config, layer_idx: int): +class Olmo2DecoderLayer(nn.Module): + def __init__(self, config: Olmo2Config, layer_idx: int): super().__init__() self.hidden_size = config.hidden_size - self.self_attn = OLMO_1124_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + self.self_attn = OLMO2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) - self.mlp = Olmo1124MLP(config) - self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.mlp = Olmo2MLP(config) + self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) # copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward # TODO(joao): add me back asap :) @@ -574,7 +574,7 @@ class Olmo1124DecoderLayer(nn.Module): return outputs -OLMO_1124_START_DOCSTRING = r""" +OLMO2_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) @@ -584,7 +584,7 @@ OLMO_1124_START_DOCSTRING = r""" and behavior. Parameters: - config ([`Olmo1124Config`]): + config ([`Olmo2Config`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. @@ -592,14 +592,14 @@ OLMO_1124_START_DOCSTRING = r""" @add_start_docstrings( - "The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.", - OLMO_1124_START_DOCSTRING, + "The bare Olmo2 Model outputting raw hidden-states without any specific head on top.", + OLMO2_START_DOCSTRING, ) -class Olmo1124PreTrainedModel(PreTrainedModel): - config_class = Olmo1124Config +class Olmo2PreTrainedModel(PreTrainedModel): + config_class = Olmo2Config base_model_prefix = "model" supports_gradient_checkpointing = True - _no_split_modules = ["Olmo1124DecoderLayer"] + _no_split_modules = ["Olmo2DecoderLayer"] _skip_keys_device_placement = ["past_key_values"] _supports_flash_attn_2 = True _supports_sdpa = True @@ -619,7 +619,7 @@ class Olmo1124PreTrainedModel(PreTrainedModel): module.weight.data[module.padding_idx].zero_() -OLMO_1124_INPUTS_DOCSTRING = r""" +OLMO2_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide @@ -695,27 +695,27 @@ OLMO_1124_INPUTS_DOCSTRING = r""" @add_start_docstrings( - "The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.", - OLMO_1124_START_DOCSTRING, + "The bare Olmo2 Model outputting raw hidden-states without any specific head on top.", + OLMO2_START_DOCSTRING, ) -class Olmo1124Model(Olmo1124PreTrainedModel): +class Olmo2Model(Olmo2PreTrainedModel): """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Olmo1124DecoderLayer`] + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Olmo2DecoderLayer`] Args: - config: Olmo1124Config + config: Olmo2Config """ - def __init__(self, config: Olmo1124Config): + def __init__(self, config: Olmo2Config): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) self.layers = nn.ModuleList( - [Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + [Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) - self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -727,7 +727,7 @@ class Olmo1124Model(Olmo1124PreTrainedModel): def set_input_embeddings(self, value): self.embed_tokens = value - @add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING) # copied from transformers.models.llama.modeling_llama.LlamaModel.forward # TODO(joao): add me back asap :) def forward( @@ -971,13 +971,13 @@ class Olmo1124Model(Olmo1124PreTrainedModel): return causal_mask -# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO_1124,Llama->Olmo1124 -class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin): +# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO2,Llama->Olmo2 +class Olmo2ForCausalLM(Olmo2PreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] - def __init__(self, config: Olmo1124Config): + def __init__(self, config: Olmo2Config): super().__init__(config) - self.model = Olmo1124Model(config) + self.model = Olmo2Model(config) self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) @@ -1002,7 +1002,7 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin): def get_decoder(self): return self.model - @add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING) + @add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) # Ignore copy def forward( @@ -1038,10 +1038,10 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin): Example: ```python - >>> from transformers import AutoTokenizer, Olmo1124ForCausalLM + >>> from transformers import AutoTokenizer, Olmo2ForCausalLM - >>> model = Olmo1124ForCausalLM.from_pretrained("allenai/Olmo1124-1B-hf") - >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo1124-1B-hf") + >>> model = Olmo2ForCausalLM.from_pretrained("allenai/Olmo2-1B-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo2-1B-hf") >>> prompt = "Hey, are you conscious? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") @@ -1093,4 +1093,4 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin): ) -__all__ = ["Olmo1124ForCausalLM", "Olmo1124Model", "Olmo1124PreTrainedModel"] +__all__ = ["Olmo2ForCausalLM", "Olmo2Model", "Olmo2PreTrainedModel"] diff --git a/src/transformers/models/olmo_1124/modular_olmo_1124.py b/src/transformers/models/olmo2/modular_olmo2.py similarity index 88% rename from src/transformers/models/olmo_1124/modular_olmo_1124.py rename to src/transformers/models/olmo2/modular_olmo2.py index 2305b1f400..393d17c59c 100644 --- a/src/transformers/models/olmo_1124/modular_olmo_1124.py +++ b/src/transformers/models/olmo2/modular_olmo2.py @@ -28,11 +28,11 @@ if is_flash_attn_2_available(): logger = logging.get_logger(__name__) -class Olmo1124Config(OlmoConfig): +class Olmo2Config(OlmoConfig): r""" - This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024 + This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf). + defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf). Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -40,8 +40,8 @@ class Olmo1124Config(OlmoConfig): Args: vocab_size (`int`, *optional*, defaults to 50304): - Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Olmo1124Model`] + Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Olmo2Model`] hidden_size (`int`, *optional*, defaults to 4096): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 11008): @@ -93,20 +93,20 @@ class Olmo1124Config(OlmoConfig): The epsilon used by the rms normalization layers. ```python - >>> from transformers import Olmo1124Model, Olmo1124Config + >>> from transformers import Olmo2Model, Olmo2Config - >>> # Initializing a Olmo November 2024 7B style configuration - >>> configuration = Olmo1124Config() + >>> # Initializing a Olmo2 7B style configuration + >>> configuration = Olmo2Config() - >>> # Initializing a model from the Olmo November 2024 7B style configuration - >>> model = Olmo1124Model(configuration) + >>> # Initializing a model from the Olmo2 7B style configuration + >>> model = Olmo2Model(configuration) >>> # Accessing the model configuration >>> configuration = model.config ``` """ - model_type = "olmo_1124" + model_type = "olmo2" def __init__( self, @@ -157,21 +157,21 @@ class Olmo1124Config(OlmoConfig): del self.clip_qkv -class Olmo1124RMSNorm(LlamaRMSNorm): +class Olmo2RMSNorm(LlamaRMSNorm): pass -ALL_LAYERNORM_LAYERS.append(Olmo1124RMSNorm) +ALL_LAYERNORM_LAYERS.append(Olmo2RMSNorm) -# Olmo1124 attention is identical to OLMo attention except: +# Olmo2 attention is identical to OLMo attention except: # - Norm is applied to attention queries and keys. # - No qkv clipping. -class Olmo1124Attention(OlmoAttention): - def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None): +class Olmo2Attention(OlmoAttention): + def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None): super().__init__(config, layer_idx=layer_idx) - self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps) - self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps) + self.q_norm = Olmo2RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps) + self.k_norm = Olmo2RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps) def forward( self, @@ -234,15 +234,15 @@ class Olmo1124Attention(OlmoAttention): return attn_output, attn_weights, past_key_value -class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention): +class Olmo2FlashAttention2(OlmoFlashAttention2, Olmo2Attention): """ - OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays + OLMo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays untouched. The only required change would be on the forward pass where it needs to correctly call the public API of flash attention and deal with padding tokens in case the input contains any of them. """ def __init__(self, *args, **kwargs): - Olmo1124Attention.__init__(*args, **kwargs) + Olmo2Attention.__init__(*args, **kwargs) # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. @@ -338,8 +338,8 @@ class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention): return attn_output, attn_weights, past_key_value -class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention): - # Adapted from Olmo1124Attention.forward +class Olmo2SdpaAttention(OlmoSdpaAttention, Olmo2Attention): + # Adapted from Olmo2Attention.forward def forward( self, hidden_states: torch.Tensor, @@ -353,7 +353,7 @@ class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention): if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + "Olmo2Model is using Olmo2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( @@ -408,14 +408,14 @@ class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention): return attn_output, None, past_key_value -# The OLMo November 2024 layers are identical to those of the OLMo model except: +# The OLMo2 layers are identical to those of the OLMo model except: # - RMSNorm is used instead of standard layer norm. # - Norm is applied after attention/feedforward rather than before. -class Olmo1124DecoderLayer(OlmoDecoderLayer): - def __init__(self, config: Olmo1124Config, layer_idx: int): +class Olmo2DecoderLayer(OlmoDecoderLayer): + def __init__(self, config: Olmo2Config, layer_idx: int): super().__init__(config, layer_idx=layer_idx) - self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) del self.input_layernorm def forward( @@ -459,31 +459,31 @@ class Olmo1124DecoderLayer(OlmoDecoderLayer): return outputs -class Olmo1124PreTrainedModel(OlmoPreTrainedModel): +class Olmo2PreTrainedModel(OlmoPreTrainedModel): pass -# The OLMo November 2024 model is identical to the OLMo model, except RMSNorm is used instead of +# The OLMo2 model is identical to the OLMo model, except RMSNorm is used instead of # standard layer norm for the output norm. -class Olmo1124Model(OlmoModel): - def __init__(self, config: Olmo1124Config): +class Olmo2Model(OlmoModel): + def __init__(self, config: Olmo2Config): super().__init__(config) self.layers = nn.ModuleList( - [Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + [Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) - self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) # The heads now only need to redefine the model inside to the correct `RobertaModel` -class Olmo1124ForCausalLM(OlmoForCausalLM): - def __init__(self, config: Olmo1124Config): +class Olmo2ForCausalLM(OlmoForCausalLM): + def __init__(self, config: Olmo2Config): super().__init__(config) - self.model = Olmo1124Model(config) + self.model = Olmo2Model(config) __all__ = [ - "Olmo1124Config", - "Olmo1124ForCausalLM", - "Olmo1124Model", - "Olmo1124PreTrainedModel", + "Olmo2Config", + "Olmo2ForCausalLM", + "Olmo2Model", + "Olmo2PreTrainedModel", ] diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 3bf6d6eb28..1238f05878 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -6758,21 +6758,21 @@ class OlmoPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) -class Olmo1124ForCausalLM(metaclass=DummyObject): +class Olmo2ForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class Olmo1124Model(metaclass=DummyObject): +class Olmo2Model(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class Olmo1124PreTrainedModel(metaclass=DummyObject): +class Olmo2PreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): diff --git a/tests/models/olmo_1124/__init__.py b/tests/models/olmo2/__init__.py similarity index 100% rename from tests/models/olmo_1124/__init__.py rename to tests/models/olmo2/__init__.py diff --git a/tests/models/olmo_1124/test_modeling_olmo_1124.py b/tests/models/olmo2/test_modeling_olmo2.py similarity index 91% rename from tests/models/olmo_1124/test_modeling_olmo_1124.py rename to tests/models/olmo2/test_modeling_olmo2.py index 9dad7932f1..fe6dcfdb54 100644 --- a/tests/models/olmo_1124/test_modeling_olmo_1124.py +++ b/tests/models/olmo2/test_modeling_olmo2.py @@ -12,14 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Testing suite for the PyTorch OLMo November 2024 model.""" +"""Testing suite for the PyTorch OLMo2 model.""" import unittest from packaging import version from parameterized import parameterized -from transformers import Olmo1124Config, is_torch_available, set_seed +from transformers import Olmo2Config, is_torch_available, set_seed from transformers.generation.configuration_utils import GenerationConfig from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.testing_utils import ( @@ -39,12 +39,12 @@ if is_torch_available(): import torch from transformers import ( - Olmo1124ForCausalLM, - Olmo1124Model, + Olmo2ForCausalLM, + Olmo2Model, ) -class Olmo1124ModelTester: +class Olmo2ModelTester: def __init__( self, parent, @@ -119,7 +119,7 @@ class Olmo1124ModelTester: return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def get_config(self): - return Olmo1124Config( + return Olmo2Config( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, @@ -138,7 +138,7 @@ class Olmo1124ModelTester: def create_and_check_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = Olmo1124Model(config=config) + model = Olmo2Model(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask) @@ -158,7 +158,7 @@ class Olmo1124ModelTester: encoder_attention_mask, ): config.add_cross_attention = True - model = Olmo1124Model(config) + model = Olmo2Model(config) model.to(torch_device) model.eval() result = model( @@ -187,7 +187,7 @@ class Olmo1124ModelTester: encoder_hidden_states, encoder_attention_mask, ): - model = Olmo1124ForCausalLM(config=config) + model = Olmo2ForCausalLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) @@ -207,7 +207,7 @@ class Olmo1124ModelTester: ): config.is_decoder = True config.add_cross_attention = True - model = Olmo1124ForCausalLM(config=config) + model = Olmo2ForCausalLM(config=config) model.to(torch_device) model.eval() @@ -271,13 +271,13 @@ class Olmo1124ModelTester: @require_torch -class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): - all_model_classes = (Olmo1124Model, Olmo1124ForCausalLM) if is_torch_available() else () - all_generative_model_classes = (Olmo1124ForCausalLM,) if is_torch_available() else () +class Olmo2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (Olmo2Model, Olmo2ForCausalLM) if is_torch_available() else () + all_generative_model_classes = (Olmo2ForCausalLM,) if is_torch_available() else () pipeline_model_mapping = ( { - "feature-extraction": Olmo1124Model, - "text-generation": Olmo1124ForCausalLM, + "feature-extraction": Olmo2Model, + "text-generation": Olmo2ForCausalLM, } if is_torch_available() else {} @@ -290,8 +290,8 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM model_split_percents = [0.5, 0.7, 0.8] def setUp(self): - self.model_tester = Olmo1124ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Olmo1124Config, hidden_size=37) + self.model_tester = Olmo2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Olmo2Config, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() @@ -300,7 +300,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="OLMo November 2024 does not support head pruning.") + @unittest.skip(reason="OLMo2 does not support head pruning.") def test_headmasking(self): pass @@ -310,7 +310,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="OLMo November 2024 buffers include complex numbers, which breaks this test") + @unittest.skip(reason="OLMo2 buffers include complex numbers, which breaks this test") def test_save_load_fast_init_from_base(self): pass @@ -321,7 +321,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = Olmo1124Model(config) + original_model = Olmo2Model(config) original_model.to(torch_device) original_model.eval() original_short_output = original_model(short_input).last_hidden_state @@ -329,7 +329,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM set_seed(42) # Fixed seed at init time so the two models get the same random weights config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = Olmo1124Model(config) + scaled_model = Olmo2Model(config) scaled_model.to(torch_device) scaled_model.eval() scaled_short_output = scaled_model(short_input).last_hidden_state @@ -347,11 +347,11 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM @require_torch -class Olmo1124IntegrationTest(unittest.TestCase): +class Olmo2IntegrationTest(unittest.TestCase): @slow def test_model_7b_logits(self): input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]] - model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto") + model = Olmo2ForCausalLM.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto") out = model(torch.tensor(input_ids)).logits.float() # Expected mean on dim = -1 EXPECTED_MEAN = torch.tensor( @@ -366,8 +366,8 @@ class Olmo1124IntegrationTest(unittest.TestCase): def test_model_7b_greedy_generation(self): EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the fastest speed possible, and 3) the speed of light is the same for all observers, regardless of their relative motion. The theory of relativity is based on the idea that the speed of light is constant. This means that""" prompt = "Simply put, the theory of relativity states that " - tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto") - model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto") + tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto") + model = Olmo2ForCausalLM.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto") input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device) # greedy generation outputs @@ -377,7 +377,7 @@ class Olmo1124IntegrationTest(unittest.TestCase): @require_tokenizers def test_simple_encode_decode(self): - rust_tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf") + rust_tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo2-7B-1124-hf") self.assertEqual(rust_tokenizer.encode("This is a test"), [2028, 374, 264, 1296]) self.assertEqual(rust_tokenizer.decode([2028, 374, 264, 1296], skip_special_tokens=True), "This is a test") @@ -414,9 +414,9 @@ class Olmo1124IntegrationTest(unittest.TestCase): convert_and_export_with_cache, ) - olmo_1124_model = "shanearora/OLMo-7B-1124-hf" + olmo2_model = "shanearora/OLMo2-7B-1124-hf" - tokenizer = AutoTokenizer.from_pretrained(olmo_1124_model, pad_token="", padding_side="right") + tokenizer = AutoTokenizer.from_pretrained(olmo2_model, pad_token="", padding_side="right") EXPECTED_TEXT_COMPLETION = [ "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light", ] @@ -439,8 +439,8 @@ class Olmo1124IntegrationTest(unittest.TestCase): "max_cache_len": max_generation_length, }, ) - model = Olmo1124ForCausalLM.from_pretrained( - olmo_1124_model, + model = Olmo2ForCausalLM.from_pretrained( + olmo2_model, device_map=device, torch_dtype=dtype, attn_implementation=attn_implementation,