Rename OLMo November to OLMo2 (#34864)

* Rename/move OLMo Nov files to OLMo2

* Rename Olmo1124 and its variants to Olmo2
This commit is contained in:
Shane A
2024-11-25 07:31:22 -08:00
committed by GitHub
parent 1de3598d30
commit 9121ab8fe8
17 changed files with 221 additions and 221 deletions

View File

@@ -516,8 +516,8 @@
title: Nyströmformer
- local: model_doc/olmo
title: OLMo
- local: model_doc/olmo_1124
title: OLMo November 2024
- local: model_doc/olmo2
title: OLMo2
- local: model_doc/olmoe
title: OLMoE
- local: model_doc/open-llama

View File

@@ -240,7 +240,7 @@ Flax), PyTorch, and/or TensorFlow.
| [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ |
| [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ |
| [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ |
| [OLMo November 2024](model_doc/olmo_1124) | ✅ | ❌ | ❌ |
| [OLMo2](model_doc/olmo2) | ✅ | ❌ | ❌ |
| [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ |
| [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ |
| [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ |

View File

@@ -14,11 +14,11 @@ rendered properly in your Markdown viewer.
-->
# OLMo November 2024
# OLMo2
## Overview
The OLMo November 2024 model is a successor of the OLMo model, which was proposed in
The OLMo2 model is the successor of the OLMo model, which was proposed in
[OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838).
The architectural changes from the original OLMo model to this model are:
@@ -31,16 +31,16 @@ This model was contributed by [shanearora](https://huggingface.co/shanearora).
The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo).
## Olmo1124Config
## Olmo2Config
[[autodoc]] Olmo1124Config
[[autodoc]] Olmo2Config
## Olmo1124Model
## Olmo2Model
[[autodoc]] Olmo1124Model
[[autodoc]] Olmo2Model
- forward
## Olmo1124ForCausalLM
## Olmo2ForCausalLM
[[autodoc]] Olmo1124ForCausalLM
[[autodoc]] Olmo2ForCausalLM
- forward

View File

@@ -77,7 +77,7 @@ FlashAttention-2 is currently supported for the following architectures:
* [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
* [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model)
* [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
* [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
@@ -261,7 +261,7 @@ For now, Transformers supports SDPA inference and training for the following arc
* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
* [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model)
* [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
* [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)

View File

@@ -620,7 +620,7 @@ _import_structure = {
"models.nougat": ["NougatProcessor"],
"models.nystromformer": ["NystromformerConfig"],
"models.olmo": ["OlmoConfig"],
"models.olmo_1124": ["Olmo1124Config"],
"models.olmo2": ["Olmo2Config"],
"models.olmoe": ["OlmoeConfig"],
"models.omdet_turbo": [
"OmDetTurboConfig",
@@ -2920,11 +2920,11 @@ else:
"OlmoPreTrainedModel",
]
)
_import_structure["models.olmo_1124"].extend(
_import_structure["models.olmo2"].extend(
[
"Olmo1124ForCausalLM",
"Olmo1124Model",
"Olmo1124PreTrainedModel",
"Olmo2ForCausalLM",
"Olmo2Model",
"Olmo2PreTrainedModel",
]
)
_import_structure["models.olmoe"].extend(
@@ -5514,7 +5514,7 @@ if TYPE_CHECKING:
NystromformerConfig,
)
from .models.olmo import OlmoConfig
from .models.olmo_1124 import Olmo1124Config
from .models.olmo2 import Olmo2Config
from .models.olmoe import OlmoeConfig
from .models.omdet_turbo import (
OmDetTurboConfig,
@@ -7533,10 +7533,10 @@ if TYPE_CHECKING:
OlmoModel,
OlmoPreTrainedModel,
)
from .models.olmo_1124 import (
Olmo1124ForCausalLM,
Olmo1124Model,
Olmo1124PreTrainedModel,
from .models.olmo2 import (
Olmo2ForCausalLM,
Olmo2Model,
Olmo2PreTrainedModel,
)
from .models.olmoe import (
OlmoeForCausalLM,

View File

@@ -177,7 +177,7 @@ from . import (
nougat,
nystromformer,
olmo,
olmo_1124,
olmo2,
olmoe,
omdet_turbo,
oneformer,

View File

@@ -195,7 +195,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("nougat", "VisionEncoderDecoderConfig"),
("nystromformer", "NystromformerConfig"),
("olmo", "OlmoConfig"),
("olmo_1124", "Olmo1124Config"),
("olmo2", "Olmo2Config"),
("olmoe", "OlmoeConfig"),
("omdet-turbo", "OmDetTurboConfig"),
("oneformer", "OneFormerConfig"),
@@ -511,7 +511,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("nougat", "Nougat"),
("nystromformer", "Nyströmformer"),
("olmo", "OLMo"),
("olmo_1124", "OLMo November 2024"),
("olmo2", "OLMo2"),
("olmoe", "OLMoE"),
("omdet-turbo", "OmDet-Turbo"),
("oneformer", "OneFormer"),

View File

@@ -184,7 +184,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("nllb-moe", "NllbMoeModel"),
("nystromformer", "NystromformerModel"),
("olmo", "OlmoModel"),
("olmo_1124", "Olmo1124Model"),
("olmo2", "Olmo2Model"),
("olmoe", "OlmoeModel"),
("omdet-turbo", "OmDetTurboForObjectDetection"),
("oneformer", "OneFormerModel"),
@@ -517,7 +517,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("mvp", "MvpForCausalLM"),
("nemotron", "NemotronForCausalLM"),
("olmo", "OlmoForCausalLM"),
("olmo_1124", "Olmo1124ForCausalLM"),
("olmo2", "Olmo2ForCausalLM"),
("olmoe", "OlmoeForCausalLM"),
("open-llama", "OpenLlamaForCausalLM"),
("openai-gpt", "OpenAIGPTLMHeadModel"),

View File

@@ -348,7 +348,7 @@ else:
),
),
("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
("olmo2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
(
"omdet-turbo",

View File

@@ -18,8 +18,8 @@ from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_olmo_1124 import *
from .modeling_olmo_1124 import *
from .configuration_olmo2 import *
from .modeling_olmo2 import *
else:
import sys

View File

@@ -1,18 +1,18 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
# This file was automatically generated from src/transformers/models/olmo2/modular_olmo2.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_olmo_1124.py file directly. One of our CI enforces this.
# modular_olmo2.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from ...configuration_utils import PretrainedConfig
class Olmo1124Config(PretrainedConfig):
class Olmo2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
@@ -20,8 +20,8 @@ class Olmo1124Config(PretrainedConfig):
Args:
vocab_size (`int`, *optional*, defaults to 50304):
Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Olmo1124Model`]
Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Olmo2Model`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
@@ -73,20 +73,20 @@ class Olmo1124Config(PretrainedConfig):
The epsilon used by the rms normalization layers.
```python
>>> from transformers import Olmo1124Model, Olmo1124Config
>>> from transformers import Olmo2Model, Olmo2Config
>>> # Initializing a Olmo November 2024 7B style configuration
>>> configuration = Olmo1124Config()
>>> # Initializing a Olmo2 7B style configuration
>>> configuration = Olmo2Config()
>>> # Initializing a model from the Olmo November 2024 7B style configuration
>>> model = Olmo1124Model(configuration)
>>> # Initializing a model from the Olmo2 7B style configuration
>>> model = Olmo2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "olmo_1124"
model_type = "olmo2"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
@@ -163,4 +163,4 @@ class Olmo1124Config(PretrainedConfig):
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
__all__ = ["Olmo1124Config"]
__all__ = ["Olmo2Config"]

View File

@@ -23,7 +23,7 @@ import torch
import yaml
from tokenizers import Tokenizer
from transformers import Olmo1124Config, Olmo1124ForCausalLM
from transformers import Olmo2Config, Olmo2ForCausalLM
from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
@@ -31,16 +31,16 @@ from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
Sample usage:
```
python src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py \
--input_dir /path/to/downloaded/olmo_1124/weights --model_size 7B --output_dir /output/path
python src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py \
--input_dir /path/to/downloaded/olmo2/weights --model_size 7B --output_dir /output/path
```
Thereafter, models can be loaded via:
```py
from transformers import Olmo1124ForCausalLM, AutoTokenizer
from transformers import Olmo2ForCausalLM, AutoTokenizer
model = Olmo1124ForCausalLM.from_pretrained("/output/path")
model = Olmo2ForCausalLM.from_pretrained("/output/path")
tokenizer = AutoTokenizer.from_pretrained("/output/path")
```
@@ -77,26 +77,26 @@ def write_model(
os.makedirs(tmp_model_path, exist_ok=True)
config_path = Path(input_base_path) / "config.yaml"
olmo_1124_config = yaml.safe_load(config_path.read_text())["model"]
olmo2_config = yaml.safe_load(config_path.read_text())["model"]
if not olmo_1124_config.get("attention_layer_norm", False):
raise RuntimeError("OLMo November 2024 checkpoints must have attention layer norm")
if not olmo_1124_config.get("norm_after", False):
raise RuntimeError("OLMo November 2024 checkpoints must set norm_after to True")
if not olmo2_config.get("attention_layer_norm", False):
raise RuntimeError("OLMo2 checkpoints must have attention layer norm")
if not olmo2_config.get("norm_after", False):
raise RuntimeError("OLMo2 checkpoints must set norm_after to True")
n_layers = olmo_1124_config["n_layers"]
n_heads = olmo_1124_config["n_heads"]
dim = olmo_1124_config["d_model"]
n_layers = olmo2_config["n_layers"]
n_heads = olmo2_config["n_heads"]
dim = olmo2_config["d_model"]
dims_per_head = dim // n_heads
base = olmo_1124_config["rope_theta"]
base = olmo2_config["rope_theta"]
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
max_position_embeddings = olmo_1124_config["max_sequence_length"]
max_position_embeddings = olmo2_config["max_sequence_length"]
vocab_size = olmo_1124_config.get("embedding_size", olmo_1124_config["vocab_size"])
vocab_size = olmo2_config.get("embedding_size", olmo2_config["vocab_size"])
if olmo_1124_config.get("n_kv_heads", None) is not None:
num_key_value_heads = olmo_1124_config["n_kv_heads"] # for GQA / MQA
elif olmo_1124_config["multi_query_attention"]: # compatibility with other checkpoints
if olmo2_config.get("n_kv_heads", None) is not None:
num_key_value_heads = olmo2_config["n_kv_heads"] # for GQA / MQA
elif olmo2_config["multi_query_attention"]: # compatibility with other checkpoints
num_key_value_heads = 1
else:
num_key_value_heads = n_heads
@@ -167,17 +167,17 @@ def write_model(
index_dict["metadata"] = {"total_size": param_count * 2}
write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
if olmo_1124_config.get("mlp_hidden_size", None) is not None:
intermediate_size = olmo_1124_config["mlp_hidden_size"] // 2
if olmo2_config.get("mlp_hidden_size", None) is not None:
intermediate_size = olmo2_config["mlp_hidden_size"] // 2
else:
intermediate_size = (dim * olmo_1124_config["mlp_ratio"]) // 2
intermediate_size = (dim * olmo2_config["mlp_ratio"]) // 2
if fix_eos_token_id and olmo_1124_config["eos_token_id"] == 0:
if fix_eos_token_id and olmo2_config["eos_token_id"] == 0:
# Fixing a bug in OLMo where eos token id was incorrectly set
print("Changing eos_token_id from 0 to 50279.")
olmo_1124_config["eos_token_id"] = 50279
olmo2_config["eos_token_id"] = 50279
config = Olmo1124Config(
config = Olmo2Config(
vocab_size=vocab_size,
hidden_size=dim,
intermediate_size=intermediate_size,
@@ -185,11 +185,11 @@ def write_model(
num_attention_heads=n_heads,
num_key_value_heads=num_key_value_heads,
max_position_embeddings=max_position_embeddings,
pad_token_id=olmo_1124_config["pad_token_id"],
pad_token_id=olmo2_config["pad_token_id"],
bos_token_id=None,
eos_token_id=olmo_1124_config["eos_token_id"],
tie_word_embeddings=olmo_1124_config["weight_tying"],
rms_norm_eps=olmo_1124_config["layer_norm_eps"],
eos_token_id=olmo2_config["eos_token_id"],
tie_word_embeddings=olmo2_config["weight_tying"],
rms_norm_eps=olmo2_config["layer_norm_eps"],
rope_theta=base,
)
config.save_pretrained(tmp_model_path)
@@ -202,8 +202,8 @@ def write_model(
if include_tokenizer:
_write_tokenizer(model_path, config, input_base_path, tokenizer_path)
print("Loading the checkpoint in a OLMo November 2024 model.")
model = Olmo1124ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
print("Loading the checkpoint in a OLMo2 model.")
model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
# Avoid saving this as part of the config.
del model.config._name_or_path
print("Saving in the Transformers format.")
@@ -216,7 +216,7 @@ def write_model(
def _write_tokenizer(
output_path: Path,
config: Olmo1124Config,
config: Olmo2Config,
checkpoint_dir: str,
input_tokenizer_path: Path | None,
) -> None:
@@ -251,7 +251,7 @@ def main():
parser.add_argument(
"--input_dir",
required=True,
help="Location of OLMo November 2024 weights, which contains config.yaml and model.pt.",
help="Location of OLMo2 weights, which contains config.yaml and model.pt.",
)
parser.add_argument(
"--no_tokenizer",
@@ -263,7 +263,7 @@ def main():
"--tokenizer_json_path",
type=Path,
default=None,
help="Location of OLMo November 2024 tokenizer json file. Defaults to what is set in the config file.",
help="Location of OLMo2 tokenizer json file. Defaults to what is set in the config file.",
)
parser.add_argument(
"--output_dir",

View File

@@ -1,8 +1,8 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
# This file was automatically generated from src/transformers/models/olmo2/modular_olmo2.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_olmo_1124.py file directly. One of our CI enforces this.
# modular_olmo2.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
import math
from typing import List, Optional, Tuple, Union
@@ -25,7 +25,7 @@ from ...utils import (
logging,
replace_return_docstrings,
)
from .configuration_olmo_1124 import Olmo1124Config
from .configuration_olmo2 import Olmo2Config
if is_flash_attn_2_available():
@@ -34,13 +34,13 @@ if is_flash_attn_2_available():
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Olmo1124Config"
_CONFIG_FOR_DOC = "Olmo2Config"
class Olmo1124RMSNorm(nn.Module):
class Olmo2RMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Olmo1124RMSNorm is equivalent to T5LayerNorm
Olmo2RMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -57,9 +57,9 @@ class Olmo1124RMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo1124
# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo2
# TODO(joao): add me back asap :)
class Olmo1124RotaryEmbedding(nn.Module):
class Olmo2RotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
super().__init__()
self.scaling_factor = scaling_factor
@@ -88,10 +88,10 @@ class Olmo1124RotaryEmbedding(nn.Module):
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo1124
# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo2
# TODO(joao): add me back asap :)
class Olmo1124LinearScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
"""Olmo1124RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
class Olmo2LinearScalingRotaryEmbedding(Olmo2RotaryEmbedding):
"""Olmo2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
def forward(self, x, position_ids):
# difference to the original RoPE: a scaling factor is aplied to the position ids
@@ -100,10 +100,10 @@ class Olmo1124LinearScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
return cos, sin
# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo1124
# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo2
# TODO(joao): add me back asap :)
class Olmo1124DynamicNTKScalingRotaryEmbedding(Olmo1124RotaryEmbedding):
"""Olmo1124RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
class Olmo2DynamicNTKScalingRotaryEmbedding(Olmo2RotaryEmbedding):
"""Olmo2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
def forward(self, x, position_ids):
# difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
@@ -167,12 +167,12 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
class Olmo1124Attention(nn.Module):
class Olmo2Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo1124
# copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo2
# TODO(joao): add me back asap :)
def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None):
super().__init__()
self.config = config
self.layer_idx = layer_idx
@@ -204,12 +204,12 @@ class Olmo1124Attention(nn.Module):
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
self._init_rope()
self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
self.q_norm = Olmo2RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
self.k_norm = Olmo2RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = Olmo1124RotaryEmbedding(
self.rotary_emb = Olmo2RotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
@@ -218,14 +218,14 @@ class Olmo1124Attention(nn.Module):
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = Olmo1124LinearScalingRotaryEmbedding(
self.rotary_emb = Olmo2LinearScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = Olmo1124DynamicNTKScalingRotaryEmbedding(
self.rotary_emb = Olmo2DynamicNTKScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
@@ -295,13 +295,13 @@ class Olmo1124Attention(nn.Module):
return attn_output, attn_weights, past_key_value
class Olmo1124FlashAttention2(Olmo1124Attention):
class Olmo2FlashAttention2(Olmo2Attention):
"""
Olmo1124 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
Olmo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
OLMo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
@@ -403,14 +403,14 @@ class Olmo1124FlashAttention2(Olmo1124Attention):
return attn_output, attn_weights, past_key_value
class Olmo1124SdpaAttention(Olmo1124Attention):
class Olmo2SdpaAttention(Olmo2Attention):
"""
Olmo1124 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`Olmo1124Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
Olmo2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`Olmo2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
"""
# Adapted from Olmo1124Attention.forward
# Adapted from Olmo2Attention.forward
def forward(
self,
hidden_states: torch.Tensor,
@@ -424,7 +424,7 @@ class Olmo1124SdpaAttention(Olmo1124Attention):
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
"Olmo2Model is using Olmo2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
@@ -479,7 +479,7 @@ class Olmo1124SdpaAttention(Olmo1124Attention):
return attn_output, None, past_key_value
class Olmo1124MLP(nn.Module):
class Olmo2MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
@@ -494,23 +494,23 @@ class Olmo1124MLP(nn.Module):
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
OLMO_1124_ATTENTION_CLASSES = {
"eager": Olmo1124Attention,
"flash_attention_2": Olmo1124FlashAttention2,
"sdpa": Olmo1124SdpaAttention,
OLMO2_ATTENTION_CLASSES = {
"eager": Olmo2Attention,
"flash_attention_2": Olmo2FlashAttention2,
"sdpa": Olmo2SdpaAttention,
}
class Olmo1124DecoderLayer(nn.Module):
def __init__(self, config: Olmo1124Config, layer_idx: int):
class Olmo2DecoderLayer(nn.Module):
def __init__(self, config: Olmo2Config, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = OLMO_1124_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
self.self_attn = OLMO2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
self.mlp = Olmo1124MLP(config)
self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.mlp = Olmo2MLP(config)
self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward
# TODO(joao): add me back asap :)
@@ -574,7 +574,7 @@ class Olmo1124DecoderLayer(nn.Module):
return outputs
OLMO_1124_START_DOCSTRING = r"""
OLMO2_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
@@ -584,7 +584,7 @@ OLMO_1124_START_DOCSTRING = r"""
and behavior.
Parameters:
config ([`Olmo1124Config`]):
config ([`Olmo2Config`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -592,14 +592,14 @@ OLMO_1124_START_DOCSTRING = r"""
@add_start_docstrings(
"The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.",
OLMO_1124_START_DOCSTRING,
"The bare Olmo2 Model outputting raw hidden-states without any specific head on top.",
OLMO2_START_DOCSTRING,
)
class Olmo1124PreTrainedModel(PreTrainedModel):
config_class = Olmo1124Config
class Olmo2PreTrainedModel(PreTrainedModel):
config_class = Olmo2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Olmo1124DecoderLayer"]
_no_split_modules = ["Olmo2DecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
_supports_flash_attn_2 = True
_supports_sdpa = True
@@ -619,7 +619,7 @@ class Olmo1124PreTrainedModel(PreTrainedModel):
module.weight.data[module.padding_idx].zero_()
OLMO_1124_INPUTS_DOCSTRING = r"""
OLMO2_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -695,27 +695,27 @@ OLMO_1124_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare Olmo1124 Model outputting raw hidden-states without any specific head on top.",
OLMO_1124_START_DOCSTRING,
"The bare Olmo2 Model outputting raw hidden-states without any specific head on top.",
OLMO2_START_DOCSTRING,
)
class Olmo1124Model(Olmo1124PreTrainedModel):
class Olmo2Model(Olmo2PreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Olmo1124DecoderLayer`]
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Olmo2DecoderLayer`]
Args:
config: Olmo1124Config
config: Olmo2Config
"""
def __init__(self, config: Olmo1124Config):
def __init__(self, config: Olmo2Config):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
self.layers = nn.ModuleList(
[Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
[Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
@@ -727,7 +727,7 @@ class Olmo1124Model(Olmo1124PreTrainedModel):
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING)
@add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING)
# copied from transformers.models.llama.modeling_llama.LlamaModel.forward
# TODO(joao): add me back asap :)
def forward(
@@ -971,13 +971,13 @@ class Olmo1124Model(Olmo1124PreTrainedModel):
return causal_mask
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO_1124,Llama->Olmo1124
class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO2,Llama->Olmo2
class Olmo2ForCausalLM(Olmo2PreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: Olmo1124Config):
def __init__(self, config: Olmo2Config):
super().__init__(config)
self.model = Olmo1124Model(config)
self.model = Olmo2Model(config)
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1002,7 +1002,7 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
def get_decoder(self):
return self.model
@add_start_docstrings_to_model_forward(OLMO_1124_INPUTS_DOCSTRING)
@add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
# Ignore copy
def forward(
@@ -1038,10 +1038,10 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
Example:
```python
>>> from transformers import AutoTokenizer, Olmo1124ForCausalLM
>>> from transformers import AutoTokenizer, Olmo2ForCausalLM
>>> model = Olmo1124ForCausalLM.from_pretrained("allenai/Olmo1124-1B-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo1124-1B-hf")
>>> model = Olmo2ForCausalLM.from_pretrained("allenai/Olmo2-1B-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo2-1B-hf")
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -1093,4 +1093,4 @@ class Olmo1124ForCausalLM(Olmo1124PreTrainedModel, GenerationMixin):
)
__all__ = ["Olmo1124ForCausalLM", "Olmo1124Model", "Olmo1124PreTrainedModel"]
__all__ = ["Olmo2ForCausalLM", "Olmo2Model", "Olmo2PreTrainedModel"]

View File

@@ -28,11 +28,11 @@ if is_flash_attn_2_available():
logger = logging.get_logger(__name__)
class Olmo1124Config(OlmoConfig):
class Olmo2Config(OlmoConfig):
r"""
This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
@@ -40,8 +40,8 @@ class Olmo1124Config(OlmoConfig):
Args:
vocab_size (`int`, *optional*, defaults to 50304):
Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Olmo1124Model`]
Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Olmo2Model`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
@@ -93,20 +93,20 @@ class Olmo1124Config(OlmoConfig):
The epsilon used by the rms normalization layers.
```python
>>> from transformers import Olmo1124Model, Olmo1124Config
>>> from transformers import Olmo2Model, Olmo2Config
>>> # Initializing a Olmo November 2024 7B style configuration
>>> configuration = Olmo1124Config()
>>> # Initializing a Olmo2 7B style configuration
>>> configuration = Olmo2Config()
>>> # Initializing a model from the Olmo November 2024 7B style configuration
>>> model = Olmo1124Model(configuration)
>>> # Initializing a model from the Olmo2 7B style configuration
>>> model = Olmo2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "olmo_1124"
model_type = "olmo2"
def __init__(
self,
@@ -157,21 +157,21 @@ class Olmo1124Config(OlmoConfig):
del self.clip_qkv
class Olmo1124RMSNorm(LlamaRMSNorm):
class Olmo2RMSNorm(LlamaRMSNorm):
pass
ALL_LAYERNORM_LAYERS.append(Olmo1124RMSNorm)
ALL_LAYERNORM_LAYERS.append(Olmo2RMSNorm)
# Olmo1124 attention is identical to OLMo attention except:
# Olmo2 attention is identical to OLMo attention except:
# - Norm is applied to attention queries and keys.
# - No qkv clipping.
class Olmo1124Attention(OlmoAttention):
def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
class Olmo2Attention(OlmoAttention):
def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None):
super().__init__(config, layer_idx=layer_idx)
self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
self.q_norm = Olmo2RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
self.k_norm = Olmo2RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
def forward(
self,
@@ -234,15 +234,15 @@ class Olmo1124Attention(OlmoAttention):
return attn_output, attn_weights, past_key_value
class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention):
class Olmo2FlashAttention2(OlmoFlashAttention2, Olmo2Attention):
"""
OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
OLMo2 flash attention module. This module inherits from `Olmo2Attention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
Olmo1124Attention.__init__(*args, **kwargs)
Olmo2Attention.__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
@@ -338,8 +338,8 @@ class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention):
return attn_output, attn_weights, past_key_value
class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
# Adapted from Olmo1124Attention.forward
class Olmo2SdpaAttention(OlmoSdpaAttention, Olmo2Attention):
# Adapted from Olmo2Attention.forward
def forward(
self,
hidden_states: torch.Tensor,
@@ -353,7 +353,7 @@ class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
"Olmo2Model is using Olmo2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
@@ -408,14 +408,14 @@ class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
return attn_output, None, past_key_value
# The OLMo November 2024 layers are identical to those of the OLMo model except:
# The OLMo2 layers are identical to those of the OLMo model except:
# - RMSNorm is used instead of standard layer norm.
# - Norm is applied after attention/feedforward rather than before.
class Olmo1124DecoderLayer(OlmoDecoderLayer):
def __init__(self, config: Olmo1124Config, layer_idx: int):
class Olmo2DecoderLayer(OlmoDecoderLayer):
def __init__(self, config: Olmo2Config, layer_idx: int):
super().__init__(config, layer_idx=layer_idx)
self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
del self.input_layernorm
def forward(
@@ -459,31 +459,31 @@ class Olmo1124DecoderLayer(OlmoDecoderLayer):
return outputs
class Olmo1124PreTrainedModel(OlmoPreTrainedModel):
class Olmo2PreTrainedModel(OlmoPreTrainedModel):
pass
# The OLMo November 2024 model is identical to the OLMo model, except RMSNorm is used instead of
# The OLMo2 model is identical to the OLMo model, except RMSNorm is used instead of
# standard layer norm for the output norm.
class Olmo1124Model(OlmoModel):
def __init__(self, config: Olmo1124Config):
class Olmo2Model(OlmoModel):
def __init__(self, config: Olmo2Config):
super().__init__(config)
self.layers = nn.ModuleList(
[Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
[Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# The heads now only need to redefine the model inside to the correct `RobertaModel`
class Olmo1124ForCausalLM(OlmoForCausalLM):
def __init__(self, config: Olmo1124Config):
class Olmo2ForCausalLM(OlmoForCausalLM):
def __init__(self, config: Olmo2Config):
super().__init__(config)
self.model = Olmo1124Model(config)
self.model = Olmo2Model(config)
__all__ = [
"Olmo1124Config",
"Olmo1124ForCausalLM",
"Olmo1124Model",
"Olmo1124PreTrainedModel",
"Olmo2Config",
"Olmo2ForCausalLM",
"Olmo2Model",
"Olmo2PreTrainedModel",
]

View File

@@ -6758,21 +6758,21 @@ class OlmoPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
class Olmo1124ForCausalLM(metaclass=DummyObject):
class Olmo2ForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Olmo1124Model(metaclass=DummyObject):
class Olmo2Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Olmo1124PreTrainedModel(metaclass=DummyObject):
class Olmo2PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):

View File

@@ -12,14 +12,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch OLMo November 2024 model."""
"""Testing suite for the PyTorch OLMo2 model."""
import unittest
from packaging import version
from parameterized import parameterized
from transformers import Olmo1124Config, is_torch_available, set_seed
from transformers import Olmo2Config, is_torch_available, set_seed
from transformers.generation.configuration_utils import GenerationConfig
from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.testing_utils import (
@@ -39,12 +39,12 @@ if is_torch_available():
import torch
from transformers import (
Olmo1124ForCausalLM,
Olmo1124Model,
Olmo2ForCausalLM,
Olmo2Model,
)
class Olmo1124ModelTester:
class Olmo2ModelTester:
def __init__(
self,
parent,
@@ -119,7 +119,7 @@ class Olmo1124ModelTester:
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def get_config(self):
return Olmo1124Config(
return Olmo2Config(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
@@ -138,7 +138,7 @@ class Olmo1124ModelTester:
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = Olmo1124Model(config=config)
model = Olmo2Model(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask)
@@ -158,7 +158,7 @@ class Olmo1124ModelTester:
encoder_attention_mask,
):
config.add_cross_attention = True
model = Olmo1124Model(config)
model = Olmo2Model(config)
model.to(torch_device)
model.eval()
result = model(
@@ -187,7 +187,7 @@ class Olmo1124ModelTester:
encoder_hidden_states,
encoder_attention_mask,
):
model = Olmo1124ForCausalLM(config=config)
model = Olmo2ForCausalLM(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, labels=token_labels)
@@ -207,7 +207,7 @@ class Olmo1124ModelTester:
):
config.is_decoder = True
config.add_cross_attention = True
model = Olmo1124ForCausalLM(config=config)
model = Olmo2ForCausalLM(config=config)
model.to(torch_device)
model.eval()
@@ -271,13 +271,13 @@ class Olmo1124ModelTester:
@require_torch
class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Olmo1124Model, Olmo1124ForCausalLM) if is_torch_available() else ()
all_generative_model_classes = (Olmo1124ForCausalLM,) if is_torch_available() else ()
class Olmo2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Olmo2Model, Olmo2ForCausalLM) if is_torch_available() else ()
all_generative_model_classes = (Olmo2ForCausalLM,) if is_torch_available() else ()
pipeline_model_mapping = (
{
"feature-extraction": Olmo1124Model,
"text-generation": Olmo1124ForCausalLM,
"feature-extraction": Olmo2Model,
"text-generation": Olmo2ForCausalLM,
}
if is_torch_available()
else {}
@@ -290,8 +290,8 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
model_split_percents = [0.5, 0.7, 0.8]
def setUp(self):
self.model_tester = Olmo1124ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Olmo1124Config, hidden_size=37)
self.model_tester = Olmo2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Olmo2Config, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
@@ -300,7 +300,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="OLMo November 2024 does not support head pruning.")
@unittest.skip(reason="OLMo2 does not support head pruning.")
def test_headmasking(self):
pass
@@ -310,7 +310,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
config_and_inputs[0].position_embedding_type = type
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="OLMo November 2024 buffers include complex numbers, which breaks this test")
@unittest.skip(reason="OLMo2 buffers include complex numbers, which breaks this test")
def test_save_load_fast_init_from_base(self):
pass
@@ -321,7 +321,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
set_seed(42) # Fixed seed at init time so the two models get the same random weights
original_model = Olmo1124Model(config)
original_model = Olmo2Model(config)
original_model.to(torch_device)
original_model.eval()
original_short_output = original_model(short_input).last_hidden_state
@@ -329,7 +329,7 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
set_seed(42) # Fixed seed at init time so the two models get the same random weights
config.rope_scaling = {"type": scaling_type, "factor": 10.0}
scaled_model = Olmo1124Model(config)
scaled_model = Olmo2Model(config)
scaled_model.to(torch_device)
scaled_model.eval()
scaled_short_output = scaled_model(short_input).last_hidden_state
@@ -347,11 +347,11 @@ class Olmo1124ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
@require_torch
class Olmo1124IntegrationTest(unittest.TestCase):
class Olmo2IntegrationTest(unittest.TestCase):
@slow
def test_model_7b_logits(self):
input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
model = Olmo2ForCausalLM.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto")
out = model(torch.tensor(input_ids)).logits.float()
# Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor(
@@ -366,8 +366,8 @@ class Olmo1124IntegrationTest(unittest.TestCase):
def test_model_7b_greedy_generation(self):
EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the fastest speed possible, and 3) the speed of light is the same for all observers, regardless of their relative motion. The theory of relativity is based on the idea that the speed of light is constant. This means that"""
prompt = "Simply put, the theory of relativity states that "
tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
model = Olmo1124ForCausalLM.from_pretrained("shanearora/OLMo-7B-1124-hf", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto")
model = Olmo2ForCausalLM.from_pretrained("shanearora/OLMo2-7B-1124-hf", device_map="auto")
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
# greedy generation outputs
@@ -377,7 +377,7 @@ class Olmo1124IntegrationTest(unittest.TestCase):
@require_tokenizers
def test_simple_encode_decode(self):
rust_tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo-7B-1124-hf")
rust_tokenizer = AutoTokenizer.from_pretrained("shanearora/OLMo2-7B-1124-hf")
self.assertEqual(rust_tokenizer.encode("This is a test"), [2028, 374, 264, 1296])
self.assertEqual(rust_tokenizer.decode([2028, 374, 264, 1296], skip_special_tokens=True), "This is a test")
@@ -414,9 +414,9 @@ class Olmo1124IntegrationTest(unittest.TestCase):
convert_and_export_with_cache,
)
olmo_1124_model = "shanearora/OLMo-7B-1124-hf"
olmo2_model = "shanearora/OLMo2-7B-1124-hf"
tokenizer = AutoTokenizer.from_pretrained(olmo_1124_model, pad_token="</s>", padding_side="right")
tokenizer = AutoTokenizer.from_pretrained(olmo2_model, pad_token="</s>", padding_side="right")
EXPECTED_TEXT_COMPLETION = [
"Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light",
]
@@ -439,8 +439,8 @@ class Olmo1124IntegrationTest(unittest.TestCase):
"max_cache_len": max_generation_length,
},
)
model = Olmo1124ForCausalLM.from_pretrained(
olmo_1124_model,
model = Olmo2ForCausalLM.from_pretrained(
olmo2_model,
device_map=device,
torch_dtype=dtype,
attn_implementation=attn_implementation,