[WIP] add deepseek-v3 (#35926)
* init commit * style * take comments into account * add deepseekv3 modeling * remove redundant code * apply make style * apply fix-copies * make format * add init files * rename deepseekv3 into deepseek_v3 based on its model_type * rename deepseekv3 into deepseek_v3 based on its model_type * deepseek-v3 not deepseek_v3 * set model_type as deepseek_v3 * use default docs * apply make * fill type and docstring * add rope_config_validation * use custom DeepseekV3MLP * hold code only for checkpoints congifuration; remove redundant * revise rope yarn for DeepSeek variation * rename DeepSeek-V3 * some refactoring * revise load_hook to work properly; make moe func trainable; use llama instead of mixtral * fix attention forward * use -1 for not-changing dim when to use exapnd * refactor DeepseekV3TopkRouter * use reshape_for_rope instead of load_hook; revise attention forward for TP; rename q_head_dim with qk_head_dim * register pre_hook and hook both * make style * use n_shared_experts * Update src/transformers/models/deepseek_v3/configuration_deepseek_v3.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * add test file * update modeling_file according to modular file * make style * add mapping for DeepseekV3ForSequenceClassification * remove aux_loss_alpha * add deepseek_v3 for perf * add deepseek_v3 * rename test as deepseekv3 * use tiny-deepseek-v3 * remove DeepseekV3ForSequenceClassification * cache before padding * remote output_router_logits * Revert "remote output_router_logits" This reverts commit f264f800d04950390db8413b9efb24cef8186330. * remove output_router_logits * make e_score_correction_bias as buffer * skip tests not compatible * make style * make e_score_correction_bias as buffer * use rope_interleave instead of load_hook * skip tests not compatible with MLA * add doc for rope_interleave * fix typo * remove torch.no_grad for selecting topk * fix post merge issue * mrege with main and simplify * nits * final * small fixes * fix * support TP better * stash * changes currently requires * remove synch * more fixes for TP * temp fix for TP : some attention layers's FP8 scales are too small + shared is local colwise and anything is local if FP8 because weights are used * updates to have generation work! * push most of the changes * reorder functions + call for contributions! * update readme * nits * update * ruff was updated on main * merge with main and fix copies * revert unrelated changes * route all tokens to all experts when testing to avoid no gradient iddues * finish fixing all tests * fixup * nit * clean config * last readme changes * nit * do cnit * typo * last nit * one more one more --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: arthur@huggingface.co <arthur@ip-26-0-165-131.ec2.internal>
This commit is contained in:
@@ -345,6 +345,7 @@ _import_structure = {
|
||||
],
|
||||
"models.deberta_v2": ["DebertaV2Config"],
|
||||
"models.decision_transformer": ["DecisionTransformerConfig"],
|
||||
"models.deepseek_v3": ["DeepseekV3Config"],
|
||||
"models.deformable_detr": ["DeformableDetrConfig"],
|
||||
"models.deit": ["DeiTConfig"],
|
||||
"models.deprecated": [],
|
||||
@@ -2023,6 +2024,13 @@ else:
|
||||
"DecisionTransformerPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.deepseek_v3"].extend(
|
||||
[
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV3Model",
|
||||
"DeepseekV3PreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.deformable_detr"].extend(
|
||||
[
|
||||
"DeformableDetrForObjectDetection",
|
||||
@@ -5546,6 +5554,9 @@ if TYPE_CHECKING:
|
||||
from .models.decision_transformer import (
|
||||
DecisionTransformerConfig,
|
||||
)
|
||||
from .models.deepseek_v3 import (
|
||||
DeepseekV3Config,
|
||||
)
|
||||
from .models.deformable_detr import (
|
||||
DeformableDetrConfig,
|
||||
)
|
||||
@@ -7175,6 +7186,11 @@ if TYPE_CHECKING:
|
||||
DecisionTransformerModel,
|
||||
DecisionTransformerPreTrainedModel,
|
||||
)
|
||||
from .models.deepseek_v3 import (
|
||||
DeepseekV3ForCausalLM,
|
||||
DeepseekV3Model,
|
||||
DeepseekV3PreTrainedModel,
|
||||
)
|
||||
from .models.deformable_detr import (
|
||||
DeformableDetrForObjectDetection,
|
||||
DeformableDetrModel,
|
||||
|
||||
@@ -291,7 +291,7 @@ def w8a8_block_fp8_matmul_compile(
|
||||
return output.to(output_dtype)
|
||||
|
||||
|
||||
class FP8Linear(nn.Module):
|
||||
class FP8Linear(nn.Linear):
|
||||
dtype = torch.float8_e4m3fn
|
||||
|
||||
def __init__(
|
||||
@@ -304,17 +304,20 @@ class FP8Linear(nn.Module):
|
||||
device=None,
|
||||
activation_scheme="dynamic",
|
||||
):
|
||||
super().__init__()
|
||||
super().__init__(in_features, out_features)
|
||||
self.in_features = in_features
|
||||
self.out_features = out_features
|
||||
|
||||
self.register_buffer("weight", torch.empty(out_features, in_features, dtype=FP8Linear.dtype, device=device))
|
||||
self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=FP8Linear.dtype, device=device))
|
||||
|
||||
scale_out_features = (out_features + block_size[0] - 1) // block_size[0]
|
||||
scale_in_features = (in_features + block_size[1] - 1) // block_size[1]
|
||||
self.register_buffer(
|
||||
"weight_scale_inv", torch.empty(scale_out_features, scale_in_features, dtype=torch.float32, device=device)
|
||||
)
|
||||
if self.weight.element_size() == 1:
|
||||
scale_out_features = (out_features + block_size[0] - 1) // block_size[0]
|
||||
scale_in_features = (in_features + block_size[1] - 1) // block_size[1]
|
||||
self.weight_scale_inv = nn.Parameter(
|
||||
torch.empty(scale_out_features, scale_in_features, dtype=torch.float32, device=device)
|
||||
)
|
||||
else:
|
||||
self.register_parameter("weight_scale_inv", None)
|
||||
|
||||
self.block_size = block_size
|
||||
|
||||
@@ -330,11 +333,11 @@ class FP8Linear(nn.Module):
|
||||
return F.linear(input, self.weight, self.bias)
|
||||
else:
|
||||
# Context manager used to switch among the available cuda devices
|
||||
with torch.cuda.device(input.device):
|
||||
qinput, scale = act_quant(input, self.block_size[1])
|
||||
# with torch.cuda.device(input.device):
|
||||
qinput, scale = act_quant(input, self.block_size[1])
|
||||
# Blocks the CPU until all CUDA operations on the specified device are complete. It is used to ensure that the results of the
|
||||
# preceding operations are ready before proceeding
|
||||
torch.cuda.synchronize(device=input.device)
|
||||
# torch.cuda.synchronize(device=self.weight.device)
|
||||
with torch.cuda.device(input.device):
|
||||
output = w8a8_block_fp8_matmul_triton(
|
||||
qinput,
|
||||
@@ -344,7 +347,7 @@ class FP8Linear(nn.Module):
|
||||
self.block_size,
|
||||
output_dtype=input.dtype,
|
||||
)
|
||||
torch.cuda.synchronize(device=input.device)
|
||||
torch.cuda.synchronize()
|
||||
if self.bias is not None:
|
||||
output = output + self.bias
|
||||
return output.to(dtype=input.dtype)
|
||||
@@ -352,6 +355,7 @@ class FP8Linear(nn.Module):
|
||||
|
||||
def _replace_with_fp8_linear(
|
||||
model,
|
||||
tp_plan=None,
|
||||
modules_to_not_convert=None,
|
||||
current_key_name=None,
|
||||
quantization_config=None,
|
||||
@@ -378,10 +382,12 @@ def _replace_with_fp8_linear(
|
||||
block_size=quantization_config.weight_block_size,
|
||||
)
|
||||
has_been_replaced = True
|
||||
# when changing a layer the TP PLAN for that layer should be updated. TODO
|
||||
|
||||
if len(list(module.children())) > 0:
|
||||
_, has_been_replaced = _replace_with_fp8_linear(
|
||||
module,
|
||||
tp_plan,
|
||||
modules_to_not_convert,
|
||||
current_key_name,
|
||||
quantization_config,
|
||||
@@ -404,9 +410,9 @@ def replace_with_fp8_linear(
|
||||
if quantization_config.modules_to_not_convert is not None:
|
||||
modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
|
||||
modules_to_not_convert = list(set(modules_to_not_convert))
|
||||
|
||||
model, has_been_replaced = _replace_with_fp8_linear(
|
||||
model,
|
||||
tp_plan=model._tp_plan,
|
||||
modules_to_not_convert=modules_to_not_convert,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
|
||||
@@ -231,8 +231,8 @@ class IsolatedParallel(TensorParallelLayer):
|
||||
distribute_module(
|
||||
module,
|
||||
device_mesh,
|
||||
partial(self._prepare_input_fn),
|
||||
partial(self._prepare_output_fn),
|
||||
partial(self._prepare_input_fn, None, None),
|
||||
partial(self._prepare_output_fn, None, None),
|
||||
)
|
||||
|
||||
|
||||
@@ -484,7 +484,12 @@ def add_tensor_parallel_hooks_to_module(model, module, tp_plan, layer_name, curr
|
||||
# 1. We add hooks to the layer being loaded:
|
||||
if current_module_plan is not None:
|
||||
tp_layer = translate_to_torch_parallel_style(current_module_plan)
|
||||
tp_layer.prepare_module_tp(module, device_mesh)
|
||||
try:
|
||||
tp_layer.prepare_module_tp(module, device_mesh)
|
||||
except NotImplementedError as e:
|
||||
print(
|
||||
f"Trying to prepare {layer_name}, but it's not supported. Corresponding module: {module} Fix it's TP plan: {e}"
|
||||
)
|
||||
|
||||
# 2. We add hooks to the parrent module if needed
|
||||
if "." in layer_name:
|
||||
@@ -531,6 +536,7 @@ def shard_and_distribute_module(
|
||||
param, empty_param, param_type, param_casting_dtype, is_contiguous, rank, device_mesh
|
||||
)
|
||||
else:
|
||||
# TODO log no plan modules in set
|
||||
param = param[...].to(param_casting_dtype)
|
||||
if is_contiguous:
|
||||
param = param.contiguous()
|
||||
|
||||
@@ -189,13 +189,31 @@ def _compute_yarn_parameters(
|
||||
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
||||
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
dim = int(head_dim * partial_rotary_factor)
|
||||
max_position_embeddings = config.max_position_embeddings
|
||||
factor = config.rope_scaling["factor"]
|
||||
attention_factor = config.rope_scaling.get("attention_factor")
|
||||
mscale = config.rope_scaling.get("mscale")
|
||||
mscale_all_dim = config.rope_scaling.get("mscale_all_dim")
|
||||
|
||||
# NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
|
||||
# `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
|
||||
# values to compute the default attention scaling factor, instead of using `factor`.
|
||||
if "original_max_position_embeddings" in config.rope_scaling:
|
||||
original_max_position_embeddings = config.rope_scaling["original_max_position_embeddings"]
|
||||
factor = config.max_position_embeddings / original_max_position_embeddings
|
||||
else:
|
||||
original_max_position_embeddings = config.max_position_embeddings
|
||||
|
||||
def get_mscale(scale, mscale=1):
|
||||
if scale <= 1:
|
||||
return 1.0
|
||||
return 0.1 * mscale * math.log(scale) + 1.0
|
||||
|
||||
# Sets the attention factor as suggested in the paper
|
||||
attention_factor = config.rope_scaling.get("attention_factor")
|
||||
if attention_factor is None:
|
||||
attention_factor = 0.1 * math.log(factor) + 1.0
|
||||
if mscale and mscale_all_dim:
|
||||
attention_factor = float(get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim))
|
||||
else:
|
||||
attention_factor = get_mscale(factor)
|
||||
|
||||
# Optional config options
|
||||
# beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
|
||||
@@ -227,7 +245,7 @@ def _compute_yarn_parameters(
|
||||
inv_freq_extrapolation = 1.0 / pos_freqs
|
||||
inv_freq_interpolation = 1.0 / (factor * pos_freqs)
|
||||
|
||||
low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
|
||||
low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings)
|
||||
|
||||
# Get n-dimensional rotational scaling corrected for extrapolation
|
||||
inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
|
||||
@@ -235,7 +253,6 @@ def _compute_yarn_parameters(
|
||||
inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
|
||||
+ inv_freq_extrapolation * inv_freq_extrapolation_factor
|
||||
)
|
||||
|
||||
return inv_freq, attention_factor
|
||||
|
||||
|
||||
@@ -425,7 +442,14 @@ def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[se
|
||||
rope_scaling = config.rope_scaling
|
||||
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
|
||||
required_keys = {"rope_type", "factor"}
|
||||
optional_keys = {"attention_factor", "beta_fast", "beta_slow", "original_max_position_embeddings"}
|
||||
optional_keys = {
|
||||
"attention_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"original_max_position_embeddings",
|
||||
"mscale",
|
||||
"mscale_all_dim",
|
||||
}
|
||||
received_keys = set(rope_scaling.keys())
|
||||
_check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
|
||||
|
||||
|
||||
@@ -779,8 +779,7 @@ def _load_state_dict_into_meta_model(
|
||||
device_map_regex = "|".join([re.escape(k) for k in sorted(device_map.keys(), reverse=True)])
|
||||
|
||||
is_quantized = hf_quantizer is not None
|
||||
is_meta_state_dict = shard_file.endswith(".safetensors") and not is_quantized
|
||||
|
||||
is_meta_state_dict = shard_file.endswith(".safetensors")
|
||||
file_pointer = None
|
||||
if is_meta_state_dict:
|
||||
file_pointer = safe_open(shard_file, framework="pt", device=tensor_device)
|
||||
@@ -795,7 +794,7 @@ def _load_state_dict_into_meta_model(
|
||||
serialized_param_name = reverse_renaming_mapping[param_name]
|
||||
param = file_pointer.get_slice(serialized_param_name)
|
||||
else:
|
||||
param = empty_param # It is actually not empty!
|
||||
param = empty_param.to(tensor_device) # It is actually not empty!
|
||||
|
||||
to_contiguous, casting_dtype = _infer_parameter_dtype(
|
||||
model,
|
||||
@@ -813,7 +812,7 @@ def _load_state_dict_into_meta_model(
|
||||
param_name,
|
||||
casting_dtype,
|
||||
to_contiguous,
|
||||
tensor_device, # the rank
|
||||
int(os.environ["RANK"]), # the rank
|
||||
device_mesh,
|
||||
)
|
||||
else:
|
||||
@@ -4102,11 +4101,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
|
||||
if not torch.distributed.is_initialized():
|
||||
try:
|
||||
logger.warning("Tensor Parallel requires torch.distributed to be initialized first.")
|
||||
rank = int(os.environ["RANK"])
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
|
||||
torch.cuda.set_device(rank)
|
||||
torch.distributed.init_process_group(
|
||||
"nccl", rank=rank, world_size=world_size, init_method="env://"
|
||||
)
|
||||
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
|
||||
except Exception as e:
|
||||
raise EnvironmentError(
|
||||
"We tried to initialize torch.distributed for you, but it failed, make"
|
||||
@@ -4115,12 +4115,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
|
||||
# Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
|
||||
device_type = torch._C._get_accelerator().type
|
||||
device_module = torch.get_device_module(device_type)
|
||||
# Get device with index assuming equal number of devices per host
|
||||
tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count())
|
||||
tp_device = torch.device(device_type, torch.cuda.current_device())
|
||||
if tp_device.index > 0:
|
||||
import sys
|
||||
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
# This is the easiest way to dispatch to the current process device
|
||||
device_map = tp_device
|
||||
|
||||
# Assuming sharding the model onto the world
|
||||
world_size = torch.distributed.get_world_size()
|
||||
device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
|
||||
@@ -4871,9 +4872,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
expected_keys = hf_quantizer.update_expected_keys(model_to_load, expected_keys, checkpoint_keys)
|
||||
|
||||
# Warmup cuda to load the weights much faster on devices
|
||||
if device_map is not None and hf_quantizer is None:
|
||||
if device_map is not None: # and hf_quantizer is None:
|
||||
expanded_device_map = expand_device_map(device_map, expected_keys)
|
||||
caching_allocator_warmup(model_to_load, expanded_device_map)
|
||||
caching_allocator_warmup(model_to_load, expanded_device_map, factor=2 if hf_quantizer is None else 4)
|
||||
|
||||
error_msgs = []
|
||||
mismatched_keys = []
|
||||
@@ -5834,7 +5835,7 @@ def expand_device_map(device_map, param_names):
|
||||
return new_device_map
|
||||
|
||||
|
||||
def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict):
|
||||
def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, factor=2):
|
||||
"""This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
|
||||
device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
|
||||
the model, which is actually the loading speed botteneck.
|
||||
@@ -5865,7 +5866,6 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict):
|
||||
if _torch_distributed_available and torch.distributed.is_initialized()
|
||||
else None
|
||||
)
|
||||
|
||||
total_byte_count = defaultdict(lambda: 0)
|
||||
for param_name, device in accelerator_device_map.items():
|
||||
param = model.get_parameter_or_buffer(param_name)
|
||||
@@ -5886,7 +5886,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict):
|
||||
# Allow up to 95% of max device memory
|
||||
byte_count = min(byte_count, int(0.95 * device_memory))
|
||||
# Allocate memory
|
||||
_ = torch.empty(byte_count // 2, dtype=torch.float16, device=device, requires_grad=False)
|
||||
_ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)
|
||||
|
||||
|
||||
def get_disk_only_shard_files(device_map, weight_map):
|
||||
|
||||
@@ -71,6 +71,7 @@ from . import (
|
||||
deberta,
|
||||
deberta_v2,
|
||||
decision_transformer,
|
||||
deepseek_v3,
|
||||
deformable_detr,
|
||||
deit,
|
||||
deprecated,
|
||||
|
||||
@@ -89,6 +89,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
("deberta", "DebertaConfig"),
|
||||
("deberta-v2", "DebertaV2Config"),
|
||||
("decision_transformer", "DecisionTransformerConfig"),
|
||||
("deepseek_v3", "DeepseekV3Config"),
|
||||
("deformable_detr", "DeformableDetrConfig"),
|
||||
("deit", "DeiTConfig"),
|
||||
("depth_anything", "DepthAnythingConfig"),
|
||||
@@ -423,6 +424,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("deberta", "DeBERTa"),
|
||||
("deberta-v2", "DeBERTa-v2"),
|
||||
("decision_transformer", "Decision Transformer"),
|
||||
("deepseek_v3", "DeepSeek-V3"),
|
||||
("deformable_detr", "Deformable DETR"),
|
||||
("deit", "DeiT"),
|
||||
("deplot", "DePlot"),
|
||||
|
||||
@@ -88,6 +88,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("deberta", "DebertaModel"),
|
||||
("deberta-v2", "DebertaV2Model"),
|
||||
("decision_transformer", "DecisionTransformerModel"),
|
||||
("deepseek_v3", "DeepseekV3Model"),
|
||||
("deformable_detr", "DeformableDetrModel"),
|
||||
("deit", "DeiTModel"),
|
||||
("depth_pro", "DepthProModel"),
|
||||
@@ -514,6 +515,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("ctrl", "CTRLLMHeadModel"),
|
||||
("data2vec-text", "Data2VecTextForCausalLM"),
|
||||
("dbrx", "DbrxForCausalLM"),
|
||||
("deepseek_v3", "DeepseekV3ForCausalLM"),
|
||||
("diffllama", "DiffLlamaForCausalLM"),
|
||||
("electra", "ElectraForCausalLM"),
|
||||
("emu3", "Emu3ForCausalLM"),
|
||||
|
||||
@@ -171,6 +171,13 @@ else:
|
||||
"DebertaV2TokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
(
|
||||
"deepseek_v3",
|
||||
(
|
||||
"LlamaTokenizer" if is_sentencepiece_available() else None,
|
||||
"LlamaTokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
(
|
||||
"diffllama",
|
||||
(
|
||||
|
||||
27
src/transformers/models/deepseek_v3/__init__.py
Normal file
27
src/transformers/models/deepseek_v3/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_deepseek_v3 import *
|
||||
from .modeling_deepseek_v3 import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
247
src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
Normal file
247
src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
Normal file
@@ -0,0 +1,247 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 bzantium and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on the DeepSeekV3 implementations from the DeepSeek AI team. (https://huggingface.co/deepseek-ai/DeepSeek-V3)
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""DeepSeekV3 model configuration"""
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
|
||||
|
||||
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
||||
|
||||
|
||||
class DeepseekV3Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the DeepSeek-V3.
|
||||
e.g. [bzantium/tiny-deepseek-v3](https://huggingface.co/bzantium/tiny-deepseek-v3)
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 129280):
|
||||
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`DeepseekV3Model`]
|
||||
hidden_size (`int`, *optional*, defaults to 7168):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 18432):
|
||||
Dimension of the MLP representations.
|
||||
moe_intermediate_size (`int`, *optional*, defaults to 2048):
|
||||
Dimension of the MoE representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 61):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 128):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 128):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
n_shared_experts (`int`, *optional*, defaults to 1):
|
||||
Number of shared experts.
|
||||
n_routed_experts (`int`, *optional*, defaults to 256):
|
||||
Number of routed experts.
|
||||
routed_scaling_factor (`float`, *optional*, defaults to 2.5):
|
||||
Scaling factor or routed experts.
|
||||
kv_lora_rank (`int`, *optional*, defaults to 512):
|
||||
Rank of the LoRA matrices for key and value projections.
|
||||
q_lora_rank (`int`, *optional*, defaults to 1536):
|
||||
Rank of the LoRA matrices for query projections.
|
||||
qk_rope_head_dim (`int`, *optional*, defaults to 64):
|
||||
Dimension of the query/key heads that use rotary position embeddings.
|
||||
v_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of the value heads.
|
||||
qk_nope_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of the query/key heads that don't use rotary position embeddings.
|
||||
n_group (`int`, *optional*, defaults to 8):
|
||||
Number of groups for routed experts.
|
||||
topk_group (`int`, *optional*, defaults to 4):
|
||||
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 8):
|
||||
Number of selected experts, None means dense model.
|
||||
first_k_dense_replace (`int`, *optional*, defaults to 3):
|
||||
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
|
||||
\--k dense layers--/
|
||||
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the weights of the routed experts.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 0):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 1):
|
||||
End of stream token id.
|
||||
pretraining_tp (`int`, *optional*, defaults to 1):
|
||||
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
||||
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
||||
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
||||
issue](https://github.com/pytorch/pytorch/issues/76232).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
|
||||
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
|
||||
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum.
|
||||
rope_interleave (`bool`, *optional*, defaults to `True`):
|
||||
Whether to interleave the rotary position embeddings.
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
|
||||
```python
|
||||
>>> from transformers import DeepseekV3Model, DeepseekV3Config
|
||||
|
||||
>>> # Initializing a Deepseek-V3 style configuration
|
||||
>>> configuration = DeepseekV3Config()
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "deepseek_v3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace
|
||||
"layers.*.mlp.experts.*.gate_proj": "local_colwise",
|
||||
"layers.*.mlp.experts.*.up_proj": "local_colwise",
|
||||
"layers.*.mlp.experts.*.down_proj": "local_rowwise",
|
||||
"layers.*.mlp.experts.*": "local", # each expert is wrapped in a module list
|
||||
"layers.*.mlp.shared_experts.gate_proj": "local_colwise",
|
||||
"layers.*.mlp.shared_experts.up_proj": "local_colwise",
|
||||
"layers.*.mlp.shared_experts.down_proj": "local_rowwise",
|
||||
"layers.*.mlp.shared_experts": "local",
|
||||
"layers.*.mlp.gate_proj": "local_colwise",
|
||||
"layers.*.mlp.up_proj": "local_colwise",
|
||||
"layers.*.mlp.down_proj": "local_rowwise",
|
||||
"layers.*.mlp": "gather", # This is the only moment where results are gathered
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=129280,
|
||||
hidden_size=7168,
|
||||
intermediate_size=18432,
|
||||
moe_intermediate_size=2048,
|
||||
num_hidden_layers=61,
|
||||
num_attention_heads=128,
|
||||
num_key_value_heads=128,
|
||||
n_shared_experts=1,
|
||||
n_routed_experts=256,
|
||||
routed_scaling_factor=2.5,
|
||||
kv_lora_rank=512,
|
||||
q_lora_rank=1536,
|
||||
qk_rope_head_dim=64,
|
||||
v_head_dim=128,
|
||||
qk_nope_head_dim=128,
|
||||
n_group=8,
|
||||
topk_group=4,
|
||||
num_experts_per_tok=8,
|
||||
first_k_dense_replace=3,
|
||||
norm_topk_prob=True,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=0,
|
||||
eos_token_id=1,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
rope_interleave=True,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.kv_lora_rank = kv_lora_rank
|
||||
self.q_lora_rank = q_lora_rank
|
||||
self.qk_rope_head_dim = qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
|
||||
self.head_dim = qk_rope_head_dim
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.rope_interleave = rope_interleave
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
# Validate the correctness of rotary position embeddings parameters
|
||||
# BC: if there is a 'type' field, copy it it to 'rope_type'.
|
||||
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
||||
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
||||
rope_config_validation(self)
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["DeepseekV3Config"]
|
||||
1061
src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
Normal file
1061
src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
Normal file
File diff suppressed because it is too large
Load Diff
368
src/transformers/models/deepseek_v3/modular_deepseek_v3.py
Normal file
368
src/transformers/models/deepseek_v3/modular_deepseek_v3.py
Normal file
@@ -0,0 +1,368 @@
|
||||
import math
|
||||
from typing import Callable, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import logging
|
||||
from ..llama.modeling_llama import (
|
||||
LlamaDecoderLayer,
|
||||
LlamaForCausalLM,
|
||||
LlamaModel,
|
||||
LlamaPreTrainedModel,
|
||||
LlamaRMSNorm,
|
||||
LlamaRotaryEmbedding,
|
||||
apply_rotary_pos_emb,
|
||||
eager_attention_forward,
|
||||
rotate_half,
|
||||
)
|
||||
from .configuration_deepseek_v3 import DeepseekV3Config
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeepseekV3RMSNorm(LlamaRMSNorm):
|
||||
pass
|
||||
|
||||
|
||||
class DeepseekV3RotaryEmbedding(LlamaRotaryEmbedding):
|
||||
pass
|
||||
|
||||
|
||||
def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||
r"""
|
||||
TODO let's just use the original freqcis computation to not have the view
|
||||
transpose + reshape! This is not optimized!
|
||||
Applies Rotary Position Embedding to the query and key tensors.
|
||||
|
||||
Args:
|
||||
q (`torch.Tensor`): The query tensor.
|
||||
k (`torch.Tensor`): The key tensor.
|
||||
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
||||
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
||||
position_ids (`torch.Tensor`):
|
||||
The position indices of the tokens corresponding to the query and key tensors. For example, this can be
|
||||
used to pass offsetted position ids when working with a KV-cache.
|
||||
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
||||
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
||||
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
||||
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
||||
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
||||
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
||||
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
||||
Returns:
|
||||
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||
"""
|
||||
cos = cos.unsqueeze(unsqueeze_dim)
|
||||
sin = sin.unsqueeze(unsqueeze_dim)
|
||||
|
||||
b, h, s, d = q.shape
|
||||
q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
|
||||
|
||||
b, h, s, d = k.shape
|
||||
k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
|
||||
|
||||
q_embed = (q * cos) + (rotate_half(q) * sin)
|
||||
k_embed = (k * cos) + (rotate_half(k) * sin)
|
||||
return q_embed, k_embed
|
||||
|
||||
|
||||
def yarn_get_mscale(scale=1, mscale=1):
|
||||
if scale <= 1:
|
||||
return 1.0
|
||||
return 0.1 * mscale * math.log(scale) + 1.0
|
||||
|
||||
|
||||
class DeepseekV3MLP(nn.Module):
|
||||
def __init__(self, config, hidden_size=None, intermediate_size=None):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
|
||||
self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
|
||||
|
||||
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
||||
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
||||
self.act_fn = ACT2FN[config.hidden_act]
|
||||
|
||||
def forward(self, x):
|
||||
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
||||
return down_proj
|
||||
|
||||
|
||||
class DeepseekV3TopkRouter(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.top_k = config.num_experts_per_tok
|
||||
self.n_routed_experts = config.n_routed_experts
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
self.n_group = config.n_group
|
||||
self.topk_group = config.topk_group
|
||||
self.norm_topk_prob = config.norm_topk_prob
|
||||
|
||||
self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
|
||||
self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
|
||||
|
||||
@torch.no_grad()
|
||||
def get_topk_indices(self, scores):
|
||||
scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
|
||||
group_scores = (
|
||||
scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
|
||||
.topk(2, dim=-1)[0]
|
||||
.sum(dim=-1)
|
||||
)
|
||||
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
|
||||
group_mask = torch.zeros_like(group_scores)
|
||||
group_mask.scatter_(1, group_idx, 1)
|
||||
score_mask = (
|
||||
group_mask.unsqueeze(-1)
|
||||
.expand(-1, self.n_group, self.n_routed_experts // self.n_group)
|
||||
.reshape(-1, self.n_routed_experts)
|
||||
)
|
||||
scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
|
||||
topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
|
||||
return topk_indices
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = hidden_states.view(-1, self.config.hidden_size)
|
||||
router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
|
||||
scores = router_logits.sigmoid()
|
||||
topk_indices = self.get_topk_indices(scores)
|
||||
topk_weights = scores.gather(1, topk_indices)
|
||||
if self.norm_topk_prob:
|
||||
denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
|
||||
topk_weights /= denominator
|
||||
topk_weights = topk_weights * self.routed_scaling_factor
|
||||
return topk_indices, topk_weights
|
||||
|
||||
|
||||
class DeepseekV3MoE(nn.Module):
|
||||
"""
|
||||
A mixed expert module containing shared experts.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.experts = nn.ModuleList(
|
||||
[
|
||||
DeepseekV3MLP(config, intermediate_size=config.moe_intermediate_size)
|
||||
for _ in range(config.n_routed_experts)
|
||||
]
|
||||
)
|
||||
self.gate = DeepseekV3TopkRouter(config)
|
||||
self.shared_experts = DeepseekV3MLP(
|
||||
config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
|
||||
)
|
||||
|
||||
def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
|
||||
r"""
|
||||
CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
|
||||
to not have to do a loop here (deepseek has 256 experts soooo yeah).
|
||||
"""
|
||||
final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
|
||||
expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
|
||||
expert_mask = expert_mask.permute(2, 0, 1)
|
||||
|
||||
for expert_idx in range(len(self.experts)):
|
||||
expert = self.experts[expert_idx]
|
||||
mask = expert_mask[expert_idx]
|
||||
token_indices, weight_indices = torch.where(mask)
|
||||
|
||||
if token_indices.numel() > 0:
|
||||
expert_weights = topk_weights[token_indices, weight_indices]
|
||||
expert_input = hidden_states[token_indices]
|
||||
expert_output = expert(expert_input)
|
||||
weighted_output = expert_output * expert_weights.unsqueeze(-1)
|
||||
final_hidden_states.index_add_(0, token_indices, weighted_output)
|
||||
|
||||
# in original deepseek, the output of the experts are gathered once we leave this module
|
||||
# thus the moe module is itelsf an IsolatedParallel module
|
||||
# and all expert are "local" meaning we shard but we don't gather
|
||||
return final_hidden_states.type(hidden_states.dtype)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
residuals = hidden_states
|
||||
orig_shape = hidden_states.shape
|
||||
topk_indices, topk_weights = self.gate(hidden_states)
|
||||
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
||||
hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
|
||||
hidden_states = hidden_states + self.shared_experts(residuals)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class DeepseekV3Attention(nn.Module):
|
||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||
|
||||
def __init__(self, config: DeepseekV3Config, layer_idx: int):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
||||
self.attention_dropout = config.attention_dropout
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.rope_theta = config.rope_theta
|
||||
self.q_lora_rank = config.q_lora_rank
|
||||
self.qk_rope_head_dim = config.qk_rope_head_dim
|
||||
self.kv_lora_rank = config.kv_lora_rank
|
||||
self.v_head_dim = config.v_head_dim
|
||||
self.qk_nope_head_dim = config.qk_nope_head_dim
|
||||
self.qk_head_dim = config.qk_head_dim
|
||||
|
||||
self.is_causal = True
|
||||
self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
|
||||
self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
|
||||
self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
|
||||
|
||||
self.kv_a_proj_with_mqa = nn.Linear(
|
||||
config.hidden_size,
|
||||
self.kv_lora_rank + self.qk_rope_head_dim,
|
||||
bias=config.attention_bias,
|
||||
)
|
||||
self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
|
||||
self.kv_b_proj = nn.Linear(
|
||||
self.kv_lora_rank,
|
||||
self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
|
||||
bias=False,
|
||||
)
|
||||
|
||||
self.o_proj = nn.Linear(
|
||||
self.num_heads * self.v_head_dim,
|
||||
config.hidden_size,
|
||||
bias=config.attention_bias,
|
||||
)
|
||||
|
||||
self.scaling = self.qk_head_dim ** (-0.5)
|
||||
if self.config.rope_scaling is not None:
|
||||
mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
|
||||
scaling_factor = self.config.rope_scaling["factor"]
|
||||
if mscale_all_dim:
|
||||
mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
|
||||
self.scaling = self.scaling * mscale * mscale
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
batch_size, seq_length = hidden_states.shape[:-1]
|
||||
query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
|
||||
key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
|
||||
|
||||
q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))).view(query_shape).transpose(1, 2)
|
||||
q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
|
||||
|
||||
compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
|
||||
k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
|
||||
|
||||
k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
|
||||
k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
|
||||
|
||||
k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
|
||||
|
||||
cos, sin = position_embeddings
|
||||
if self.config.rope_interleave: # support using interleaved weights for efficiency
|
||||
q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
|
||||
else:
|
||||
q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
|
||||
k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
|
||||
|
||||
query_states = torch.cat((q_pass, q_rot), dim=-1)
|
||||
key_states = torch.cat((k_pass, k_rot), dim=-1)
|
||||
|
||||
if past_key_value is not None:
|
||||
# sin and cos are specific to RoPE models; cache_position needed for the static cache
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
|
||||
value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
if self.config._attn_implementation != "eager":
|
||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||
logger.warning_once(
|
||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||
)
|
||||
else:
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
attention_mask,
|
||||
dropout=0.0 if not self.training else self.attention_dropout,
|
||||
scaling=self.scaling,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
|
||||
attn_output = attn_output[:, :, :, : self.v_head_dim]
|
||||
|
||||
attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class DeepseekV3DecoderLayer(LlamaDecoderLayer, nn.Module):
|
||||
def __init__(self, config: DeepseekV3Config, layer_idx: int):
|
||||
nn.Module().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
|
||||
self.self_attn = DeepseekV3Attention(config=config, layer_idx=layer_idx)
|
||||
|
||||
if layer_idx >= config.first_k_dense_replace:
|
||||
self.mlp = DeepseekV3MoE(config)
|
||||
else:
|
||||
self.mlp = DeepseekV3MLP(config)
|
||||
|
||||
self.input_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_attention_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
|
||||
class DeepseekV3PreTrainedModel(LlamaPreTrainedModel):
|
||||
def _init_weights(self, module):
|
||||
std = self.config.initializer_range
|
||||
if isinstance(module, nn.Linear):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.padding_idx is not None:
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
elif isinstance(module, DeepseekV3TopkRouter):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
elif isinstance(module, nn.Parameter):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
|
||||
|
||||
class DeepseekV3Model(LlamaModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"]
|
||||
|
||||
|
||||
class DeepseekV3ForCausalLM(LlamaForCausalLM):
|
||||
pass
|
||||
|
||||
|
||||
__all__ = [
|
||||
"DeepseekV3PreTrainedModel",
|
||||
"DeepseekV3Model",
|
||||
"DeepseekV3ForCausalLM",
|
||||
]
|
||||
@@ -2812,6 +2812,27 @@ class DecisionTransformerPreTrainedModel(metaclass=DummyObject):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class DeepseekV3ForCausalLM(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class DeepseekV3Model(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class DeepseekV3PreTrainedModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class DeformableDetrForObjectDetection(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user