Fix Mamba2 Grouped SSD Support in the torch_forward Path (#37533)

* Fix mamba2 grouped support in bamba torch path

* patch zamba2 and mamba2

* Add a unit test for grouped SSD

* add comment for the new unit test

* add output_size arg value to repeat_interleave calls

* Add comment
This commit is contained in:
Chih-Chieh Yang
2025-04-16 16:16:01 -04:00
committed by GitHub
parent a7d2bbaaa8
commit 4005730044
6 changed files with 18 additions and 10 deletions

View File

@@ -783,8 +783,8 @@ class BambaMixer(nn.Module):
hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)

View File

@@ -580,8 +580,8 @@ class BambaMixer(nn.Module):
hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)

View File

@@ -572,8 +572,8 @@ class Mamba2Mixer(nn.Module):
hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)

View File

@@ -860,8 +860,8 @@ class Zamba2MambaMixer(nn.Module):
hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)

View File

@@ -630,8 +630,8 @@ class Zamba2MambaMixer(nn.Module):
hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)

View File

@@ -238,6 +238,14 @@ class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs) self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs)
# This test adjusts n_groups to half the original setting and effectively
# creates a grouped SSD configuration in the mamba2 layers
# See https://github.com/huggingface/transformers/pull/37533/
def test_mamba2_slow_vs_fast_forward_grouped(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
config_and_inputs[0].n_groups //= 2
self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs)
def test_initialization(self): def test_initialization(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common() config, _ = self.model_tester.prepare_config_and_inputs_for_common()