[Mixtral] Fix loss + nits (#28115)

* default config should not use sliding window * update the doc * nits * add a proper test * update * update * update expected value * Update src/transformers/tokenization_utils_fast.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * convert to float * average then N**2 * comment * revert nit * good to fo * fixup * Update tests/models/mixtral/test_modeling_mixtral.py Co-authored-by: Lysandre Debut <hi@lysand.re> * revert unrelated change --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: Lysandre Debut <hi@lysand.re>
2023-12-19 17:31:54 +01:00
parent ac974199c8
commit 4a04b4ccca
3 changed files with 20 additions and 22 deletions
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -469,6 +469,7 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi

        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.num_labels = 3
+        config.num_local_experts = 8
        config.output_router_logits = True
        input_ids = input_dict["input_ids"]
        attention_mask = input_ids.ne(1).to(torch_device)
@@ -476,8 +477,8 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=attention_mask)
-        self.assertEqual(result.router_logits[0].shape, (91, config.num_experts_per_tok))
-        torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(1, dtype=torch.float32))
+        self.assertEqual(result.router_logits[0].shape, (91, config.num_local_experts))
+        torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(8, dtype=torch.float32))


@require_torch