[CSM] update model id (#38211)

* update model id * codec_model eval * add processor img * use ungated repo for processor tests
2025-05-27 17:03:55 +02:00
parent 07dd6b2495
commit b9f8f863d9
8 changed files with 17 additions and 14 deletions
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@@ -39,7 +39,7 @@ CSM can be used to simply generate speech from a text prompt:
 import torch
 from transformers import CsmForConditionalGeneration, AutoProcessor

-model_id = "eustlb/csm-1b"
+model_id = "sesame/csm-1b"
 device = "cuda" if torch.cuda.is_available() else "cpu"

 # load the model and the processor
@@ -74,7 +74,7 @@ import torch
 from transformers import CsmForConditionalGeneration, AutoProcessor
 from datasets import load_dataset, Audio

-model_id = "eustlb/csm-1b"
+model_id = "sesame/csm-1b"
 device = "cuda" if torch.cuda.is_available() else "cpu"

 # load the model and the processor
@@ -119,7 +119,7 @@ import torch
 from transformers import CsmForConditionalGeneration, AutoProcessor
 from datasets import load_dataset, Audio

-model_id = "eustlb/csm-1b"
+model_id = "sesame/csm-1b"
 device = "cuda" if torch.cuda.is_available() else "cpu"

 # load the model and the processor
@@ -176,7 +176,7 @@ import copy
 from transformers import CsmForConditionalGeneration, AutoProcessor
 from datasets import load_dataset

-model_id = "eustlb/csm-1b"
+model_id = "sesame/csm-1b"
 device = "cuda"

 # set logs to ensure no recompilation and graph breaks
@@ -308,7 +308,7 @@ CSM Transformers integration supports training!
 from transformers import CsmForConditionalGeneration, AutoProcessor
 from datasets import load_dataset, Audio

-model_id = "eustlb/csm-1b"
+model_id = "sesame/csm-1b"
 device = "cuda"

 # load the model and the processor
@@ -356,6 +356,10 @@ The original code can be found [here](https://github.com/SesameAILabs/csm).

 ## CsmProcessor

+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/fig1.jpg"/>
+</div>
+
 [[autodoc]] CsmProcessor
    - __call__

--- a/src/transformers/models/csm/configuration_csm.py
+++ b/src/transformers/models/csm/configuration_csm.py
@@ -28,7 +28,7 @@ class CsmDepthDecoderConfig(PretrainedConfig):
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
    a similar configuration to that of the csm-1b.

-    e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b)
+    e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
@@ -210,7 +210,7 @@ class CsmConfig(PretrainedConfig):
    model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the csm-1b.

-    e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b)
+    e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
--- a/src/transformers/models/csm/generation_csm.py
+++ b/src/transformers/models/csm/generation_csm.py
@@ -415,7 +415,7 @@ class CsmGenerationMixin(GenerationMixin):
        >>> from transformers import CsmProcessor, CsmForConditionalGeneration
        >>> from datasets import load_dataset, Audio

-        >>> model_id = "eustlb/csm-1b"
+        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)
--- a/src/transformers/models/csm/modeling_csm.py
+++ b/src/transformers/models/csm/modeling_csm.py
@@ -1113,7 +1113,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

-        >>> model_id = "eustlb/csm-1b"
+        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)
--- a/src/transformers/models/csm/modular_csm.py
+++ b/src/transformers/models/csm/modular_csm.py
@@ -727,7 +727,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

-        >>> model_id = "eustlb/csm-1b"
+        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@@ -76,7 +76,7 @@ class CsmProcessor(ProcessorMixin):
        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        audio = ds[0]["audio"]["array"]

-        processor = CsmProcessor.from_pretrained("eustlb/csm-1b")
+        processor = CsmProcessor.from_pretrained("sesame/csm-1b")

        processor(
            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
--- a/tests/models/csm/test_modeling_csm.py
+++ b/tests/models/csm/test_modeling_csm.py
@@ -417,7 +417,7 @@ class CsmForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, u
 class CsmForConditionalGenerationIntegrationTest(unittest.TestCase):
    def setUp(self):
        # TODO: @eustlb, update with correct sesame's repo
-        self.model_checkpoint = "eustlb/csm-1b"
+        self.model_checkpoint = "sesame/csm-1b"

    def tearDown(self):
        cleanup(torch_device, gc_collect=True)
--- a/tests/models/csm/test_processor_csm.py
+++ b/tests/models/csm/test_processor_csm.py
@@ -37,8 +37,7 @@ class CsmProcessorTest(ProcessorTesterMixin, unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        # TODO: @eustlb, change for hf-internal-testing/csm-1b
-        cls.checkpoint = "eustlb/csm-1b"
+        cls.checkpoint = "hf-internal-testing/namespace-sesame-repo_name_csm-1b"
        processor = CsmProcessor.from_pretrained(cls.checkpoint)
        cls.audio_token = processor.audio_token
        cls.audio_token_id = processor.audio_token_id