From b9f8f863d9f0cb31e64f7cb5f1a11e1441cea9e5 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Tue, 27 May 2025 17:03:55 +0200 Subject: [PATCH] [CSM] update model id (#38211) * update model id * codec_model eval * add processor img * use ungated repo for processor tests --- docs/source/en/model_doc/csm.md | 14 +++++++++----- src/transformers/models/csm/configuration_csm.py | 4 ++-- src/transformers/models/csm/generation_csm.py | 2 +- src/transformers/models/csm/modeling_csm.py | 2 +- src/transformers/models/csm/modular_csm.py | 2 +- src/transformers/models/csm/processing_csm.py | 2 +- tests/models/csm/test_modeling_csm.py | 2 +- tests/models/csm/test_processor_csm.py | 3 +-- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/docs/source/en/model_doc/csm.md b/docs/source/en/model_doc/csm.md index 53c24a5eba..833ddb697b 100644 --- a/docs/source/en/model_doc/csm.md +++ b/docs/source/en/model_doc/csm.md @@ -39,7 +39,7 @@ CSM can be used to simply generate speech from a text prompt: import torch from transformers import CsmForConditionalGeneration, AutoProcessor -model_id = "eustlb/csm-1b" +model_id = "sesame/csm-1b" device = "cuda" if torch.cuda.is_available() else "cpu" # load the model and the processor @@ -74,7 +74,7 @@ import torch from transformers import CsmForConditionalGeneration, AutoProcessor from datasets import load_dataset, Audio -model_id = "eustlb/csm-1b" +model_id = "sesame/csm-1b" device = "cuda" if torch.cuda.is_available() else "cpu" # load the model and the processor @@ -119,7 +119,7 @@ import torch from transformers import CsmForConditionalGeneration, AutoProcessor from datasets import load_dataset, Audio -model_id = "eustlb/csm-1b" +model_id = "sesame/csm-1b" device = "cuda" if torch.cuda.is_available() else "cpu" # load the model and the processor @@ -176,7 +176,7 @@ import copy from transformers import CsmForConditionalGeneration, AutoProcessor from datasets import load_dataset -model_id = "eustlb/csm-1b" +model_id = "sesame/csm-1b" device = "cuda" # set logs to ensure no recompilation and graph breaks @@ -308,7 +308,7 @@ CSM Transformers integration supports training! from transformers import CsmForConditionalGeneration, AutoProcessor from datasets import load_dataset, Audio -model_id = "eustlb/csm-1b" +model_id = "sesame/csm-1b" device = "cuda" # load the model and the processor @@ -356,6 +356,10 @@ The original code can be found [here](https://github.com/SesameAILabs/csm). ## CsmProcessor +
+ +
+ [[autodoc]] CsmProcessor - __call__ diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index e6d6d2e27c..b13b9d2a87 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -28,7 +28,7 @@ class CsmDepthDecoderConfig(PretrainedConfig): model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the csm-1b. - e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b) + e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -210,7 +210,7 @@ class CsmConfig(PretrainedConfig): model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the csm-1b. - e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b) + e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/src/transformers/models/csm/generation_csm.py b/src/transformers/models/csm/generation_csm.py index 2fec3ea891..7afc7c2d60 100644 --- a/src/transformers/models/csm/generation_csm.py +++ b/src/transformers/models/csm/generation_csm.py @@ -415,7 +415,7 @@ class CsmGenerationMixin(GenerationMixin): >>> from transformers import CsmProcessor, CsmForConditionalGeneration >>> from datasets import load_dataset, Audio - >>> model_id = "eustlb/csm-1b" + >>> model_id = "sesame/csm-1b" >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu" >>> processor = AutoProcessor.from_pretrained(model_id) diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index 7771695654..c0c4f5927a 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -1113,7 +1113,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin): >>> from transformers import CsmForConditionalGeneration, AutoProcessor >>> from datasets import load_dataset, Audio - >>> model_id = "eustlb/csm-1b" + >>> model_id = "sesame/csm-1b" >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu" >>> processor = AutoProcessor.from_pretrained(model_id) diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py index 86483076d3..4322a2a07f 100644 --- a/src/transformers/models/csm/modular_csm.py +++ b/src/transformers/models/csm/modular_csm.py @@ -727,7 +727,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin): >>> from transformers import CsmForConditionalGeneration, AutoProcessor >>> from datasets import load_dataset, Audio - >>> model_id = "eustlb/csm-1b" + >>> model_id = "sesame/csm-1b" >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu" >>> processor = AutoProcessor.from_pretrained(model_id) diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py index a0f91a1c3d..ca516d8264 100644 --- a/src/transformers/models/csm/processing_csm.py +++ b/src/transformers/models/csm/processing_csm.py @@ -76,7 +76,7 @@ class CsmProcessor(ProcessorMixin): ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train") audio = ds[0]["audio"]["array"] - processor = CsmProcessor.from_pretrained("eustlb/csm-1b") + processor = CsmProcessor.from_pretrained("sesame/csm-1b") processor( text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"], diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py index 26442ef845..2425ff35ec 100644 --- a/tests/models/csm/test_modeling_csm.py +++ b/tests/models/csm/test_modeling_csm.py @@ -417,7 +417,7 @@ class CsmForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, u class CsmForConditionalGenerationIntegrationTest(unittest.TestCase): def setUp(self): # TODO: @eustlb, update with correct sesame's repo - self.model_checkpoint = "eustlb/csm-1b" + self.model_checkpoint = "sesame/csm-1b" def tearDown(self): cleanup(torch_device, gc_collect=True) diff --git a/tests/models/csm/test_processor_csm.py b/tests/models/csm/test_processor_csm.py index da96381246..dcd344d120 100644 --- a/tests/models/csm/test_processor_csm.py +++ b/tests/models/csm/test_processor_csm.py @@ -37,8 +37,7 @@ class CsmProcessorTest(ProcessorTesterMixin, unittest.TestCase): @classmethod def setUpClass(cls): - # TODO: @eustlb, change for hf-internal-testing/csm-1b - cls.checkpoint = "eustlb/csm-1b" + cls.checkpoint = "hf-internal-testing/namespace-sesame-repo_name_csm-1b" processor = CsmProcessor.from_pretrained(cls.checkpoint) cls.audio_token = processor.audio_token cls.audio_token_id = processor.audio_token_id