[CSM] update model id (#38211)
* update model id * codec_model eval * add processor img * use ungated repo for processor tests
This commit is contained in:
@@ -39,7 +39,7 @@ CSM can be used to simply generate speech from a text prompt:
|
|||||||
import torch
|
import torch
|
||||||
from transformers import CsmForConditionalGeneration, AutoProcessor
|
from transformers import CsmForConditionalGeneration, AutoProcessor
|
||||||
|
|
||||||
model_id = "eustlb/csm-1b"
|
model_id = "sesame/csm-1b"
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
# load the model and the processor
|
# load the model and the processor
|
||||||
@@ -74,7 +74,7 @@ import torch
|
|||||||
from transformers import CsmForConditionalGeneration, AutoProcessor
|
from transformers import CsmForConditionalGeneration, AutoProcessor
|
||||||
from datasets import load_dataset, Audio
|
from datasets import load_dataset, Audio
|
||||||
|
|
||||||
model_id = "eustlb/csm-1b"
|
model_id = "sesame/csm-1b"
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
# load the model and the processor
|
# load the model and the processor
|
||||||
@@ -119,7 +119,7 @@ import torch
|
|||||||
from transformers import CsmForConditionalGeneration, AutoProcessor
|
from transformers import CsmForConditionalGeneration, AutoProcessor
|
||||||
from datasets import load_dataset, Audio
|
from datasets import load_dataset, Audio
|
||||||
|
|
||||||
model_id = "eustlb/csm-1b"
|
model_id = "sesame/csm-1b"
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
# load the model and the processor
|
# load the model and the processor
|
||||||
@@ -176,7 +176,7 @@ import copy
|
|||||||
from transformers import CsmForConditionalGeneration, AutoProcessor
|
from transformers import CsmForConditionalGeneration, AutoProcessor
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
model_id = "eustlb/csm-1b"
|
model_id = "sesame/csm-1b"
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
|
|
||||||
# set logs to ensure no recompilation and graph breaks
|
# set logs to ensure no recompilation and graph breaks
|
||||||
@@ -308,7 +308,7 @@ CSM Transformers integration supports training!
|
|||||||
from transformers import CsmForConditionalGeneration, AutoProcessor
|
from transformers import CsmForConditionalGeneration, AutoProcessor
|
||||||
from datasets import load_dataset, Audio
|
from datasets import load_dataset, Audio
|
||||||
|
|
||||||
model_id = "eustlb/csm-1b"
|
model_id = "sesame/csm-1b"
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
|
|
||||||
# load the model and the processor
|
# load the model and the processor
|
||||||
@@ -356,6 +356,10 @@ The original code can be found [here](https://github.com/SesameAILabs/csm).
|
|||||||
|
|
||||||
## CsmProcessor
|
## CsmProcessor
|
||||||
|
|
||||||
|
<div class="flex justify-center">
|
||||||
|
<img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/fig1.jpg"/>
|
||||||
|
</div>
|
||||||
|
|
||||||
[[autodoc]] CsmProcessor
|
[[autodoc]] CsmProcessor
|
||||||
- __call__
|
- __call__
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class CsmDepthDecoderConfig(PretrainedConfig):
|
|||||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
|
||||||
a similar configuration to that of the csm-1b.
|
a similar configuration to that of the csm-1b.
|
||||||
|
|
||||||
e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b)
|
e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)
|
||||||
|
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
@@ -210,7 +210,7 @@ class CsmConfig(PretrainedConfig):
|
|||||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||||
with the defaults will yield a similar configuration to that of the csm-1b.
|
with the defaults will yield a similar configuration to that of the csm-1b.
|
||||||
|
|
||||||
e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b)
|
e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)
|
||||||
|
|
||||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
documentation from [`PretrainedConfig`] for more information.
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|||||||
@@ -415,7 +415,7 @@ class CsmGenerationMixin(GenerationMixin):
|
|||||||
>>> from transformers import CsmProcessor, CsmForConditionalGeneration
|
>>> from transformers import CsmProcessor, CsmForConditionalGeneration
|
||||||
>>> from datasets import load_dataset, Audio
|
>>> from datasets import load_dataset, Audio
|
||||||
|
|
||||||
>>> model_id = "eustlb/csm-1b"
|
>>> model_id = "sesame/csm-1b"
|
||||||
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
>>> processor = AutoProcessor.from_pretrained(model_id)
|
>>> processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
|||||||
@@ -1113,7 +1113,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
|
|||||||
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
|
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
|
||||||
>>> from datasets import load_dataset, Audio
|
>>> from datasets import load_dataset, Audio
|
||||||
|
|
||||||
>>> model_id = "eustlb/csm-1b"
|
>>> model_id = "sesame/csm-1b"
|
||||||
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
>>> processor = AutoProcessor.from_pretrained(model_id)
|
>>> processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
|||||||
@@ -727,7 +727,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
|
|||||||
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
|
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
|
||||||
>>> from datasets import load_dataset, Audio
|
>>> from datasets import load_dataset, Audio
|
||||||
|
|
||||||
>>> model_id = "eustlb/csm-1b"
|
>>> model_id = "sesame/csm-1b"
|
||||||
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
>>> processor = AutoProcessor.from_pretrained(model_id)
|
>>> processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ class CsmProcessor(ProcessorMixin):
|
|||||||
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
|
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
|
||||||
audio = ds[0]["audio"]["array"]
|
audio = ds[0]["audio"]["array"]
|
||||||
|
|
||||||
processor = CsmProcessor.from_pretrained("eustlb/csm-1b")
|
processor = CsmProcessor.from_pretrained("sesame/csm-1b")
|
||||||
|
|
||||||
processor(
|
processor(
|
||||||
text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
|
text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
|
||||||
|
|||||||
@@ -417,7 +417,7 @@ class CsmForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, u
|
|||||||
class CsmForConditionalGenerationIntegrationTest(unittest.TestCase):
|
class CsmForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
# TODO: @eustlb, update with correct sesame's repo
|
# TODO: @eustlb, update with correct sesame's repo
|
||||||
self.model_checkpoint = "eustlb/csm-1b"
|
self.model_checkpoint = "sesame/csm-1b"
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
cleanup(torch_device, gc_collect=True)
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|||||||
@@ -37,8 +37,7 @@ class CsmProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
# TODO: @eustlb, change for hf-internal-testing/csm-1b
|
cls.checkpoint = "hf-internal-testing/namespace-sesame-repo_name_csm-1b"
|
||||||
cls.checkpoint = "eustlb/csm-1b"
|
|
||||||
processor = CsmProcessor.from_pretrained(cls.checkpoint)
|
processor = CsmProcessor.from_pretrained(cls.checkpoint)
|
||||||
cls.audio_token = processor.audio_token
|
cls.audio_token = processor.audio_token
|
||||||
cls.audio_token_id = processor.audio_token_id
|
cls.audio_token_id = processor.audio_token_id
|
||||||
|
|||||||
Reference in New Issue
Block a user