diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 65038e7e24..a3c6981861 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -847,7 +847,7 @@ title: GraniteSpeech - local: model_doc/hubert title: Hubert - - local: model_doc/stt + - local: model_doc/kyutai_speech_to_text title: Kyutai Speech-To-Text - local: model_doc/mctct title: MCTCT diff --git a/docs/source/en/model_doc/stt.md b/docs/source/en/model_doc/kyutai_speech_to_text.md similarity index 95% rename from docs/source/en/model_doc/stt.md rename to docs/source/en/model_doc/kyutai_speech_to_text.md index 02428899df..1c7d93e2af 100644 --- a/docs/source/en/model_doc/stt.md +++ b/docs/source/en/model_doc/kyutai_speech_to_text.md @@ -36,10 +36,10 @@ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForCondi # 1. load the model and the processor torch_device = "cuda" if torch.cuda.is_available() else "cpu" -model_id = "kyutai/stt-2.6b-en" +model_id = "kyutai/stt-2.6b-en-trfs" processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id) -model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) +model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto") # 2. load audio samples ds = load_dataset( @@ -69,10 +69,10 @@ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForCondi # 1. load the model and the processor torch_device = "cuda" if torch.cuda.is_available() else "cpu" -model_id = "kyutai/stt-2.6b-en" +model_id = "kyutai/stt-2.6b-en-trfs" processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id) -model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) +model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto") # 2. load audio samples ds = load_dataset( diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 6d2c5affad..3c0e649f8a 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -158,6 +158,7 @@ if TYPE_CHECKING: from .janus import * from .jetmoe import * from .kosmos2 import * + from .kyutai_speech_to_text import * from .layoutlm import * from .layoutlmv2 import * from .layoutlmv3 import * @@ -286,7 +287,6 @@ if TYPE_CHECKING: from .squeezebert import * from .stablelm import * from .starcoder2 import * - from .stt import * from .superglue import * from .superpoint import * from .swiftformer import * diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 6e8a123518..8d2109759d 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -184,6 +184,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str]( ("jetmoe", "JetMoeConfig"), ("jukebox", "JukeboxConfig"), ("kosmos-2", "Kosmos2Config"), + ("kyutai_speech_to_text", "KyutaiSpeechToTextConfig"), ("layoutlm", "LayoutLMConfig"), ("layoutlmv2", "LayoutLMv2Config"), ("layoutlmv3", "LayoutLMv3Config"), @@ -326,7 +327,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str]( ("squeezebert", "SqueezeBertConfig"), ("stablelm", "StableLmConfig"), ("starcoder2", "Starcoder2Config"), - ("stt", "KyutaiSpeechToTextConfig"), ("superglue", "SuperGlueConfig"), ("superpoint", "SuperPointConfig"), ("swiftformer", "SwiftFormerConfig"), @@ -562,6 +562,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str]( ("jetmoe", "JetMoe"), ("jukebox", "Jukebox"), ("kosmos-2", "KOSMOS-2"), + ("kyutai_speech_to_text", "KyutaiSpeechToText"), ("layoutlm", "LayoutLM"), ("layoutlmv2", "LayoutLMv2"), ("layoutlmv3", "LayoutLMv3"), @@ -717,7 +718,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str]( ("squeezebert", "SqueezeBERT"), ("stablelm", "StableLm"), ("starcoder2", "Starcoder2"), - ("stt", "KyutaiSpeechToText"), ("superglue", "SuperGlue"), ("superpoint", "SuperPoint"), ("swiftformer", "SwiftFormer"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 5754b3bc1b..cf806f39a6 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -65,6 +65,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( ("groupvit", "CLIPFeatureExtractor"), ("hubert", "Wav2Vec2FeatureExtractor"), ("imagegpt", "ImageGPTFeatureExtractor"), + ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"), ("layoutlmv2", "LayoutLMv2FeatureExtractor"), ("layoutlmv3", "LayoutLMv3FeatureExtractor"), ("levit", "LevitFeatureExtractor"), @@ -91,7 +92,6 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( ("sew-d", "Wav2Vec2FeatureExtractor"), ("speech_to_text", "Speech2TextFeatureExtractor"), ("speecht5", "SpeechT5FeatureExtractor"), - ("stt", "KyutaiSpeechToTextFeatureExtractor"), ("swiftformer", "ViTFeatureExtractor"), ("swin", "ViTFeatureExtractor"), ("swinv2", "ViTFeatureExtractor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index b631e38828..51a3c3fbbc 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -174,6 +174,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("jetmoe", "JetMoeModel"), ("jukebox", "JukeboxModel"), ("kosmos-2", "Kosmos2Model"), + ("kyutai_speech_to_text", "KyutaiSpeechToTextModel"), ("layoutlm", "LayoutLMModel"), ("layoutlmv2", "LayoutLMv2Model"), ("layoutlmv3", "LayoutLMv3Model"), @@ -304,7 +305,6 @@ MODEL_MAPPING_NAMES = OrderedDict( ("squeezebert", "SqueezeBertModel"), ("stablelm", "StableLmModel"), ("starcoder2", "Starcoder2Model"), - ("stt", "KyutaiSpeechToTextModel"), ("superglue", "SuperGlueForKeypointMatching"), ("swiftformer", "SwiftFormerModel"), ("swin", "SwinModel"), @@ -1060,6 +1060,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict( MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( [ ("granite_speech", "GraniteSpeechForConditionalGeneration"), + ("kyutai_speech_to_text", "KyutaiSpeechToTextForConditionalGeneration"), ("moonshine", "MoonshineForConditionalGeneration"), ("pop2piano", "Pop2PianoForConditionalGeneration"), ("seamless_m4t", "SeamlessM4TForSpeechToText"), @@ -1067,7 +1068,6 @@ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( ("speech-encoder-decoder", "SpeechEncoderDecoderModel"), ("speech_to_text", "Speech2TextForConditionalGeneration"), ("speecht5", "SpeechT5ForSpeechToText"), - ("stt", "KyutaiSpeechToTextForConditionalGeneration"), ("whisper", "WhisperForConditionalGeneration"), ] ) diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index a6bd873b88..372c0b249b 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -80,6 +80,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("internvl", "InternVLProcessor"), ("janus", "JanusProcessor"), ("kosmos-2", "Kosmos2Processor"), + ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), ("llama4", "Llama4Processor"), @@ -117,7 +118,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("speech_to_text", "Speech2TextProcessor"), ("speech_to_text_2", "Speech2Text2Processor"), ("speecht5", "SpeechT5Processor"), - ("stt", "KyutaiSpeechToTextProcessor"), ("trocr", "TrOCRProcessor"), ("tvlt", "TvltProcessor"), ("tvp", "TvpProcessor"), diff --git a/src/transformers/models/stt/__init__.py b/src/transformers/models/kyutai_speech_to_text/__init__.py similarity index 100% rename from src/transformers/models/stt/__init__.py rename to src/transformers/models/kyutai_speech_to_text/__init__.py diff --git a/src/transformers/models/stt/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py similarity index 97% rename from src/transformers/models/stt/configuration_kyutai_speech_to_text.py rename to src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index f9ea11a5f4..40bfcf0937 100644 --- a/src/transformers/models/stt/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -28,7 +28,7 @@ class KyutaiSpeechToTextConfig(PretrainedConfig): architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the 2.6b-en model. - e.g. [kyutai/stt-2.6b-en](https://huggingface.co/kyutai/stt-2.6b-en) + e.g. [kyutai/stt-2.6b-en-trfs](https://huggingface.co/kyutai/stt-2.6b-en-trfs) Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -110,8 +110,7 @@ class KyutaiSpeechToTextConfig(PretrainedConfig): >>> configuration = model.config ```""" - # not the best naming here for `model_type`, but original codebase already uses model type:`stt` for in the config so we keep it to simplify - model_type = "stt" + model_type = "kyutai_speech_to_text" keys_to_ignore_at_inference = ["past_key_values"] sub_configs = {"codec_config": AutoConfig} diff --git a/src/transformers/models/stt/convert_kyutai_speech_to_text_to_hf.py b/src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py similarity index 98% rename from src/transformers/models/stt/convert_kyutai_speech_to_text_to_hf.py rename to src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py index fe4a5a6bc6..d08550fa94 100644 --- a/src/transformers/models/stt/convert_kyutai_speech_to_text_to_hf.py +++ b/src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py @@ -190,7 +190,14 @@ def write_model( print("Converting the model.") os.makedirs(output_dir, exist_ok=True) - config = KyutaiSpeechToTextConfig() + config = KyutaiSpeechToTextConfig( + vocab_size=8001, + max_position_embeddings=375, + num_hidden_layers=16, + num_attention_heads=16, + num_key_value_heads=16, + head_dim=128, + ) config.use_cache = True config.codec_config.sliding_window = 250 diff --git a/src/transformers/models/stt/feature_extraction_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py similarity index 99% rename from src/transformers/models/stt/feature_extraction_kyutai_speech_to_text.py rename to src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py index 94ddb15daa..bde1736f9d 100644 --- a/src/transformers/models/stt/feature_extraction_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py @@ -1,5 +1,5 @@ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/stt/modular_kyutai_speech_to_text.py. +# This file was automatically generated from src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_kyutai_speech_to_text.py file directly. One of our CI enforces this. diff --git a/src/transformers/models/stt/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py similarity index 99% rename from src/transformers/models/stt/modeling_kyutai_speech_to_text.py rename to src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index 7a86cd440c..67c4dac4cc 100644 --- a/src/transformers/models/stt/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -1,5 +1,5 @@ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/stt/modular_kyutai_speech_to_text.py. +# This file was automatically generated from src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_kyutai_speech_to_text.py file directly. One of our CI enforces this. @@ -713,7 +713,7 @@ class KyutaiSpeechToTextSdpaAttention(KyutaiSpeechToTextAttention): return attn_output, None, past_key_value -STT_ATTENTION_CLASSES = { +KYUTAI_SPEECH_TO_TEXT_ATTENTION_CLASSES = { "eager": KyutaiSpeechToTextAttention, "flash_attention_2": KyutaiSpeechToTextFlashAttention2, "sdpa": KyutaiSpeechToTextSdpaAttention, @@ -726,7 +726,7 @@ class KyutaiSpeechToTextDecoderLayer(GradientCheckpointingLayer): self.hidden_size = config.hidden_size self.use_flexible_linear = use_flexible_linear - self.self_attn = STT_ATTENTION_CLASSES[config._attn_implementation]( + self.self_attn = KYUTAI_SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation]( config=config, layer_idx=layer_idx, use_flexible_linear=use_flexible_linear, use_rope=use_rope ) @@ -1169,7 +1169,7 @@ class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedMod >>> from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu" - >>> model_id = "kyutai/stt-2.6b-en" + >>> model_id = "kyutai/stt-2.6b-en-trfs" >>> processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id) >>> model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) diff --git a/src/transformers/models/stt/modular_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py similarity index 99% rename from src/transformers/models/stt/modular_kyutai_speech_to_text.py rename to src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py index 8cc0c9d2a7..a9b86c6e2c 100644 --- a/src/transformers/models/stt/modular_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py @@ -278,7 +278,7 @@ class KyutaiSpeechToTextForConditionalGeneration(LlamaForCausalLM, GenerationMix >>> from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu" - >>> model_id = "kyutai/stt-2.6b-en" + >>> model_id = "kyutai/stt-2.6b-en-trfs" >>> processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id) >>> model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) diff --git a/src/transformers/models/stt/processing_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py similarity index 100% rename from src/transformers/models/stt/processing_kyutai_speech_to_text.py rename to src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py index a6e08f714f..822bc872bc 100644 --- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py +++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py @@ -619,7 +619,7 @@ class KyutaiSpeechToTextForConditionalGenerationIntegrationTests(unittest.TestCa _dataset = None def setUp(self): - self.model_checkpoint = "kyutai/stt-2.6b-en" + self.model_checkpoint = "kyutai/stt-2.6b-en-trfs" def tearDown(self): cleanup(torch_device, gc_collect=True)