diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 65038e7e24..a3c6981861 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -847,7 +847,7 @@
         title: GraniteSpeech
       - local: model_doc/hubert
         title: Hubert
-      - local: model_doc/stt
+      - local: model_doc/kyutai_speech_to_text
         title: Kyutai Speech-To-Text
       - local: model_doc/mctct
         title: MCTCT
diff --git a/docs/source/en/model_doc/stt.md b/docs/source/en/model_doc/kyutai_speech_to_text.md
similarity index 95%
rename from docs/source/en/model_doc/stt.md
rename to docs/source/en/model_doc/kyutai_speech_to_text.md
index 02428899df..1c7d93e2af 100644
--- a/docs/source/en/model_doc/stt.md
+++ b/docs/source/en/model_doc/kyutai_speech_to_text.md
@@ -36,10 +36,10 @@ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForCondi
 
 # 1. load the model and the processor
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-model_id = "kyutai/stt-2.6b-en"
+model_id = "kyutai/stt-2.6b-en-trfs"
 
 processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
-model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto")
 
 # 2. load audio samples
 ds = load_dataset(
@@ -69,10 +69,10 @@ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForCondi
 
 # 1. load the model and the processor
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-model_id = "kyutai/stt-2.6b-en"
+model_id = "kyutai/stt-2.6b-en-trfs"
 
 processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
-model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto")
 
 # 2. load audio samples
 ds = load_dataset(
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 6d2c5affad..3c0e649f8a 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -158,6 +158,7 @@ if TYPE_CHECKING:
     from .janus import *
     from .jetmoe import *
     from .kosmos2 import *
+    from .kyutai_speech_to_text import *
     from .layoutlm import *
     from .layoutlmv2 import *
     from .layoutlmv3 import *
@@ -286,7 +287,6 @@ if TYPE_CHECKING:
     from .squeezebert import *
     from .stablelm import *
     from .starcoder2 import *
-    from .stt import *
     from .superglue import *
     from .superpoint import *
     from .swiftformer import *
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6e8a123518..8d2109759d 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -184,6 +184,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
         ("jetmoe", "JetMoeConfig"),
         ("jukebox", "JukeboxConfig"),
         ("kosmos-2", "Kosmos2Config"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextConfig"),
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
         ("layoutlmv3", "LayoutLMv3Config"),
@@ -326,7 +327,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
         ("squeezebert", "SqueezeBertConfig"),
         ("stablelm", "StableLmConfig"),
         ("starcoder2", "Starcoder2Config"),
-        ("stt", "KyutaiSpeechToTextConfig"),
         ("superglue", "SuperGlueConfig"),
         ("superpoint", "SuperPointConfig"),
         ("swiftformer", "SwiftFormerConfig"),
@@ -562,6 +562,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
         ("jetmoe", "JetMoe"),
         ("jukebox", "Jukebox"),
         ("kosmos-2", "KOSMOS-2"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToText"),
         ("layoutlm", "LayoutLM"),
         ("layoutlmv2", "LayoutLMv2"),
         ("layoutlmv3", "LayoutLMv3"),
@@ -717,7 +718,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
         ("squeezebert", "SqueezeBERT"),
         ("stablelm", "StableLm"),
         ("starcoder2", "Starcoder2"),
-        ("stt", "KyutaiSpeechToText"),
         ("superglue", "SuperGlue"),
         ("superpoint", "SuperPoint"),
         ("swiftformer", "SwiftFormer"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 5754b3bc1b..cf806f39a6 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -65,6 +65,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
         ("groupvit", "CLIPFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),
         ("imagegpt", "ImageGPTFeatureExtractor"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
         ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
         ("layoutlmv3", "LayoutLMv3FeatureExtractor"),
         ("levit", "LevitFeatureExtractor"),
@@ -91,7 +92,6 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
         ("sew-d", "Wav2Vec2FeatureExtractor"),
         ("speech_to_text", "Speech2TextFeatureExtractor"),
         ("speecht5", "SpeechT5FeatureExtractor"),
-        ("stt", "KyutaiSpeechToTextFeatureExtractor"),
         ("swiftformer", "ViTFeatureExtractor"),
         ("swin", "ViTFeatureExtractor"),
         ("swinv2", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b631e38828..51a3c3fbbc 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -174,6 +174,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
         ("jetmoe", "JetMoeModel"),
         ("jukebox", "JukeboxModel"),
         ("kosmos-2", "Kosmos2Model"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
         ("layoutlmv3", "LayoutLMv3Model"),
@@ -304,7 +305,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
         ("squeezebert", "SqueezeBertModel"),
         ("stablelm", "StableLmModel"),
         ("starcoder2", "Starcoder2Model"),
-        ("stt", "KyutaiSpeechToTextModel"),
         ("superglue", "SuperGlueForKeypointMatching"),
         ("swiftformer", "SwiftFormerModel"),
         ("swin", "SwinModel"),
@@ -1060,6 +1060,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
         ("granite_speech", "GraniteSpeechForConditionalGeneration"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextForConditionalGeneration"),
         ("moonshine", "MoonshineForConditionalGeneration"),
         ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForSpeechToText"),
@@ -1067,7 +1068,6 @@ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
         ("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
         ("speecht5", "SpeechT5ForSpeechToText"),
-        ("stt", "KyutaiSpeechToTextForConditionalGeneration"),
         ("whisper", "WhisperForConditionalGeneration"),
     ]
 )
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index a6bd873b88..372c0b249b 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -80,6 +80,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
         ("internvl", "InternVLProcessor"),
         ("janus", "JanusProcessor"),
         ("kosmos-2", "Kosmos2Processor"),
+        ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("llama4", "Llama4Processor"),
@@ -117,7 +118,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
         ("speech_to_text", "Speech2TextProcessor"),
         ("speech_to_text_2", "Speech2Text2Processor"),
         ("speecht5", "SpeechT5Processor"),
-        ("stt", "KyutaiSpeechToTextProcessor"),
         ("trocr", "TrOCRProcessor"),
         ("tvlt", "TvltProcessor"),
         ("tvp", "TvpProcessor"),
diff --git a/src/transformers/models/stt/__init__.py b/src/transformers/models/kyutai_speech_to_text/__init__.py
similarity index 100%
rename from src/transformers/models/stt/__init__.py
rename to src/transformers/models/kyutai_speech_to_text/__init__.py
diff --git a/src/transformers/models/stt/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
similarity index 97%
rename from src/transformers/models/stt/configuration_kyutai_speech_to_text.py
rename to src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
index f9ea11a5f4..40bfcf0937 100644
--- a/src/transformers/models/stt/configuration_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py
@@ -28,7 +28,7 @@ class KyutaiSpeechToTextConfig(PretrainedConfig):
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
     2.6b-en model.
 
-    e.g. [kyutai/stt-2.6b-en](https://huggingface.co/kyutai/stt-2.6b-en)
+    e.g. [kyutai/stt-2.6b-en-trfs](https://huggingface.co/kyutai/stt-2.6b-en-trfs)
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -110,8 +110,7 @@ class KyutaiSpeechToTextConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    # not the best naming here for `model_type`, but original codebase already uses model type:`stt` for in the config so we keep it to simplify
-    model_type = "stt"
+    model_type = "kyutai_speech_to_text"
     keys_to_ignore_at_inference = ["past_key_values"]
     sub_configs = {"codec_config": AutoConfig}
 
diff --git a/src/transformers/models/stt/convert_kyutai_speech_to_text_to_hf.py b/src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py
similarity index 98%
rename from src/transformers/models/stt/convert_kyutai_speech_to_text_to_hf.py
rename to src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py
index fe4a5a6bc6..d08550fa94 100644
--- a/src/transformers/models/stt/convert_kyutai_speech_to_text_to_hf.py
+++ b/src/transformers/models/kyutai_speech_to_text/convert_kyutai_speech_to_text_to_hf.py
@@ -190,7 +190,14 @@ def write_model(
     print("Converting the model.")
     os.makedirs(output_dir, exist_ok=True)
 
-    config = KyutaiSpeechToTextConfig()
+    config = KyutaiSpeechToTextConfig(
+        vocab_size=8001,
+        max_position_embeddings=375,
+        num_hidden_layers=16,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=128,
+    )
     config.use_cache = True
     config.codec_config.sliding_window = 250
 
diff --git a/src/transformers/models/stt/feature_extraction_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
similarity index 99%
rename from src/transformers/models/stt/feature_extraction_kyutai_speech_to_text.py
rename to src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
index 94ddb15daa..bde1736f9d 100644
--- a/src/transformers/models/stt/feature_extraction_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
@@ -1,5 +1,5 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/stt/modular_kyutai_speech_to_text.py.
+#           This file was automatically generated from src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py.
 #               Do NOT edit this file manually as any edits will be overwritten by the generation of
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_kyutai_speech_to_text.py file directly. One of our CI enforces this.
diff --git a/src/transformers/models/stt/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
similarity index 99%
rename from src/transformers/models/stt/modeling_kyutai_speech_to_text.py
rename to src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
index 7a86cd440c..67c4dac4cc 100644
--- a/src/transformers/models/stt/modeling_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
@@ -1,5 +1,5 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/stt/modular_kyutai_speech_to_text.py.
+#           This file was automatically generated from src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py.
 #               Do NOT edit this file manually as any edits will be overwritten by the generation of
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_kyutai_speech_to_text.py file directly. One of our CI enforces this.
@@ -713,7 +713,7 @@ class KyutaiSpeechToTextSdpaAttention(KyutaiSpeechToTextAttention):
         return attn_output, None, past_key_value
 
 
-STT_ATTENTION_CLASSES = {
+KYUTAI_SPEECH_TO_TEXT_ATTENTION_CLASSES = {
     "eager": KyutaiSpeechToTextAttention,
     "flash_attention_2": KyutaiSpeechToTextFlashAttention2,
     "sdpa": KyutaiSpeechToTextSdpaAttention,
@@ -726,7 +726,7 @@ class KyutaiSpeechToTextDecoderLayer(GradientCheckpointingLayer):
         self.hidden_size = config.hidden_size
         self.use_flexible_linear = use_flexible_linear
 
-        self.self_attn = STT_ATTENTION_CLASSES[config._attn_implementation](
+        self.self_attn = KYUTAI_SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
             config=config, layer_idx=layer_idx, use_flexible_linear=use_flexible_linear, use_rope=use_rope
         )
 
@@ -1169,7 +1169,7 @@ class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedMod
         >>> from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
 
         >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-        >>> model_id = "kyutai/stt-2.6b-en"
+        >>> model_id = "kyutai/stt-2.6b-en-trfs"
 
         >>> processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
         >>> model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
diff --git a/src/transformers/models/stt/modular_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
similarity index 99%
rename from src/transformers/models/stt/modular_kyutai_speech_to_text.py
rename to src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
index 8cc0c9d2a7..a9b86c6e2c 100644
--- a/src/transformers/models/stt/modular_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
@@ -278,7 +278,7 @@ class KyutaiSpeechToTextForConditionalGeneration(LlamaForCausalLM, GenerationMix
         >>> from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
 
         >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-        >>> model_id = "kyutai/stt-2.6b-en"
+        >>> model_id = "kyutai/stt-2.6b-en-trfs"
 
         >>> processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
         >>> model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
diff --git a/src/transformers/models/stt/processing_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
similarity index 100%
rename from src/transformers/models/stt/processing_kyutai_speech_to_text.py
rename to src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
index a6e08f714f..822bc872bc 100644
--- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
+++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
@@ -619,7 +619,7 @@ class KyutaiSpeechToTextForConditionalGenerationIntegrationTests(unittest.TestCa
     _dataset = None
 
     def setUp(self):
-        self.model_checkpoint = "kyutai/stt-2.6b-en"
+        self.model_checkpoint = "kyutai/stt-2.6b-en-trfs"
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)