@@ -110,9 +110,14 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran
|
||||
>>> from datasets import load_dataset, Audio
|
||||
>>> import torch, math
|
||||
>>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer
|
||||
>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
|
||||
|
||||
>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("kyutai/moshiko-pytorch-bf16")
|
||||
>>> device = "cuda"
|
||||
>>> dtype = torch.bfloat16
|
||||
|
||||
>>> # prepare user input audio
|
||||
>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
|
||||
>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
|
||||
|
||||
Reference in New Issue
Block a user