diff --git a/examples/pytorch/_tests_requirements.txt b/examples/pytorch/_tests_requirements.txt index f5b4b5170d..9483d3a750 100644 --- a/examples/pytorch/_tests_requirements.txt +++ b/examples/pytorch/_tests_requirements.txt @@ -13,7 +13,7 @@ streamlit elasticsearch nltk pandas -datasets >= 1.1.3 +datasets >= 1.13.3 fire pytest conllu @@ -21,3 +21,4 @@ sentencepiece != 0.1.92 protobuf torchvision jiwer +librosa diff --git a/examples/pytorch/speech-pretraining/README.md b/examples/pytorch/speech-pretraining/README.md index 0e6795a61a..3efac8dd2b 100644 --- a/examples/pytorch/speech-pretraining/README.md +++ b/examples/pytorch/speech-pretraining/README.md @@ -94,7 +94,7 @@ To pre-train `"large-sized"` Wav2Vec2 model, *e.g.* [facebook/wav2vec2-large-lv6 on [librispeech_asr](https://huggingface.co/datasets/librispeech_asr), the following command can be run: ```bash -accelerate launch run_pretrain_no_trainer.py \ +accelerate launch run_wav2vec2_pretraining_no_trainer.py \ --dataset_name=librispeech_asr \ --dataset_config_names clean clean other \ --dataset_split_names train.100 train.360 train.500 \ diff --git a/examples/pytorch/speech-pretraining/requirements.txt b/examples/pytorch/speech-pretraining/requirements.txt index ea09a02c2d..64a48c3967 100644 --- a/examples/pytorch/speech-pretraining/requirements.txt +++ b/examples/pytorch/speech-pretraining/requirements.txt @@ -2,3 +2,4 @@ datasets >= 1.12.0 torch >= 1.5 torchaudio accelerate >= 0.5.0 +librosa diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 657c6e844b..e56a3dcb3d 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -25,7 +25,6 @@ from typing import Dict, List, Optional, Union import datasets import torch -import torchaudio from datasets import DatasetDict, concatenate_datasets, load_dataset from torch.utils.data.dataloader import DataLoader from tqdm.auto import tqdm @@ -113,7 +112,7 @@ def parse_args(): parser.add_argument( "--audio_column_name", type=str, - default="file", + default="audio", help="Column in the dataset that contains speech file path. Defaults to 'file'", ) parser.add_argument( @@ -128,6 +127,18 @@ def parse_args(): default=None, help="Pretrained config name or path if not the same as model_name", ) + parser.add_argument( + "--train_cache_file_name", + type=str, + default=None, + help="Path to the train cached file name", + ) + parser.add_argument( + "--validation_cache_file_name", + type=str, + default=None, + help="Path to the validation cached file name", + ) parser.add_argument( "--per_device_train_batch_size", type=int, @@ -414,9 +425,17 @@ def main(): raw_datasets["validation"] = raw_datasets["train"].select(range(num_validation_samples)) raw_datasets["train"] = raw_datasets["train"].select(range(num_validation_samples, raw_datasets["train"].num_rows)) - # 2. Preprocess audio: load, resample, normalize and truncate + # 2. Now we preprocess the datasets including loading the audio, resampling and normalization + # Thankfully, `datasets` takes care of automatically loading and resampling the audio, + # so that we just need to set the correct target sampling rate and normalize the input + # via the `feature_extractor` feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(args.model_name_or_path) + # make sure that dataset decodes audio with correct samlping rate + raw_datasets = raw_datasets.cast_column( + "audio", datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) + ) + # only normalized-inputs-training is supported if not feature_extractor.do_normalize: raise ValueError( @@ -427,38 +446,40 @@ def main(): max_length = int(args.max_duration_in_seconds * feature_extractor.sampling_rate) min_length = int(args.min_duration_in_seconds * feature_extractor.sampling_rate) - resampler = None - if raw_datasets["train"][args.audio_column_name][0].split(".")[-1] == "mp3": - # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged - resampler = torchaudio.transforms.Resample(48_000, feature_extractor.sampling_rate) - def prepare_dataset(batch): - speech_array, sampling_rate = torchaudio.load(batch[args.audio_column_name]) - speech_array = speech_array.squeeze() + sample = batch[args.audio_column_name] - # if necessary resample audio - if resampler is not None: - # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged - speech_array = resampler(speech_array) - sampling_rate = resampler.new_freq - - speech_array = speech_array.numpy() - inputs = feature_extractor(speech_array, sampling_rate=sampling_rate, max_length=max_length, truncation=True) + inputs = feature_extractor( + sample["array"], sampling_rate=sample["sampling_rate"], max_length=max_length, truncation=True + ) batch["input_values"] = inputs.input_values[0] + batch["input_length"] = len(inputs.input_values[0]) + return batch + # load via mapped files via path + cache_file_names = None + if args.train_cache_file_name is not None: + cache_file_names = {"train": args.train_cache_file_name, "validation": args.validation_cache_file_name} + # load audio files into numpy arrays with accelerator.main_process_first(): vectorized_datasets = raw_datasets.map( prepare_dataset, num_proc=args.preprocessing_num_workers, remove_columns=raw_datasets["train"].column_names, - load_from_cache_file=not args.overwrite_cache, - ) - vectorized_datasets = vectorized_datasets.filter( - lambda x: len(x["input_values"]) > min_length, load_from_cache_file=not args.overwrite_cache + cache_file_names=cache_file_names, ) + if min_length > 0.0: + vectorized_datasets = vectorized_datasets.filter( + lambda x: x > min_length, + num_proc=args.preprocessing_num_workers, + input_columns=["input_length"], + ) + + vectorized_datasets = vectorized_datasets.remove_columns("input_length") + # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md index 4b618dd0b2..c6434a6fe7 100644 --- a/examples/pytorch/speech-recognition/README.md +++ b/examples/pytorch/speech-recognition/README.md @@ -58,7 +58,6 @@ python run_speech_recognition_ctc.py \ --learning_rate="3e-4" \ --warmup_steps="500" \ --evaluation_strategy="steps" \ - --audio_column_name="path" \ --text_column_name="sentence" \ --save_steps="400" \ --eval_steps="100" \ @@ -87,7 +86,6 @@ python -m torch.distributed.launch \ --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ --dataset_config_name="tr" \ --output_dir="./wav2vec2-common_voice-tr-demo-dist" \ - --preprocessing_num_workers="16" \ --overwrite_output_dir \ --num_train_epochs="15" \ --per_device_train_batch_size="4" \ diff --git a/examples/pytorch/speech-recognition/requirements.txt b/examples/pytorch/speech-recognition/requirements.txt index e417719315..b5c58171d3 100644 --- a/examples/pytorch/speech-recognition/requirements.txt +++ b/examples/pytorch/speech-recognition/requirements.txt @@ -1,3 +1,4 @@ -datasets >= 1.12.0 +datasets >= 1.13.3 torch >= 1.5 torchaudio +librosa diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index ac1f24dad0..b839552438 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -24,9 +24,9 @@ import sys from dataclasses import dataclass, field from typing import Dict, List, Optional, Union +import datasets import numpy as np import torch -import torchaudio from datasets import DatasetDict, load_dataset, load_metric import transformers @@ -49,8 +49,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.12.0.dev0") -# TODO(Patrick) Bump up as soon as audio features are merged -require_version("datasets>=1.12.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") +require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") logger = logging.getLogger(__name__) @@ -179,12 +178,12 @@ class DataTrainingArguments: min_duration_in_seconds: Optional[float] = field( default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} ) - only_data_preprocessing: Optional[bool] = field( + preprocessing_only: Optional[bool] = field( default=False, metadata={ "help": "Whether to only do data preprocessing and skip training. " "This is especially useful when data preprocessing errors out in distributed training due to timeout. " - "In this case, one should run the preprocessing in a non-distributed setup with `only_data_preprocessing=True` " + "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` " "so that the cached datasets can consequently be loaded in distributed training" }, ) @@ -450,41 +449,30 @@ def main(): if model_args.freeze_feature_extractor: model.freeze_feature_extractor() - # 5. Now we preprocess the datasets which includes loading the audio, resampling and padding + # 5. Now we preprocess the datasets including loading the audio, resampling and normalization + # Thankfully, `datasets` takes care of automatically loading and resampling the audio, + # so that we just need to set the correct target sampling rate and normalize the input + # via the `feature_extractor` - # The following code should be cleaned up as soon as - # https://github.com/huggingface/datasets/pull/2324 is merged - - # Preprocessing the datasets. - # We need to read the audio files as arrays and tokenize the targets. + # make sure that dataset decodes audio with correct samlping rate + raw_datasets = raw_datasets.cast_column( + "audio", datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) + ) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate - resampler = None - if raw_datasets["train"][data_args.audio_column_name][0].split(".")[-1] == "mp3": - # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged - resampler = torchaudio.transforms.Resample(48_000, processor.feature_extractor.sampling_rate) - # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio - speech_array, sampling_rate = torchaudio.load(batch[data_args.audio_column_name]) - speech_array = speech_array.squeeze() - - # if necessary resample audio - if resampler is not None: - # TODO(PVP) - remove hard-coded 48_000 after audio feature is merged - speech_array = resampler(speech_array) - sampling_rate = resampler.new_freq - - speech_array = speech_array.numpy() + sample = batch[data_args.audio_column_name] batch["input_values"] = processor( - speech_array, sampling_rate=sampling_rate, truncate=True, max_length=max_input_length + sample["array"], sampling_rate=sample["sampling_rate"], truncate=True, max_length=max_input_length ).input_values[0] + batch["input_length"] = len(batch["input_values"]) # Setup the processor for targets with processor.as_target_processor(): @@ -502,10 +490,13 @@ def main(): if min_input_length > 0.0: # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( - lambda data: len(data["input_values"]) > min_input_length, + lambda x: x > min_input_length, num_proc=data_args.preprocessing_num_workers, + input_columns=["input_length"], ) + vectorized_datasets = vectorized_datasets.remove_columns("input_length") + # 6. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer @@ -513,8 +504,13 @@ def main(): # Define Metric during training wer_metric = load_metric("wer") - if data_args.only_data_preprocessing: - logger.info("Data preprocessing finished.") + # for large datasets it is advised to run the preprocessing on a + # single machine first with ``args.preprocessing_only`` since there will mostly likely + # be a timeout when running the script in distributed mode. + # In a second step ``args.preprocessing_only`` can then be set to `False` to load the + # cached dataset + if data_args.preprocessing_only: + logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}") return def compute_metrics(pred): diff --git a/examples/pytorch/test_examples.py b/examples/pytorch/test_examples.py index 9692a6eb46..4ef574f90a 100644 --- a/examples/pytorch/test_examples.py +++ b/examples/pytorch/test_examples.py @@ -395,7 +395,6 @@ class ExamplesTests(TestCasePlus): --dataset_config_name clean --train_split_name validation --eval_split_name validation - --audio_column_name file --do_train --do_eval --learning_rate 1e-4