Broken links fixed related to datasets docs (#27569)
fixed the broken links belogs to dataset library of transformers
This commit is contained in:
@@ -10,7 +10,7 @@ way which enables simple and efficient model parallelism.
|
||||
`run_image_captioning_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets
|
||||
library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
|
||||
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files and you also will find examples of these below.
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files and you also will find examples of these below.
|
||||
|
||||
### Download COCO dataset (2017)
|
||||
This example uses COCO dataset (2017) through a custom dataset script, which requires users to manually download the
|
||||
|
||||
@@ -494,7 +494,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
model = FlaxVisionEncoderDecoderModel.from_pretrained(
|
||||
|
||||
@@ -589,7 +589,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
|
||||
|
||||
@@ -484,7 +484,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
|
||||
|
||||
@@ -516,7 +516,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
|
||||
|
||||
@@ -630,7 +630,7 @@ def main():
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
|
||||
|
||||
@@ -536,7 +536,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
# endregion
|
||||
|
||||
# region Load pretrained model and tokenizer
|
||||
|
||||
@@ -9,7 +9,7 @@ way which enables simple and efficient model parallelism.
|
||||
|
||||
`run_summarization_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
|
||||
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files and you also will find examples of these below.
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files and you also will find examples of these below.
|
||||
|
||||
### Train the model
|
||||
Next we can run the example script to train the model:
|
||||
|
||||
@@ -521,7 +521,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
|
||||
|
||||
@@ -410,7 +410,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Labels
|
||||
if data_args.task_name is not None:
|
||||
@@ -427,7 +427,7 @@ def main():
|
||||
num_labels = 1
|
||||
else:
|
||||
# A useful fast method:
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.unique
|
||||
label_list = raw_datasets["train"].unique("label")
|
||||
label_list.sort() # Let's sort it for determinism
|
||||
num_labels = len(label_list)
|
||||
|
||||
@@ -465,7 +465,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
if raw_datasets["train"] is not None:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
@@ -340,7 +340,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# 5. Load pretrained model, tokenizer, and image processor
|
||||
if model_args.tokenizer_name:
|
||||
|
||||
@@ -388,7 +388,7 @@ def main():
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -368,7 +368,7 @@ def main():
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -382,7 +382,7 @@ def main():
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -371,7 +371,7 @@ def main():
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -352,7 +352,7 @@ def main():
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -329,7 +329,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
|
||||
|
||||
@@ -366,7 +366,7 @@ def main():
|
||||
for split in raw_datasets.keys():
|
||||
raw_datasets[split] = raw_datasets[split].select(range(100))
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
if raw_datasets["train"] is not None:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
@@ -337,7 +337,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -325,7 +325,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -369,7 +369,7 @@ def main():
|
||||
extension = args.train_file.split(".")[-1]
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -417,7 +417,7 @@ def main():
|
||||
extension = args.train_file.split(".")[-1]
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -382,7 +382,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -134,7 +134,7 @@ of **0.36**.
|
||||
|
||||
### Multi GPU CTC with Dataset Streaming
|
||||
|
||||
The following command shows how to use [Dataset Streaming mode](https://huggingface.co/docs/datasets/dataset_streaming.html)
|
||||
The following command shows how to use [Dataset Streaming mode](https://huggingface.co/docs/datasets/dataset_streaming)
|
||||
to fine-tune [XLS-R](https://huggingface.co/transformers/main/model_doc/xls_r.html)
|
||||
on [Common Voice](https://huggingface.co/datasets/common_voice) using 4 GPUs in half-precision.
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2s
|
||||
|
||||
`run_summarization.py` is a lightweight example of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
|
||||
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files
|
||||
and you also will find examples of these below.
|
||||
|
||||
## With Trainer
|
||||
|
||||
@@ -432,7 +432,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -409,7 +409,7 @@ def main():
|
||||
extension = args.train_file.split(".")[-1]
|
||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -396,7 +396,7 @@ def main():
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
if data_args.remove_splits is not None:
|
||||
for split in data_args.remove_splits.split(","):
|
||||
|
||||
@@ -355,7 +355,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Labels
|
||||
if data_args.task_name is not None:
|
||||
@@ -372,7 +372,7 @@ def main():
|
||||
num_labels = 1
|
||||
else:
|
||||
# A useful fast method:
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.unique
|
||||
label_list = raw_datasets["train"].unique("label")
|
||||
label_list.sort() # Let's sort it for determinism
|
||||
num_labels = len(label_list)
|
||||
|
||||
@@ -293,7 +293,7 @@ def main():
|
||||
extension = (args.train_file if args.train_file is not None else args.validation_file).split(".")[-1]
|
||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Labels
|
||||
if args.task_name is not None:
|
||||
|
||||
@@ -318,7 +318,7 @@ def main():
|
||||
extension = data_args.train_file.split(".")[-1]
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
if training_args.do_train:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
@@ -348,7 +348,7 @@ def main():
|
||||
for split in raw_datasets.keys():
|
||||
raw_datasets[split] = raw_datasets[split].select(range(100))
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
if raw_datasets["train"] is not None:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
@@ -33,7 +33,7 @@ For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2s
|
||||
|
||||
`run_translation.py` is a lightweight examples of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
|
||||
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files
|
||||
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files
|
||||
and you also will find examples of these below.
|
||||
|
||||
|
||||
|
||||
@@ -389,7 +389,7 @@ def main():
|
||||
extension = args.train_file.split(".")[-1]
|
||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -227,7 +227,7 @@ the forum and making use of the [🤗 hub](http://huggingface.co/) to have a ver
|
||||
control for your models and training logs.
|
||||
- When debugging, it is important that the debugging cycle is kept as short as possible to
|
||||
be able to effectively debug. *E.g.* if there is a problem with your training script,
|
||||
you should run it with just a couple of hundreds of examples and not the whole dataset script. This can be done by either making use of [datasets streaming](https://huggingface.co/docs/datasets/master/dataset_streaming.html?highlight=streaming) or by selecting just the first
|
||||
you should run it with just a couple of hundreds of examples and not the whole dataset script. This can be done by either making use of [datasets streaming](https://huggingface.co/docs/datasets/master/dataset_streaming?highlight=streaming) or by selecting just the first
|
||||
X number of data samples after loading:
|
||||
|
||||
```python
|
||||
|
||||
@@ -23,7 +23,7 @@ JAX/Flax allows you to trace pure functions and compile them into efficient, fus
|
||||
Models written in JAX/Flax are **immutable** and updated in a purely functional
|
||||
way which enables simple and efficient model parallelism.
|
||||
|
||||
All of the following examples make use of [dataset streaming](https://huggingface.co/docs/datasets/master/dataset_streaming.html), therefore allowing to train models on massive datasets\
|
||||
All of the following examples make use of [dataset streaming](https://huggingface.co/docs/datasets/master/dataset_streaming), therefore allowing to train models on massive datasets\
|
||||
without ever having to download the full dataset.
|
||||
|
||||
## Masked language modeling
|
||||
|
||||
@@ -304,7 +304,7 @@ def main():
|
||||
extension = "text"
|
||||
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained config and tokenizer
|
||||
if model_args.config_name:
|
||||
|
||||
@@ -10,7 +10,7 @@ way which enables simple and efficient model parallelism.
|
||||
|
||||
`run_wav2vec2_pretrain_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then pretrain the wav2vec2 architectures above on it.
|
||||
|
||||
For custom datasets in `jsonlines` format please see: [the Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets.html#json-files) and you also will find examples of these below.
|
||||
For custom datasets in `jsonlines` format please see: [the Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets#json-files) and you also will find examples of these below.
|
||||
|
||||
Let's start by creating a model repository to save the trained model and logs.
|
||||
Here we call the model `"wav2vec2-base-robust"`, but you can change the model name as you like.
|
||||
|
||||
@@ -294,7 +294,7 @@ def main():
|
||||
for split in raw_datasets.keys():
|
||||
raw_datasets[split] = raw_datasets[split].select(range(100))
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
if raw_datasets["train"] is not None:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
@@ -278,7 +278,7 @@ def main():
|
||||
extension = "text"
|
||||
datasets = load_dataset(extension, data_files=data_files)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -524,7 +524,7 @@ if __name__ == "__main__":
|
||||
extension = "text"
|
||||
datasets = load_dataset(extension, data_files=data_files)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
|
||||
|
||||
@@ -272,7 +272,7 @@ if args.dataset_name is not None:
|
||||
else:
|
||||
raise ValueError("Evaluation requires a dataset name")
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
|
||||
@@ -308,7 +308,7 @@ def main():
|
||||
extension = data_args.test_file.split(".")[-1]
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# set default quantization parameters before building model
|
||||
quant_trainer.set_default_quantizers(quant_trainer_args)
|
||||
|
||||
@@ -65,7 +65,7 @@ def main(
|
||||
"csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
|
||||
)
|
||||
|
||||
# More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files
|
||||
# More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
|
||||
|
||||
# Then split the documents into passages of 100 words
|
||||
dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
|
||||
|
||||
@@ -73,7 +73,7 @@ def main(
|
||||
"csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
|
||||
)
|
||||
|
||||
# More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files
|
||||
# More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
|
||||
|
||||
# Then split the documents into passages of 100 words
|
||||
dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
|
||||
|
||||
@@ -112,7 +112,7 @@ Hugging Face Hub for additional audio data, for example by selecting the categor
|
||||
["speech-processing"](https://huggingface.co/datasets?task_categories=task_categories:speech-processing&sort=downloads).
|
||||
All datasets that are available on the Hub can be downloaded via the 🤗 Datasets library in the same way Common Voice is downloaded.
|
||||
If one wants to combine multiple datasets for training, it might make sense to take a look at
|
||||
the [`interleave_datasets`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=interleave#datasets.interleave_datasets) function.
|
||||
the [`interleave_datasets`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=interleave#datasets.interleave_datasets) function.
|
||||
|
||||
In addition, participants can also make use of their audio data. Here, please make sure that you **are allowed to use the audio data**. E.g., if audio data
|
||||
is taken from media platforms, such as YouTube, it should be verified that the media platform and the owner of the data have given her/his approval to use the audio
|
||||
|
||||
@@ -277,7 +277,7 @@ def main():
|
||||
# Loading a dataset from local json files
|
||||
raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Labels
|
||||
label_list = raw_datasets["train"].features["label"].names
|
||||
|
||||
@@ -317,7 +317,7 @@ def main():
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -315,7 +315,7 @@ def main():
|
||||
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
|
||||
@@ -361,7 +361,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# 5. Load pretrained model, tokenizer, and image processor
|
||||
if model_args.tokenizer_name:
|
||||
|
||||
@@ -316,7 +316,7 @@ def main():
|
||||
task="image-classification",
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Prepare label mappings.
|
||||
# We'll include these in the model's config to get human readable labels in the Inference API.
|
||||
|
||||
@@ -371,7 +371,7 @@ def main():
|
||||
**dataset_args,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
# endregion
|
||||
|
||||
# region Load pretrained model and tokenizer
|
||||
|
||||
@@ -353,7 +353,7 @@ def main():
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
# endregion
|
||||
|
||||
# region Load pretrained model and tokenizer
|
||||
|
||||
@@ -338,7 +338,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# When using your own dataset or a different dataset from swag, you will probably need to change this.
|
||||
ending_names = [f"ending{i}" for i in range(4)]
|
||||
|
||||
@@ -352,7 +352,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
# endregion
|
||||
|
||||
# region Load pretrained model and tokenizer
|
||||
|
||||
@@ -401,7 +401,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
# endregion
|
||||
|
||||
# region Load model config and tokenizer
|
||||
|
||||
@@ -271,7 +271,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
is_regression = data_args.task_name == "stsb"
|
||||
if not is_regression:
|
||||
|
||||
@@ -290,7 +290,7 @@ def main():
|
||||
# Loading a dataset from local json files
|
||||
datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
|
||||
# See more about loading any type of standard or custom dataset at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
# endregion
|
||||
|
||||
# region Label preprocessing
|
||||
|
||||
@@ -269,7 +269,7 @@ def main():
|
||||
token=model_args.token,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
if raw_datasets["train"] is not None:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
Reference in New Issue
Block a user