From 24a85cca61fda92b9376fe45da1dcb10c8853066 Mon Sep 17 00:00:00 2001 From: Karim Foda <35491698+KMFODA@users.noreply.github.com> Date: Mon, 4 Apr 2022 15:27:45 +0100 Subject: [PATCH] Add use_auth to load_datasets for private datasets to PT and TF examples (#16521) * fix formatting and remove use_auth * Add use_auth_token to Flax examples --- .../run_image_captioning_flax.py | 25 +++++++- .../flax/language-modeling/run_clm_flax.py | 58 +++++++++++++++--- .../flax/language-modeling/run_mlm_flax.py | 58 +++++++++++++++--- .../flax/language-modeling/run_t5_mlm_flax.py | 59 ++++++++++++++++--- examples/flax/question-answering/run_qa.py | 13 +++- .../summarization/run_summarization_flax.py | 53 ++++++++++++++--- .../flax/text-classification/run_flax_glue.py | 27 +++++++-- .../flax/token-classification/run_flax_ner.py | 12 +++- .../flax/vision/run_image_classification.py | 20 ++++++- .../run_audio_classification.py | 10 +++- .../contrastive-image-text/run_clip.py | 8 ++- .../run_image_classification.py | 1 + examples/pytorch/image-pretraining/run_mae.py | 1 + examples/pytorch/image-pretraining/run_mim.py | 1 + examples/pytorch/language-modeling/run_clm.py | 17 +++++- examples/pytorch/language-modeling/run_mlm.py | 16 ++++- examples/pytorch/language-modeling/run_plm.py | 9 ++- examples/pytorch/multiple-choice/run_swag.py | 14 ++++- examples/pytorch/question-answering/run_qa.py | 13 +++- .../question-answering/run_qa_beam_search.py | 13 +++- .../run_wav2vec2_pretraining_no_trainer.py | 5 +- .../run_speech_recognition_seq2seq.py | 10 +++- .../summarization/run_summarization.py | 12 +++- .../pytorch/text-classification/run_glue.py | 26 ++++++-- .../pytorch/text-classification/run_xnli.py | 30 ++++++++-- .../pytorch/token-classification/run_ner.py | 5 +- .../pytorch/translation/run_translation.py | 12 +++- .../tensorflow/language-modeling/run_clm.py | 15 ++++- .../tensorflow/language-modeling/run_mlm.py | 14 ++++- .../tensorflow/multiple-choice/run_swag.py | 14 ++++- .../tensorflow/question-answering/run_qa.py | 15 ++++- .../summarization/run_summarization.py | 12 +++- .../text-classification/run_glue.py | 7 ++- .../run_text_classification.py | 7 ++- .../token-classification/run_ner.py | 12 +++- .../tensorflow/translation/run_translation.py | 12 +++- 36 files changed, 544 insertions(+), 92 deletions(-) diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py index b4b9afe0d3..b1c9012777 100644 --- a/examples/flax/image-captioning/run_image_captioning_flax.py +++ b/examples/flax/image-captioning/run_image_captioning_flax.py @@ -178,6 +178,13 @@ class ModelArguments: "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`." }, ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) @dataclass @@ -418,6 +425,7 @@ def main(): cache_dir=model_args.cache_dir, keep_in_memory=False, data_dir=data_args.data_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -430,7 +438,12 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + dataset = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -439,12 +452,18 @@ def main(): model_args.model_name_or_path, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( - model_args.model_name_or_path, cache_dir=model_args.cache_dir + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id) diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index 82a9757d5c..afb6d75b38 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -165,6 +165,13 @@ class ModelArguments: "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`." }, ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) @dataclass @@ -363,7 +370,11 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + keep_in_memory=False, + use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in dataset.keys(): @@ -372,12 +383,14 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) dataset["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -390,7 +403,13 @@ def main(): if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + dataset = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + **dataset_args, + use_auth_token=True if model_args.use_auth_token else None, + ) if "validation" not in dataset.keys(): dataset["validation"] = load_dataset( @@ -399,6 +418,7 @@ def main(): split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, **dataset_args, + use_auth_token=True if model_args.use_auth_token else None, ) dataset["train"] = load_dataset( extension, @@ -406,6 +426,7 @@ def main(): split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, **dataset_args, + use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -416,20 +437,34 @@ def main(): # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) + config = AutoConfig.from_pretrained( + model_args.config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) + config = AutoConfig.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.tokenizer_name, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( @@ -439,11 +474,18 @@ def main(): if model_args.model_name_or_path: model = FlaxAutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + model_args.model_name_or_path, + config=config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) else: model = FlaxAutoModelForCausalLM.from_config( - config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets. diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index daa247ecaa..6ea0f6e156 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -163,6 +163,13 @@ class ModelArguments: "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`." }, ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) @dataclass @@ -396,7 +403,12 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) + datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( @@ -404,12 +416,14 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -420,7 +434,12 @@ def main(): extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" - datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( @@ -428,12 +447,14 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -444,20 +465,34 @@ def main(): # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) + config = AutoConfig.from_pretrained( + model_args.config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) + config = AutoConfig.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.tokenizer_name, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( @@ -572,11 +607,18 @@ def main(): if model_args.model_name_or_path: model = FlaxAutoModelForMaskedLM.from_pretrained( - model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + model_args.model_name_or_path, + config=config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) else: model = FlaxAutoModelForMaskedLM.from_config( - config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) # Store some constant diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index 622f11f5de..5b1067cd99 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -162,6 +162,13 @@ class ModelArguments: "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`." }, ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) @dataclass @@ -525,7 +532,12 @@ def main(): # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) + datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( @@ -533,12 +545,14 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -549,7 +563,12 @@ def main(): extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" - datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( @@ -557,12 +576,14 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -571,11 +592,17 @@ def main(): if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.tokenizer_name, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( @@ -585,10 +612,17 @@ def main(): if model_args.config_name: config = T5Config.from_pretrained( - model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer) + model_args.config_name, + cache_dir=model_args.cache_dir, + vocab_size=len(tokenizer), + use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: - config = T5Config.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) + config = T5Config.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") @@ -678,11 +712,20 @@ def main(): if model_args.model_name_or_path: model = FlaxT5ForConditionalGeneration.from_pretrained( - model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + model_args.model_name_or_path, + config=config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) else: config.vocab_size = len(tokenizer) - model = FlaxT5ForConditionalGeneration(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)) + model = FlaxT5ForConditionalGeneration( + config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, + ) # Data collator # This one will take care of randomly masking the tokens. diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index a15cca6607..6ab150a762 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -448,7 +448,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading the dataset from local csv or json file. @@ -463,7 +466,13 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + field="data", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py index effe3b5883..3ebff73b98 100644 --- a/examples/flax/summarization/run_summarization_flax.py +++ b/examples/flax/summarization/run_summarization_flax.py @@ -176,6 +176,13 @@ class ModelArguments: "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`." }, ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) @dataclass @@ -421,7 +428,11 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + keep_in_memory=False, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -434,27 +445,46 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + dataset = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) + config = AutoConfig.from_pretrained( + model_args.config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) + config = AutoConfig.from_pretrained( + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.tokenizer_name, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( @@ -464,11 +494,18 @@ def main(): if model_args.model_name_or_path: model = FlaxAutoModelForSeq2SeqLM.from_pretrained( - model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + model_args.model_name_or_path, + config=config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) else: model = FlaxAutoModelForSeq2SeqLM.from_config( - config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) if model.config.decoder_start_token_id is None: diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index d56d23d273..06f9caba89 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -337,7 +337,11 @@ def main(): # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset("glue", data_args.task_name) + raw_datasets = load_dataset( + "glue", + data_args.task_name, + use_auth_token=True if model_args.use_auth_token else None, + ) else: # Loading the dataset from local csv or json file. data_files = {} @@ -346,7 +350,11 @@ def main(): if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = (data_args.train_file if data_args.train_file is not None else data_args.valid_file).split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files) + raw_datasets = load_dataset( + extension, + data_files=data_files, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -372,12 +380,21 @@ def main(): # Load pretrained model and tokenizer config = AutoConfig.from_pretrained( - model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name + model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, use_fast=not model_args.use_slow_tokenizer + model_args.model_name_or_path, + use_fast=not model_args.use_slow_tokenizer, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = FlaxAutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + config=config, + use_auth_token=True if model_args.use_auth_token else None, ) - model = FlaxAutoModelForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) # Preprocessing the datasets if data_args.task_name is not None: diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index abf1b8d0c1..32f0104b89 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -391,7 +391,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading the dataset from local csv or json file. @@ -401,7 +404,12 @@ def main(): if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = (data_args.train_file if data_args.train_file is not None else data_args.valid_file).split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py index 7459d24c63..0dc7b2f957 100644 --- a/examples/flax/vision/run_image_classification.py +++ b/examples/flax/vision/run_image_classification.py @@ -154,6 +154,13 @@ class ModelArguments: "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`." }, ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) @dataclass @@ -315,6 +322,7 @@ def main(): num_labels=len(train_dataset.classes), image_size=data_args.image_size, cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained( @@ -322,6 +330,7 @@ def main(): num_labels=len(train_dataset.classes), image_size=data_args.image_size, cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: config = CONFIG_MAPPING[model_args.model_type]() @@ -329,11 +338,18 @@ def main(): if model_args.model_name_or_path: model = FlaxAutoModelForImageClassification.from_pretrained( - model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + model_args.model_name_or_path, + config=config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) else: model = FlaxAutoModelForImageClassification.from_config( - config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) + config, + seed=training_args.seed, + dtype=getattr(jnp, model_args.dtype), + use_auth_token=True if model_args.use_auth_token else None, ) # Store some constant diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 14c0a026fd..c0eb755b6a 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -227,10 +227,16 @@ def main(): # Initialize our dataset and prepare it for the audio classification task. raw_datasets = DatasetDict() raw_datasets["train"] = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.train_split_name, + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["eval"] = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.eval_split_name, + use_auth_token=True if model_args.use_auth_token else None, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index 79fd123064..02f2093687 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -276,6 +276,7 @@ def main(): cache_dir=model_args.cache_dir, keep_in_memory=False, data_dir=data_args.data_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -288,7 +289,12 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + dataset = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index b7de0f5f7b..fef52c4bf5 100644 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -207,6 +207,7 @@ def main(): data_files=data_args.data_files, cache_dir=model_args.cache_dir, task="image-classification", + use_auth_token=True if model_args.use_auth_token else None, ) # If we don't have a validation split, split off a percentage of train as validation. diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 3b634d6918..e2182ec783 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -207,6 +207,7 @@ def main(): data_args.dataset_config_name, data_files=data_args.data_files, cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) # If we don't have a validation split, split off a percentage of train as validation. diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 0377a505e0..323c384895 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -266,6 +266,7 @@ def main(): data_args.dataset_config_name, data_files=data_args.data_files, cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) # If we don't have a validation split, split off a percentage of train as validation. diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index a1cdcf9ee4..3d2af72cca 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -254,7 +254,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -262,12 +265,14 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -284,7 +289,13 @@ def main(): if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -292,6 +303,7 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) raw_datasets["train"] = load_dataset( @@ -299,6 +311,7 @@ def main(): data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 6ea3c2c934..f829e86781 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -263,7 +263,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -271,12 +274,14 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -288,7 +293,12 @@ def main(): extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): @@ -297,12 +307,14 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index d1c09896d8..cc4ad60232 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -256,7 +256,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -264,12 +267,14 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -288,12 +293,14 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 01c9e8bcf7..4578e4570a 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -269,10 +269,20 @@ def main(): if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: # Downloading and loading the swag dataset from the hub. - raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + "swag", + "regular", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 67aaf1d84f..90d199b14d 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -262,7 +262,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -276,7 +279,13 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + field="data", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 4c79be08b9..96aa07a808 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -260,7 +260,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -273,7 +276,13 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + field="data", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 51ac519118..88021a4285 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -403,7 +403,10 @@ def main(): for dataset_config_name, train_split_name in zip(args.dataset_config_names, args.dataset_split_names): # load dataset dataset_split = load_dataset( - args.dataset_name, dataset_config_name, split=train_split_name, cache_dir=args.cache_dir + args.dataset_name, + dataset_config_name, + split=train_split_name, + cache_dir=args.cache_dir, ) datasets_splits.append(dataset_split) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 695a5b24fd..46d4785fa8 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -278,12 +278,18 @@ def main(): if training_args.do_train: raw_datasets["train"] = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.train_split_name, + use_auth_token=True if model_args.use_auth_token else None, ) if training_args.do_eval: raw_datasets["eval"] = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.eval_split_name, + use_auth_token=True if model_args.use_auth_token else None, ) if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names: diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 66aeb981bd..7b39cb8e48 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -341,7 +341,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -354,7 +357,12 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 88be878fae..a0730f6098 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -252,11 +252,19 @@ def main(): # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + "glue", + data_args.task_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) elif data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading a dataset from your local files. @@ -281,10 +289,20 @@ def main(): if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files - raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + "csv", + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: # Loading a dataset from local json files - raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + "json", + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index f54b1ec2aa..4a17a5d702 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -213,19 +213,41 @@ def main(): # Downloading and loading xnli dataset from the hub. if training_args.do_train: if model_args.train_language is None: - train_dataset = load_dataset("xnli", model_args.language, split="train", cache_dir=model_args.cache_dir) + train_dataset = load_dataset( + "xnli", + model_args.language, + split="train", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: train_dataset = load_dataset( - "xnli", model_args.train_language, split="train", cache_dir=model_args.cache_dir + "xnli", + model_args.train_language, + split="train", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) label_list = train_dataset.features["label"].names if training_args.do_eval: - eval_dataset = load_dataset("xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir) + eval_dataset = load_dataset( + "xnli", + model_args.language, + split="validation", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) label_list = eval_dataset.features["label"].names if training_args.do_predict: - predict_dataset = load_dataset("xnli", model_args.language, split="test", cache_dir=model_args.cache_dir) + predict_dataset = load_dataset( + "xnli", + model_args.language, + split="test", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) label_list = predict_dataset.features["label"].names # Labels diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 9ff64b3797..5545b35862 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -249,7 +249,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index b458a3f0cd..f7e98276dc 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -306,7 +306,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -319,7 +322,12 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 4cbc00b3cd..84e71efe50 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -280,17 +280,23 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + use_auth_token=True if model_args.use_auth_token else None, + ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -303,7 +309,12 @@ def main(): if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args) + raw_datasets = load_dataset( + extension, + data_files=data_files, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 44c5d23031..8b32070b2d 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -292,17 +292,23 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + use_auth_token=True if model_args.use_auth_token else None, + ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", + use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -313,7 +319,11 @@ def main(): extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" - raw_datasets = load_dataset(extension, data_files=data_files) + raw_datasets = load_dataset( + extension, + data_files=data_files, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index e14815cf81..2c78ab39fa 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -290,10 +290,20 @@ def main(): if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: # Downloading and loading the swag dataset from the hub. - raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + "swag", + "regular", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 50e8c7f50d..891219d3a1 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -278,7 +278,12 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) + datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: data_files = {} if data_args.train_file is not None: @@ -291,7 +296,13 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) + datasets = load_dataset( + extension, + data_files=data_files, + field="data", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index e40c763530..09aa8f90de 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -391,7 +391,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -404,7 +407,12 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index 03d7df675b..fa8cb98a5a 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -236,7 +236,12 @@ def main(): # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee # that only one local process can concurrently download the dataset. - datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) + datasets = load_dataset( + "glue", + data_args.task_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py index 114caacdbf..3f3d64b623 100644 --- a/examples/tensorflow/text-classification/run_text_classification.py +++ b/examples/tensorflow/text-classification/run_text_classification.py @@ -236,7 +236,12 @@ def main(): if data_args.input_file_extension == "csv": # Loading a dataset from local csv files - datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) + datasets = load_dataset( + "csv", + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py index acb7285566..e580ed94b0 100644 --- a/examples/tensorflow/token-classification/run_ner.py +++ b/examples/tensorflow/token-classification/run_ner.py @@ -266,7 +266,11 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + use_auth_token=True if model_args.use_auth_token else None, + ) else: data_files = {} if data_args.train_file is not None: @@ -274,7 +278,11 @@ def main(): if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files) + raw_datasets = load_dataset( + extension, + data_files=data_files, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index fce150b712..c6921bbf3c 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -347,7 +347,10 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} @@ -357,7 +360,12 @@ def main(): if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion