Add token arugment in example scripts (#25172)

* fix * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2023-08-02 11:17:31 +02:00
parent c6a8768dab
commit 149cb0cce2
43 changed files with 987 additions and 420 deletions
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -229,15 +229,21 @@ class DataTrainingArguments:
            )
        },
    )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
        metadata={
            "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
    unk_token: str = field(
        default="[UNK]",
        metadata={"help": "The unk token for the tokenizer"},
@@ -379,6 +385,12 @@ def main():
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

+    if data_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+
    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
@@ -427,7 +439,7 @@ def main():
            data_args.dataset_name,
            data_args.dataset_config_name,
            split=data_args.train_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
        )

        if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -452,7 +464,7 @@ def main():
            data_args.dataset_name,
            data_args.dataset_config_name,
            split=data_args.eval_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
        )

        if data_args.max_eval_samples is not None:
@@ -490,7 +502,9 @@ def main():
    # the tokenizer
    # load config
    config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
    )

    # 4. Next, if no tokenizer file is defined,
@@ -546,11 +560,13 @@ def main():
    # load feature_extractor and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
        **tokenizer_kwargs,
    )
    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
    )

    # adapt config
@@ -578,7 +594,7 @@ def main():
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        config=config,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
    )

    # freeze encoder
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -232,15 +232,21 @@ class DataTrainingArguments:
            )
        },
    )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
        metadata={
            "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
    unk_token: str = field(
        default="[UNK]",
        metadata={"help": "The unk token for the tokenizer"},
@@ -375,6 +381,12 @@ def main():
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

+    if data_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+
    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_speech_recognition_ctc_adapter", model_args, data_args)
@@ -423,7 +435,7 @@ def main():
            data_args.dataset_name,
            data_args.dataset_config_name,
            split=data_args.train_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
        )

        if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -448,7 +460,7 @@ def main():
            data_args.dataset_name,
            data_args.dataset_config_name,
            split=data_args.eval_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
        )

        if data_args.max_eval_samples is not None:
@@ -486,7 +498,9 @@ def main():
    # the tokenizer
    # load config
    config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
    )

    # 4. Next, if no tokenizer file is defined,
@@ -500,7 +514,10 @@ def main():
    vocab_dict = {}
    if tokenizer_name_or_path is not None:
        # load vocabulary of other adapter languages so that new language can be appended
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_auth_token=data_args.use_auth_token)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            token=data_args.token,
+        )
        vocab_dict = tokenizer.vocab.copy()
        if tokenizer.target_lang is None:
            raise ValueError("Make sure to load a multi-lingual tokenizer with a set target language.")
@@ -566,11 +583,13 @@ def main():
    # load feature_extractor and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
        **tokenizer_kwargs,
    )
    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
    )

    # adapt config
@@ -595,7 +614,7 @@ def main():
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        config=config,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
        ignore_mismatched_sizes=True,
    )

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -22,6 +22,7 @@ Fine-tuning the library models for sequence to sequence speech recognition.
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union

@@ -85,15 +86,21 @@ class ModelArguments:
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
        metadata={
            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
    freeze_feature_encoder: bool = field(
        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
    )
@@ -278,6 +285,12 @@ def main():
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)
@@ -336,7 +349,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.train_split_name,
            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
        )

    if training_args.do_eval:
@@ -345,7 +358,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.eval_split_name,
            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
        )

    if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
@@ -370,7 +383,7 @@ def main():
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
    )

    config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
@@ -383,21 +396,21 @@ def main():
        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
    )
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
    )

    if model.config.decoder_start_token_id is None: