Add token arugment in example scripts (#25172)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2023-08-02 11:17:31 +02:00
committed by GitHub
parent c6a8768dab
commit 149cb0cce2
43 changed files with 987 additions and 420 deletions

View File

@@ -229,15 +229,21 @@ class DataTrainingArguments:
)
},
)
use_auth_token: bool = field(
default=False,
token: str = field(
default=None,
metadata={
"help": (
"If :obj:`True`, will use the token generated when running"
":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
use_auth_token: bool = field(
default=None,
metadata={
"help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
},
)
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
@@ -379,6 +385,12 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if data_args.use_auth_token is not None:
warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
if data_args.token is not None:
raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
data_args.token = data_args.use_auth_token
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.
send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
@@ -427,7 +439,7 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.train_split_name,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
)
if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -452,7 +464,7 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.eval_split_name,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
)
if data_args.max_eval_samples is not None:
@@ -490,7 +502,9 @@ def main():
# the tokenizer
# load config
config = AutoConfig.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
token=data_args.token,
)
# 4. Next, if no tokenizer file is defined,
@@ -546,11 +560,13 @@ def main():
# load feature_extractor and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name_or_path,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
**tokenizer_kwargs,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
token=data_args.token,
)
# adapt config
@@ -578,7 +594,7 @@ def main():
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
config=config,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
)
# freeze encoder

View File

@@ -232,15 +232,21 @@ class DataTrainingArguments:
)
},
)
use_auth_token: bool = field(
default=False,
token: str = field(
default=None,
metadata={
"help": (
"If :obj:`True`, will use the token generated when running"
":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
use_auth_token: bool = field(
default=None,
metadata={
"help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
},
)
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
@@ -375,6 +381,12 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if data_args.use_auth_token is not None:
warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
if data_args.token is not None:
raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
data_args.token = data_args.use_auth_token
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.
send_example_telemetry("run_speech_recognition_ctc_adapter", model_args, data_args)
@@ -423,7 +435,7 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.train_split_name,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
)
if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -448,7 +460,7 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.eval_split_name,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
)
if data_args.max_eval_samples is not None:
@@ -486,7 +498,9 @@ def main():
# the tokenizer
# load config
config = AutoConfig.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
token=data_args.token,
)
# 4. Next, if no tokenizer file is defined,
@@ -500,7 +514,10 @@ def main():
vocab_dict = {}
if tokenizer_name_or_path is not None:
# load vocabulary of other adapter languages so that new language can be appended
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_auth_token=data_args.use_auth_token)
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name_or_path,
token=data_args.token,
)
vocab_dict = tokenizer.vocab.copy()
if tokenizer.target_lang is None:
raise ValueError("Make sure to load a multi-lingual tokenizer with a set target language.")
@@ -566,11 +583,13 @@ def main():
# load feature_extractor and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name_or_path,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
**tokenizer_kwargs,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
token=data_args.token,
)
# adapt config
@@ -595,7 +614,7 @@ def main():
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
config=config,
use_auth_token=data_args.use_auth_token,
token=data_args.token,
ignore_mismatched_sizes=True,
)

View File

@@ -22,6 +22,7 @@ Fine-tuning the library models for sequence to sequence speech recognition.
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
@@ -85,15 +86,21 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
default=False,
token: str = field(
default=None,
metadata={
"help": (
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
use_auth_token: bool = field(
default=None,
metadata={
"help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
},
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
@@ -278,6 +285,12 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if model_args.use_auth_token is not None:
warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
if model_args.token is not None:
raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
model_args.token = model_args.use_auth_token
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.
send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)
@@ -336,7 +349,7 @@ def main():
data_args.dataset_config_name,
split=data_args.train_split_name,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
)
if training_args.do_eval:
@@ -345,7 +358,7 @@ def main():
data_args.dataset_config_name,
split=data_args.eval_split_name,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
)
if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
@@ -370,7 +383,7 @@ def main():
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=True if model_args.use_auth_token else None,
token=model_args.token,
)
config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
@@ -383,21 +396,21 @@ def main():
model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=True if model_args.use_auth_token else None,
token=model_args.token,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
token=True if model_args.use_auth_token else None,
token=model_args.token,
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_args.model_name_or_path,
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
token=True if model_args.use_auth_token else None,
token=model_args.token,
)
if model.config.decoder_start_token_id is None: