update examples after ruff being updated (#36972)
* update * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -154,7 +154,7 @@ jobs:
|
|||||||
path: ~/transformers/installed.txt
|
path: ~/transformers/installed.txt
|
||||||
- run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
|
- run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
|
||||||
- run: ruff check examples tests src utils
|
- run: ruff check examples tests src utils
|
||||||
- run: ruff format tests src utils --check
|
- run: ruff format examples tests src utils --check
|
||||||
- run: python utils/custom_init_isort.py --check_only
|
- run: python utils/custom_init_isort.py --check_only
|
||||||
- run: python utils/sort_auto_mappings.py --check_only
|
- run: python utils/sort_auto_mappings.py --check_only
|
||||||
- run: python utils/check_doc_toc.py
|
- run: python utils/check_doc_toc.py
|
||||||
|
|||||||
@@ -53,4 +53,4 @@ for _ in range(nbenchmark):
|
|||||||
func()
|
func()
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(end - start)
|
print(end - start)
|
||||||
print(f"Throughput: {((nbenchmark * BS)/(end-start)):.3f} examples/sec")
|
print(f"Throughput: {((nbenchmark * BS) / (end - start)):.3f} examples/sec")
|
||||||
|
|||||||
@@ -231,9 +231,9 @@ def main():
|
|||||||
|
|
||||||
# set decoder_start_token_id for MBart
|
# set decoder_start_token_id for MBart
|
||||||
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
||||||
assert (
|
assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
|
||||||
data_args.tgt_lang is not None and data_args.src_lang is not None
|
"mBart requires --tgt_lang and --src_lang"
|
||||||
), "mBart requires --tgt_lang and --src_lang"
|
)
|
||||||
if isinstance(tokenizer, MBartTokenizer):
|
if isinstance(tokenizer, MBartTokenizer):
|
||||||
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
|
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ def run_search():
|
|||||||
|
|
||||||
results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
|
results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
|
||||||
print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
|
print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
|
||||||
print(" | ".join([f"{'-'*col_widths[col]}" for col in col_names]))
|
print(" | ".join([f"{'-' * col_widths[col]}" for col in col_names]))
|
||||||
for row in results_sorted:
|
for row in results_sorted:
|
||||||
print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))
|
print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))
|
||||||
|
|
||||||
|
|||||||
@@ -282,9 +282,9 @@ class Seq2SeqDataCollator:
|
|||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.pad_token_id = tokenizer.pad_token_id
|
self.pad_token_id = tokenizer.pad_token_id
|
||||||
self.decoder_start_token_id = decoder_start_token_id
|
self.decoder_start_token_id = decoder_start_token_id
|
||||||
assert (
|
assert self.pad_token_id is not None, (
|
||||||
self.pad_token_id is not None
|
f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
|
||||||
), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
|
)
|
||||||
self.data_args = data_args
|
self.data_args = data_args
|
||||||
self.tpu_num_cores = tpu_num_cores
|
self.tpu_num_cores = tpu_num_cores
|
||||||
self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
|
self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
|
||||||
@@ -593,7 +593,7 @@ def assert_all_frozen(model):
|
|||||||
model_grads: List[bool] = list(grad_status(model))
|
model_grads: List[bool] = list(grad_status(model))
|
||||||
n_require_grad = sum(lmap(int, model_grads))
|
n_require_grad = sum(lmap(int, model_grads))
|
||||||
npars = len(model_grads)
|
npars = len(model_grads)
|
||||||
assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
|
assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"
|
||||||
|
|
||||||
|
|
||||||
def assert_not_all_frozen(model):
|
def assert_not_all_frozen(model):
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ class POS(TokenClassificationTask):
|
|||||||
s_p = preds_list[example_id]
|
s_p = preds_list[example_id]
|
||||||
out = ""
|
out = ""
|
||||||
for token in sentence:
|
for token in sentence:
|
||||||
out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
|
out += f"{token['form']} ({token['upos']}|{s_p.pop(0)}) "
|
||||||
out += "\n"
|
out += "\n"
|
||||||
writer.write(out)
|
writer.write(out)
|
||||||
example_id += 1
|
example_id += 1
|
||||||
|
|||||||
@@ -534,7 +534,7 @@ class Multimodal2VisionEmbeddings(nn.Module):
|
|||||||
batch_size, _, height, width = pixel_values.shape
|
batch_size, _, height, width = pixel_values.shape
|
||||||
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
|
f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
|
||||||
)
|
)
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
|
|||||||
@@ -438,7 +438,7 @@ def main():
|
|||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
|
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
|
||||||
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
|
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
|
||||||
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
|
logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
|
||||||
|
|
||||||
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
|
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
|
||||||
# on a small vocab and want a smaller embedding size, remove this test.
|
# on a small vocab and want a smaller embedding size, remove this test.
|
||||||
|
|||||||
@@ -265,8 +265,7 @@ class DataTrainingArguments:
|
|||||||
default="<fim_pad>",
|
default="<fim_pad>",
|
||||||
metadata={
|
metadata={
|
||||||
"help": (
|
"help": (
|
||||||
"Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. "
|
"Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."
|
||||||
"Defaults to '<fim_pad>'."
|
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -514,7 +513,7 @@ def main():
|
|||||||
attn_implementation=model_args.attn_implementation,
|
attn_implementation=model_args.attn_implementation,
|
||||||
)
|
)
|
||||||
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
|
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
|
||||||
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
|
logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
|
||||||
|
|
||||||
# Add the new FIM tokens to the tokenizer and resize model's vocab embeddings
|
# Add the new FIM tokens to the tokenizer and resize model's vocab embeddings
|
||||||
special_tokens = [data_args.fim_prefix_token, data_args.fim_middle_token, data_args.fim_suffix_token]
|
special_tokens = [data_args.fim_prefix_token, data_args.fim_middle_token, data_args.fim_suffix_token]
|
||||||
|
|||||||
@@ -234,9 +234,7 @@ def parse_args():
|
|||||||
"--fim_pad_token",
|
"--fim_pad_token",
|
||||||
type=str,
|
type=str,
|
||||||
default="<fim_pad>",
|
default="<fim_pad>",
|
||||||
help=(
|
help=("Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."),
|
||||||
"Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True." " Defaults to '<fim_pad>'."
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--preprocessing_num_workers",
|
"--preprocessing_num_workers",
|
||||||
|
|||||||
@@ -491,7 +491,7 @@ def main():
|
|||||||
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
||||||
# that could be easily picked up by the model
|
# that could be easily picked up by the model
|
||||||
chars_to_ignore_regex = (
|
chars_to_ignore_regex = (
|
||||||
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
|
f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
|
||||||
)
|
)
|
||||||
text_column_name = data_args.text_column_name
|
text_column_name = data_args.text_column_name
|
||||||
|
|
||||||
|
|||||||
@@ -471,7 +471,7 @@ def main():
|
|||||||
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
||||||
# that could be easily picked up by the model
|
# that could be easily picked up by the model
|
||||||
chars_to_ignore_regex = (
|
chars_to_ignore_regex = (
|
||||||
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
|
f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
|
||||||
)
|
)
|
||||||
text_column_name = data_args.text_column_name
|
text_column_name = data_args.text_column_name
|
||||||
|
|
||||||
|
|||||||
@@ -505,9 +505,9 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
|
if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
|
||||||
assert (
|
assert data_args.lang is not None, (
|
||||||
data_args.lang is not None
|
f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
|
||||||
), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
|
)
|
||||||
|
|
||||||
tokenizer.src_lang = data_args.lang
|
tokenizer.src_lang = data_args.lang
|
||||||
tokenizer.tgt_lang = data_args.lang
|
tokenizer.tgt_lang = data_args.lang
|
||||||
|
|||||||
@@ -199,9 +199,9 @@ class DataTrainingArguments:
|
|||||||
train_extension = self.train_file.split(".")[-1]
|
train_extension = self.train_file.split(".")[-1]
|
||||||
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
|
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
|
||||||
validation_extension = self.validation_file.split(".")[-1]
|
validation_extension = self.validation_file.split(".")[-1]
|
||||||
assert (
|
assert validation_extension == train_extension, (
|
||||||
validation_extension == train_extension
|
"`validation_file` should have the same extension (csv or json) as `train_file`."
|
||||||
), "`validation_file` should have the same extension (csv or json) as `train_file`."
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -357,9 +357,9 @@ def main():
|
|||||||
if data_args.test_file is not None:
|
if data_args.test_file is not None:
|
||||||
train_extension = data_args.train_file.split(".")[-1]
|
train_extension = data_args.train_file.split(".")[-1]
|
||||||
test_extension = data_args.test_file.split(".")[-1]
|
test_extension = data_args.test_file.split(".")[-1]
|
||||||
assert (
|
assert test_extension == train_extension, (
|
||||||
test_extension == train_extension
|
"`test_file` should have the same extension (csv or json) as `train_file`."
|
||||||
), "`test_file` should have the same extension (csv or json) as `train_file`."
|
)
|
||||||
data_files["test"] = data_args.test_file
|
data_files["test"] = data_args.test_file
|
||||||
else:
|
else:
|
||||||
raise ValueError("Need either a dataset name or a test file for `do_predict`.")
|
raise ValueError("Need either a dataset name or a test file for `do_predict`.")
|
||||||
|
|||||||
@@ -156,9 +156,9 @@ class DataTrainingArguments:
|
|||||||
train_extension = self.train_file.split(".")[-1]
|
train_extension = self.train_file.split(".")[-1]
|
||||||
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
|
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
|
||||||
validation_extension = self.validation_file.split(".")[-1]
|
validation_extension = self.validation_file.split(".")[-1]
|
||||||
assert (
|
assert validation_extension == train_extension, (
|
||||||
validation_extension == train_extension
|
"`validation_file` should have the same extension (csv or json) as `train_file`."
|
||||||
), "`validation_file` should have the same extension (csv or json) as `train_file`."
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -313,9 +313,9 @@ def main():
|
|||||||
if data_args.test_file is not None:
|
if data_args.test_file is not None:
|
||||||
train_extension = data_args.train_file.split(".")[-1]
|
train_extension = data_args.train_file.split(".")[-1]
|
||||||
test_extension = data_args.test_file.split(".")[-1]
|
test_extension = data_args.test_file.split(".")[-1]
|
||||||
assert (
|
assert test_extension == train_extension, (
|
||||||
test_extension == train_extension
|
"`test_file` should have the same extension (csv or json) as `train_file`."
|
||||||
), "`test_file` should have the same extension (csv or json) as `train_file`."
|
)
|
||||||
data_files["test"] = data_args.test_file
|
data_files["test"] = data_args.test_file
|
||||||
else:
|
else:
|
||||||
raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
|
raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
|
||||||
|
|||||||
@@ -322,7 +322,7 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--use_cpu",
|
"--use_cpu",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
|
help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
|
||||||
)
|
)
|
||||||
parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
|
parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--use_cpu",
|
"--use_cpu",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
|
help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--fp16",
|
"--fp16",
|
||||||
|
|||||||
@@ -436,9 +436,9 @@ def main():
|
|||||||
|
|
||||||
# Set decoder_start_token_id
|
# Set decoder_start_token_id
|
||||||
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
||||||
assert (
|
assert args.target_lang is not None and args.source_lang is not None, (
|
||||||
args.target_lang is not None and args.source_lang is not None
|
"mBart requires --target_lang and --source_lang"
|
||||||
), "mBart requires --target_lang and --source_lang"
|
)
|
||||||
if isinstance(tokenizer, MBartTokenizer):
|
if isinstance(tokenizer, MBartTokenizer):
|
||||||
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang]
|
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang]
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ if __name__ == "__main__":
|
|||||||
cluster.run(["pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117"])
|
cluster.run(["pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117"])
|
||||||
|
|
||||||
# Run example. You can bypass the CLI wrapper and paste your own code here.
|
# Run example. You can bypass the CLI wrapper and paste your own code here.
|
||||||
cluster.run([f'python transformers/examples/{args.example} {" ".join(shlex.quote(arg) for arg in unknown)}'])
|
cluster.run([f"python transformers/examples/{args.example} {' '.join(shlex.quote(arg) for arg in unknown)}"])
|
||||||
|
|
||||||
# Alternatively, we can just import and run a training function (especially if there's no wrapper CLI):
|
# Alternatively, we can just import and run a training function (especially if there's no wrapper CLI):
|
||||||
# from my_script... import train
|
# from my_script... import train
|
||||||
|
|||||||
@@ -501,9 +501,9 @@ def main():
|
|||||||
|
|
||||||
# region Set decoder_start_token_id
|
# region Set decoder_start_token_id
|
||||||
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
||||||
assert (
|
assert data_args.target_lang is not None and data_args.source_lang is not None, (
|
||||||
data_args.target_lang is not None and data_args.source_lang is not None
|
"mBart requires --target_lang and --source_lang"
|
||||||
), "mBart requires --target_lang and --source_lang"
|
)
|
||||||
if isinstance(tokenizer, MBartTokenizer):
|
if isinstance(tokenizer, MBartTokenizer):
|
||||||
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
|
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user