Use Python 3.9 syntax in examples (#37279)

Signed-off-by: cyy <cyyever@outlook.com>
This commit is contained in:
cyyever
2025-04-07 19:52:21 +08:00
committed by GitHub
parent 08f36771b3
commit 0fb8d49e88
123 changed files with 358 additions and 451 deletions

View File

@@ -15,7 +15,7 @@
import csv
from collections import defaultdict
from dataclasses import dataclass, field
from typing import List, Optional
from typing import Optional
import matplotlib.pyplot as plt
import numpy as np
@@ -59,7 +59,7 @@ class PlotArguments:
default=None,
metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
)
short_model_names: Optional[List[str]] = list_field(
short_model_names: Optional[list[str]] = list_field(
default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
)

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#

View File

@@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -18,7 +17,7 @@
import logging
import os
from dataclasses import dataclass, field
from typing import Dict, Optional
from typing import Optional
import numpy as np
from utils_multiple_choice import MultipleChoiceDataset, Split, processors
@@ -187,7 +186,7 @@ def main():
else None
)
def compute_metrics(p: EvalPrediction) -> Dict:
def compute_metrics(p: EvalPrediction) -> dict:
preds = np.argmax(p.predictions, axis=1)
return {"acc": simple_accuracy(preds, p.label_ids)}
@@ -228,7 +227,7 @@ def main():
logger.info("***** Eval results *****")
for key, value in result.items():
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
writer.write("{} = {}\n".format(key, value))
results.update(result)

View File

@@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -22,7 +21,7 @@ import logging
import os
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional
from typing import Optional
import tqdm
from filelock import FileLock
@@ -49,8 +48,8 @@ class InputExample:
example_id: str
question: str
contexts: List[str]
endings: List[str]
contexts: list[str]
endings: list[str]
label: Optional[str]
@@ -62,9 +61,9 @@ class InputFeatures:
"""
example_id: str
input_ids: List[List[int]]
attention_mask: Optional[List[List[int]]]
token_type_ids: Optional[List[List[int]]]
input_ids: list[list[int]]
attention_mask: Optional[list[list[int]]]
token_type_ids: Optional[list[list[int]]]
label: Optional[int]
@@ -84,7 +83,7 @@ if is_torch_available():
soon.
"""
features: List[InputFeatures]
features: list[InputFeatures]
def __init__(
self,
@@ -149,7 +148,7 @@ if is_tf_available():
soon.
"""
features: List[InputFeatures]
features: list[InputFeatures]
def __init__(
self,
@@ -253,7 +252,7 @@ class RaceProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} train".format(data_dir))
logger.info(f"LOOKING AT {data_dir} train")
high = os.path.join(data_dir, "train/high")
middle = os.path.join(data_dir, "train/middle")
high = self._read_txt(high)
@@ -262,7 +261,7 @@ class RaceProcessor(DataProcessor):
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
logger.info(f"LOOKING AT {data_dir} dev")
high = os.path.join(data_dir, "dev/high")
middle = os.path.join(data_dir, "dev/middle")
high = self._read_txt(high)
@@ -271,7 +270,7 @@ class RaceProcessor(DataProcessor):
def get_test_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} test".format(data_dir))
logger.info(f"LOOKING AT {data_dir} test")
high = os.path.join(data_dir, "test/high")
middle = os.path.join(data_dir, "test/middle")
high = self._read_txt(high)
@@ -286,7 +285,7 @@ class RaceProcessor(DataProcessor):
lines = []
files = glob.glob(input_dir + "/*txt")
for file in tqdm.tqdm(files, desc="read files"):
with open(file, "r", encoding="utf-8") as fin:
with open(file, encoding="utf-8") as fin:
data_raw = json.load(fin)
data_raw["race_id"] = file
lines.append(data_raw)
@@ -296,7 +295,7 @@ class RaceProcessor(DataProcessor):
"""Creates examples for the training and dev sets."""
examples = []
for _, data_raw in enumerate(lines):
race_id = "%s-%s" % (set_type, data_raw["race_id"])
race_id = "{}-{}".format(set_type, data_raw["race_id"])
article = data_raw["article"]
for i in range(len(data_raw["answers"])):
truth = str(ord(data_raw["answers"][i]) - ord("A"))
@@ -320,17 +319,17 @@ class SynonymProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} train".format(data_dir))
logger.info(f"LOOKING AT {data_dir} train")
return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
logger.info(f"LOOKING AT {data_dir} dev")
return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
logger.info(f"LOOKING AT {data_dir} dev")
return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
@@ -339,10 +338,10 @@ class SynonymProcessor(DataProcessor):
return ["0", "1", "2", "3", "4"]
def _read_csv(self, input_file):
with open(input_file, "r", encoding="utf-8") as f:
with open(input_file, encoding="utf-8") as f:
return list(csv.reader(f))
def _create_examples(self, lines: List[List[str]], type: str):
def _create_examples(self, lines: list[list[str]], type: str):
"""Creates examples for the training and dev sets."""
examples = [
@@ -366,17 +365,17 @@ class SwagProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} train".format(data_dir))
logger.info(f"LOOKING AT {data_dir} train")
return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
logger.info(f"LOOKING AT {data_dir} dev")
return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
logger.info(f"LOOKING AT {data_dir} dev")
raise ValueError(
"For swag testing, the input file does not contain a label column. It can not be tested in current code "
"setting!"
@@ -388,10 +387,10 @@ class SwagProcessor(DataProcessor):
return ["0", "1", "2", "3"]
def _read_csv(self, input_file):
with open(input_file, "r", encoding="utf-8") as f:
with open(input_file, encoding="utf-8") as f:
return list(csv.reader(f))
def _create_examples(self, lines: List[List[str]], type: str):
def _create_examples(self, lines: list[list[str]], type: str):
"""Creates examples for the training and dev sets."""
if type == "train" and lines[0][-1] != "label":
raise ValueError("For training, the input file must contain a label column.")
@@ -417,16 +416,16 @@ class ArcProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} train".format(data_dir))
logger.info(f"LOOKING AT {data_dir} train")
return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
logger.info(f"LOOKING AT {data_dir} dev")
return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
def get_test_examples(self, data_dir):
logger.info("LOOKING AT {} test".format(data_dir))
logger.info(f"LOOKING AT {data_dir} test")
return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
def get_labels(self):
@@ -434,7 +433,7 @@ class ArcProcessor(DataProcessor):
return ["0", "1", "2", "3"]
def _read_json(self, input_file):
with open(input_file, "r", encoding="utf-8") as fin:
with open(input_file, encoding="utf-8") as fin:
lines = fin.readlines()
return lines
@@ -504,11 +503,11 @@ class ArcProcessor(DataProcessor):
def convert_examples_to_features(
examples: List[InputExample],
label_list: List[str],
examples: list[InputExample],
label_list: list[str],
max_length: int,
tokenizer: PreTrainedTokenizer,
) -> List[InputFeatures]:
) -> list[InputFeatures]:
"""
Loads a data file into a list of `InputFeatures`
"""

View File

@@ -2,7 +2,7 @@ import argparse
import logging
import os
from pathlib import Path
from typing import Any, Dict
from typing import Any
import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_info
@@ -201,7 +201,7 @@ class BaseTransformer(pl.LightningModule):
)
@pl.utilities.rank_zero_only
def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
def on_save_checkpoint(self, checkpoint: dict[str, Any]) -> None:
save_path = self.output_dir.joinpath("best_tfmr")
self.model.config.save_step = self.step_count
self.model.save_pretrained(save_path)
@@ -282,7 +282,7 @@ class LoggingCallback(pl.Callback):
# Log results
for key in sorted(metrics):
if key not in ["log", "progress_bar"]:
rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
rank_zero_info(f"{key} = {str(metrics[key])}\n")
def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
rank_zero_info("***** Test results *****")
@@ -292,8 +292,8 @@ class LoggingCallback(pl.Callback):
with open(output_test_results_file, "w") as writer:
for key in sorted(metrics):
if key not in ["log", "progress_bar"]:
rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
writer.write("{} = {}\n".format(key, str(metrics[key])))
rank_zero_info(f"{key} = {str(metrics[key])}\n")
writer.write(f"{key} = {str(metrics[key])}\n")
def add_generic_args(parser, root_dir) -> None:

View File

@@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -231,14 +230,14 @@ def train(args, train_dataset, model, tokenizer):
if args.local_rank == -1 and args.evaluate_during_training:
results = evaluate(args, model, tokenizer)
for key, value in results.items():
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
tb_writer.add_scalar(f"eval_{key}", value, global_step)
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
logging_loss = tr_loss
# Save model checkpoint
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
# Take care of distributed/parallel training
model_to_save = model.module if hasattr(model, "module") else model
model_to_save.save_pretrained(output_dir)
@@ -281,7 +280,7 @@ def evaluate(args, model, tokenizer, prefix=""):
model = torch.nn.DataParallel(model)
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(f"***** Running evaluation {prefix} *****")
logger.info(" Num examples = %d", len(dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
@@ -348,11 +347,11 @@ def evaluate(args, model, tokenizer, prefix=""):
logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
# Compute predictions
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
output_prediction_file = os.path.join(args.output_dir, f"predictions_{prefix}.json")
output_nbest_file = os.path.join(args.output_dir, f"nbest_predictions_{prefix}.json")
if args.version_2_with_negative:
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
output_null_log_odds_file = os.path.join(args.output_dir, f"null_odds_{prefix}.json")
else:
output_null_log_odds_file = None
@@ -828,10 +827,10 @@ def main():
# Evaluate
result = evaluate(args, model, tokenizer, prefix=global_step)
result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
results.update(result)
logger.info("Results: {}".format(results))
logger.info(f"Results: {results}")
return results

View File

@@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#

View File

@@ -20,10 +20,10 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
topk_filled_outputs = []
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
predicted_token = predicted_token_bpe.replace("\u2581", " ")
if " {0}".format(masked_token) in masked_input:
if f" {masked_token}" in masked_input:
topk_filled_outputs.append(
(
masked_input.replace(" {0}".format(masked_token), predicted_token),
masked_input.replace(f" {masked_token}", predicted_token),
values[index].item(),
predicted_token,
)

View File

@@ -1,7 +1,6 @@
#!/usr/bin/env python
import argparse
import json
from typing import List
from ltp import LTP
@@ -42,7 +41,7 @@ def is_chinese(word: str):
return 1
def get_chinese_word(tokens: List[str]):
def get_chinese_word(tokens: list[str]):
word_set = set()
for token in tokens:
@@ -53,7 +52,7 @@ def get_chinese_word(tokens: List[str]):
return word_list
def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
def add_sub_symbol(bert_tokens: list[str], chinese_word_set: set()):
if not chinese_word_set:
return bert_tokens
max_word_len = max([len(w) for w in chinese_word_set])
@@ -77,7 +76,7 @@ def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
return bert_word
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
def prepare_ref(lines: list[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
ltp_res = []
for i in range(0, len(lines), 100):
@@ -117,7 +116,7 @@ def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokeni
def main(args):
# For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
# If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
with open(args.file_name, "r", encoding="utf-8") as f:
with open(args.file_name, encoding="utf-8") as f:
data = f.readlines()
data = [line.strip() for line in data if len(line) > 0 and not line.isspace()] # avoid delimiter like '\u2029'
ltp_tokenizer = LTP(args.ltp) # faster in GPU device

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -358,7 +357,7 @@ def main():
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
writer.write("{} = {}\n".format(key, str(result[key])))
results.update(result)

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -163,7 +162,7 @@ def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {}, n_gpu {}".format(device, n_gpu))
logger.info(f"device: {device}, n_gpu {n_gpu}")
if not args.do_train and not args.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
@@ -261,7 +260,7 @@ def main():
loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
)
nb_tr_steps += 1
tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
tqdm_bar.desc = f"Training loss: {exp_average_loss:.2e} lr: {scheduler.get_lr()[0]:.2e}"
# Save a trained model
if args.do_train:
@@ -313,7 +312,7 @@ def main():
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
writer.write("{} = {}\n".format(key, str(result[key])))
if __name__ == "__main__":

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -51,7 +50,7 @@ except ImportError:
logger = logging.getLogger(__name__)
class SwagExample(object):
class SwagExample:
"""A single training/test example for the SWAG dataset."""
def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
@@ -71,22 +70,22 @@ class SwagExample(object):
def __repr__(self):
attributes = [
"swag_id: {}".format(self.swag_id),
"context_sentence: {}".format(self.context_sentence),
"start_ending: {}".format(self.start_ending),
"ending_0: {}".format(self.endings[0]),
"ending_1: {}".format(self.endings[1]),
"ending_2: {}".format(self.endings[2]),
"ending_3: {}".format(self.endings[3]),
f"swag_id: {self.swag_id}",
f"context_sentence: {self.context_sentence}",
f"start_ending: {self.start_ending}",
f"ending_0: {self.endings[0]}",
f"ending_1: {self.endings[1]}",
f"ending_2: {self.endings[2]}",
f"ending_3: {self.endings[3]}",
]
if self.label is not None:
attributes.append("label: {}".format(self.label))
attributes.append(f"label: {self.label}")
return ", ".join(attributes)
class InputFeatures(object):
class InputFeatures:
def __init__(self, example_id, choices_features, label):
self.example_id = example_id
self.choices_features = [
@@ -97,7 +96,7 @@ class InputFeatures(object):
def read_swag_examples(input_file, is_training=True):
with open(input_file, "r", encoding="utf-8") as f:
with open(input_file, encoding="utf-8") as f:
lines = list(csv.reader(f))
if is_training and lines[0][-1] != "label":
@@ -179,15 +178,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, is_trainin
label = example.label
if example_index < 5:
logger.info("*** Example ***")
logger.info("swag_id: {}".format(example.swag_id))
logger.info(f"swag_id: {example.swag_id}")
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
logger.info("choice: {}".format(choice_idx))
logger.info(f"choice: {choice_idx}")
logger.info("tokens: {}".format(" ".join(tokens)))
logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
if is_training:
logger.info("label: {}".format(label))
logger.info(f"label: {label}")
features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
@@ -382,14 +381,14 @@ def train(args, train_dataset, model, tokenizer):
): # Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer)
for key, value in results.items():
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
tb_writer.add_scalar(f"eval_{key}", value, global_step)
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
logging_loss = tr_loss
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
model_to_save = (
model.module if hasattr(model, "module") else model
) # Take care of distributed/parallel training
@@ -423,7 +422,7 @@ def evaluate(args, model, tokenizer, prefix=""):
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(f"***** Running evaluation {prefix} *****")
logger.info(" Num examples = %d", len(dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
@@ -466,7 +465,7 @@ def evaluate(args, model, tokenizer, prefix=""):
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
logger.info("%s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
writer.write("{} = {}\n".format(key, str(result[key])))
return result
@@ -710,10 +709,10 @@ def main():
# Evaluate
result = evaluate(args, model, tokenizer, prefix=global_step)
result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
results.update(result)
logger.info("Results: {}".format(results))
logger.info(f"Results: {results}")
return results

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -66,7 +65,7 @@ def main():
ptvsd.wait_for_attach()
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
logger.info("device: {}".format(device))
logger.info(f"device: {device}")
# Load a pre-processed dataset
# You can also build the corpus yourself using TransfoXLCorpus methods
@@ -111,7 +110,7 @@ def main():
total_loss += seq_len * loss.item()
total_len += seq_len
total_time = time.time() - start_time
logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
logger.info(f"Time : {total_time:.2f}s, {1000 * total_time / (idx + 1):.2f}ms/segment")
return total_loss / total_len
# Run on test data.

View File

@@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2020 Huggingface
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import json
import unittest
@@ -25,7 +23,7 @@ from utils import calculate_bleu
filename = get_tests_dir() + "/test_data/fsmt/fsmt_val_data.json"
with io.open(filename, "r", encoding="utf-8") as f:
with open(filename, encoding="utf-8") as f:
bleu_data = json.load(f)

View File

@@ -19,7 +19,6 @@ import time
from json import JSONDecodeError
from logging import getLogger
from pathlib import Path
from typing import Dict, List
import torch
from torch.utils.data import DataLoader
@@ -55,10 +54,10 @@ def eval_data_dir(
task="summarization",
local_rank=None,
num_return_sequences=1,
dataset_kwargs: Dict = None,
dataset_kwargs: dict = None,
prefix="",
**generate_kwargs,
) -> Dict:
) -> dict:
"""Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json"""
model_name = str(model_name)
assert local_rank is not None
@@ -211,7 +210,7 @@ def run_generate():
calc_bleu = "translation" in args.task
score_fn = calculate_bleu if calc_bleu else calculate_rouge
metric_name = "bleu" if calc_bleu else "rouge"
metrics: Dict = score_fn(preds, labels)
metrics: dict = score_fn(preds, labels)
metrics["n_obs"] = len(preds)
runtime = time.time() - start_time
metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4)
@@ -227,7 +226,7 @@ def run_generate():
shutil.rmtree(json_save_dir)
def combine_partial_results(partial_results) -> List:
def combine_partial_results(partial_results) -> list:
"""Concatenate partial results into one file, then sort it by id."""
records = []
for partial_result in partial_results:
@@ -237,7 +236,7 @@ def combine_partial_results(partial_results) -> List:
return preds
def gather_results_from_each_node(num_replicas, save_dir, timeout) -> List[Dict[str, List]]:
def gather_results_from_each_node(num_replicas, save_dir, timeout) -> list[dict[str, list]]:
# WAIT FOR lots of .json files
start_wait = time.time()
logger.info("waiting for all nodes to finish")

View File

@@ -20,7 +20,6 @@ import time
import warnings
from logging import getLogger
from pathlib import Path
from typing import Dict, List
import torch
from tqdm import tqdm
@@ -36,7 +35,7 @@ DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def generate_summaries_or_translations(
examples: List[str],
examples: list[str],
out_file: str,
model_name: str,
batch_size: int = 8,
@@ -45,7 +44,7 @@ def generate_summaries_or_translations(
task="summarization",
prefix=None,
**generate_kwargs,
) -> Dict:
) -> dict:
"""Save model.generate results to <out_file>, and return how long it took."""
fout = Path(out_file).open("w", encoding="utf-8")
model_name = str(model_name)

View File

@@ -34,7 +34,7 @@ task_score_names = {
def parse_search_arg(search):
groups = search.split()
entries = dict((g.split("=") for g in groups))
entries = dict(g.split("=") for g in groups)
entry_names = list(entries.keys())
sets = [[f"--{k} {v}" for v in vs.split(":")] for k, vs in entries.items()]
matrix = [list(x) for x in itertools.product(*sets)]
@@ -105,7 +105,7 @@ def run_search():
col_widths = {col: len(str(col)) for col in col_names}
results = []
for r in matrix:
hparams = dict((x.replace("--", "").split() for x in r))
hparams = dict(x.replace("--", "").split() for x in r)
args_exp = " ".join(r).split()
args_exp.extend(["--bs", str(args.bs)]) # in case we need to reduce its size due to CUDA OOM
sys.argv = args_normal + args_exp

View File

@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Optional, Union
import torch
from torch import nn
@@ -172,10 +172,10 @@ class Seq2SeqTrainer(Trainer):
def prediction_step(
self,
model: nn.Module,
inputs: Dict[str, Union[torch.Tensor, Any]],
inputs: dict[str, Union[torch.Tensor, Any]],
prediction_loss_only: bool,
ignore_keys: Optional[List[str]] = None,
) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
ignore_keys: Optional[list[str]] = None,
) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.

View File

@@ -1,6 +1,5 @@
#!/usr/bin/env python
import io
import json
import subprocess
@@ -29,5 +28,5 @@ def get_all_data(pairs, n_objs):
text = get_all_data(pairs, n_objs)
filename = "./fsmt_val_data.json"
with io.open(filename, "w", encoding="utf-8") as f:
with open(filename, "w", encoding="utf-8") as f:
bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)

View File

@@ -19,9 +19,10 @@ import math
import os
import pickle
import socket
from collections.abc import Iterable
from logging import getLogger
from pathlib import Path
from typing import Callable, Dict, Iterable, List, Tuple, Union
from typing import Callable, Union
import git
import numpy as np
@@ -67,7 +68,7 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
return loss, nll_loss
def lmap(f: Callable, x: Iterable) -> List:
def lmap(f: Callable, x: Iterable) -> list:
"""list(map(f, x))"""
return list(map(f, x))
@@ -77,11 +78,11 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], Dict]:
def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], dict]:
def non_pad_len(tokens: np.ndarray) -> int:
return np.count_nonzero(tokens != tokenizer.pad_token_id)
def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
def decode_pred(pred: EvalPrediction) -> tuple[list[str], list[str]]:
pred_ids = pred.predictions
label_ids = pred.label_ids
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
@@ -91,16 +92,16 @@ def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) ->
label_str = lmap(str.strip, label_str)
return pred_str, label_str
def summarization_metrics(pred: EvalPrediction) -> Dict:
def summarization_metrics(pred: EvalPrediction) -> dict:
pred_str, label_str = decode_pred(pred)
rouge: Dict = calculate_rouge(pred_str, label_str)
rouge: dict = calculate_rouge(pred_str, label_str)
summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
rouge.update({"gen_len": summ_len})
return rouge
def translation_metrics(pred: EvalPrediction) -> Dict:
def translation_metrics(pred: EvalPrediction) -> dict:
pred_str, label_str = decode_pred(pred)
bleu: Dict = calculate_bleu(pred_str, label_str)
bleu: dict = calculate_bleu(pred_str, label_str)
gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
bleu.update({"gen_len": gen_len})
return bleu
@@ -183,7 +184,7 @@ class AbstractSeq2SeqDataset(Dataset):
return min(self.src_lens[i], self.max_target_length)
# call fairseq cython function
batch_sampler: List[List[int]] = batch_by_size(
batch_sampler: list[list[int]] = batch_by_size(
sorted_indices,
num_tokens_fn=num_tokens_in_example,
max_tokens=max_tokens_per_batch,
@@ -207,7 +208,7 @@ class AbstractSeq2SeqDataset(Dataset):
class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
def __getitem__(self, index) -> Dict[str, torch.Tensor]:
def __getitem__(self, index) -> dict[str, torch.Tensor]:
"""Call tokenizer on src and tgt_lines"""
index = index + 1 # linecache starts at 1
source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
@@ -237,7 +238,7 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
**self.dataset_kwargs,
)
def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
def collate_fn(self, batch) -> dict[str, torch.Tensor]:
input_ids = torch.stack([x["input_ids"] for x in batch])
masks = torch.stack([x["attention_mask"] for x in batch])
target_ids = torch.stack([x["labels"] for x in batch])
@@ -255,7 +256,7 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
class Seq2SeqDataset(AbstractSeq2SeqDataset):
"""A dataset that calls prepare_seq2seq_batch."""
def __getitem__(self, index) -> Dict[str, str]:
def __getitem__(self, index) -> dict[str, str]:
index = index + 1 # linecache starts at 1
source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
@@ -263,9 +264,9 @@ class Seq2SeqDataset(AbstractSeq2SeqDataset):
assert tgt_line, f"empty tgt line for index {index}"
return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
def collate_fn(self, batch) -> dict[str, torch.Tensor]:
"""Call prepare_seq2seq_batch."""
batch_encoding: Dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
batch_encoding: dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
[x["src_texts"] for x in batch],
tgt_texts=[x["tgt_texts"] for x in batch],
max_length=self.max_source_length,
@@ -293,7 +294,7 @@ class Seq2SeqDataCollator:
if data_args.tgt_lang is not None:
self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
def __call__(self, batch) -> Dict[str, torch.Tensor]:
def __call__(self, batch) -> dict[str, torch.Tensor]:
if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
batch = self._encode(batch)
input_ids, attention_mask, labels = (
@@ -329,7 +330,7 @@ class Seq2SeqDataCollator:
shifted_input_ids[..., 0] = self.pad_token_id
return shifted_input_ids
def _encode(self, batch) -> Dict[str, torch.Tensor]:
def _encode(self, batch) -> dict[str, torch.Tensor]:
batch_encoding = self.tokenizer.prepare_seq2seq_batch(
[x["src_texts"] for x in batch],
tgt_texts=[x["tgt_texts"] for x in batch],
@@ -355,7 +356,7 @@ class SortishSampler(Sampler):
return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
def sortish_sampler_indices(data: List, bs: int, shuffle=True) -> np.array:
def sortish_sampler_indices(data: list, bs: int, shuffle=True) -> np.array:
"Go through the text data by order of src length with a bit of randomness. From fastai repo."
if not shuffle:
return np.argsort(np.array(data) * -1)
@@ -455,7 +456,7 @@ def pickle_save(obj, path):
return pickle.dump(obj, f)
def flatten_list(summary_ids: List[List]):
def flatten_list(summary_ids: list[list]):
return list(itertools.chain.from_iterable(summary_ids))
@@ -506,14 +507,14 @@ def extract_rouge_mid_statistics(dct):
def calculate_rouge(
pred_lns: List[str],
tgt_lns: List[str],
pred_lns: list[str],
tgt_lns: list[str],
use_stemmer=True,
rouge_keys=ROUGE_KEYS,
return_precision_and_recall=False,
bootstrap_aggregation=True,
newline_sep=True,
) -> Dict:
) -> dict:
"""Calculate rouge using rouge_scorer package.
Args:
@@ -590,19 +591,19 @@ def any_requires_grad(model: nn.Module) -> bool:
def assert_all_frozen(model):
model_grads: List[bool] = list(grad_status(model))
model_grads: list[bool] = list(grad_status(model))
n_require_grad = sum(lmap(int, model_grads))
npars = len(model_grads)
assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"
def assert_not_all_frozen(model):
model_grads: List[bool] = list(grad_status(model))
model_grads: list[bool] = list(grad_status(model))
npars = len(model_grads)
assert any(model_grads), f"none of {npars} weights require grad"
def parse_numeric_n_bool_cl_kwargs(unparsed_args: List[str]) -> Dict[str, Union[int, float, bool]]:
def parse_numeric_n_bool_cl_kwargs(unparsed_args: list[str]) -> dict[str, Union[int, float, bool]]:
"""
Parse an argv list of unspecified command line args to a dict.
Assumes all values are either numeric or boolean in the form of true/false.

View File

@@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -20,7 +19,7 @@ import os
import sys
from dataclasses import dataclass, field
from importlib import import_module
from typing import Dict, List, Optional, Tuple
from typing import Optional
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
@@ -159,7 +158,7 @@ def main():
# Prepare CONLL-2003 task
labels = token_classification_task.get_labels(data_args.labels)
label_map: Dict[int, str] = dict(enumerate(labels))
label_map: dict[int, str] = dict(enumerate(labels))
num_labels = len(labels)
# Load pretrained model and tokenizer
@@ -217,7 +216,7 @@ def main():
else None
)
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> tuple[list[int], list[int]]:
preds = np.argmax(predictions, axis=2)
batch_size, seq_len = preds.shape
@@ -233,7 +232,7 @@ def main():
return preds_list, out_label_list
def compute_metrics(p: EvalPrediction) -> Dict:
def compute_metrics(p: EvalPrediction) -> dict:
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
return {
"accuracy_score": accuracy_score(out_label_list, preds_list),
@@ -279,7 +278,7 @@ def main():
logger.info("***** Eval results *****")
for key, value in result.items():
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
writer.write("{} = {}\n".format(key, value))
results.update(result)
@@ -304,13 +303,13 @@ def main():
with open(output_test_results_file, "w") as writer:
for key, value in metrics.items():
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
writer.write("{} = {}\n".format(key, value))
# Save predictions
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
if trainer.is_world_process_zero():
with open(output_test_predictions_file, "w") as writer:
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
with open(os.path.join(data_args.data_dir, "test.txt")) as f:
token_classification_task.write_predictions_to_file(writer, f, preds_list)
return results

View File

@@ -12,7 +12,7 @@ subword_len_counter = 0
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
max_len -= tokenizer.num_special_tokens_to_add()
with open(dataset, "rt") as f_p:
with open(dataset) as f_p:
for line in f_p:
line = line.rstrip()

View File

@@ -1,6 +1,6 @@
import logging
import os
from typing import List, TextIO, Union
from typing import TextIO, Union
from conllu import parse_incr
from utils_ner import InputExample, Split, TokenClassificationTask
@@ -14,7 +14,7 @@ class NER(TokenClassificationTask):
# in NER datasets, the last column is usually reserved for NER label
self.label_idx = label_idx
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]:
if isinstance(mode, Split):
mode = mode.value
file_path = os.path.join(data_dir, f"{mode}.txt")
@@ -42,7 +42,7 @@ class NER(TokenClassificationTask):
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
return examples
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list):
example_id = 0
for line in test_input_reader:
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
@@ -55,9 +55,9 @@ class NER(TokenClassificationTask):
else:
logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def get_labels(self, path: str) -> List[str]:
def get_labels(self, path: str) -> list[str]:
if path:
with open(path, "r") as f:
with open(path) as f:
labels = f.read().splitlines()
if "O" not in labels:
labels = ["O"] + labels
@@ -71,9 +71,9 @@ class Chunk(NER):
# in CONLL2003 dataset chunk column is second-to-last
super().__init__(label_idx=-2)
def get_labels(self, path: str) -> List[str]:
def get_labels(self, path: str) -> list[str]:
if path:
with open(path, "r") as f:
with open(path) as f:
labels = f.read().splitlines()
if "O" not in labels:
labels = ["O"] + labels
@@ -105,7 +105,7 @@ class Chunk(NER):
class POS(TokenClassificationTask):
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]:
if isinstance(mode, Split):
mode = mode.value
file_path = os.path.join(data_dir, f"{mode}.txt")
@@ -125,7 +125,7 @@ class POS(TokenClassificationTask):
guid_index += 1
return examples
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list):
example_id = 0
for sentence in parse_incr(test_input_reader):
s_p = preds_list[example_id]
@@ -136,9 +136,9 @@ class POS(TokenClassificationTask):
writer.write(out)
example_id += 1
def get_labels(self, path: str) -> List[str]:
def get_labels(self, path: str) -> list[str]:
if path:
with open(path, "r") as f:
with open(path) as f:
return f.read().splitlines()
else:
return [

View File

@@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
@@ -19,7 +18,7 @@ import logging
import os
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional, Union
from typing import Optional, Union
from filelock import FileLock
@@ -42,8 +41,8 @@ class InputExample:
"""
guid: str
words: List[str]
labels: Optional[List[str]]
words: list[str]
labels: Optional[list[str]]
@dataclass
@@ -53,10 +52,10 @@ class InputFeatures:
Property names are the same names as the corresponding inputs to a model.
"""
input_ids: List[int]
attention_mask: List[int]
token_type_ids: Optional[List[int]] = None
label_ids: Optional[List[int]] = None
input_ids: list[int]
attention_mask: list[int]
token_type_ids: Optional[list[int]] = None
label_ids: Optional[list[int]] = None
class Split(Enum):
@@ -67,17 +66,17 @@ class Split(Enum):
class TokenClassificationTask:
@staticmethod
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> list[InputExample]:
raise NotImplementedError
@staticmethod
def get_labels(path: str) -> List[str]:
def get_labels(path: str) -> list[str]:
raise NotImplementedError
@staticmethod
def convert_examples_to_features(
examples: List[InputExample],
label_list: List[str],
examples: list[InputExample],
label_list: list[str],
max_seq_length: int,
tokenizer: PreTrainedTokenizer,
cls_token_at_end=False,
@@ -91,7 +90,7 @@ class TokenClassificationTask:
pad_token_label_id=-100,
sequence_a_segment_id=0,
mask_padding_with_zero=True,
) -> List[InputFeatures]:
) -> list[InputFeatures]:
"""Loads a data file into a list of `InputFeatures`
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
@@ -214,7 +213,7 @@ if is_torch_available():
soon.
"""
features: List[InputFeatures]
features: list[InputFeatures]
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
# Use cross entropy ignore_index as padding label id so that only
# real label ids contribute to the loss later.
@@ -224,7 +223,7 @@ if is_torch_available():
token_classification_task: TokenClassificationTask,
data_dir: str,
tokenizer: PreTrainedTokenizer,
labels: List[str],
labels: list[str],
model_type: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,
@@ -233,7 +232,7 @@ if is_torch_available():
# Load data features from cache or dataset file
cached_features_file = os.path.join(
data_dir,
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
f"cached_{mode.value}_{tokenizer.__class__.__name__}_{str(max_seq_length)}",
)
# Make sure only the first process in distributed training processes the dataset,
@@ -283,7 +282,7 @@ if is_tf_available():
soon.
"""
features: List[InputFeatures]
features: list[InputFeatures]
pad_token_label_id: int = -100
# Use cross entropy ignore_index as padding label id so that only
# real label ids contribute to the loss later.
@@ -293,7 +292,7 @@ if is_tf_available():
token_classification_task: TokenClassificationTask,
data_dir: str,
tokenizer: PreTrainedTokenizer,
labels: List[str],
labels: list[str],
model_type: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,