Use Python 3.9 syntax in examples (#37279)
Signed-off-by: cyy <cyyever@outlook.com>
This commit is contained in:
@@ -15,7 +15,7 @@
|
||||
import csv
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
@@ -59,7 +59,7 @@ class PlotArguments:
|
||||
default=None,
|
||||
metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
|
||||
)
|
||||
short_model_names: Optional[List[str]] = list_field(
|
||||
short_model_names: Optional[list[str]] = list_field(
|
||||
default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
|
||||
)
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -18,7 +17,7 @@
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Optional
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from utils_multiple_choice import MultipleChoiceDataset, Split, processors
|
||||
@@ -187,7 +186,7 @@ def main():
|
||||
else None
|
||||
)
|
||||
|
||||
def compute_metrics(p: EvalPrediction) -> Dict:
|
||||
def compute_metrics(p: EvalPrediction) -> dict:
|
||||
preds = np.argmax(p.predictions, axis=1)
|
||||
return {"acc": simple_accuracy(preds, p.label_ids)}
|
||||
|
||||
@@ -228,7 +227,7 @@ def main():
|
||||
logger.info("***** Eval results *****")
|
||||
for key, value in result.items():
|
||||
logger.info(" %s = %s", key, value)
|
||||
writer.write("%s = %s\n" % (key, value))
|
||||
writer.write("{} = {}\n".format(key, value))
|
||||
|
||||
results.update(result)
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -22,7 +21,7 @@ import logging
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import tqdm
|
||||
from filelock import FileLock
|
||||
@@ -49,8 +48,8 @@ class InputExample:
|
||||
|
||||
example_id: str
|
||||
question: str
|
||||
contexts: List[str]
|
||||
endings: List[str]
|
||||
contexts: list[str]
|
||||
endings: list[str]
|
||||
label: Optional[str]
|
||||
|
||||
|
||||
@@ -62,9 +61,9 @@ class InputFeatures:
|
||||
"""
|
||||
|
||||
example_id: str
|
||||
input_ids: List[List[int]]
|
||||
attention_mask: Optional[List[List[int]]]
|
||||
token_type_ids: Optional[List[List[int]]]
|
||||
input_ids: list[list[int]]
|
||||
attention_mask: Optional[list[list[int]]]
|
||||
token_type_ids: Optional[list[list[int]]]
|
||||
label: Optional[int]
|
||||
|
||||
|
||||
@@ -84,7 +83,7 @@ if is_torch_available():
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
features: list[InputFeatures]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -149,7 +148,7 @@ if is_tf_available():
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
features: list[InputFeatures]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -253,7 +252,7 @@ class RaceProcessor(DataProcessor):
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} train".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} train")
|
||||
high = os.path.join(data_dir, "train/high")
|
||||
middle = os.path.join(data_dir, "train/middle")
|
||||
high = self._read_txt(high)
|
||||
@@ -262,7 +261,7 @@ class RaceProcessor(DataProcessor):
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} dev".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} dev")
|
||||
high = os.path.join(data_dir, "dev/high")
|
||||
middle = os.path.join(data_dir, "dev/middle")
|
||||
high = self._read_txt(high)
|
||||
@@ -271,7 +270,7 @@ class RaceProcessor(DataProcessor):
|
||||
|
||||
def get_test_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} test".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} test")
|
||||
high = os.path.join(data_dir, "test/high")
|
||||
middle = os.path.join(data_dir, "test/middle")
|
||||
high = self._read_txt(high)
|
||||
@@ -286,7 +285,7 @@ class RaceProcessor(DataProcessor):
|
||||
lines = []
|
||||
files = glob.glob(input_dir + "/*txt")
|
||||
for file in tqdm.tqdm(files, desc="read files"):
|
||||
with open(file, "r", encoding="utf-8") as fin:
|
||||
with open(file, encoding="utf-8") as fin:
|
||||
data_raw = json.load(fin)
|
||||
data_raw["race_id"] = file
|
||||
lines.append(data_raw)
|
||||
@@ -296,7 +295,7 @@ class RaceProcessor(DataProcessor):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
examples = []
|
||||
for _, data_raw in enumerate(lines):
|
||||
race_id = "%s-%s" % (set_type, data_raw["race_id"])
|
||||
race_id = "{}-{}".format(set_type, data_raw["race_id"])
|
||||
article = data_raw["article"]
|
||||
for i in range(len(data_raw["answers"])):
|
||||
truth = str(ord(data_raw["answers"][i]) - ord("A"))
|
||||
@@ -320,17 +319,17 @@ class SynonymProcessor(DataProcessor):
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} train".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} train")
|
||||
return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} dev".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} dev")
|
||||
return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
|
||||
|
||||
def get_test_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} dev".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} dev")
|
||||
|
||||
return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
|
||||
|
||||
@@ -339,10 +338,10 @@ class SynonymProcessor(DataProcessor):
|
||||
return ["0", "1", "2", "3", "4"]
|
||||
|
||||
def _read_csv(self, input_file):
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
with open(input_file, encoding="utf-8") as f:
|
||||
return list(csv.reader(f))
|
||||
|
||||
def _create_examples(self, lines: List[List[str]], type: str):
|
||||
def _create_examples(self, lines: list[list[str]], type: str):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
|
||||
examples = [
|
||||
@@ -366,17 +365,17 @@ class SwagProcessor(DataProcessor):
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} train".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} train")
|
||||
return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} dev".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} dev")
|
||||
return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
|
||||
|
||||
def get_test_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} dev".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} dev")
|
||||
raise ValueError(
|
||||
"For swag testing, the input file does not contain a label column. It can not be tested in current code "
|
||||
"setting!"
|
||||
@@ -388,10 +387,10 @@ class SwagProcessor(DataProcessor):
|
||||
return ["0", "1", "2", "3"]
|
||||
|
||||
def _read_csv(self, input_file):
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
with open(input_file, encoding="utf-8") as f:
|
||||
return list(csv.reader(f))
|
||||
|
||||
def _create_examples(self, lines: List[List[str]], type: str):
|
||||
def _create_examples(self, lines: list[list[str]], type: str):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
if type == "train" and lines[0][-1] != "label":
|
||||
raise ValueError("For training, the input file must contain a label column.")
|
||||
@@ -417,16 +416,16 @@ class ArcProcessor(DataProcessor):
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} train".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} train")
|
||||
return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} dev".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} dev")
|
||||
return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
|
||||
|
||||
def get_test_examples(self, data_dir):
|
||||
logger.info("LOOKING AT {} test".format(data_dir))
|
||||
logger.info(f"LOOKING AT {data_dir} test")
|
||||
return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
|
||||
|
||||
def get_labels(self):
|
||||
@@ -434,7 +433,7 @@ class ArcProcessor(DataProcessor):
|
||||
return ["0", "1", "2", "3"]
|
||||
|
||||
def _read_json(self, input_file):
|
||||
with open(input_file, "r", encoding="utf-8") as fin:
|
||||
with open(input_file, encoding="utf-8") as fin:
|
||||
lines = fin.readlines()
|
||||
return lines
|
||||
|
||||
@@ -504,11 +503,11 @@ class ArcProcessor(DataProcessor):
|
||||
|
||||
|
||||
def convert_examples_to_features(
|
||||
examples: List[InputExample],
|
||||
label_list: List[str],
|
||||
examples: list[InputExample],
|
||||
label_list: list[str],
|
||||
max_length: int,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
) -> List[InputFeatures]:
|
||||
) -> list[InputFeatures]:
|
||||
"""
|
||||
Loads a data file into a list of `InputFeatures`
|
||||
"""
|
||||
|
||||
@@ -2,7 +2,7 @@ import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
from typing import Any
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.utilities import rank_zero_info
|
||||
@@ -201,7 +201,7 @@ class BaseTransformer(pl.LightningModule):
|
||||
)
|
||||
|
||||
@pl.utilities.rank_zero_only
|
||||
def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
|
||||
def on_save_checkpoint(self, checkpoint: dict[str, Any]) -> None:
|
||||
save_path = self.output_dir.joinpath("best_tfmr")
|
||||
self.model.config.save_step = self.step_count
|
||||
self.model.save_pretrained(save_path)
|
||||
@@ -282,7 +282,7 @@ class LoggingCallback(pl.Callback):
|
||||
# Log results
|
||||
for key in sorted(metrics):
|
||||
if key not in ["log", "progress_bar"]:
|
||||
rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
|
||||
rank_zero_info(f"{key} = {str(metrics[key])}\n")
|
||||
|
||||
def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
|
||||
rank_zero_info("***** Test results *****")
|
||||
@@ -292,8 +292,8 @@ class LoggingCallback(pl.Callback):
|
||||
with open(output_test_results_file, "w") as writer:
|
||||
for key in sorted(metrics):
|
||||
if key not in ["log", "progress_bar"]:
|
||||
rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
|
||||
writer.write("{} = {}\n".format(key, str(metrics[key])))
|
||||
rank_zero_info(f"{key} = {str(metrics[key])}\n")
|
||||
writer.write(f"{key} = {str(metrics[key])}\n")
|
||||
|
||||
|
||||
def add_generic_args(parser, root_dir) -> None:
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -231,14 +230,14 @@ def train(args, train_dataset, model, tokenizer):
|
||||
if args.local_rank == -1 and args.evaluate_during_training:
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar(f"eval_{key}", value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
# Save model checkpoint
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
|
||||
# Take care of distributed/parallel training
|
||||
model_to_save = model.module if hasattr(model, "module") else model
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
@@ -281,7 +280,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(f"***** Running evaluation {prefix} *****")
|
||||
logger.info(" Num examples = %d", len(dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
|
||||
@@ -348,11 +347,11 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
|
||||
|
||||
# Compute predictions
|
||||
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
|
||||
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
|
||||
output_prediction_file = os.path.join(args.output_dir, f"predictions_{prefix}.json")
|
||||
output_nbest_file = os.path.join(args.output_dir, f"nbest_predictions_{prefix}.json")
|
||||
|
||||
if args.version_2_with_negative:
|
||||
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
|
||||
output_null_log_odds_file = os.path.join(args.output_dir, f"null_odds_{prefix}.json")
|
||||
else:
|
||||
output_null_log_odds_file = None
|
||||
|
||||
@@ -828,10 +827,10 @@ def main():
|
||||
# Evaluate
|
||||
result = evaluate(args, model, tokenizer, prefix=global_step)
|
||||
|
||||
result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
|
||||
result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
|
||||
results.update(result)
|
||||
|
||||
logger.info("Results: {}".format(results))
|
||||
logger.info(f"Results: {results}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
|
||||
@@ -20,10 +20,10 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
|
||||
topk_filled_outputs = []
|
||||
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
|
||||
predicted_token = predicted_token_bpe.replace("\u2581", " ")
|
||||
if " {0}".format(masked_token) in masked_input:
|
||||
if f" {masked_token}" in masked_input:
|
||||
topk_filled_outputs.append(
|
||||
(
|
||||
masked_input.replace(" {0}".format(masked_token), predicted_token),
|
||||
masked_input.replace(f" {masked_token}", predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
from ltp import LTP
|
||||
|
||||
@@ -42,7 +41,7 @@ def is_chinese(word: str):
|
||||
return 1
|
||||
|
||||
|
||||
def get_chinese_word(tokens: List[str]):
|
||||
def get_chinese_word(tokens: list[str]):
|
||||
word_set = set()
|
||||
|
||||
for token in tokens:
|
||||
@@ -53,7 +52,7 @@ def get_chinese_word(tokens: List[str]):
|
||||
return word_list
|
||||
|
||||
|
||||
def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
|
||||
def add_sub_symbol(bert_tokens: list[str], chinese_word_set: set()):
|
||||
if not chinese_word_set:
|
||||
return bert_tokens
|
||||
max_word_len = max([len(w) for w in chinese_word_set])
|
||||
@@ -77,7 +76,7 @@ def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
|
||||
return bert_word
|
||||
|
||||
|
||||
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
|
||||
def prepare_ref(lines: list[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
|
||||
ltp_res = []
|
||||
|
||||
for i in range(0, len(lines), 100):
|
||||
@@ -117,7 +116,7 @@ def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokeni
|
||||
def main(args):
|
||||
# For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
|
||||
# If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
|
||||
with open(args.file_name, "r", encoding="utf-8") as f:
|
||||
with open(args.file_name, encoding="utf-8") as f:
|
||||
data = f.readlines()
|
||||
data = [line.strip() for line in data if len(line) > 0 and not line.isspace()] # avoid delimiter like '\u2029'
|
||||
ltp_tokenizer = LTP(args.ltp) # faster in GPU device
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -358,7 +357,7 @@ def main():
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
writer.write("{} = {}\n".format(key, str(result[key])))
|
||||
|
||||
results.update(result)
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -163,7 +162,7 @@ def main():
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
logger.info("device: {}, n_gpu {}".format(device, n_gpu))
|
||||
logger.info(f"device: {device}, n_gpu {n_gpu}")
|
||||
|
||||
if not args.do_train and not args.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
@@ -261,7 +260,7 @@ def main():
|
||||
loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
|
||||
)
|
||||
nb_tr_steps += 1
|
||||
tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
|
||||
tqdm_bar.desc = f"Training loss: {exp_average_loss:.2e} lr: {scheduler.get_lr()[0]:.2e}"
|
||||
|
||||
# Save a trained model
|
||||
if args.do_train:
|
||||
@@ -313,7 +312,7 @@ def main():
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
writer.write("{} = {}\n".format(key, str(result[key])))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -51,7 +50,7 @@ except ImportError:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SwagExample(object):
|
||||
class SwagExample:
|
||||
"""A single training/test example for the SWAG dataset."""
|
||||
|
||||
def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
|
||||
@@ -71,22 +70,22 @@ class SwagExample(object):
|
||||
|
||||
def __repr__(self):
|
||||
attributes = [
|
||||
"swag_id: {}".format(self.swag_id),
|
||||
"context_sentence: {}".format(self.context_sentence),
|
||||
"start_ending: {}".format(self.start_ending),
|
||||
"ending_0: {}".format(self.endings[0]),
|
||||
"ending_1: {}".format(self.endings[1]),
|
||||
"ending_2: {}".format(self.endings[2]),
|
||||
"ending_3: {}".format(self.endings[3]),
|
||||
f"swag_id: {self.swag_id}",
|
||||
f"context_sentence: {self.context_sentence}",
|
||||
f"start_ending: {self.start_ending}",
|
||||
f"ending_0: {self.endings[0]}",
|
||||
f"ending_1: {self.endings[1]}",
|
||||
f"ending_2: {self.endings[2]}",
|
||||
f"ending_3: {self.endings[3]}",
|
||||
]
|
||||
|
||||
if self.label is not None:
|
||||
attributes.append("label: {}".format(self.label))
|
||||
attributes.append(f"label: {self.label}")
|
||||
|
||||
return ", ".join(attributes)
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
class InputFeatures:
|
||||
def __init__(self, example_id, choices_features, label):
|
||||
self.example_id = example_id
|
||||
self.choices_features = [
|
||||
@@ -97,7 +96,7 @@ class InputFeatures(object):
|
||||
|
||||
|
||||
def read_swag_examples(input_file, is_training=True):
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
with open(input_file, encoding="utf-8") as f:
|
||||
lines = list(csv.reader(f))
|
||||
|
||||
if is_training and lines[0][-1] != "label":
|
||||
@@ -179,15 +178,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, is_trainin
|
||||
label = example.label
|
||||
if example_index < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("swag_id: {}".format(example.swag_id))
|
||||
logger.info(f"swag_id: {example.swag_id}")
|
||||
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
|
||||
logger.info("choice: {}".format(choice_idx))
|
||||
logger.info(f"choice: {choice_idx}")
|
||||
logger.info("tokens: {}".format(" ".join(tokens)))
|
||||
logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
|
||||
logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
|
||||
logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
|
||||
if is_training:
|
||||
logger.info("label: {}".format(label))
|
||||
logger.info(f"label: {label}")
|
||||
|
||||
features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
|
||||
|
||||
@@ -382,14 +381,14 @@ def train(args, train_dataset, model, tokenizer):
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar(f"eval_{key}", value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
@@ -423,7 +422,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(f"***** Running evaluation {prefix} *****")
|
||||
logger.info(" Num examples = %d", len(dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
|
||||
@@ -466,7 +465,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info("%s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
writer.write("{} = {}\n".format(key, str(result[key])))
|
||||
|
||||
return result
|
||||
|
||||
@@ -710,10 +709,10 @@ def main():
|
||||
# Evaluate
|
||||
result = evaluate(args, model, tokenizer, prefix=global_step)
|
||||
|
||||
result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
|
||||
result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
|
||||
results.update(result)
|
||||
|
||||
logger.info("Results: {}".format(results))
|
||||
logger.info(f"Results: {results}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -66,7 +65,7 @@ def main():
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
logger.info("device: {}".format(device))
|
||||
logger.info(f"device: {device}")
|
||||
|
||||
# Load a pre-processed dataset
|
||||
# You can also build the corpus yourself using TransfoXLCorpus methods
|
||||
@@ -111,7 +110,7 @@ def main():
|
||||
total_loss += seq_len * loss.item()
|
||||
total_len += seq_len
|
||||
total_time = time.time() - start_time
|
||||
logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
|
||||
logger.info(f"Time : {total_time:.2f}s, {1000 * total_time / (idx + 1):.2f}ms/segment")
|
||||
return total_loss / total_len
|
||||
|
||||
# Run on test data.
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 Huggingface
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -13,7 +12,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import io
|
||||
import json
|
||||
import unittest
|
||||
|
||||
@@ -25,7 +23,7 @@ from utils import calculate_bleu
|
||||
|
||||
|
||||
filename = get_tests_dir() + "/test_data/fsmt/fsmt_val_data.json"
|
||||
with io.open(filename, "r", encoding="utf-8") as f:
|
||||
with open(filename, encoding="utf-8") as f:
|
||||
bleu_data = json.load(f)
|
||||
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@ import time
|
||||
from json import JSONDecodeError
|
||||
from logging import getLogger
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
@@ -55,10 +54,10 @@ def eval_data_dir(
|
||||
task="summarization",
|
||||
local_rank=None,
|
||||
num_return_sequences=1,
|
||||
dataset_kwargs: Dict = None,
|
||||
dataset_kwargs: dict = None,
|
||||
prefix="",
|
||||
**generate_kwargs,
|
||||
) -> Dict:
|
||||
) -> dict:
|
||||
"""Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json"""
|
||||
model_name = str(model_name)
|
||||
assert local_rank is not None
|
||||
@@ -211,7 +210,7 @@ def run_generate():
|
||||
calc_bleu = "translation" in args.task
|
||||
score_fn = calculate_bleu if calc_bleu else calculate_rouge
|
||||
metric_name = "bleu" if calc_bleu else "rouge"
|
||||
metrics: Dict = score_fn(preds, labels)
|
||||
metrics: dict = score_fn(preds, labels)
|
||||
metrics["n_obs"] = len(preds)
|
||||
runtime = time.time() - start_time
|
||||
metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4)
|
||||
@@ -227,7 +226,7 @@ def run_generate():
|
||||
shutil.rmtree(json_save_dir)
|
||||
|
||||
|
||||
def combine_partial_results(partial_results) -> List:
|
||||
def combine_partial_results(partial_results) -> list:
|
||||
"""Concatenate partial results into one file, then sort it by id."""
|
||||
records = []
|
||||
for partial_result in partial_results:
|
||||
@@ -237,7 +236,7 @@ def combine_partial_results(partial_results) -> List:
|
||||
return preds
|
||||
|
||||
|
||||
def gather_results_from_each_node(num_replicas, save_dir, timeout) -> List[Dict[str, List]]:
|
||||
def gather_results_from_each_node(num_replicas, save_dir, timeout) -> list[dict[str, list]]:
|
||||
# WAIT FOR lots of .json files
|
||||
start_wait = time.time()
|
||||
logger.info("waiting for all nodes to finish")
|
||||
|
||||
@@ -20,7 +20,6 @@ import time
|
||||
import warnings
|
||||
from logging import getLogger
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
@@ -36,7 +35,7 @@ DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
|
||||
def generate_summaries_or_translations(
|
||||
examples: List[str],
|
||||
examples: list[str],
|
||||
out_file: str,
|
||||
model_name: str,
|
||||
batch_size: int = 8,
|
||||
@@ -45,7 +44,7 @@ def generate_summaries_or_translations(
|
||||
task="summarization",
|
||||
prefix=None,
|
||||
**generate_kwargs,
|
||||
) -> Dict:
|
||||
) -> dict:
|
||||
"""Save model.generate results to <out_file>, and return how long it took."""
|
||||
fout = Path(out_file).open("w", encoding="utf-8")
|
||||
model_name = str(model_name)
|
||||
|
||||
@@ -34,7 +34,7 @@ task_score_names = {
|
||||
|
||||
def parse_search_arg(search):
|
||||
groups = search.split()
|
||||
entries = dict((g.split("=") for g in groups))
|
||||
entries = dict(g.split("=") for g in groups)
|
||||
entry_names = list(entries.keys())
|
||||
sets = [[f"--{k} {v}" for v in vs.split(":")] for k, vs in entries.items()]
|
||||
matrix = [list(x) for x in itertools.product(*sets)]
|
||||
@@ -105,7 +105,7 @@ def run_search():
|
||||
col_widths = {col: len(str(col)) for col in col_names}
|
||||
results = []
|
||||
for r in matrix:
|
||||
hparams = dict((x.replace("--", "").split() for x in r))
|
||||
hparams = dict(x.replace("--", "").split() for x in r)
|
||||
args_exp = " ".join(r).split()
|
||||
args_exp.extend(["--bs", str(args.bs)]) # in case we need to reduce its size due to CUDA OOM
|
||||
sys.argv = args_normal + args_exp
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@@ -172,10 +172,10 @@ class Seq2SeqTrainer(Trainer):
|
||||
def prediction_step(
|
||||
self,
|
||||
model: nn.Module,
|
||||
inputs: Dict[str, Union[torch.Tensor, Any]],
|
||||
inputs: dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only: bool,
|
||||
ignore_keys: Optional[List[str]] = None,
|
||||
) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
ignore_keys: Optional[list[str]] = None,
|
||||
) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import io
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
@@ -29,5 +28,5 @@ def get_all_data(pairs, n_objs):
|
||||
|
||||
text = get_all_data(pairs, n_objs)
|
||||
filename = "./fsmt_val_data.json"
|
||||
with io.open(filename, "w", encoding="utf-8") as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)
|
||||
|
||||
@@ -19,9 +19,10 @@ import math
|
||||
import os
|
||||
import pickle
|
||||
import socket
|
||||
from collections.abc import Iterable
|
||||
from logging import getLogger
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, Iterable, List, Tuple, Union
|
||||
from typing import Callable, Union
|
||||
|
||||
import git
|
||||
import numpy as np
|
||||
@@ -67,7 +68,7 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
|
||||
return loss, nll_loss
|
||||
|
||||
|
||||
def lmap(f: Callable, x: Iterable) -> List:
|
||||
def lmap(f: Callable, x: Iterable) -> list:
|
||||
"""list(map(f, x))"""
|
||||
return list(map(f, x))
|
||||
|
||||
@@ -77,11 +78,11 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
|
||||
return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
|
||||
|
||||
|
||||
def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], Dict]:
|
||||
def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], dict]:
|
||||
def non_pad_len(tokens: np.ndarray) -> int:
|
||||
return np.count_nonzero(tokens != tokenizer.pad_token_id)
|
||||
|
||||
def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
|
||||
def decode_pred(pred: EvalPrediction) -> tuple[list[str], list[str]]:
|
||||
pred_ids = pred.predictions
|
||||
label_ids = pred.label_ids
|
||||
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
|
||||
@@ -91,16 +92,16 @@ def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) ->
|
||||
label_str = lmap(str.strip, label_str)
|
||||
return pred_str, label_str
|
||||
|
||||
def summarization_metrics(pred: EvalPrediction) -> Dict:
|
||||
def summarization_metrics(pred: EvalPrediction) -> dict:
|
||||
pred_str, label_str = decode_pred(pred)
|
||||
rouge: Dict = calculate_rouge(pred_str, label_str)
|
||||
rouge: dict = calculate_rouge(pred_str, label_str)
|
||||
summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
|
||||
rouge.update({"gen_len": summ_len})
|
||||
return rouge
|
||||
|
||||
def translation_metrics(pred: EvalPrediction) -> Dict:
|
||||
def translation_metrics(pred: EvalPrediction) -> dict:
|
||||
pred_str, label_str = decode_pred(pred)
|
||||
bleu: Dict = calculate_bleu(pred_str, label_str)
|
||||
bleu: dict = calculate_bleu(pred_str, label_str)
|
||||
gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
|
||||
bleu.update({"gen_len": gen_len})
|
||||
return bleu
|
||||
@@ -183,7 +184,7 @@ class AbstractSeq2SeqDataset(Dataset):
|
||||
return min(self.src_lens[i], self.max_target_length)
|
||||
|
||||
# call fairseq cython function
|
||||
batch_sampler: List[List[int]] = batch_by_size(
|
||||
batch_sampler: list[list[int]] = batch_by_size(
|
||||
sorted_indices,
|
||||
num_tokens_fn=num_tokens_in_example,
|
||||
max_tokens=max_tokens_per_batch,
|
||||
@@ -207,7 +208,7 @@ class AbstractSeq2SeqDataset(Dataset):
|
||||
|
||||
|
||||
class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
|
||||
def __getitem__(self, index) -> Dict[str, torch.Tensor]:
|
||||
def __getitem__(self, index) -> dict[str, torch.Tensor]:
|
||||
"""Call tokenizer on src and tgt_lines"""
|
||||
index = index + 1 # linecache starts at 1
|
||||
source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
|
||||
@@ -237,7 +238,7 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
|
||||
**self.dataset_kwargs,
|
||||
)
|
||||
|
||||
def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
|
||||
def collate_fn(self, batch) -> dict[str, torch.Tensor]:
|
||||
input_ids = torch.stack([x["input_ids"] for x in batch])
|
||||
masks = torch.stack([x["attention_mask"] for x in batch])
|
||||
target_ids = torch.stack([x["labels"] for x in batch])
|
||||
@@ -255,7 +256,7 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
|
||||
class Seq2SeqDataset(AbstractSeq2SeqDataset):
|
||||
"""A dataset that calls prepare_seq2seq_batch."""
|
||||
|
||||
def __getitem__(self, index) -> Dict[str, str]:
|
||||
def __getitem__(self, index) -> dict[str, str]:
|
||||
index = index + 1 # linecache starts at 1
|
||||
source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
|
||||
tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
|
||||
@@ -263,9 +264,9 @@ class Seq2SeqDataset(AbstractSeq2SeqDataset):
|
||||
assert tgt_line, f"empty tgt line for index {index}"
|
||||
return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
|
||||
|
||||
def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
|
||||
def collate_fn(self, batch) -> dict[str, torch.Tensor]:
|
||||
"""Call prepare_seq2seq_batch."""
|
||||
batch_encoding: Dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
|
||||
batch_encoding: dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
|
||||
[x["src_texts"] for x in batch],
|
||||
tgt_texts=[x["tgt_texts"] for x in batch],
|
||||
max_length=self.max_source_length,
|
||||
@@ -293,7 +294,7 @@ class Seq2SeqDataCollator:
|
||||
if data_args.tgt_lang is not None:
|
||||
self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
|
||||
|
||||
def __call__(self, batch) -> Dict[str, torch.Tensor]:
|
||||
def __call__(self, batch) -> dict[str, torch.Tensor]:
|
||||
if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
|
||||
batch = self._encode(batch)
|
||||
input_ids, attention_mask, labels = (
|
||||
@@ -329,7 +330,7 @@ class Seq2SeqDataCollator:
|
||||
shifted_input_ids[..., 0] = self.pad_token_id
|
||||
return shifted_input_ids
|
||||
|
||||
def _encode(self, batch) -> Dict[str, torch.Tensor]:
|
||||
def _encode(self, batch) -> dict[str, torch.Tensor]:
|
||||
batch_encoding = self.tokenizer.prepare_seq2seq_batch(
|
||||
[x["src_texts"] for x in batch],
|
||||
tgt_texts=[x["tgt_texts"] for x in batch],
|
||||
@@ -355,7 +356,7 @@ class SortishSampler(Sampler):
|
||||
return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
|
||||
|
||||
|
||||
def sortish_sampler_indices(data: List, bs: int, shuffle=True) -> np.array:
|
||||
def sortish_sampler_indices(data: list, bs: int, shuffle=True) -> np.array:
|
||||
"Go through the text data by order of src length with a bit of randomness. From fastai repo."
|
||||
if not shuffle:
|
||||
return np.argsort(np.array(data) * -1)
|
||||
@@ -455,7 +456,7 @@ def pickle_save(obj, path):
|
||||
return pickle.dump(obj, f)
|
||||
|
||||
|
||||
def flatten_list(summary_ids: List[List]):
|
||||
def flatten_list(summary_ids: list[list]):
|
||||
return list(itertools.chain.from_iterable(summary_ids))
|
||||
|
||||
|
||||
@@ -506,14 +507,14 @@ def extract_rouge_mid_statistics(dct):
|
||||
|
||||
|
||||
def calculate_rouge(
|
||||
pred_lns: List[str],
|
||||
tgt_lns: List[str],
|
||||
pred_lns: list[str],
|
||||
tgt_lns: list[str],
|
||||
use_stemmer=True,
|
||||
rouge_keys=ROUGE_KEYS,
|
||||
return_precision_and_recall=False,
|
||||
bootstrap_aggregation=True,
|
||||
newline_sep=True,
|
||||
) -> Dict:
|
||||
) -> dict:
|
||||
"""Calculate rouge using rouge_scorer package.
|
||||
|
||||
Args:
|
||||
@@ -590,19 +591,19 @@ def any_requires_grad(model: nn.Module) -> bool:
|
||||
|
||||
|
||||
def assert_all_frozen(model):
|
||||
model_grads: List[bool] = list(grad_status(model))
|
||||
model_grads: list[bool] = list(grad_status(model))
|
||||
n_require_grad = sum(lmap(int, model_grads))
|
||||
npars = len(model_grads)
|
||||
assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"
|
||||
|
||||
|
||||
def assert_not_all_frozen(model):
|
||||
model_grads: List[bool] = list(grad_status(model))
|
||||
model_grads: list[bool] = list(grad_status(model))
|
||||
npars = len(model_grads)
|
||||
assert any(model_grads), f"none of {npars} weights require grad"
|
||||
|
||||
|
||||
def parse_numeric_n_bool_cl_kwargs(unparsed_args: List[str]) -> Dict[str, Union[int, float, bool]]:
|
||||
def parse_numeric_n_bool_cl_kwargs(unparsed_args: list[str]) -> dict[str, Union[int, float, bool]]:
|
||||
"""
|
||||
Parse an argv list of unspecified command line args to a dict.
|
||||
Assumes all values are either numeric or boolean in the form of true/false.
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -20,7 +19,7 @@ import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from importlib import import_module
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||
@@ -159,7 +158,7 @@ def main():
|
||||
|
||||
# Prepare CONLL-2003 task
|
||||
labels = token_classification_task.get_labels(data_args.labels)
|
||||
label_map: Dict[int, str] = dict(enumerate(labels))
|
||||
label_map: dict[int, str] = dict(enumerate(labels))
|
||||
num_labels = len(labels)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
@@ -217,7 +216,7 @@ def main():
|
||||
else None
|
||||
)
|
||||
|
||||
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
|
||||
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> tuple[list[int], list[int]]:
|
||||
preds = np.argmax(predictions, axis=2)
|
||||
|
||||
batch_size, seq_len = preds.shape
|
||||
@@ -233,7 +232,7 @@ def main():
|
||||
|
||||
return preds_list, out_label_list
|
||||
|
||||
def compute_metrics(p: EvalPrediction) -> Dict:
|
||||
def compute_metrics(p: EvalPrediction) -> dict:
|
||||
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
|
||||
return {
|
||||
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
||||
@@ -279,7 +278,7 @@ def main():
|
||||
logger.info("***** Eval results *****")
|
||||
for key, value in result.items():
|
||||
logger.info(" %s = %s", key, value)
|
||||
writer.write("%s = %s\n" % (key, value))
|
||||
writer.write("{} = {}\n".format(key, value))
|
||||
|
||||
results.update(result)
|
||||
|
||||
@@ -304,13 +303,13 @@ def main():
|
||||
with open(output_test_results_file, "w") as writer:
|
||||
for key, value in metrics.items():
|
||||
logger.info(" %s = %s", key, value)
|
||||
writer.write("%s = %s\n" % (key, value))
|
||||
writer.write("{} = {}\n".format(key, value))
|
||||
|
||||
# Save predictions
|
||||
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
|
||||
if trainer.is_world_process_zero():
|
||||
with open(output_test_predictions_file, "w") as writer:
|
||||
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
|
||||
with open(os.path.join(data_args.data_dir, "test.txt")) as f:
|
||||
token_classification_task.write_predictions_to_file(writer, f, preds_list)
|
||||
|
||||
return results
|
||||
|
||||
@@ -12,7 +12,7 @@ subword_len_counter = 0
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
max_len -= tokenizer.num_special_tokens_to_add()
|
||||
|
||||
with open(dataset, "rt") as f_p:
|
||||
with open(dataset) as f_p:
|
||||
for line in f_p:
|
||||
line = line.rstrip()
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import List, TextIO, Union
|
||||
from typing import TextIO, Union
|
||||
|
||||
from conllu import parse_incr
|
||||
from utils_ner import InputExample, Split, TokenClassificationTask
|
||||
@@ -14,7 +14,7 @@ class NER(TokenClassificationTask):
|
||||
# in NER datasets, the last column is usually reserved for NER label
|
||||
self.label_idx = label_idx
|
||||
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]:
|
||||
if isinstance(mode, Split):
|
||||
mode = mode.value
|
||||
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||
@@ -42,7 +42,7 @@ class NER(TokenClassificationTask):
|
||||
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||
return examples
|
||||
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list):
|
||||
example_id = 0
|
||||
for line in test_input_reader:
|
||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||
@@ -55,9 +55,9 @@ class NER(TokenClassificationTask):
|
||||
else:
|
||||
logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
def get_labels(self, path: str) -> list[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
with open(path) as f:
|
||||
labels = f.read().splitlines()
|
||||
if "O" not in labels:
|
||||
labels = ["O"] + labels
|
||||
@@ -71,9 +71,9 @@ class Chunk(NER):
|
||||
# in CONLL2003 dataset chunk column is second-to-last
|
||||
super().__init__(label_idx=-2)
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
def get_labels(self, path: str) -> list[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
with open(path) as f:
|
||||
labels = f.read().splitlines()
|
||||
if "O" not in labels:
|
||||
labels = ["O"] + labels
|
||||
@@ -105,7 +105,7 @@ class Chunk(NER):
|
||||
|
||||
|
||||
class POS(TokenClassificationTask):
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]:
|
||||
if isinstance(mode, Split):
|
||||
mode = mode.value
|
||||
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||
@@ -125,7 +125,7 @@ class POS(TokenClassificationTask):
|
||||
guid_index += 1
|
||||
return examples
|
||||
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list):
|
||||
example_id = 0
|
||||
for sentence in parse_incr(test_input_reader):
|
||||
s_p = preds_list[example_id]
|
||||
@@ -136,9 +136,9 @@ class POS(TokenClassificationTask):
|
||||
writer.write(out)
|
||||
example_id += 1
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
def get_labels(self, path: str) -> list[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
with open(path) as f:
|
||||
return f.read().splitlines()
|
||||
else:
|
||||
return [
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -19,7 +18,7 @@ import logging
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
from filelock import FileLock
|
||||
|
||||
@@ -42,8 +41,8 @@ class InputExample:
|
||||
"""
|
||||
|
||||
guid: str
|
||||
words: List[str]
|
||||
labels: Optional[List[str]]
|
||||
words: list[str]
|
||||
labels: Optional[list[str]]
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -53,10 +52,10 @@ class InputFeatures:
|
||||
Property names are the same names as the corresponding inputs to a model.
|
||||
"""
|
||||
|
||||
input_ids: List[int]
|
||||
attention_mask: List[int]
|
||||
token_type_ids: Optional[List[int]] = None
|
||||
label_ids: Optional[List[int]] = None
|
||||
input_ids: list[int]
|
||||
attention_mask: list[int]
|
||||
token_type_ids: Optional[list[int]] = None
|
||||
label_ids: Optional[list[int]] = None
|
||||
|
||||
|
||||
class Split(Enum):
|
||||
@@ -67,17 +66,17 @@ class Split(Enum):
|
||||
|
||||
class TokenClassificationTask:
|
||||
@staticmethod
|
||||
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> list[InputExample]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def get_labels(path: str) -> List[str]:
|
||||
def get_labels(path: str) -> list[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def convert_examples_to_features(
|
||||
examples: List[InputExample],
|
||||
label_list: List[str],
|
||||
examples: list[InputExample],
|
||||
label_list: list[str],
|
||||
max_seq_length: int,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
cls_token_at_end=False,
|
||||
@@ -91,7 +90,7 @@ class TokenClassificationTask:
|
||||
pad_token_label_id=-100,
|
||||
sequence_a_segment_id=0,
|
||||
mask_padding_with_zero=True,
|
||||
) -> List[InputFeatures]:
|
||||
) -> list[InputFeatures]:
|
||||
"""Loads a data file into a list of `InputFeatures`
|
||||
`cls_token_at_end` define the location of the CLS token:
|
||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
||||
@@ -214,7 +213,7 @@ if is_torch_available():
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
features: list[InputFeatures]
|
||||
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
|
||||
# Use cross entropy ignore_index as padding label id so that only
|
||||
# real label ids contribute to the loss later.
|
||||
@@ -224,7 +223,7 @@ if is_torch_available():
|
||||
token_classification_task: TokenClassificationTask,
|
||||
data_dir: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
labels: List[str],
|
||||
labels: list[str],
|
||||
model_type: str,
|
||||
max_seq_length: Optional[int] = None,
|
||||
overwrite_cache=False,
|
||||
@@ -233,7 +232,7 @@ if is_torch_available():
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(
|
||||
data_dir,
|
||||
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
|
||||
f"cached_{mode.value}_{tokenizer.__class__.__name__}_{str(max_seq_length)}",
|
||||
)
|
||||
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
@@ -283,7 +282,7 @@ if is_tf_available():
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
features: list[InputFeatures]
|
||||
pad_token_label_id: int = -100
|
||||
# Use cross entropy ignore_index as padding label id so that only
|
||||
# real label ids contribute to the loss later.
|
||||
@@ -293,7 +292,7 @@ if is_tf_available():
|
||||
token_classification_task: TokenClassificationTask,
|
||||
data_dir: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
labels: List[str],
|
||||
labels: list[str],
|
||||
model_type: str,
|
||||
max_seq_length: Optional[int] = None,
|
||||
overwrite_cache=False,
|
||||
|
||||
Reference in New Issue
Block a user