Use Python 3.9 syntax in examples (#37279)
Signed-off-by: cyy <cyyever@outlook.com>
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -20,7 +19,7 @@ import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from importlib import import_module
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||
@@ -159,7 +158,7 @@ def main():
|
||||
|
||||
# Prepare CONLL-2003 task
|
||||
labels = token_classification_task.get_labels(data_args.labels)
|
||||
label_map: Dict[int, str] = dict(enumerate(labels))
|
||||
label_map: dict[int, str] = dict(enumerate(labels))
|
||||
num_labels = len(labels)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
@@ -217,7 +216,7 @@ def main():
|
||||
else None
|
||||
)
|
||||
|
||||
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
|
||||
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> tuple[list[int], list[int]]:
|
||||
preds = np.argmax(predictions, axis=2)
|
||||
|
||||
batch_size, seq_len = preds.shape
|
||||
@@ -233,7 +232,7 @@ def main():
|
||||
|
||||
return preds_list, out_label_list
|
||||
|
||||
def compute_metrics(p: EvalPrediction) -> Dict:
|
||||
def compute_metrics(p: EvalPrediction) -> dict:
|
||||
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
|
||||
return {
|
||||
"accuracy_score": accuracy_score(out_label_list, preds_list),
|
||||
@@ -279,7 +278,7 @@ def main():
|
||||
logger.info("***** Eval results *****")
|
||||
for key, value in result.items():
|
||||
logger.info(" %s = %s", key, value)
|
||||
writer.write("%s = %s\n" % (key, value))
|
||||
writer.write("{} = {}\n".format(key, value))
|
||||
|
||||
results.update(result)
|
||||
|
||||
@@ -304,13 +303,13 @@ def main():
|
||||
with open(output_test_results_file, "w") as writer:
|
||||
for key, value in metrics.items():
|
||||
logger.info(" %s = %s", key, value)
|
||||
writer.write("%s = %s\n" % (key, value))
|
||||
writer.write("{} = {}\n".format(key, value))
|
||||
|
||||
# Save predictions
|
||||
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
|
||||
if trainer.is_world_process_zero():
|
||||
with open(output_test_predictions_file, "w") as writer:
|
||||
with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
|
||||
with open(os.path.join(data_args.data_dir, "test.txt")) as f:
|
||||
token_classification_task.write_predictions_to_file(writer, f, preds_list)
|
||||
|
||||
return results
|
||||
|
||||
@@ -12,7 +12,7 @@ subword_len_counter = 0
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
max_len -= tokenizer.num_special_tokens_to_add()
|
||||
|
||||
with open(dataset, "rt") as f_p:
|
||||
with open(dataset) as f_p:
|
||||
for line in f_p:
|
||||
line = line.rstrip()
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import List, TextIO, Union
|
||||
from typing import TextIO, Union
|
||||
|
||||
from conllu import parse_incr
|
||||
from utils_ner import InputExample, Split, TokenClassificationTask
|
||||
@@ -14,7 +14,7 @@ class NER(TokenClassificationTask):
|
||||
# in NER datasets, the last column is usually reserved for NER label
|
||||
self.label_idx = label_idx
|
||||
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]:
|
||||
if isinstance(mode, Split):
|
||||
mode = mode.value
|
||||
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||
@@ -42,7 +42,7 @@ class NER(TokenClassificationTask):
|
||||
examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
|
||||
return examples
|
||||
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list):
|
||||
example_id = 0
|
||||
for line in test_input_reader:
|
||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||
@@ -55,9 +55,9 @@ class NER(TokenClassificationTask):
|
||||
else:
|
||||
logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
def get_labels(self, path: str) -> list[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
with open(path) as f:
|
||||
labels = f.read().splitlines()
|
||||
if "O" not in labels:
|
||||
labels = ["O"] + labels
|
||||
@@ -71,9 +71,9 @@ class Chunk(NER):
|
||||
# in CONLL2003 dataset chunk column is second-to-last
|
||||
super().__init__(label_idx=-2)
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
def get_labels(self, path: str) -> list[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
with open(path) as f:
|
||||
labels = f.read().splitlines()
|
||||
if "O" not in labels:
|
||||
labels = ["O"] + labels
|
||||
@@ -105,7 +105,7 @@ class Chunk(NER):
|
||||
|
||||
|
||||
class POS(TokenClassificationTask):
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]:
|
||||
if isinstance(mode, Split):
|
||||
mode = mode.value
|
||||
file_path = os.path.join(data_dir, f"{mode}.txt")
|
||||
@@ -125,7 +125,7 @@ class POS(TokenClassificationTask):
|
||||
guid_index += 1
|
||||
return examples
|
||||
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
|
||||
def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list):
|
||||
example_id = 0
|
||||
for sentence in parse_incr(test_input_reader):
|
||||
s_p = preds_list[example_id]
|
||||
@@ -136,9 +136,9 @@ class POS(TokenClassificationTask):
|
||||
writer.write(out)
|
||||
example_id += 1
|
||||
|
||||
def get_labels(self, path: str) -> List[str]:
|
||||
def get_labels(self, path: str) -> list[str]:
|
||||
if path:
|
||||
with open(path, "r") as f:
|
||||
with open(path) as f:
|
||||
return f.read().splitlines()
|
||||
else:
|
||||
return [
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -19,7 +18,7 @@ import logging
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
from filelock import FileLock
|
||||
|
||||
@@ -42,8 +41,8 @@ class InputExample:
|
||||
"""
|
||||
|
||||
guid: str
|
||||
words: List[str]
|
||||
labels: Optional[List[str]]
|
||||
words: list[str]
|
||||
labels: Optional[list[str]]
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -53,10 +52,10 @@ class InputFeatures:
|
||||
Property names are the same names as the corresponding inputs to a model.
|
||||
"""
|
||||
|
||||
input_ids: List[int]
|
||||
attention_mask: List[int]
|
||||
token_type_ids: Optional[List[int]] = None
|
||||
label_ids: Optional[List[int]] = None
|
||||
input_ids: list[int]
|
||||
attention_mask: list[int]
|
||||
token_type_ids: Optional[list[int]] = None
|
||||
label_ids: Optional[list[int]] = None
|
||||
|
||||
|
||||
class Split(Enum):
|
||||
@@ -67,17 +66,17 @@ class Split(Enum):
|
||||
|
||||
class TokenClassificationTask:
|
||||
@staticmethod
|
||||
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> List[InputExample]:
|
||||
def read_examples_from_file(data_dir, mode: Union[Split, str]) -> list[InputExample]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def get_labels(path: str) -> List[str]:
|
||||
def get_labels(path: str) -> list[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def convert_examples_to_features(
|
||||
examples: List[InputExample],
|
||||
label_list: List[str],
|
||||
examples: list[InputExample],
|
||||
label_list: list[str],
|
||||
max_seq_length: int,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
cls_token_at_end=False,
|
||||
@@ -91,7 +90,7 @@ class TokenClassificationTask:
|
||||
pad_token_label_id=-100,
|
||||
sequence_a_segment_id=0,
|
||||
mask_padding_with_zero=True,
|
||||
) -> List[InputFeatures]:
|
||||
) -> list[InputFeatures]:
|
||||
"""Loads a data file into a list of `InputFeatures`
|
||||
`cls_token_at_end` define the location of the CLS token:
|
||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
||||
@@ -214,7 +213,7 @@ if is_torch_available():
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
features: list[InputFeatures]
|
||||
pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
|
||||
# Use cross entropy ignore_index as padding label id so that only
|
||||
# real label ids contribute to the loss later.
|
||||
@@ -224,7 +223,7 @@ if is_torch_available():
|
||||
token_classification_task: TokenClassificationTask,
|
||||
data_dir: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
labels: List[str],
|
||||
labels: list[str],
|
||||
model_type: str,
|
||||
max_seq_length: Optional[int] = None,
|
||||
overwrite_cache=False,
|
||||
@@ -233,7 +232,7 @@ if is_torch_available():
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(
|
||||
data_dir,
|
||||
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
|
||||
f"cached_{mode.value}_{tokenizer.__class__.__name__}_{str(max_seq_length)}",
|
||||
)
|
||||
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
@@ -283,7 +282,7 @@ if is_tf_available():
|
||||
soon.
|
||||
"""
|
||||
|
||||
features: List[InputFeatures]
|
||||
features: list[InputFeatures]
|
||||
pad_token_label_id: int = -100
|
||||
# Use cross entropy ignore_index as padding label id so that only
|
||||
# real label ids contribute to the loss later.
|
||||
@@ -293,7 +292,7 @@ if is_tf_available():
|
||||
token_classification_task: TokenClassificationTask,
|
||||
data_dir: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
labels: List[str],
|
||||
labels: list[str],
|
||||
model_type: str,
|
||||
max_seq_length: Optional[int] = None,
|
||||
overwrite_cache=False,
|
||||
|
||||
Reference in New Issue
Block a user