From f875fb0e5fb29c39436cfe7f6fedf2e3f5f0ad66 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Wed, 20 Oct 2021 07:55:14 -0400 Subject: [PATCH] Fix label attribution in token classification examples (#14055) --- examples/pytorch/token-classification/run_ner.py | 13 ++++++++++++- .../token-classification/run_ner_no_trainer.py | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index e2d2bcba9a..f944a819e3 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -303,6 +303,14 @@ def main(): label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) + # Map that sends B-Xxx label to its I-Xxx counterpart + b_to_i_label = [] + for idx, label in enumerate(label_list): + if label.startswith("B-") and label.replace("B-", "I-") in label_list: + b_to_i_label.append(label_list.index(label.replace("B-", "I-"))) + else: + b_to_i_label.append(idx) + # Load pretrained model and tokenizer # # Distributed training: @@ -385,7 +393,10 @@ def main(): # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: - label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) + if data_args.label_all_tokens: + label_ids.append(b_to_i_label[label_to_id[label[word_idx]]]) + else: + label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 756bdff55b..d78486d380 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -328,6 +328,14 @@ def main(): label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) + # Map that sends B-Xxx label to its I-Xxx counterpart + b_to_i_label = [] + for idx, label in enumerate(label_list): + if label.startswith("B-") and label.replace("B-", "I-") in label_list: + b_to_i_label.append(label_list.index(label.replace("B-", "I-"))) + else: + b_to_i_label.append(idx) + # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently @@ -396,7 +404,10 @@ def main(): # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: - label_ids.append(label_to_id[label[word_idx]] if args.label_all_tokens else -100) + if args.label_all_tokens: + label_ids.append(b_to_i_label[label_to_id[label[word_idx]]]) + else: + label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids)