From 7f9ccffc5b9da6b2eb5631fef81b85fc52269f6f Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 7 Dec 2020 14:26:36 -0500 Subject: [PATCH] Use word_ids to get labels in run_ner (#8962) * Use word_ids to get labels in run_ner * Add sanity check --- examples/token-classification/run_ner.py | 36 ++++++++++++++---------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py index 0021a71974..050e9ae5da 100644 --- a/examples/token-classification/run_ner.py +++ b/examples/token-classification/run_ner.py @@ -35,6 +35,7 @@ from transformers import ( AutoTokenizer, DataCollatorForTokenClassification, HfArgumentParser, + PreTrainedTokenizerFast, Trainer, TrainingArguments, set_seed, @@ -250,6 +251,14 @@ def main(): cache_dir=model_args.cache_dir, ) + # Tokenizer check: this script requires a fast tokenizer. + if not isinstance(tokenizer, PreTrainedTokenizerFast): + raise ValueError( + "This example script only works for models that have a fast tokenizer. Checkout the big table of models " + "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " + "requirement" + ) + # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False @@ -262,28 +271,25 @@ def main(): truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, - return_offsets_mapping=True, ) - offset_mappings = tokenized_inputs.pop("offset_mapping") labels = [] - for label, offset_mapping in zip(examples[label_column_name], offset_mappings): - label_index = 0 - current_label = -100 + for i, label in enumerate(examples[label_column_name]): + word_ids = tokenized_inputs.word_ids(batch_index=i) + previous_word_idx = None label_ids = [] - for offset in offset_mapping: - # We set the label for the first token of each word. Special characters will have an offset of (0, 0) - # so the test ignores them. - if offset[0] == 0 and offset[1] != 0: - current_label = label_to_id[label[label_index]] - label_index += 1 - label_ids.append(current_label) - # For special tokens, we set the label to -100 so it's automatically ignored in the loss function. - elif offset[0] == 0 and offset[1] == 0: + for word_idx in word_ids: + # Special tokens have a word id that is None. We set the label to -100 so they are automatically + # ignored in the loss function. + if word_idx is None: label_ids.append(-100) + # We set the label for the first token of each word. + elif word_idx != previous_word_idx: + label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: - label_ids.append(current_label if data_args.label_all_tokens else -100) + label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) + previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels