From d812e6d76e39bd3a5e037524ccb1c3bc9b6c2420 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 15 Jun 2020 14:30:40 +0200 Subject: [PATCH] NER: fix construction of input examples for RoBERTa (#4943) * utils_ner: do not add extra sep token for RoBERTa model * run_pl_ner: do not add extra sep token for RoBERTa model --- examples/token-classification/run_pl_ner.py | 2 +- examples/token-classification/utils_ner.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/token-classification/run_pl_ner.py b/examples/token-classification/run_pl_ner.py index f015dad947..eca8ffd5f5 100644 --- a/examples/token-classification/run_pl_ner.py +++ b/examples/token-classification/run_pl_ner.py @@ -65,7 +65,7 @@ class NERTransformer(BaseTransformer): cls_token=self.tokenizer.cls_token, cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0, sep_token=self.tokenizer.sep_token, - sep_token_extra=bool(self.config.model_type in ["roberta"]), + sep_token_extra=False, pad_on_left=bool(self.config.model_type in ["xlnet"]), pad_token=self.tokenizer.pad_token_id, pad_token_segment_id=self.tokenizer.pad_token_type_id, diff --git a/examples/token-classification/utils_ner.py b/examples/token-classification/utils_ner.py index ef58904332..42e07f642a 100644 --- a/examples/token-classification/utils_ner.py +++ b/examples/token-classification/utils_ner.py @@ -119,7 +119,7 @@ if is_torch_available(): cls_token=tokenizer.cls_token, cls_token_segment_id=2 if model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, - sep_token_extra=bool(model_type in ["roberta"]), + sep_token_extra=False, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(tokenizer.padding_side == "left"), pad_token=tokenizer.pad_token_id, @@ -172,7 +172,7 @@ if is_tf_available(): cls_token=tokenizer.cls_token, cls_token_segment_id=2 if model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, - sep_token_extra=bool(model_type in ["roberta"]), + sep_token_extra=False, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(tokenizer.padding_side == "left"), pad_token=tokenizer.pad_token_id,