NER: fix construction of input examples for RoBERTa (#4943)

* utils_ner: do not add extra sep token for RoBERTa model

* run_pl_ner: do not add extra sep token for RoBERTa model
This commit is contained in:
Stefan Schweter
2020-06-15 14:30:40 +02:00
committed by GitHub
parent ebab096e86
commit d812e6d76e
2 changed files with 3 additions and 3 deletions

View File

@@ -65,7 +65,7 @@ class NERTransformer(BaseTransformer):
cls_token=self.tokenizer.cls_token, cls_token=self.tokenizer.cls_token,
cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0, cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
sep_token=self.tokenizer.sep_token, sep_token=self.tokenizer.sep_token,
sep_token_extra=bool(self.config.model_type in ["roberta"]), sep_token_extra=False,
pad_on_left=bool(self.config.model_type in ["xlnet"]), pad_on_left=bool(self.config.model_type in ["xlnet"]),
pad_token=self.tokenizer.pad_token_id, pad_token=self.tokenizer.pad_token_id,
pad_token_segment_id=self.tokenizer.pad_token_type_id, pad_token_segment_id=self.tokenizer.pad_token_type_id,

View File

@@ -119,7 +119,7 @@ if is_torch_available():
cls_token=tokenizer.cls_token, cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0, cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token, sep_token=tokenizer.sep_token,
sep_token_extra=bool(model_type in ["roberta"]), sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"), pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id, pad_token=tokenizer.pad_token_id,
@@ -172,7 +172,7 @@ if is_tf_available():
cls_token=tokenizer.cls_token, cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0, cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token, sep_token=tokenizer.sep_token,
sep_token_extra=bool(model_type in ["roberta"]), sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"), pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id, pad_token=tokenizer.pad_token_id,