NER: fix construction of input examples for RoBERTa (#4943)
* utils_ner: do not add extra sep token for RoBERTa model * run_pl_ner: do not add extra sep token for RoBERTa model
This commit is contained in:
@@ -65,7 +65,7 @@ class NERTransformer(BaseTransformer):
|
|||||||
cls_token=self.tokenizer.cls_token,
|
cls_token=self.tokenizer.cls_token,
|
||||||
cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
|
cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
|
||||||
sep_token=self.tokenizer.sep_token,
|
sep_token=self.tokenizer.sep_token,
|
||||||
sep_token_extra=bool(self.config.model_type in ["roberta"]),
|
sep_token_extra=False,
|
||||||
pad_on_left=bool(self.config.model_type in ["xlnet"]),
|
pad_on_left=bool(self.config.model_type in ["xlnet"]),
|
||||||
pad_token=self.tokenizer.pad_token_id,
|
pad_token=self.tokenizer.pad_token_id,
|
||||||
pad_token_segment_id=self.tokenizer.pad_token_type_id,
|
pad_token_segment_id=self.tokenizer.pad_token_type_id,
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ if is_torch_available():
|
|||||||
cls_token=tokenizer.cls_token,
|
cls_token=tokenizer.cls_token,
|
||||||
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
||||||
sep_token=tokenizer.sep_token,
|
sep_token=tokenizer.sep_token,
|
||||||
sep_token_extra=bool(model_type in ["roberta"]),
|
sep_token_extra=False,
|
||||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
pad_on_left=bool(tokenizer.padding_side == "left"),
|
pad_on_left=bool(tokenizer.padding_side == "left"),
|
||||||
pad_token=tokenizer.pad_token_id,
|
pad_token=tokenizer.pad_token_id,
|
||||||
@@ -172,7 +172,7 @@ if is_tf_available():
|
|||||||
cls_token=tokenizer.cls_token,
|
cls_token=tokenizer.cls_token,
|
||||||
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
|
||||||
sep_token=tokenizer.sep_token,
|
sep_token=tokenizer.sep_token,
|
||||||
sep_token_extra=bool(model_type in ["roberta"]),
|
sep_token_extra=False,
|
||||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
pad_on_left=bool(tokenizer.padding_side == "left"),
|
pad_on_left=bool(tokenizer.padding_side == "left"),
|
||||||
pad_token=tokenizer.pad_token_id,
|
pad_token=tokenizer.pad_token_id,
|
||||||
|
|||||||
Reference in New Issue
Block a user