Tokenizers: Start cleaning examples a little (#3455)

* Start cleaning examples

* Fixup
This commit is contained in:
Julien Chaumond
2020-04-01 07:13:40 -04:00
committed by GitHub
parent b38d552a92
commit 50e15c825c
8 changed files with 14 additions and 14 deletions

View File

@@ -348,8 +348,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(args.model_type in ["xlnet"]),
# pad on the left for xlnet
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=pad_token_label_id,
)
if args.local_rank in [-1, 0]:

View File

@@ -64,8 +64,8 @@ class NERTransformer(BaseTransformer):
sep_token=self.tokenizer.sep_token,
sep_token_extra=bool(args.model_type in ["roberta"]),
pad_on_left=bool(args.model_type in ["xlnet"]),
pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
pad_token=self.tokenizer.pad_token_id,
pad_token_segment_id=self.tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id,
)
logger.info("Saving features into cached file %s", cached_features_file)

View File

@@ -434,8 +434,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(args["model_type"] in ["xlnet"]),
# pad on the left for xlnet
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=pad_token_label_id,
)
logging.info("Saving features into cached file %s", cached_features_file)