Tokenizers: Start cleaning examples a little (#3455)
* Start cleaning examples * Fixup
This commit is contained in:
@@ -68,7 +68,7 @@ class GLUETransformer(BaseTransformer):
|
|||||||
output_mode=args.glue_output_mode,
|
output_mode=args.glue_output_mode,
|
||||||
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
||||||
pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
|
pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
|
||||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
pad_token_segment_id=self.tokenizer.pad_token_type_id,
|
||||||
)
|
)
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
torch.save(features, cached_features_file)
|
torch.save(features, cached_features_file)
|
||||||
|
|||||||
@@ -342,8 +342,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
max_length=args.max_seq_length,
|
max_length=args.max_seq_length,
|
||||||
output_mode=output_mode,
|
output_mode=output_mode,
|
||||||
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
||||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
pad_token=tokenizer.pad_token_id,
|
||||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
)
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
|||||||
@@ -348,8 +348,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
|
|||||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
pad_on_left=bool(args.model_type in ["xlnet"]),
|
pad_on_left=bool(args.model_type in ["xlnet"]),
|
||||||
# pad on the left for xlnet
|
# pad on the left for xlnet
|
||||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
pad_token=tokenizer.pad_token_id,
|
||||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
pad_token_label_id=pad_token_label_id,
|
pad_token_label_id=pad_token_label_id,
|
||||||
)
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
|
|||||||
@@ -64,8 +64,8 @@ class NERTransformer(BaseTransformer):
|
|||||||
sep_token=self.tokenizer.sep_token,
|
sep_token=self.tokenizer.sep_token,
|
||||||
sep_token_extra=bool(args.model_type in ["roberta"]),
|
sep_token_extra=bool(args.model_type in ["roberta"]),
|
||||||
pad_on_left=bool(args.model_type in ["xlnet"]),
|
pad_on_left=bool(args.model_type in ["xlnet"]),
|
||||||
pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
|
pad_token=self.tokenizer.pad_token_id,
|
||||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
pad_token_segment_id=self.tokenizer.pad_token_type_id,
|
||||||
pad_token_label_id=self.pad_token_label_id,
|
pad_token_label_id=self.pad_token_label_id,
|
||||||
)
|
)
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
|||||||
@@ -434,8 +434,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s
|
|||||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
pad_on_left=bool(args["model_type"] in ["xlnet"]),
|
pad_on_left=bool(args["model_type"] in ["xlnet"]),
|
||||||
# pad on the left for xlnet
|
# pad on the left for xlnet
|
||||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
pad_token=tokenizer.pad_token_id,
|
||||||
pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
pad_token_label_id=pad_token_label_id,
|
pad_token_label_id=pad_token_label_id,
|
||||||
)
|
)
|
||||||
logging.info("Saving features into cached file %s", cached_features_file)
|
logging.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
|||||||
@@ -360,8 +360,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
max_length=args.max_seq_length,
|
max_length=args.max_seq_length,
|
||||||
output_mode=output_mode,
|
output_mode=output_mode,
|
||||||
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
||||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
pad_token=tokenizer.pad_token_id,
|
||||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
)
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
|||||||
@@ -361,7 +361,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
|
|||||||
args.max_seq_length,
|
args.max_seq_length,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
||||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
)
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
|||||||
@@ -350,8 +350,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
max_length=args.max_seq_length,
|
max_length=args.max_seq_length,
|
||||||
output_mode=output_mode,
|
output_mode=output_mode,
|
||||||
pad_on_left=False,
|
pad_on_left=False,
|
||||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
pad_token=tokenizer.pad_token_id,
|
||||||
pad_token_segment_id=0,
|
pad_token_segment_id=tokenizer.pad_token_type_id,
|
||||||
)
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
|||||||
Reference in New Issue
Block a user