From 50e15c825cc6cb488561aa911dc81deb765d3ad7 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 1 Apr 2020 07:13:40 -0400 Subject: [PATCH] Tokenizers: Start cleaning examples a little (#3455) * Start cleaning examples * Fixup --- examples/glue/run_pl_glue.py | 2 +- examples/hans/test_hans.py | 4 ++-- examples/ner/run_ner.py | 4 ++-- examples/ner/run_pl_ner.py | 4 ++-- examples/ner/run_tf_ner.py | 4 ++-- examples/run_glue.py | 4 ++-- examples/run_multiple_choice.py | 2 +- examples/run_xnli.py | 4 ++-- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/glue/run_pl_glue.py b/examples/glue/run_pl_glue.py index 44031cce5f..80cc1f8124 100644 --- a/examples/glue/run_pl_glue.py +++ b/examples/glue/run_pl_glue.py @@ -68,7 +68,7 @@ class GLUETransformer(BaseTransformer): output_mode=args.glue_output_mode, pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, + pad_token_segment_id=self.tokenizer.pad_token_type_id, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/examples/hans/test_hans.py b/examples/hans/test_hans.py index a5d4e76149..56416b28bd 100644 --- a/examples/hans/test_hans.py +++ b/examples/hans/test_hans.py @@ -342,8 +342,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py index 818ff91136..ba08e51da8 100644 --- a/examples/ner/run_ner.py +++ b/examples/ner/run_ner.py @@ -348,8 +348,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, pad_token_label_id=pad_token_label_id, ) if args.local_rank in [-1, 0]: diff --git a/examples/ner/run_pl_ner.py b/examples/ner/run_pl_ner.py index f5cbf5bd3f..6b84697891 100644 --- a/examples/ner/run_pl_ner.py +++ b/examples/ner/run_pl_ner.py @@ -64,8 +64,8 @@ class NERTransformer(BaseTransformer): sep_token=self.tokenizer.sep_token, sep_token_extra=bool(args.model_type in ["roberta"]), pad_on_left=bool(args.model_type in ["xlnet"]), - pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id, pad_token_label_id=self.pad_token_label_id, ) logger.info("Saving features into cached file %s", cached_features_file) diff --git a/examples/ner/run_tf_ner.py b/examples/ner/run_tf_ner.py index 88b235d99e..cc76989cd7 100644 --- a/examples/ner/run_tf_ner.py +++ b/examples/ner/run_tf_ner.py @@ -434,8 +434,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args["model_type"] in ["xlnet"]), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, pad_token_label_id=pad_token_label_id, ) logging.info("Saving features into cached file %s", cached_features_file) diff --git a/examples/run_glue.py b/examples/run_glue.py index 72fdc2b497..818223bf80 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -360,8 +360,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index c4f90bbad7..dbeae2b689 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -361,7 +361,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): args.max_seq_length, tokenizer, pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, + pad_token_segment_id=tokenizer.pad_token_type_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) diff --git a/examples/run_xnli.py b/examples/run_xnli.py index 9dcae8568f..e51a8408b8 100644 --- a/examples/run_xnli.py +++ b/examples/run_xnli.py @@ -350,8 +350,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=0, + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file)