diff --git a/examples/glue/run_pl_glue.py b/examples/glue/run_pl_glue.py index 80cc1f8124..0ed00821b3 100644 --- a/examples/glue/run_pl_glue.py +++ b/examples/glue/run_pl_glue.py @@ -63,12 +63,8 @@ class GLUETransformer(BaseTransformer): examples, self.tokenizer, max_length=args.max_seq_length, - task=args.task, label_list=self.labels, output_mode=args.glue_output_mode, - pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0], - pad_token_segment_id=self.tokenizer.pad_token_type_id, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/examples/run_glue.py b/examples/run_glue.py index 818223bf80..130bb19a82 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -354,14 +354,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) ) features = convert_examples_to_features( - examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet - pad_token=tokenizer.pad_token_id, - pad_token_segment_id=tokenizer.pad_token_type_id, + examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py index dae11d22b3..7b3e924bbd 100644 --- a/examples/run_tf_glue.py +++ b/examples/run_tf_glue.py @@ -48,10 +48,10 @@ train_examples = info.splits["train"].num_examples valid_examples = info.splits["validation"].num_examples # Prepare dataset for GLUE as a tf.data.Dataset instance -train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK) +train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, max_length=128, task=TASK) # MNLI expects either validation_matched or validation_mismatched -valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK) +valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, max_length=128, task=TASK) train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE) diff --git a/examples/run_xnli.py b/examples/run_xnli.py index e51a8408b8..9b64d17dfd 100644 --- a/examples/run_xnli.py +++ b/examples/run_xnli.py @@ -344,14 +344,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) ) features = convert_examples_to_features( - examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=False, - pad_token=tokenizer.pad_token_id, - pad_token_segment_id=tokenizer.pad_token_type_id, + examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py index 1310e5897e..22c18ad577 100644 --- a/src/transformers/data/processors/glue.py +++ b/src/transformers/data/processors/glue.py @@ -17,8 +17,10 @@ import logging import os +from typing import List, Optional, Union from ...file_utils import is_tf_available +from ...tokenization_utils import PreTrainedTokenizer from .utils import DataProcessor, InputExample, InputFeatures @@ -29,16 +31,12 @@ logger = logging.getLogger(__name__) def glue_convert_examples_to_features( - examples, - tokenizer, - max_length=512, + examples: Union[List[InputExample], "tf.data.Dataset"], + tokenizer: PreTrainedTokenizer, + max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - mask_padding_with_zero=True, ): """ Loads a data file into a list of ``InputFeatures`` @@ -46,16 +44,10 @@ def glue_convert_examples_to_features( Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples - max_length: Maximum example length + max_length: Maximum example length. Defaults to the tokenizer's max_len task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` - pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) - pad_token: Padding token - pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) - mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values - and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for - actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` @@ -63,83 +55,28 @@ def glue_convert_examples_to_features( a list of task-specific ``InputFeatures`` which can be fed to the model. """ - is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): - is_tf_dataset = True + if task is None: + raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.") + return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) + return _glue_convert_examples_to_features( + examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode + ) - if task is not None: + +if is_tf_available(): + + def _tf_glue_convert_examples_to_features( + examples: tf.data.Dataset, tokenizer: PreTrainedTokenizer, task=str, max_length: Optional[int] = None, + ) -> tf.data.Dataset: + """ + Returns: + A ``tf.data.Dataset`` containing the task-specific features. + + """ processor = glue_processors[task]() - if label_list is None: - label_list = processor.get_labels() - logger.info("Using label list %s for task %s" % (label_list, task)) - if output_mode is None: - output_mode = glue_output_modes[task] - logger.info("Using output mode %s for task %s" % (output_mode, task)) - - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for (ex_index, example) in enumerate(examples): - len_examples = 0 - if is_tf_dataset: - example = processor.get_example_from_tensor_dict(example) - example = processor.tfds_map(example) - len_examples = tf.data.experimental.cardinality(examples) - else: - len_examples = len(examples) - if ex_index % 10000 == 0: - logger.info("Writing example %d/%d" % (ex_index, len_examples)) - - inputs = tokenizer.encode_plus( - example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True, - ) - input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_length - len(input_ids) - if pad_on_left: - input_ids = ([pad_token] * padding_length) + input_ids - attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask - token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids - else: - input_ids = input_ids + ([pad_token] * padding_length) - attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) - - assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) - assert len(attention_mask) == max_length, "Error with input length {} vs {}".format( - len(attention_mask), max_length - ) - assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format( - len(token_type_ids), max_length - ) - - if output_mode == "classification": - label = label_map[example.label] - elif output_mode == "regression": - label = float(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logger.info("*** Example ***") - logger.info("guid: %s" % (example.guid)) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) - logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) - logger.info("label: %s (id = %d)" % (example.label, label)) - - features.append( - InputFeatures( - input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label - ) - ) - - if is_tf_available() and is_tf_dataset: + examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples] + features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) def gen(): for ex in features: @@ -165,6 +102,54 @@ def glue_convert_examples_to_features( ), ) + +def _glue_convert_examples_to_features( + examples: List[InputExample], + tokenizer: PreTrainedTokenizer, + max_length: Optional[int] = None, + task=None, + label_list=None, + output_mode=None, +): + if max_length is None: + max_length = tokenizer.max_len + + if task is not None: + processor = glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Using label list %s for task %s" % (label_list, task)) + if output_mode is None: + output_mode = glue_output_modes[task] + logger.info("Using output mode %s for task %s" % (output_mode, task)) + + label_map = {label: i for i, label in enumerate(label_list)} + + def label_from_example(example: InputExample) -> Union[int, float]: + if output_mode == "classification": + return label_map[example.label] + elif output_mode == "regression": + return float(example.label) + raise KeyError(output_mode) + + labels = [label_from_example(example) for example in examples] + + batch_encoding = tokenizer.batch_encode_plus( + [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, + ) + + features = [] + for i in range(len(examples)): + inputs = {k: batch_encoding[k][i] for k in batch_encoding} + + feature = InputFeatures(**inputs, label=labels[i]) + features.append(feature) + + for i, example in enumerate(examples[:5]): + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("features: %s" % features[i]) + return features diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py index ac929decbb..73998cc1c7 100644 --- a/src/transformers/data/processors/utils.py +++ b/src/transformers/data/processors/utils.py @@ -82,7 +82,7 @@ class InputFeatures(object): def to_json_string(self): """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + return json.dumps(self.to_dict(), sort_keys=True) + "\n" class DataProcessor(object):