From ad4a393e2e59951a0edbec0b9b3be852dd086cc7 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 25 Sep 2019 08:30:07 -0400 Subject: [PATCH] Changed processor documentation architecture. Added documentation for GLUE --- docs/source/main_classes/processors.rst | 62 +++++++++++++------------ transformers/data/processors/glue.py | 23 ++++++++- 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index 12e5339ddb..d65f48af83 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -4,42 +4,46 @@ Processors This library includes processors for several traditional tasks. These processors can be used to process a dataset into examples that can be fed to a model. -``GLUE`` +Processors ~~~~~~~~~~~~~~~~~~~~~ -`General Language Understanding Evaluation (GLUE)`__ is a benchmark that evaluates +All processors follow the same architecture which is that of the +:class:`~pytorch_transformers.data.processors.utils.DataProcessor`. The processor returns a list +of :class:`~pytorch_transformers.data.processors.utils.InputExample`. + +.. autoclass:: pytorch_transformers.data.processors.utils.DataProcessor + :members: + + +.. autoclass:: pytorch_transformers.data.processors.utils.InputExample + :members: + + +GLUE +~~~~~~~~~~~~~~~~~~~~~ + +`General Language Understanding Evaluation (GLUE) `__ is a benchmark that evaluates the performance of models across a diverse set of existing NLU tasks. It was released together with the paper -`GLUE: A multi-task benchmark and analysis platform for natural language understanding`__ +`GLUE: A multi-task benchmark and analysis platform for natural language understanding `__ This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI. -.. autoclass:: pytorch_transformers.data.processors.glue.MrpcProcessor - :members: +Those processors are: + - :class:`~pytorch_transformers.data.processors.utils.MrpcProcessor` + - :class:`~pytorch_transformers.data.processors.utils.MnliProcessor` + - :class:`~pytorch_transformers.data.processors.utils.MnliMismatchedProcessor` + - :class:`~pytorch_transformers.data.processors.utils.Sst2Processor` + - :class:`~pytorch_transformers.data.processors.utils.StsbProcessor` + - :class:`~pytorch_transformers.data.processors.utils.QqpProcessor` + - :class:`~pytorch_transformers.data.processors.utils.QnliProcessor` + - :class:`~pytorch_transformers.data.processors.utils.RteProcessor` + - :class:`~pytorch_transformers.data.processors.utils.WnliProcessor` -.. autoclass:: pytorch_transformers.data.processors.glue.MnliProcessor - :members: +Additionally, the following method can be used to load values from a data file and convert them to a list of +:class:`~pytorch_transformers.data.processors.utils.InputExample`. -.. autoclass:: pytorch_transformers.data.processors.glue.MnliMismatchedProcessor - :members: +.. automethod:: pytorch_transformers.data.processors.glue.glue_convert_examples_to_features -.. autoclass:: pytorch_transformers.data.processors.glue.ColaProcessor - :members: - -.. autoclass:: pytorch_transformers.data.processors.glue.Sst2Processor - :members: - -.. autoclass:: pytorch_transformers.data.processors.glue.StsbProcessor - :members: - -.. autoclass:: pytorch_transformers.data.processors.glue.QqpProcessor - :members: - -.. autoclass:: pytorch_transformers.data.processors.glue.QnliProcessor - :members: - -.. autoclass:: pytorch_transformers.data.processors.glue.RteProcessor - :members: - -.. autoclass:: pytorch_transformers.data.processors.glue.WnliProcessor - :members: +Example usage +^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py index 3010ce9840..2322f58604 100644 --- a/transformers/data/processors/glue.py +++ b/transformers/data/processors/glue.py @@ -26,6 +26,7 @@ if is_tf_available(): logger = logging.getLogger(__name__) + def glue_convert_examples_to_features(examples, tokenizer, max_length=512, task=None, @@ -36,7 +37,27 @@ def glue_convert_examples_to_features(examples, tokenizer, pad_token_segment_id=0, mask_padding_with_zero=True): """ - Loads a data file into a list of `InputBatch`s + Loads a data file into a list of ``InputFeatures`` + + Args: + examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + task: GLUE task + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method + output_mode: String indicating the output mode. Either ``regression`` or ``classification`` + pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) + mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values + and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for + actual values) + + Returns: + If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` + containing the task-specific features. If the input is a list of ``InputExamples``, will return + a list of task-specific ``InputFeatures`` which can be fed to the model. + """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset):