From 36f592cc828c77cb651dc1c17a2c5d6ad41451aa Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 25 Sep 2019 08:39:33 -0400 Subject: [PATCH] Updated doc for `InputExample` and `InputFeatures` --- docs/source/main_classes/processors.rst | 8 +++++- transformers/data/processors/utils.py | 36 ++++++++++++++++--------- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index d65f48af83..ab4b91143b 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -9,7 +9,9 @@ Processors All processors follow the same architecture which is that of the :class:`~pytorch_transformers.data.processors.utils.DataProcessor`. The processor returns a list -of :class:`~pytorch_transformers.data.processors.utils.InputExample`. +of :class:`~pytorch_transformers.data.processors.utils.InputExample`. These +:class:`~pytorch_transformers.data.processors.utils.InputExample` can be converted to +:class:`~pytorch_transformers.data.processors.utils.InputFeatures` in order to be fed to the model. .. autoclass:: pytorch_transformers.data.processors.utils.DataProcessor :members: @@ -19,6 +21,10 @@ of :class:`~pytorch_transformers.data.processors.utils.InputExample`. :members: +.. autoclass:: pytorch_transformers.data.processors.utils.InputFeatures + :members: + + GLUE ~~~~~~~~~~~~~~~~~~~~~ diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py index a616372054..d16ea786a0 100644 --- a/transformers/data/processors/utils.py +++ b/transformers/data/processors/utils.py @@ -20,19 +20,19 @@ import copy import json class InputExample(object): - """A single training/test example for simple sequence classification.""" - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. + """ + A single training/test example for simple sequence classification. - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + def __init__(self, guid, text_a, text_b=None, label=None): self.guid = guid self.text_a = text_a self.text_b = text_b @@ -52,7 +52,17 @@ class InputExample(object): class InputFeatures(object): - """A single set of features of data.""" + """ + A single set of features of data. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + label: Label corresponding to the input + """ def __init__(self, input_ids, attention_mask, token_type_ids, label): self.input_ids = input_ids