From 7a03519975e4f0b6698bf1221c2263ed0f8d795c Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 17:24:35 -0500 Subject: [PATCH] Documentation --- docs/source/main_classes/processors.rst | 79 +++++++++++++++++- transformers/data/processors/squad.py | 104 ++++++++++++++++++++---- 2 files changed, 164 insertions(+), 19 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index a85c126956..ce0eeb553a 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -55,4 +55,81 @@ Example usage ^^^^^^^^^^^^^^^^^^^^^^^^^ An example using these processors is given in the -`run_glue.py `__ script. \ No newline at end of file +`run_glue.py `__ script. + + + +SQuAD +~~~~~~~~~~~~~~~~~~~~~ + +`The Stanford Question Answering Dataset (SQuAD) `__ is a benchmark that evaluates +the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper +`SQuAD: 100,000+ Questions for Machine Comprehension of Text `__. The second version (v2.0) was released alongside +the paper `Know What You Don't Know: Unanswerable Questions for SQuAD `__. + +This library hosts a processor for each of the two versions: + +Processors +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Those processors are: + - :class:`~transformers.data.processors.utils.SquadV1Processor` + - :class:`~transformers.data.processors.utils.SquadV2Processor` + +They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor` + +.. autoclass:: transformers.data.processors.squad.SquadProcessor + :members: + +Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures` +that can be used as model inputs. + +.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features + +These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package. +Examples are given below. + +Example usage +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Here is an example using the processors as well as the conversion method using data files: + +Example:: + + # Loading a V2 processor + processor = SquadV2Processor() + examples = processor.get_dev_examples(squad_v2_data_dir) + + # Loading a V1 processor + processor = SquadV1Processor() + examples = processor.get_dev_examples(squad_v1_data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + doc_stride=args.doc_stride, + max_query_length=max_query_length, + is_training=not evaluate, + ) + +Using `tensorflow_datasets` is as easy as using a data file: + +Example:: + + # tensorflow_datasets only handle Squad V1. + tfds_examples = tfds.load("squad") + examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + doc_stride=args.doc_stride, + max_query_length=max_query_length, + is_training=not evaluate, + ) + + +Another example using these processors is given in the +`run_squad.py `__ script. \ No newline at end of file diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 09a79db471..b17e626c98 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -74,7 +74,35 @@ def _is_whitespace(c): def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training): - """Loads a data file into a list of `InputBatch`s.""" + """ + Converts a list of examples into a list of features that can be directly given as input to a model. + It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. + + Args: + examples: list of :class:`~transformers.data.processors.squad.SquadExample` + tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer` + max_seq_length: The maximum sequence length of the inputs. + doc_stride: The stride used when the context is too large and is split across several features. + max_query_length: The maximum length of the query. + is_training: wheter to create features for model evaluation or model training. + + Returns: + list of :class:`~transformers.data.processors.squad.SquadFeatures` + + Example:: + + processor = SquadV2Processor() + examples = processor.get_dev_examples(data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + ) + """ # Defining helper methods unique_id = 1000000000 @@ -240,12 +268,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, class SquadProcessor(DataProcessor): - """Processor for the SQuAD data set.""" + """ + Processor for the SQuAD data set. + Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively. + """ train_file = None dev_file = None - def get_example_from_tensor_dict(self, tensor_dict, evaluate=False): - + def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): if not evaluate: answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') answer_start = tensor_dict['answers']['answer_start'][0].numpy() @@ -296,35 +326,44 @@ class SquadProcessor(DataProcessor): examples = [] for tensor_dict in tqdm(dataset): - examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) + examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples - def get_train_examples(self, data_dir): - """See base class.""" + def get_train_examples(self, data_dir, filename=None): + """ + Returns the training examples from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the training file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + + """ if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train") - def get_dev_examples(self, data_dir): - """See base class.""" + def get_dev_examples(self, data_dir, filename=None): + """ + Returns the evaluation example from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the evaluation file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + """ if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") - def get_labels(self): - """See base class.""" - return ["0", "1"] - def _create_examples(self, input_data, set_type): - """Creates examples for the training and dev sets.""" - is_training = set_type == "train" examples = [] for entry in tqdm(input_data): @@ -378,6 +417,16 @@ class SquadV2Processor(SquadProcessor): class SquadExample(object): """ A single training/test example for the Squad dataset, as loaded from disk. + + Args: + qas_id: The example's unique identifier + question_text: The question string + context_text: The context string + answer_text: The answer string + start_position_character: The character position of the start of the answer + title: The title of the example + answers: None by default, this is used during evaluation. Holds answers as well as their start positions. + is_impossible: False by default, set to True if the example has no possible answer. """ def __init__(self, @@ -427,7 +476,26 @@ class SquadExample(object): class SquadFeatures(object): """ Single squad example features to be fed to a model. - Those features are model-specific. + Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample` + using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + cls_index: the index of the CLS token. + p_mask: Mask identifying tokens that can be answers vs. tokens that cannot. + Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer + example_index: the index of the example + unique_id: The unique Feature identifier + paragraph_len: The length of the context + token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object. + If a token does not have their maximum context in this feature object, it means that another feature object + has more information related to that token and should be prioritized over this feature for that token. + tokens: list of tokens corresponding to the input ids + token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. + start_position: start of the answer token index + end_position: end of the answer token index """ def __init__(self,