Documentation
This commit is contained in:
@@ -56,3 +56,80 @@ Example usage
|
||||
|
||||
An example using these processors is given in the
|
||||
`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||
|
||||
|
||||
|
||||
SQuAD
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
|
||||
the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
|
||||
`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside
|
||||
the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
|
||||
|
||||
This library hosts a processor for each of the two versions:
|
||||
|
||||
Processors
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Those processors are:
|
||||
- :class:`~transformers.data.processors.utils.SquadV1Processor`
|
||||
- :class:`~transformers.data.processors.utils.SquadV2Processor`
|
||||
|
||||
They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
|
||||
|
||||
.. autoclass:: transformers.data.processors.squad.SquadProcessor
|
||||
:members:
|
||||
|
||||
Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
|
||||
that can be used as model inputs.
|
||||
|
||||
.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
|
||||
|
||||
These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
|
||||
Examples are given below.
|
||||
|
||||
Example usage
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Here is an example using the processors as well as the conversion method using data files:
|
||||
|
||||
Example::
|
||||
|
||||
# Loading a V2 processor
|
||||
processor = SquadV2Processor()
|
||||
examples = processor.get_dev_examples(squad_v2_data_dir)
|
||||
|
||||
# Loading a V1 processor
|
||||
processor = SquadV1Processor()
|
||||
examples = processor.get_dev_examples(squad_v1_data_dir)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
|
||||
Using `tensorflow_datasets` is as easy as using a data file:
|
||||
|
||||
Example::
|
||||
|
||||
# tensorflow_datasets only handle Squad V1.
|
||||
tfds_examples = tfds.load("squad")
|
||||
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
|
||||
|
||||
Another example using these processors is given in the
|
||||
`run_squad.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py>`__ script.
|
||||
@@ -74,7 +74,35 @@ def _is_whitespace(c):
|
||||
|
||||
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
doc_stride, max_query_length, is_training):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
"""
|
||||
Converts a list of examples into a list of features that can be directly given as input to a model.
|
||||
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
|
||||
|
||||
Args:
|
||||
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
|
||||
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
|
||||
max_seq_length: The maximum sequence length of the inputs.
|
||||
doc_stride: The stride used when the context is too large and is split across several features.
|
||||
max_query_length: The maximum length of the query.
|
||||
is_training: wheter to create features for model evaluation or model training.
|
||||
|
||||
Returns:
|
||||
list of :class:`~transformers.data.processors.squad.SquadFeatures`
|
||||
|
||||
Example::
|
||||
|
||||
processor = SquadV2Processor()
|
||||
examples = processor.get_dev_examples(data_dir)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
"""
|
||||
|
||||
# Defining helper methods
|
||||
unique_id = 1000000000
|
||||
@@ -240,12 +268,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
|
||||
|
||||
class SquadProcessor(DataProcessor):
|
||||
"""Processor for the SQuAD data set."""
|
||||
"""
|
||||
Processor for the SQuAD data set.
|
||||
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
|
||||
"""
|
||||
train_file = None
|
||||
dev_file = None
|
||||
|
||||
def get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
|
||||
|
||||
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
|
||||
if not evaluate:
|
||||
answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
|
||||
answer_start = tensor_dict['answers']['answer_start'][0].numpy()
|
||||
@@ -296,35 +326,44 @@ class SquadProcessor(DataProcessor):
|
||||
|
||||
examples = []
|
||||
for tensor_dict in tqdm(dataset):
|
||||
examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
|
||||
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
|
||||
|
||||
return examples
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
def get_train_examples(self, data_dir, filename=None):
|
||||
"""
|
||||
Returns the training examples from the data directory.
|
||||
|
||||
Args:
|
||||
data_dir: Directory containing the data files used for training and evaluating.
|
||||
filename: None by default, specify this if the training file has a different name than the original one
|
||||
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||
|
||||
"""
|
||||
if self.train_file is None:
|
||||
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||
|
||||
with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader:
|
||||
with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
return self._create_examples(input_data, "train")
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
def get_dev_examples(self, data_dir, filename=None):
|
||||
"""
|
||||
Returns the evaluation example from the data directory.
|
||||
|
||||
Args:
|
||||
data_dir: Directory containing the data files used for training and evaluating.
|
||||
filename: None by default, specify this if the evaluation file has a different name than the original one
|
||||
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||
"""
|
||||
if self.dev_file is None:
|
||||
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||
|
||||
with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader:
|
||||
with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
return self._create_examples(input_data, "dev")
|
||||
|
||||
def get_labels(self):
|
||||
"""See base class."""
|
||||
return ["0", "1"]
|
||||
|
||||
def _create_examples(self, input_data, set_type):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
|
||||
is_training = set_type == "train"
|
||||
examples = []
|
||||
for entry in tqdm(input_data):
|
||||
@@ -378,6 +417,16 @@ class SquadV2Processor(SquadProcessor):
|
||||
class SquadExample(object):
|
||||
"""
|
||||
A single training/test example for the Squad dataset, as loaded from disk.
|
||||
|
||||
Args:
|
||||
qas_id: The example's unique identifier
|
||||
question_text: The question string
|
||||
context_text: The context string
|
||||
answer_text: The answer string
|
||||
start_position_character: The character position of the start of the answer
|
||||
title: The title of the example
|
||||
answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
|
||||
is_impossible: False by default, set to True if the example has no possible answer.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -427,7 +476,26 @@ class SquadExample(object):
|
||||
class SquadFeatures(object):
|
||||
"""
|
||||
Single squad example features to be fed to a model.
|
||||
Those features are model-specific.
|
||||
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
|
||||
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
|
||||
|
||||
Args:
|
||||
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
|
||||
cls_index: the index of the CLS token.
|
||||
p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
|
||||
Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
|
||||
example_index: the index of the example
|
||||
unique_id: The unique Feature identifier
|
||||
paragraph_len: The length of the context
|
||||
token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
|
||||
If a token does not have their maximum context in this feature object, it means that another feature object
|
||||
has more information related to that token and should be prioritized over this feature for that token.
|
||||
tokens: list of tokens corresponding to the input ids
|
||||
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
|
||||
start_position: start of the answer token index
|
||||
end_position: end of the answer token index
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
||||
Reference in New Issue
Block a user