Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
This commit is contained in:
@@ -11,21 +11,22 @@ from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrained
|
||||
InputDataClass = NewType("InputDataClass", Any)
|
||||
|
||||
"""
|
||||
A DataCollator is a function that takes a list of samples from a Dataset
|
||||
and collate them into a batch, as a dictionary of Tensors.
|
||||
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
|
||||
of Tensors.
|
||||
"""
|
||||
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]])
|
||||
|
||||
|
||||
def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
Very simple data collator that simply collates batches of dict-like objects and erforms special handling for potential keys named:
|
||||
Very simple data collator that simply collates batches of dict-like objects and erforms special handling for
|
||||
potential keys named:
|
||||
|
||||
- ``label``: handles a single value (int or float) per object
|
||||
- ``label_ids``: handles a list of values per object
|
||||
|
||||
Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to the model.
|
||||
See glue and ner for example of how it's useful.
|
||||
Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to
|
||||
the model. See glue and ner for example of how it's useful.
|
||||
"""
|
||||
|
||||
# In this function we'll make the assumption that all `features` in the batch
|
||||
@@ -73,11 +74,11 @@ class DataCollatorWithPadding:
|
||||
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
||||
The tokenizer used for encoding the data.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||
index) among:
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||
among:
|
||||
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||
single sequence if provided).
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
||||
sequence if provided).
|
||||
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
||||
maximum acceptable input length for the model if that argument is not provided.
|
||||
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||
@@ -87,8 +88,8 @@ class DataCollatorWithPadding:
|
||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
||||
>= 7.5 (Volta).
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
||||
7.5 (Volta).
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
@@ -117,6 +118,7 @@ class DataCollatorWithPadding:
|
||||
class DataCollatorForLanguageModeling:
|
||||
"""
|
||||
Data collator used for language modeling.
|
||||
|
||||
- collates batches of tensors, honoring their tokenizer's pad_token
|
||||
- preprocesses batches for masked language modeling
|
||||
"""
|
||||
@@ -198,6 +200,7 @@ class DataCollatorForLanguageModeling:
|
||||
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
"""
|
||||
Data collator used for language modeling.
|
||||
|
||||
- collates batches of tensors, honoring their tokenizer's pad_token
|
||||
- preprocesses batches for masked language modeling
|
||||
"""
|
||||
@@ -275,8 +278,8 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
|
||||
def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
|
||||
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
|
||||
"""
|
||||
|
||||
if self.tokenizer.mask_token is None:
|
||||
@@ -316,6 +319,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
||||
"""
|
||||
Data collator used for sentence order prediction task.
|
||||
|
||||
- collates batches of tensors, honoring their tokenizer's pad_token
|
||||
- preprocesses batches for both masked language modeling and sentence order prediction
|
||||
"""
|
||||
@@ -342,8 +346,8 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
||||
|
||||
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
N-gram not applied yet.
|
||||
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
|
||||
original. N-gram not applied yet.
|
||||
"""
|
||||
if self.tokenizer.mask_token is None:
|
||||
raise ValueError(
|
||||
@@ -385,6 +389,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
||||
class DataCollatorForPermutationLanguageModeling:
|
||||
"""
|
||||
Data collator used for permutation language modeling.
|
||||
|
||||
- collates batches of tensors, honoring their tokenizer's pad_token
|
||||
- preprocesses batches for permutation language modeling with procedures specific to XLNet
|
||||
"""
|
||||
@@ -425,10 +430,14 @@ class DataCollatorForPermutationLanguageModeling:
|
||||
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
|
||||
|
||||
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
|
||||
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be masked)
|
||||
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be masked
|
||||
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - span_length]`` and mask tokens ``start_index:start_index + span_length``
|
||||
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in the sequence to be processed), repeat from Step 1.
|
||||
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
|
||||
masked)
|
||||
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
|
||||
masked
|
||||
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
|
||||
span_length]`` and mask tokens ``start_index:start_index + span_length``
|
||||
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
|
||||
the sequence to be processed), repeat from Step 1.
|
||||
"""
|
||||
|
||||
if self.tokenizer.mask_token is None:
|
||||
@@ -517,8 +526,7 @@ class DataCollatorForPermutationLanguageModeling:
|
||||
@dataclass
|
||||
class DataCollatorForNextSentencePrediction:
|
||||
"""
|
||||
Data collator used for next sentence prediction.
|
||||
- collates examples which contains pre-generated negative examples
|
||||
Data collator used for next sentence prediction. - collates examples which contains pre-generated negative examples
|
||||
- preprocesses batches for masked language modeling
|
||||
"""
|
||||
|
||||
@@ -531,9 +539,12 @@ class DataCollatorForNextSentencePrediction:
|
||||
|
||||
def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will not generate any negative examples.
|
||||
The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will
|
||||
not generate any negative examples
|
||||
|
||||
Args:
|
||||
examples (:obj:`List[Dict]`): Each dictionary should have the following keys:
|
||||
|
||||
- ``tokens_a``: A sequence of tokens, which should appear before ``tokens_b`` in the text.
|
||||
- ``tokens_b``: A sequence of tokens, which should appear after ``tokens_a`` in the text.
|
||||
- ``is_random_next``: 1 if this pair is generated randomly, else 0.
|
||||
|
||||
@@ -23,9 +23,8 @@ class GlueDataTrainingArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
|
||||
Using `HfArgumentParser` we can turn this class
|
||||
into argparse arguments to be able to specify them on
|
||||
the command line.
|
||||
Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
|
||||
line.
|
||||
"""
|
||||
|
||||
task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
|
||||
@@ -55,8 +54,7 @@ class Split(Enum):
|
||||
|
||||
class GlueDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
This will be superseded by a framework-agnostic approach soon.
|
||||
"""
|
||||
|
||||
args: GlueDataTrainingArguments
|
||||
|
||||
@@ -19,8 +19,7 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
class TextDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
This will be superseded by a framework-agnostic approach soon.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -91,8 +90,7 @@ class TextDataset(Dataset):
|
||||
|
||||
class LineByLineTextDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
This will be superseded by a framework-agnostic approach soon.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
|
||||
@@ -118,8 +116,7 @@ class LineByLineTextDataset(Dataset):
|
||||
|
||||
class LineByLineWithRefDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
This will be superseded by a framework-agnostic approach soon.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
|
||||
@@ -294,8 +291,7 @@ class LineByLineWithSOPTextDataset(Dataset):
|
||||
|
||||
class TextDatasetForNextSentencePrediction(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
This will be superseded by a framework-agnostic approach soon.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -86,8 +86,7 @@ class Split(Enum):
|
||||
|
||||
class SquadDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
This will be superseded by a framework-agnostic approach soon.
|
||||
"""
|
||||
|
||||
args: SquadDataTrainingArguments
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
|
||||
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
|
||||
"""
|
||||
Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
|
||||
update `find_best_threshold` scripts for SQuAD V2.0
|
||||
|
||||
In addition to basic functionality, we also compute additional statistics and
|
||||
plot precision-recall curves if an additional na_prob.json file is provided.
|
||||
This file is expected to map question ID's to the model's predicted probability
|
||||
that a question is unanswerable.
|
||||
In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
|
||||
additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
|
||||
probability that a question is unanswerable.
|
||||
"""
|
||||
|
||||
|
||||
@@ -589,8 +589,9 @@ def compute_predictions_log_probs(
|
||||
tokenizer,
|
||||
verbose_logging,
|
||||
):
|
||||
"""XLNet write prediction logic (more complex than Bert's).
|
||||
Write final predictions to the json file and log-odds of null if needed.
|
||||
"""
|
||||
XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
|
||||
null if needed.
|
||||
|
||||
Requires utils_squad_evaluate.py
|
||||
"""
|
||||
|
||||
@@ -52,9 +52,9 @@ def glue_convert_examples_to_features(
|
||||
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
|
||||
|
||||
Returns:
|
||||
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
|
||||
containing the task-specific features. If the input is a list of ``InputExamples``, will return
|
||||
a list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
|
||||
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
|
||||
``InputFeatures`` which can be fed to the model.
|
||||
|
||||
"""
|
||||
if is_tf_available() and isinstance(examples, tf.data.Dataset):
|
||||
|
||||
@@ -314,8 +314,8 @@ def squad_convert_examples_to_features(
|
||||
tqdm_enabled=True,
|
||||
):
|
||||
"""
|
||||
Converts a list of examples into a list of features that can be directly given as input to a model.
|
||||
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
|
||||
Converts a list of examples into a list of features that can be directly given as input to a model. It is
|
||||
model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
|
||||
|
||||
Args:
|
||||
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
|
||||
@@ -326,8 +326,7 @@ def squad_convert_examples_to_features(
|
||||
is_training: whether to create features for model evaluation or model training.
|
||||
padding_strategy: Default to "max_length". Which padding strategy to use
|
||||
return_dataset: Default False. Either 'pt' or 'tf'.
|
||||
if 'pt': returns a torch.data.TensorDataset,
|
||||
if 'tf': returns a tf.data.Dataset
|
||||
if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
|
||||
threads: multiple processing threadsa-smi
|
||||
|
||||
|
||||
@@ -528,8 +527,8 @@ def squad_convert_examples_to_features(
|
||||
|
||||
class SquadProcessor(DataProcessor):
|
||||
"""
|
||||
Processor for the SQuAD data set.
|
||||
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
|
||||
Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
|
||||
version 2.0 of SQuAD, respectively.
|
||||
"""
|
||||
|
||||
train_file = None
|
||||
@@ -745,9 +744,9 @@ class SquadExample:
|
||||
|
||||
class SquadFeatures:
|
||||
"""
|
||||
Single squad example features to be fed to a model.
|
||||
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
|
||||
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
|
||||
Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
|
||||
:class:`~transformers.data.processors.squad.SquadExample` using the
|
||||
:method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
|
||||
|
||||
Args:
|
||||
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
@@ -55,14 +55,13 @@ class InputExample:
|
||||
@dataclass(frozen=True)
|
||||
class InputFeatures:
|
||||
"""
|
||||
A single set of features of data.
|
||||
Property names are the same names as the corresponding inputs to a model.
|
||||
A single set of features of data. Property names are the same names as the corresponding inputs to a model.
|
||||
|
||||
Args:
|
||||
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
|
||||
Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
|
||||
tokens.
|
||||
token_type_ids: (Optional) Segment token indices to indicate first and second
|
||||
portions of the inputs. Only some models use them.
|
||||
label: (Optional) Label corresponding to the input. Int for classification problems,
|
||||
@@ -83,7 +82,8 @@ class DataProcessor:
|
||||
"""Base class for data converters for sequence classification data sets."""
|
||||
|
||||
def get_example_from_tensor_dict(self, tensor_dict):
|
||||
"""Gets an example from a dict with tensorflow tensors.
|
||||
"""
|
||||
Gets an example from a dict with tensorflow tensors.
|
||||
|
||||
Args:
|
||||
tensor_dict: Keys and values should match the corresponding Glue
|
||||
@@ -108,8 +108,10 @@ class DataProcessor:
|
||||
raise NotImplementedError()
|
||||
|
||||
def tfds_map(self, example):
|
||||
"""Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
|
||||
This method converts examples to the correct format."""
|
||||
"""
|
||||
Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
|
||||
examples to the correct format.
|
||||
"""
|
||||
if len(self.get_labels()) > 1:
|
||||
example.label = self.get_labels()[int(example.label)]
|
||||
return example
|
||||
@@ -253,9 +255,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
|
||||
actual values)
|
||||
|
||||
Returns:
|
||||
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
|
||||
containing the task-specific features. If the input is a list of ``InputExamples``, will return
|
||||
a list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
|
||||
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
|
||||
``InputFeatures`` which can be fed to the model.
|
||||
|
||||
"""
|
||||
if max_length is None:
|
||||
|
||||
@@ -26,8 +26,10 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class XnliProcessor(DataProcessor):
|
||||
"""Processor for the XNLI dataset.
|
||||
Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
|
||||
"""
|
||||
Processor for the XNLI dataset. Adapted from
|
||||
https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
|
||||
"""
|
||||
|
||||
def __init__(self, language, train_language=None):
|
||||
self.language = language
|
||||
|
||||
Reference in New Issue
Block a user