Doc styling (#8067)

* Important files

* Styling them all

* Revert "Styling them all"

This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e.

* Syling them for realsies

* Fix syntax error

* Fix benchmark_utils

* More fixes

* Fix modeling auto and script

* Remove new line

* Fixes

* More fixes

* Fix more files

* Style

* Add FSMT

* More fixes

* More fixes

* More fixes

* More fixes

* Fixes

* More fixes

* More fixes

* Last fixes

* Make sphinx happy
This commit is contained in:
Sylvain Gugger
2020-10-26 18:26:02 -04:00
committed by GitHub
parent 04a17f8550
commit 08f534d2da
271 changed files with 9726 additions and 8991 deletions

View File

@@ -11,21 +11,22 @@ from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrained
InputDataClass = NewType("InputDataClass", Any)
"""
A DataCollator is a function that takes a list of samples from a Dataset
and collate them into a batch, as a dictionary of Tensors.
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
of Tensors.
"""
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]])
def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
"""
Very simple data collator that simply collates batches of dict-like objects and erforms special handling for potential keys named:
Very simple data collator that simply collates batches of dict-like objects and erforms special handling for
potential keys named:
- ``label``: handles a single value (int or float) per object
- ``label_ids``: handles a list of values per object
Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to the model.
See glue and ner for example of how it's useful.
Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to
the model. See glue and ner for example of how it's useful.
"""
# In this function we'll make the assumption that all `features` in the batch
@@ -73,11 +74,11 @@ class DataCollatorWithPadding:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
@@ -87,8 +88,8 @@ class DataCollatorWithPadding:
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta).
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
"""
tokenizer: PreTrainedTokenizerBase
@@ -117,6 +118,7 @@ class DataCollatorWithPadding:
class DataCollatorForLanguageModeling:
"""
Data collator used for language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
"""
@@ -198,6 +200,7 @@ class DataCollatorForLanguageModeling:
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
"""
Data collator used for language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for masked language modeling
"""
@@ -275,8 +278,8 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
if self.tokenizer.mask_token is None:
@@ -316,6 +319,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
class DataCollatorForSOP(DataCollatorForLanguageModeling):
"""
Data collator used for sentence order prediction task.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for both masked language modeling and sentence order prediction
"""
@@ -342,8 +346,8 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10% original.
N-gram not applied yet.
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
original. N-gram not applied yet.
"""
if self.tokenizer.mask_token is None:
raise ValueError(
@@ -385,6 +389,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
class DataCollatorForPermutationLanguageModeling:
"""
Data collator used for permutation language modeling.
- collates batches of tensors, honoring their tokenizer's pad_token
- preprocesses batches for permutation language modeling with procedures specific to XLNet
"""
@@ -425,10 +430,14 @@ class DataCollatorForPermutationLanguageModeling:
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - span_length]`` and mask tokens ``start_index:start_index + span_length``
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in the sequence to be processed), repeat from Step 1.
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
span_length]`` and mask tokens ``start_index:start_index + span_length``
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1.
"""
if self.tokenizer.mask_token is None:
@@ -517,8 +526,7 @@ class DataCollatorForPermutationLanguageModeling:
@dataclass
class DataCollatorForNextSentencePrediction:
"""
Data collator used for next sentence prediction.
- collates examples which contains pre-generated negative examples
Data collator used for next sentence prediction. - collates examples which contains pre-generated negative examples
- preprocesses batches for masked language modeling
"""
@@ -531,9 +539,12 @@ class DataCollatorForNextSentencePrediction:
def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
"""
The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will not generate any negative examples.
The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will
not generate any negative examples
Args:
examples (:obj:`List[Dict]`): Each dictionary should have the following keys:
- ``tokens_a``: A sequence of tokens, which should appear before ``tokens_b`` in the text.
- ``tokens_b``: A sequence of tokens, which should appear after ``tokens_a`` in the text.
- ``is_random_next``: 1 if this pair is generated randomly, else 0.

View File

@@ -23,9 +23,8 @@ class GlueDataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
line.
"""
task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
@@ -55,8 +54,7 @@ class Split(Enum):
class GlueDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
args: GlueDataTrainingArguments

View File

@@ -19,8 +19,7 @@ logger = logging.get_logger(__name__)
class TextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(
@@ -91,8 +90,7 @@ class TextDataset(Dataset):
class LineByLineTextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
@@ -118,8 +116,7 @@ class LineByLineTextDataset(Dataset):
class LineByLineWithRefDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
@@ -294,8 +291,7 @@ class LineByLineWithSOPTextDataset(Dataset):
class TextDatasetForNextSentencePrediction(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
def __init__(

View File

@@ -86,8 +86,7 @@ class Split(Enum):
class SquadDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
This will be superseded by a framework-agnostic approach soon.
"""
args: SquadDataTrainingArguments

View File

@@ -1,10 +1,10 @@
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
"""
Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
update `find_best_threshold` scripts for SQuAD V2.0
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
probability that a question is unanswerable.
"""
@@ -589,8 +589,9 @@ def compute_predictions_log_probs(
tokenizer,
verbose_logging,
):
"""XLNet write prediction logic (more complex than Bert's).
Write final predictions to the json file and log-odds of null if needed.
"""
XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
null if needed.
Requires utils_squad_evaluate.py
"""

View File

@@ -52,9 +52,9 @@ def glue_convert_examples_to_features(
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
containing the task-specific features. If the input is a list of ``InputExamples``, will return
a list of task-specific ``InputFeatures`` which can be fed to the model.
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
``InputFeatures`` which can be fed to the model.
"""
if is_tf_available() and isinstance(examples, tf.data.Dataset):

View File

@@ -314,8 +314,8 @@ def squad_convert_examples_to_features(
tqdm_enabled=True,
):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Converts a list of examples into a list of features that can be directly given as input to a model. It is
model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
@@ -326,8 +326,7 @@ def squad_convert_examples_to_features(
is_training: whether to create features for model evaluation or model training.
padding_strategy: Default to "max_length". Which padding strategy to use
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
threads: multiple processing threadsa-smi
@@ -528,8 +527,8 @@ def squad_convert_examples_to_features(
class SquadProcessor(DataProcessor):
"""
Processor for the SQuAD data set.
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
version 2.0 of SQuAD, respectively.
"""
train_file = None
@@ -745,9 +744,9 @@ class SquadExample:
class SquadFeatures:
"""
Single squad example features to be fed to a model.
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
:class:`~transformers.data.processors.squad.SquadExample` using the
:method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.

View File

@@ -55,14 +55,13 @@ class InputExample:
@dataclass(frozen=True)
class InputFeatures:
"""
A single set of features of data.
Property names are the same names as the corresponding inputs to a model.
A single set of features of data. Property names are the same names as the corresponding inputs to a model.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
tokens.
token_type_ids: (Optional) Segment token indices to indicate first and second
portions of the inputs. Only some models use them.
label: (Optional) Label corresponding to the input. Int for classification problems,
@@ -83,7 +82,8 @@ class DataProcessor:
"""Base class for data converters for sequence classification data sets."""
def get_example_from_tensor_dict(self, tensor_dict):
"""Gets an example from a dict with tensorflow tensors.
"""
Gets an example from a dict with tensorflow tensors.
Args:
tensor_dict: Keys and values should match the corresponding Glue
@@ -108,8 +108,10 @@ class DataProcessor:
raise NotImplementedError()
def tfds_map(self, example):
"""Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
This method converts examples to the correct format."""
"""
Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
examples to the correct format.
"""
if len(self.get_labels()) > 1:
example.label = self.get_labels()[int(example.label)]
return example
@@ -253,9 +255,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
actual values)
Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
containing the task-specific features. If the input is a list of ``InputExamples``, will return
a list of task-specific ``InputFeatures`` which can be fed to the model.
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
``InputFeatures`` which can be fed to the model.
"""
if max_length is None:

View File

@@ -26,8 +26,10 @@ logger = logging.get_logger(__name__)
class XnliProcessor(DataProcessor):
"""Processor for the XNLI dataset.
Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
"""
Processor for the XNLI dataset. Adapted from
https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
"""
def __init__(self, language, train_language=None):
self.language = language