Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
2020-10-29 10:33:33 -04:00
parent 4731a00c3e
commit 969859d5f6
160 changed files with 342 additions and 364 deletions
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -291,10 +291,9 @@ def hans_convert_examples_to_features(

    Args:
        examples: List of ``InputExamples`` containing the examples.
-        tokenizer: Instance of a tokenizer that will tokenize the examples.
-        max_length: Maximum example length.
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
+        max_length: Maximum example length.
+        tokenizer: Instance of a tokenizer that will tokenize the examples.

    Returns:
        A list of task-specific ``InputFeatures`` which can be fed to the model.
--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -155,7 +155,7 @@ class BertModelWithPabee(BertModel):
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)

        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
--- a/examples/deebert/src/modeling_highway_bert.py
+++ b/examples/deebert/src/modeling_highway_bert.py
@@ -198,7 +198,7 @@ class DeeBertModel(BertPreTrainedModel):
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)

        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if encoder_attention_mask.dim() == 3:
            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
        if encoder_attention_mask.dim() == 2:
@@ -260,7 +260,7 @@ class BertHighway(nn.Module):

        # BertModel
        bmodel_output = (pooler_input, pooler_output) + encoder_outputs[1:]
-        # "return" bodel_output
+        # "return" bmodel_output

        # Dropout and classification
        pooled_output = bmodel_output[1]
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -265,7 +265,7 @@ class Distiller:
        -------
            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
+            clm_labels: `torch.tensor(bs, seq_length)` - The causal language modeling labels. There is a -100 where there is nothing to predict.
        """
        token_ids, lengths = batch
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
@@ -401,9 +401,9 @@ class Distiller:
        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
        if self.params.restrict_ce_to_mask:
-            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_length, voc_size)
        else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_length, voc_size)
        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -61,7 +61,7 @@ class LmSeqsDataset(Dataset):

    def remove_long_sequences(self):
        """
-        Sequences that are too long are splitted by chunk of max_model_input_size.
+        Sequences that are too long are split by chunk of max_model_input_size.
        """
        max_len = self.params.max_model_input_size
        indices = self.lengths > max_len
@@ -138,8 +138,8 @@ class LmSeqsDataset(Dataset):
        # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')

        # unk_idx = self.params.special_tok_ids['unk_token']
-        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
-        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
+        # nb_unknown = sum([(t==unk_idx).sum() for t in self.token_ids])
+        # logger.info(f'{nb_unknown} unknown tokens (covering {100*nb_unknown/data_len:.2f}% of the data)')

    def batch_sequences(self, batch):
        """
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -96,7 +96,7 @@ if __name__ == "__main__":
        compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]

    print(f"N layers selected for distillation: {std_idx}")
-    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
+    print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")

-    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
+    print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/lxmert/modeling_frcnn.py
+++ b/examples/lxmert/modeling_frcnn.py
@@ -266,14 +266,14 @@ def find_top_rpn_proposals(
 ):
    """Args:
        proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
-        pred_objectness_logits: tensors of lenngth L.
+        pred_objectness_logits: tensors of length L.
        nms_thresh (float): IoU threshold to use for NMS
        pre_nms_topk (int): before nms
        post_nms_topk (int): after nms
        min_box_side_len (float): minimum proposal box side
        training (bool): True if proposals are to be used in training,
    Returns:
-        resuls (List[Dict]): stores post_nms_topk object proposals for image i.
+        results (List[Dict]): stores post_nms_topk object proposals for image i.
    """
    num_images = len(images)
    device = proposals[0].device
@@ -648,7 +648,7 @@ class RPNOutputs(object):
            images (ImageList): :class:`ImageList` instance representing N input images
            pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
-            anchors (list[torch.Tensor]): nested list ofboxes. anchors[i][j] at (n, l) stores anchor array for feature map l
+            anchors (list[torch.Tensor]): nested list of boxes. anchors[i][j] at (n, l) stores anchor array for feature map l
            boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
            gt_boxes (list[Boxes], optional): A list of N elements.
            smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
@@ -1186,7 +1186,7 @@ class ROIOutputs(object):
        attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
        features = features.split(preds_per_image, dim=0)

-        # fun for each image too, also I can expirement and do multiple images
+        # fun for each image too, also I can experiment and do multiple images
        final_results = []
        zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
        for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
@@ -1412,7 +1412,7 @@ class AnchorGenerator(nn.Module):

    def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
        """
-        anchors are continious geometric rectangles
+        anchors are continuous geometric rectangles
        centered on one feature map point sample.
        We can later build the set of anchors
        for the entire feature map by tiling these tensors
@@ -1865,7 +1865,7 @@ class GeneralizedRCNN(nn.Module):
        scales_yx=None,
        **kwargs,
    ):
-        # run images through bacbone
+        # run images through backbone
        original_sizes = image_shapes * scales_yx
        features = self.backbone(images)

--- a/examples/lxmert/processing_image.py
+++ b/examples/lxmert/processing_image.py
@@ -116,7 +116,7 @@ class Preprocess:
            images = self.aug(images)
            # transpose images and convert to torch tensors
            # images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
-            # now normalize before pad to aoid useless arithmatic
+            # now normalize before pad to avoid useless arithmetic
            images = [self.normalizer(x) for x in images]
            # now pad them to do the following operations
            images, sizes = self.pad(images)
--- a/examples/lxmert/utils.py
+++ b/examples/lxmert/utils.py
@@ -236,7 +236,7 @@ def compare(in_tensor):
    ), f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} % element-wise mismatch"
    raise Exception("tensors are all good")

-    # Hugging face functiions below
+    # Hugging face functions below


 def is_remote_url(url_or_filename):
@@ -520,7 +520,7 @@ def get_image_from_url(url):
    return img


-# to load legace frcnn checkpoint from detectron
+# to load legacy frcnn checkpoint from detectron
 def load_frcnn_pkl_from_url(url):
    fn = url.split("/")[-1]
    if fn not in os.listdir(os.getcwd()):
--- a/examples/movement-pruning/counts_parameters.py
+++ b/examples/movement-pruning/counts_parameters.py
@@ -33,7 +33,7 @@ def main(args):
    remaining_count = 0  # Number of remaining (not pruned) params in the encoder
    encoder_count = 0  # Number of params in the encoder

-    print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
+    print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
    for name, param in st.items():
        if "encoder" not in name:
            continue
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
@@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
                )  # We can specify head_mask for each layer
            head_mask = head_mask.to(
                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
+            )  # switch to float if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
-    set_seed(args)  # Added here for reproductibility
+    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
@@ -705,7 +705,7 @@ def main():
        "--final_lambda",
        default=0.0,
        type=float,
-        help="Regularization intensity (used in conjunction with `regulariation`.",
+        help="Regularization intensity (used in conjunction with `regularization`.",
    )

    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -816,7 +816,7 @@ def main():
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
@@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
-    # Added here for reproductibility
+    # Added here for reproducibility
    set_seed(args)

    for _ in train_iterator:
@@ -824,7 +824,7 @@ def main():
        "--final_lambda",
        default=0.0,
        type=float,
-        help="Regularization intensity (used in conjunction with `regulariation`.",
+        help="Regularization intensity (used in conjunction with `regularization`.",
    )

    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -977,7 +977,7 @@ def main():
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
--- a/examples/rag/distributed_retriever.py
+++ b/examples/rag/distributed_retriever.py
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 class RagPyTorchDistributedRetriever(RagRetriever):
    """
    A distributed retriever built on top of the ``torch.distributed`` communication package. During training all workers
-    initalize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
+    initialize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
    in cpu memory. The index will also work well in a non-distributed setup.

    Args:
@@ -45,7 +45,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):

    def init_retrieval(self, distributed_port: int):
        """
-        Retriever initalization function, needs to be called from the training process. The function sets some common parameters
+        Retriever initialization function, needs to be called from the training process. The function sets some common parameters
        and environment variables. On top of that, (only) the main process in the process group loads the index into memory.

        Args:
@@ -56,7 +56,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):

        logger.info("initializing retrieval")

-        # initializing a separate process group for retrievel as the default
+        # initializing a separate process group for retrieval as the default
        # nccl backend doesn't support gather/scatter operations while gloo
        # is too slow to replace nccl for the core gpu communication
        if dist.is_initialized():
@@ -101,7 +101,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
            n_docs (:obj:`int`):
                The number of docs retrieved per query.

-        Ouput:
+        Output:
            retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
                The retrieval embeddings of the retrieved docs per query.
            doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
--- a/examples/rag/eval_rag.py
+++ b/examples/rag/eval_rag.py
@@ -176,7 +176,7 @@ def get_args():
        choices=["e2e", "retrieval"],
        default="e2e",
        type=str,
-        help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calulates precision@k.",
+        help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calculates precision@k.",
    )
    parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
    parser.add_argument(
@@ -206,7 +206,7 @@ def get_args():
        "--predictions_path",
        type=str,
        default="predictions.txt",
-        help="Name of the predictions file, to be stored in the checkpoints directry",
+        help="Name of the predictions file, to be stored in the checkpoints directory",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
--- a/examples/rag/use_own_knowledge_dataset.py
+++ b/examples/rag/use_own_knowledge_dataset.py
@@ -26,7 +26,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"


 def split_text(text: str, n=100, character=" ") -> List[str]:
-    """Split the text every ``n``-th occurence of ``character``"""
+    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]

--- a/examples/seq2seq/bertabs/configuration_bertabs.py
+++ b/examples/seq2seq/bertabs/configuration_bertabs.py
@@ -44,7 +44,7 @@ class BertAbsConfig(PretrainedConfig):
        enc_ff_size: int
            The size of the encoder's feed-forward layers.
        enc_dropout: int
-            The dropout probabilitiy for all fully connected layers in the
+            The dropout probability for all fully connected layers in the
            embeddings, layers, pooler and also the attention probabilities in
            the encoder.
        dec_layer: int
@@ -56,7 +56,7 @@ class BertAbsConfig(PretrainedConfig):
        dec_ff_size: int
            The size of the decoder's feed-forward layers.
        dec_dropout: int
-            The dropout probabilitiy for all fully connected layers in the
+            The dropout probability for all fully connected layers in the
            embeddings, layers, pooler and also the attention probabilities in
            the decoder.
    """
--- a/examples/seq2seq/bertabs/modeling_bertabs.py
+++ b/examples/seq2seq/bertabs/modeling_bertabs.py
@@ -152,7 +152,7 @@ class TransformerDecoder(nn.Module):
       dropout (float): dropout parameters
       embeddings (:obj:`onmt.modules.Embeddings`):
          embeddings to use, should have positional encodings
-       attn_type (str): if using a seperate copy attention
+       attn_type (str): if using a separate copy attention
    """

    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
@@ -817,11 +817,7 @@ class Translator(object):

        Args:
           batch (:obj:`Batch`): a batch from a dataset object
-           data (:obj:`Dataset`): the dataset object
           fast (bool): enables fast beam search (may not support all features)
-
-        Todo:
-           Shouldn't need the original dataset.
        """
        with torch.no_grad():
            return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
--- a/examples/seq2seq/convert_model_to_fp16.py
+++ b/examples/seq2seq/convert_model_to_fp16.py
@@ -12,7 +12,7 @@ def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None
    state_dict = torch.load(src_path, map_location=map_location)
    for k, v in tqdm(state_dict.items()):
        if not isinstance(v, torch.Tensor):
-            raise TypeError("FP16 conversion only works on paths that are saved state dics, like pytorch_model.bin")
+            raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
        state_dict[k] = v.half()
    if save_path is None:  # overwrite src_path
        save_path = src_path