Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
2020-10-29 10:33:33 -04:00
parent 4731a00c3e
commit 969859d5f6
160 changed files with 342 additions and 364 deletions
--- a/examples/movement-pruning/counts_parameters.py
+++ b/examples/movement-pruning/counts_parameters.py
@@ -33,7 +33,7 @@ def main(args):
    remaining_count = 0  # Number of remaining (not pruned) params in the encoder
    encoder_count = 0  # Number of params in the encoder

-    print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
+    print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
    for name, param in st.items():
        if "encoder" not in name:
            continue
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
@@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
                )  # We can specify head_mask for each layer
            head_mask = head_mask.to(
                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
+            )  # switch to float if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
-    set_seed(args)  # Added here for reproductibility
+    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
@@ -705,7 +705,7 @@ def main():
        "--final_lambda",
        default=0.0,
        type=float,
-        help="Regularization intensity (used in conjunction with `regulariation`.",
+        help="Regularization intensity (used in conjunction with `regularization`.",
    )

    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -816,7 +816,7 @@ def main():
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
@@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
-    # Added here for reproductibility
+    # Added here for reproducibility
    set_seed(args)

    for _ in train_iterator:
@@ -824,7 +824,7 @@ def main():
        "--final_lambda",
        default=0.0,
        type=float,
-        help="Regularization intensity (used in conjunction with `regulariation`.",
+        help="Regularization intensity (used in conjunction with `regularization`.",
    )

    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -977,7 +977,7 @@ def main():
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")