Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board

* Fix a typo

* Fix the CI

* Fix more typos

* Fix CI

* More fixes

* Fix CI

* More fixes

* More fixes
This commit is contained in:
Santiago Castro
2020-10-29 10:33:33 -04:00
committed by GitHub
parent 4731a00c3e
commit 969859d5f6
160 changed files with 342 additions and 364 deletions

View File

@@ -33,7 +33,7 @@ def main(args):
remaining_count = 0 # Number of remaining (not pruned) params in the encoder
encoder_count = 0 # Number of params in the encoder
print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
for name, param in st.items():
if "encoder" not in name:
continue

View File

@@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
# If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
@@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
) # switch to float if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers

View File

@@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
desc="Epoch",
disable=args.local_rank not in [-1, 0],
)
set_seed(args) # Added here for reproductibility
set_seed(args) # Added here for reproducibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
@@ -705,7 +705,7 @@ def main():
"--final_lambda",
default=0.0,
type=float,
help="Regularization intensity (used in conjunction with `regulariation`.",
help="Regularization intensity (used in conjunction with `regularization`.",
)
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -816,7 +816,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")

View File

@@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
# Added here for reproductibility
# Added here for reproducibility
set_seed(args)
for _ in train_iterator:
@@ -824,7 +824,7 @@ def main():
"--final_lambda",
default=0.0,
type=float,
help="Regularization intensity (used in conjunction with `regulariation`.",
help="Regularization intensity (used in conjunction with `regularization`.",
)
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
@@ -977,7 +977,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")