Fix doc errors and typos across the board (#8139)
* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
This commit is contained in:
@@ -33,7 +33,7 @@ def main(args):
|
||||
remaining_count = 0 # Number of remaining (not pruned) params in the encoder
|
||||
encoder_count = 0 # Number of params in the encoder
|
||||
|
||||
print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
|
||||
print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
|
||||
for name, param in st.items():
|
||||
if "encoder" not in name:
|
||||
continue
|
||||
|
||||
@@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
@@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
||||
) # We can specify head_mask for each layer
|
||||
head_mask = head_mask.to(
|
||||
dtype=next(self.parameters()).dtype
|
||||
) # switch to fload if need + fp16 compatibility
|
||||
) # switch to float if need + fp16 compatibility
|
||||
else:
|
||||
head_mask = [None] * self.config.num_hidden_layers
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
desc="Epoch",
|
||||
disable=args.local_rank not in [-1, 0],
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
set_seed(args) # Added here for reproducibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
@@ -705,7 +705,7 @@ def main():
|
||||
"--final_lambda",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
||||
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||
)
|
||||
|
||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||
@@ -816,7 +816,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
@@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
# Added here for reproductibility
|
||||
# Added here for reproducibility
|
||||
set_seed(args)
|
||||
|
||||
for _ in train_iterator:
|
||||
@@ -824,7 +824,7 @@ def main():
|
||||
"--final_lambda",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
||||
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||
)
|
||||
|
||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||
@@ -977,7 +977,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
Reference in New Issue
Block a user