[Docs] Fix spelling and grammar mistakes (#28825)
* Fix typos and grammar mistakes in docs and examples * Fix typos in docstrings and comments * Fix spelling of `tokenizer` in model tests * Remove erroneous spaces in decorators * Remove extra spaces in Markdown link texts
This commit is contained in:
@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
# set global_step to global_step of last saved checkpoint from model path
|
||||
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
@@ -169,7 +169,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
desc="Epoch",
|
||||
disable=args.local_rank not in [-1, 0],
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
set_seed(args) # Added here for reproducibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
@@ -614,7 +614,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
@@ -60,7 +60,7 @@ def is_autogenerated(example, scan_width=5):
|
||||
def is_config_or_test(example, scan_width=5, coeff=0.05):
|
||||
"""Check if file is a configuration file or a unit test by :
|
||||
1- looking for keywords in the first few lines of the file.
|
||||
2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
|
||||
2- counting number of occurrence of the words 'config' and 'test' with respect to number of lines.
|
||||
"""
|
||||
|
||||
keywords = ["unit tests", "test file", "configuration file"]
|
||||
|
||||
@@ -162,7 +162,7 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
@@ -491,7 +491,7 @@ def main():
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
@@ -566,7 +566,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
@@ -165,7 +165,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
try:
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
# set global_step to global_step of last saved checkpoint from model path
|
||||
checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
|
||||
global_step = int(checkpoint_suffix)
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
@@ -183,7 +183,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
# Added here for reproductibility
|
||||
# Added here for reproducibility
|
||||
set_seed(args)
|
||||
|
||||
for _ in train_iterator:
|
||||
@@ -731,7 +731,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
@@ -134,7 +134,7 @@ def train(args, train_dataset, model, tokenizer, criterion):
|
||||
best_f1, n_no_improve = 0, 0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility
|
||||
set_seed(args) # Added here for reproducibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
@@ -384,7 +384,7 @@ def main():
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
@@ -460,7 +460,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
@@ -275,7 +275,7 @@ else:
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
# Preprocessing is slightly different for training and evaluation.
|
||||
|
||||
column_names = raw_datasets["validation"].column_names
|
||||
|
||||
|
||||
@@ -349,7 +349,7 @@ def main():
|
||||
)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
# Preprocessing is slightly different for training and evaluation.
|
||||
if training_args.do_train or model_args.do_calib:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
elif training_args.do_eval or model_args.save_onnx:
|
||||
@@ -448,7 +448,7 @@ def main():
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
# We will select sample from whole data if agument is specified
|
||||
# We will select sample from whole data if argument is specified
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
# Create train feature from dataset
|
||||
|
||||
@@ -239,7 +239,7 @@ For example,
|
||||
./save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro
|
||||
./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
|
||||
```
|
||||
splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100.
|
||||
splits `wmt_en_ro/train` into 11,197 uneven length batches and can finish 1 epoch in 8 minutes on a v100.
|
||||
|
||||
For comparison,
|
||||
```bash
|
||||
|
||||
@@ -94,7 +94,7 @@ def run_generate(verbose=True):
|
||||
parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
|
||||
parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
|
||||
parser.add_argument(
|
||||
"--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples"
|
||||
"--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
|
||||
)
|
||||
parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
|
||||
parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
|
||||
|
||||
@@ -69,12 +69,12 @@ class ModelArguments:
|
||||
hidden_dropout: Optional[float] = field(
|
||||
default=0.1,
|
||||
metadata={
|
||||
"help": "The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler."
|
||||
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
|
||||
},
|
||||
)
|
||||
feat_proj_dropout: Optional[float] = field(
|
||||
default=0.1,
|
||||
metadata={"help": "The dropout probabilitiy for all 1D convolutional layers in feature extractor."},
|
||||
metadata={"help": "The dropout probability for all 1D convolutional layers in feature extractor."},
|
||||
)
|
||||
mask_time_prob: Optional[float] = field(
|
||||
default=0.05,
|
||||
|
||||
Reference in New Issue
Block a user