[Docs] Fix spelling and grammar mistakes (#28825)

* Fix typos and grammar mistakes in docs and examples

* Fix typos in docstrings and comments

* Fix spelling of `tokenizer` in model tests

* Remove erroneous spaces in decorators

* Remove extra spaces in Markdown link texts
This commit is contained in:
Klaus Hipp
2024-02-02 08:45:00 +01:00
committed by GitHub
parent 2418c64a1c
commit 721ee783ca
134 changed files with 185 additions and 186 deletions

View File

@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer):
steps_trained_in_current_epoch = 0
# Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path):
# set global_step to gobal_step of last saved checkpoint from model path
# set global_step to global_step of last saved checkpoint from model path
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
@@ -169,7 +169,7 @@ def train(args, train_dataset, model, tokenizer):
desc="Epoch",
disable=args.local_rank not in [-1, 0],
)
set_seed(args) # Added here for reproductibility
set_seed(args) # Added here for reproducibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
@@ -614,7 +614,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")

View File

@@ -60,7 +60,7 @@ def is_autogenerated(example, scan_width=5):
def is_config_or_test(example, scan_width=5, coeff=0.05):
"""Check if file is a configuration file or a unit test by :
1- looking for keywords in the first few lines of the file.
2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
2- counting number of occurrence of the words 'config' and 'test' with respect to number of lines.
"""
keywords = ["unit tests", "test file", "configuration file"]

View File

@@ -162,7 +162,7 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
@@ -491,7 +491,7 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
@@ -566,7 +566,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")

View File

@@ -165,7 +165,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
# Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path):
try:
# set global_step to gobal_step of last saved checkpoint from model path
# set global_step to global_step of last saved checkpoint from model path
checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
global_step = int(checkpoint_suffix)
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
@@ -183,7 +183,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
# Added here for reproductibility
# Added here for reproducibility
set_seed(args)
for _ in train_iterator:
@@ -731,7 +731,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")

View File

@@ -134,7 +134,7 @@ def train(args, train_dataset, model, tokenizer, criterion):
best_f1, n_no_improve = 0, 0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility
set_seed(args) # Added here for reproducibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
@@ -384,7 +384,7 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
@@ -460,7 +460,7 @@ def main():
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")

View File

@@ -275,7 +275,7 @@ else:
# https://huggingface.co/docs/datasets/loading_datasets.
# Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation.
# Preprocessing is slightly different for training and evaluation.
column_names = raw_datasets["validation"].column_names

View File

@@ -349,7 +349,7 @@ def main():
)
# Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation.
# Preprocessing is slightly different for training and evaluation.
if training_args.do_train or model_args.do_calib:
column_names = raw_datasets["train"].column_names
elif training_args.do_eval or model_args.save_onnx:
@@ -448,7 +448,7 @@ def main():
raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified
# We will select sample from whole data if argument is specified
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# Create train feature from dataset

View File

@@ -239,7 +239,7 @@ For example,
./save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro
./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
```
splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100.
splits `wmt_en_ro/train` into 11,197 uneven length batches and can finish 1 epoch in 8 minutes on a v100.
For comparison,
```bash

View File

@@ -94,7 +94,7 @@ def run_generate(verbose=True):
parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
parser.add_argument(
"--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples"
"--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
)
parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")

View File

@@ -69,12 +69,12 @@ class ModelArguments:
hidden_dropout: Optional[float] = field(
default=0.1,
metadata={
"help": "The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler."
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
},
)
feat_proj_dropout: Optional[float] = field(
default=0.1,
metadata={"help": "The dropout probabilitiy for all 1D convolutional layers in feature extractor."},
metadata={"help": "The dropout probability for all 1D convolutional layers in feature extractor."},
)
mask_time_prob: Optional[float] = field(
default=0.05,