[Docs] Fix spelling and grammar mistakes (#28825)
* Fix typos and grammar mistakes in docs and examples * Fix typos in docstrings and comments * Fix spelling of `tokenizer` in model tests * Remove erroneous spaces in decorators * Remove extra spaces in Markdown link texts
This commit is contained in:
@@ -289,7 +289,7 @@ def main():
|
||||
)
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# 3. Detecting last checkpoint and eventualy continue from last checkpoint
|
||||
# 3. Detecting last checkpoint and eventually continue from last checkpoint
|
||||
last_checkpoint = None
|
||||
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
||||
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
||||
@@ -528,7 +528,7 @@ def main():
|
||||
# Transform images on the fly as doing it on the whole dataset takes too much time.
|
||||
test_dataset.set_transform(transform_images)
|
||||
|
||||
# 8. Initalize our trainer
|
||||
# 8. Initialize our trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
|
||||
@@ -114,10 +114,10 @@ from datasets import load_dataset
|
||||
# example 1: local folder
|
||||
dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
|
||||
|
||||
# example 2: local files (suppoted formats are tar, gzip, zip, xz, rar, zstd)
|
||||
# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
|
||||
dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
|
||||
|
||||
# example 3: remote files (suppoted formats are tar, gzip, zip, xz, rar, zstd)
|
||||
# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
|
||||
dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
|
||||
|
||||
# example 4: providing several splits
|
||||
|
||||
@@ -404,7 +404,7 @@ def main():
|
||||
# Set the validation transforms
|
||||
dataset["validation"].set_transform(val_transforms)
|
||||
|
||||
# Initalize our trainer
|
||||
# Initialize our trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
|
||||
@@ -25,7 +25,7 @@ NOTE: If you encounter problems/have suggestions for improvement, open an issue
|
||||
|
||||
## SimMIM
|
||||
|
||||
The `run_mim.py` script can be used to pre-train any Transformer-based vision model in the library (concretly, any model supported by the `AutoModelForMaskedImageModeling` API) for masked image modeling as proposed in [SimMIM: A Simple Framework for Masked Image Modeling](https://arxiv.org/abs/2111.09886) using PyTorch.
|
||||
The `run_mim.py` script can be used to pre-train any Transformer-based vision model in the library (concretely, any model supported by the `AutoModelForMaskedImageModeling` API) for masked image modeling as proposed in [SimMIM: A Simple Framework for Masked Image Modeling](https://arxiv.org/abs/2111.09886) using PyTorch.
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/simmim_architecture.jpg"
|
||||
alt="drawing" width="300"/>
|
||||
|
||||
@@ -90,7 +90,7 @@ def parse_args():
|
||||
default=128,
|
||||
help=(
|
||||
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
|
||||
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
|
||||
" sequences shorter will be padded if `--pad_to_max_length` is passed."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -378,7 +378,7 @@ def main():
|
||||
)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
# Preprocessing is slightly different for training and evaluation.
|
||||
if training_args.do_train:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
elif training_args.do_eval:
|
||||
|
||||
@@ -354,7 +354,7 @@ def main():
|
||||
)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
# Preprocessing is slightly different for training and evaluation.
|
||||
if training_args.do_train:
|
||||
column_names = raw_datasets["train"].column_names
|
||||
elif training_args.do_eval:
|
||||
|
||||
@@ -119,7 +119,7 @@ def parse_args():
|
||||
default=384,
|
||||
help=(
|
||||
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
|
||||
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
|
||||
" sequences shorter will be padded if `--pad_to_max_length` is passed."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -385,7 +385,7 @@ def main():
|
||||
)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
# Preprocessing is slightly different for training and evaluation.
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
question_column_name = "question" if "question" in column_names else column_names[0]
|
||||
@@ -508,7 +508,7 @@ def main():
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = raw_datasets["train"]
|
||||
if args.max_train_samples is not None:
|
||||
# We will select sample from whole data if agument is specified
|
||||
# We will select sample from whole data if argument is specified
|
||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||
# Create train feature from dataset
|
||||
with accelerator.main_process_first():
|
||||
@@ -877,7 +877,7 @@ def main():
|
||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||
)
|
||||
|
||||
# intialize all lists to collect the batches
|
||||
# initialize all lists to collect the batches
|
||||
all_start_top_log_probs = []
|
||||
all_start_top_index = []
|
||||
all_end_top_log_probs = []
|
||||
@@ -936,7 +936,7 @@ def main():
|
||||
logger.info(f"Evaluation metrics: {eval_metric}")
|
||||
|
||||
if args.do_predict:
|
||||
# intialize all lists to collect the batches
|
||||
# initialize all lists to collect the batches
|
||||
|
||||
all_start_top_log_probs = []
|
||||
all_start_top_index = []
|
||||
|
||||
@@ -123,7 +123,7 @@ def parse_args():
|
||||
default=384,
|
||||
help=(
|
||||
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
|
||||
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
|
||||
" sequences shorter will be padded if `--pad_to_max_length` is passed."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -460,7 +460,7 @@ def main():
|
||||
model = AutoModelForQuestionAnswering.from_config(config, trust_remote_code=args.trust_remote_code)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# Preprocessing is slighlty different for training and evaluation.
|
||||
# Preprocessing is slightly different for training and evaluation.
|
||||
|
||||
column_names = raw_datasets["train"].column_names
|
||||
|
||||
@@ -561,7 +561,7 @@ def main():
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = raw_datasets["train"]
|
||||
if args.max_train_samples is not None:
|
||||
# We will select sample from whole data if agument is specified
|
||||
# We will select sample from whole data if argument is specified
|
||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||
|
||||
# Create train feature from dataset
|
||||
|
||||
@@ -559,7 +559,7 @@ def main():
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
# We will select sample from whole data if agument is specified
|
||||
# We will select sample from whole data if argument is specified
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
# Create train feature from dataset
|
||||
|
||||
@@ -503,7 +503,7 @@ def main():
|
||||
# Set the validation transforms
|
||||
dataset["validation"].set_transform(preprocess_val)
|
||||
|
||||
# Initalize our trainer
|
||||
# Initialize our trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
|
||||
@@ -446,7 +446,7 @@ A very common use case is to leverage a pretrained speech encoder model,
|
||||
|
||||
By pairing a pretrained speech model with a pretrained text model, the warm-started model has prior knowledge of both the source audio and target text domains. However, the cross-attention weights between the encoder and decoder are randomly initialised. Thus, the model requires fine-tuning to learn the cross-attention weights and align the encoder mapping with that of the decoder. We can perform this very fine-tuning procedure using the example script.
|
||||
|
||||
As an example, let's instantiate a *Wav2Vec2-2-Bart* model with the `SpeechEnocderDecoderModel` framework. First create an empty repo on `hf.co`:
|
||||
As an example, let's instantiate a *Wav2Vec2-2-Bart* model with the `SpeechEncoderDecoderModel` framework. First create an empty repo on `hf.co`:
|
||||
|
||||
```bash
|
||||
huggingface-cli repo create wav2vec2-2-bart-base
|
||||
@@ -506,7 +506,7 @@ Having warm-started the speech-encoder-decoder model under `<your-user-name>/wav
|
||||
In the script [`run_speech_recognition_seq2seq`], we load the warm-started model,
|
||||
feature extractor, and tokenizer, process a speech recognition dataset,
|
||||
and subsequently make use of the [`Seq2SeqTrainer`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) to train our system.
|
||||
Note that it is important to align the target transcriptions with the decoder's vocabulary. For example, the [`Librispeech`](https://huggingface.co/datasets/librispeech_asr) dataset only contains captilized letters in the transcriptions,
|
||||
Note that it is important to align the target transcriptions with the decoder's vocabulary. For example, the [`Librispeech`](https://huggingface.co/datasets/librispeech_asr) dataset only contains capitalized letters in the transcriptions,
|
||||
whereas BART was pretrained mostly on normalized text. Thus, it is recommended to add the argument
|
||||
`--do_lower_case` to the fine-tuning script when using a warm-started `SpeechEncoderDecoderModel`.
|
||||
The model is fine-tuned on the standard cross-entropy language modeling
|
||||
|
||||
@@ -146,7 +146,7 @@ class DataTrainingArguments:
|
||||
" should be trained on in ISO 693-3 code, e.g. `tur` for Turkish"
|
||||
" Wav2Vec2's MMS ISO codes can be looked up here: https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html"
|
||||
" If you are not training the adapter layers on a language, simply choose"
|
||||
" another accronym that fits your data."
|
||||
" another acronym that fits your data."
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
@@ -129,7 +129,7 @@ python run_classification.py \
|
||||
--num_train_epochs 15 \
|
||||
--output_dir /tmp/${dataset}_${subset}/
|
||||
```
|
||||
It results in a Micro F1 score of around 0.82 without any text and label filtering. Note that you have to explictly remove the "unused" split from the dataset, since it is not used for classification.
|
||||
It results in a Micro F1 score of around 0.82 without any text and label filtering. Note that you have to explicitly remove the "unused" split from the dataset, since it is not used for classification.
|
||||
|
||||
### Mixed precision training
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ class DataTrainingArguments:
|
||||
metadata={
|
||||
"help": (
|
||||
"The name of the text column in the input dataset or a CSV/JSON file. "
|
||||
'If not specified, will use the "sentence" column for single/multi-label classifcation task.'
|
||||
'If not specified, will use the "sentence" column for single/multi-label classification task.'
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -121,7 +121,7 @@ class DataTrainingArguments:
|
||||
metadata={
|
||||
"help": (
|
||||
"The name of the label column in the input dataset or a CSV/JSON file. "
|
||||
'If not specified, will use the "label" column for single/multi-label classifcation task'
|
||||
'If not specified, will use the "label" column for single/multi-label classification task'
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -260,7 +260,7 @@ class ModelArguments:
|
||||
|
||||
|
||||
def get_label_list(raw_dataset, split="train") -> List[str]:
|
||||
"""Get the list of labels from a mutli-label dataset"""
|
||||
"""Get the list of labels from a multi-label dataset"""
|
||||
|
||||
if isinstance(raw_dataset[split]["label"][0], list):
|
||||
label_list = [label for sample in raw_dataset[split]["label"] for label in sample]
|
||||
@@ -343,7 +343,7 @@ def main():
|
||||
|
||||
# Get the datasets: you can either provide your own CSV/JSON training and evaluation files, or specify a dataset name
|
||||
# to load from huggingface/datasets. In ether case, you can specify a the key of the column(s) containing the text and
|
||||
# the key of the column containing the label. If multiple columns are specified for the text, they will be joined togather
|
||||
# the key of the column containing the label. If multiple columns are specified for the text, they will be joined together
|
||||
# for the actual text value.
|
||||
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
|
||||
# download the dataset.
|
||||
|
||||
@@ -18,7 +18,7 @@ limitations under the License.
|
||||
|
||||
Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
|
||||
|
||||
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPTJ, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT.
|
||||
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPT-J, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT.
|
||||
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
|
||||
can try out the different models available in the library.
|
||||
|
||||
|
||||
@@ -175,7 +175,7 @@ def parse_args():
|
||||
default=128,
|
||||
help=(
|
||||
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
|
||||
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
|
||||
" sequences shorter will be padded if `--pad_to_max_length` is passed."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user