From 238449414f88d94ded35e80459bb6412d8ab42cf Mon Sep 17 00:00:00 2001 From: Maria Khalusova Date: Wed, 25 Jan 2023 11:33:39 -0500 Subject: [PATCH] Documentation code sample fixes (#21302) * Fixed the following: pipe -> pipeline out in pipe(data()) is a list of dict, not a dict * Fixed the TypeError: __init__() missing 1 required positional argument: 'key' * Added a tip: code sample requires additional libraries to run * Fixed custom config's name * added seqeval to the required libraries * fixed a missing dependency, fixed metric naming, added checkpoint to fix the datacollator * added checkpoint to fix the datacollator, added missing dependency --- docs/source/en/create_a_model.mdx | 4 ++-- docs/source/en/pipeline_tutorial.mdx | 18 +++++++++++++++--- docs/source/en/tasks/summarization.mdx | 13 +++++++------ docs/source/en/tasks/token_classification.mdx | 2 +- docs/source/en/tasks/translation.mdx | 15 ++++++++------- 5 files changed, 33 insertions(+), 19 deletions(-) diff --git a/docs/source/en/create_a_model.mdx b/docs/source/en/create_a_model.mdx index b0bafa4589..5c736f1d79 100644 --- a/docs/source/en/create_a_model.mdx +++ b/docs/source/en/create_a_model.mdx @@ -95,7 +95,7 @@ Once you are satisfied with your model configuration, you can save it with [`~Pr To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]: ```py ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") ``` @@ -115,7 +115,7 @@ Load your custom configuration attributes into the model: ```py >>> from transformers import DistilBertModel ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") >>> model = DistilBertModel(my_config) ``` diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx index 4be43484e0..8560d856f3 100644 --- a/docs/source/en/pipeline_tutorial.mdx +++ b/docs/source/en/pipeline_tutorial.mdx @@ -156,10 +156,10 @@ def data(): yield f"My example {i}" -pipe = pipe(model="gpt2", device=0) +pipe = pipeline(model="gpt2", device=0) generated_characters = 0 for out in pipe(data()): - generated_characters += len(out["generated_text"]) + generated_characters += len(out[0]["generated_text"]) ``` The iterator `data()` yields each result, and the pipeline automatically @@ -175,11 +175,12 @@ The simplest way to iterate over a dataset is to just load one from 🤗 [Datase ```py # KeyDataset is a util that will just output the item we're interested in. from transformers.pipelines.pt_utils import KeyDataset +from datasets import load_dataset pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0) dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]") -for out in pipe(KeyDataset(dataset["audio"])): +for out in pipe(KeyDataset(dataset, "audio")): print(out) ``` @@ -246,3 +247,14 @@ For example, if you use this [invoice image](https://huggingface.co/spaces/impir ... ) [{'score': 0.42514941096305847, 'answer': 'us-001', 'start': 16, 'end': 16}] ``` + + + +To run the example above you need to have [`pytesseract`](https://pypi.org/project/pytesseract/) installed in addition to 🤗 Transformers: + +```bash +sudo apt install -y tesseract-ocr +pip install pytesseract +``` + + \ No newline at end of file diff --git a/docs/source/en/tasks/summarization.mdx b/docs/source/en/tasks/summarization.mdx index 1663c1f713..879077d5cc 100644 --- a/docs/source/en/tasks/summarization.mdx +++ b/docs/source/en/tasks/summarization.mdx @@ -33,7 +33,7 @@ See the summarization [task page](https://huggingface.co/tasks/summarization) fo Before you begin, make sure you have all the necessary libraries installed: ```bash -pip install transformers datasets evaluate +pip install transformers datasets evaluate rouge_score ``` We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: @@ -81,7 +81,8 @@ The next step is to load a T5 tokenizer to process `text` and `summary`: ```py >>> from transformers import AutoTokenizer ->>> tokenizer = AutoTokenizer.from_pretrained("t5-small") +>>> checkpoint = "t5-small" +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ``` The preprocessing function you want to create needs to: @@ -117,14 +118,14 @@ Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more effic ```py >>> from transformers import DataCollatorForSeq2Seq ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) ``` ```py >>> from transformers import DataCollatorForSeq2Seq ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf") ``` @@ -175,7 +176,7 @@ You're ready to start training your model now! Load T5 with [`AutoModelForSeq2Se ```py >>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer ->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") +>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) ``` At this point, only three steps remain: @@ -237,7 +238,7 @@ Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]: ```py >>> from transformers import TFAutoModelForSeq2SeqLM ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) ``` Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: diff --git a/docs/source/en/tasks/token_classification.mdx b/docs/source/en/tasks/token_classification.mdx index 8c7ceac48f..64ad1d2543 100644 --- a/docs/source/en/tasks/token_classification.mdx +++ b/docs/source/en/tasks/token_classification.mdx @@ -32,7 +32,7 @@ See the token classification [task page](https://huggingface.co/tasks/token-clas Before you begin, make sure you have all the necessary libraries installed: ```bash -pip install transformers datasets evaluate +pip install transformers datasets evaluate seqeval ``` We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: diff --git a/docs/source/en/tasks/translation.mdx b/docs/source/en/tasks/translation.mdx index 318cb2b1a9..5f0a7fe385 100644 --- a/docs/source/en/tasks/translation.mdx +++ b/docs/source/en/tasks/translation.mdx @@ -30,7 +30,7 @@ See the translation [task page](https://huggingface.co/tasks/translation) for mo Before you begin, make sure you have all the necessary libraries installed: ```bash -pip install transformers datasets evaluate +pip install transformers datasets evaluate sacrebleu ``` We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login: @@ -77,7 +77,8 @@ The next step is to load a T5 tokenizer to process the English-French language p ```py >>> from transformers import AutoTokenizer ->>> tokenizer = AutoTokenizer.from_pretrained("t5-small") +>>> checkpoint = "t5-small" +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ``` The preprocessing function you want to create needs to: @@ -112,7 +113,7 @@ Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more effic ```py >>> from transformers import DataCollatorForSeq2Seq ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) ``` @@ -120,7 +121,7 @@ Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more effic ```py >>> from transformers import DataCollatorForSeq2Seq ->>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf") ``` @@ -132,7 +133,7 @@ Including a metric during training is often helpful for evaluating your model's ```py >>> import evaluate ->>> sacrebleu = evaluate.load("sacrebleu") +>>> metric = evaluate.load("sacrebleu") ``` Then create a function that passes your predictions and labels to [`~evaluate.EvaluationModule.compute`] to calculate the SacreBLEU score: @@ -184,7 +185,7 @@ You're ready to start training your model now! Load T5 with [`AutoModelForSeq2Se ```py >>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer ->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") +>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) ``` At this point, only three steps remain: @@ -246,7 +247,7 @@ Then you can load T5 with [`TFAutoModelForSeq2SeqLM`]: ```py >>> from transformers import TFAutoModelForSeq2SeqLM ->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) ``` Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: