diff --git a/docs/source/en/task_summary.mdx b/docs/source/en/task_summary.mdx index 1a740668d4..e02ad6da68 100644 --- a/docs/source/en/task_summary.mdx +++ b/docs/source/en/task_summary.mdx @@ -34,8 +34,14 @@ Audio classification is a task that labels audio data from a predefined set of c ```py >>> from transformers import pipeline ->>> classifier = pipeline(task="audio-classification") ->>> classifier("path/to/audio/file.mp3") +>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er") +>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.4532, 'label': 'hap'}, + {'score': 0.3622, 'label': 'sad'}, + {'score': 0.0943, 'label': 'neu'}, + {'score': 0.0903, 'label': 'ang'}] ``` ### Automatic speech recognition @@ -47,8 +53,9 @@ But one of the key challenges Transformer architectures have helped with is in l ```py >>> from transformers import pipeline ->>> transcriber = pipeline(task="automatic-speech-recognition") ->>> transcriber("path/to/audio/file.mp3") +>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small") +>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} ``` ## Computer vision @@ -73,7 +80,16 @@ Image classification labels an entire image from a predefined set of classes. Li >>> from transformers import pipeline >>> classifier = pipeline(task="image-classification") ->>> classifier("path/to/image/file.jpg") +>>> preds = classifier( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> print(*preds, sep="\n") +{'score': 0.4403, 'label': 'lynx, catamount'} +{'score': 0.0343, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'} +{'score': 0.0321, 'label': 'snow leopard, ounce, Panthera uncia'} +{'score': 0.0235, 'label': 'Egyptian cat'} +{'score': 0.023, 'label': 'tiger cat'} ``` ### Object detection @@ -88,7 +104,14 @@ Unlike image classification, object detection identifies multiple objects within >>> from transformers import pipeline >>> detector = pipeline(task="object-detection") ->>> detector("path/to/image/file.jpg") +>>> preds = detector( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds] +>>> preds +[{'score': 0.9865, + 'label': 'cat', + 'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}] ``` ### Image segmentation @@ -104,7 +127,14 @@ Segmentation tasks are helpful in self-driving vehicles to create a pixel-level >>> from transformers import pipeline >>> segmenter = pipeline(task="image-segmentation") ->>> segmenter("path/to/image/file.jpg") +>>> preds = segmenter( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.9856, 'label': 'LABEL_184'}, + {'score': 0.9976, 'label': 'snow'}, + {'score': 0.9962, 'label': 'cat'}] ``` ### Depth estimation @@ -120,7 +150,9 @@ There are two approaches to depth estimation: >>> from transformers import pipeline >>> depth_estimator = pipeline(task="depth-estimation") ->>> depth_estimator("path/to/image/file.jpg") +>>> preds = depth_estimator( +... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" +... ) ``` ## Natural language processing @@ -138,7 +170,10 @@ Like classification tasks in any modality, text classification labels a sequence >>> from transformers import pipeline >>> classifier = pipeline(task="sentiment-analysis") ->>> classifier("Hugging Face is the best thing since sliced bread!") +>>> preds = classifier("Hugging Face is the best thing since sliced bread!") +>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +>>> preds +[{'score': 0.9991, 'label': 'POSITIVE'}] ``` ### Token classification @@ -154,7 +189,26 @@ Two common types of token classification are: >>> from transformers import pipeline >>> classifier = pipeline(task="ner") ->>> classifier("Hugging Face is a French company based in New York City.") +>>> preds = classifier("Hugging Face is a French company based in New York City.") +>>> preds = [ +... { +... "entity": pred["entity"], +... "score": round(pred["score"], 4), +... "index": pred["index"], +... "word": pred["word"], +... "start": pred["start"], +... "end": pred["end"], +... } +... for pred in preds +... ] +>>> print(*preds, sep="\n") +{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2} +{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7} +{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12} +{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24} +{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45} +{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50} +{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55} ``` ### Question answering @@ -171,10 +225,14 @@ There are two common types of question answering: >>> from transformers import pipeline >>> question_answerer = pipeline(task="question-answering") ->>> question_answerer( +>>> preds = question_answerer( ... question="What is the name of the repository?", ... context="The name of the repository is huggingface/transformers", ... ) +>>> print( +... f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}" +... ) +score: 0.9327, start: 30, end: 54, answer: huggingface/transformers ``` ### Summarization @@ -191,8 +249,9 @@ Like question answering, there are two types of summarization: >>> summarizer = pipeline(task="summarization") >>> summarizer( -... "Hugging Face is a French company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window." +... "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles." ... ) +[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}] ``` ### Translation @@ -205,8 +264,9 @@ In the early days, translation models were mostly monolingual, but recently, the >>> from transformers import pipeline >>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning." ->>> translator = pipeline(task="translation") +>>> translator = pipeline(task="translation", model="t5-small") >>> translator(text) +[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}] ``` ### Language modeling @@ -220,17 +280,31 @@ There are two types of language modeling: ```py >>> from transformers import pipeline - >>> prompt = "Hugging Face is a" - >>> text_generator = pipeline(task="text-generation") - >>> text_generator(prompt) + >>> prompt = "Hugging Face is a community-based open-source platform for machine learning." + >>> generator = pipeline(task="text-generation") + >>> generator(prompt) # doctest: +SKIP ``` * masked: the model's objective is to predict a masked token in a sequence with full access to the tokens in the sequence ```py - >>> text = "Hugging Face is a company based in New York City." + >>> text = "Hugging Face is a community-based open-source for machine learning." >>> fill_mask = pipeline(task="fill-mask") - >>> fill_mask(text, top_k=3) + >>> preds = fill_mask(text, top_k=1) + >>> preds = [ + ... { + ... "score": round(pred["score"], 4), + ... "token": pred["token"], + ... "token_str": pred["token_str"], + ... "sequence": pred["sequence"], + ... } + ... for pred in preds + ... ] + >>> preds + [{'score': 0.2236, + 'token': 1761, + 'token_str': ' platform', + 'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}] ``` Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. In the next [section](tasks_explained), you'll learn **how** 🤗 Transformers work to solve these tasks. \ No newline at end of file