Use tqdm.auto in Pipeline docs (#14920)

It's better for e.g. notebook.
This commit is contained in:
Santiago Castro
2022-01-10 16:28:34 +01:00
committed by GitHub
parent f012c00ada
commit f21bc4215a

View File

@@ -79,14 +79,14 @@ GPU. If it doesn't don't hesitate to create an issue.
import datasets
from transformers import pipeline
from transformers.pipelines.base import KeyDataset
import tqdm
from tqdm.auto import tqdm
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
dataset = datasets.load_dataset("superb", name="asr", split="test")
# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset.
for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))):
for out in tqdm(pipe(KeyDataset(dataset, "file"))):
print(out)
# {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
# {"text": ....}
@@ -130,7 +130,6 @@ whenever the pipeline uses its streaming ability (so when passing lists or `Data
from transformers import pipeline
from transformers.pipelines.base import KeyDataset
import datasets
import tqdm
dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
pipe = pipeline("text-classification", device=0)
@@ -153,8 +152,7 @@ Example where it's mostly a speedup:
```python
from transformers import pipeline
from torch.utils.data import Dataset
import tqdm
from tqdm.auto import tqdm
pipe = pipeline("text-classification", device=0)
@@ -172,7 +170,7 @@ dataset = MyDataset()
for batch_size in [1, 8, 64, 256]:
print("-" * 30)
print(f"Streaming batch_size={batch_size}")
for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
pass
```
@@ -228,7 +226,7 @@ Streaming batch_size=256
0%| | 0/1000 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/nicolas/src/transformers/test.py", line 42, in <module>
for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
for out in tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
....
q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)