Fix Perceiver docs (#14879)
This commit is contained in:
@@ -72,7 +72,7 @@ size of 262 byte IDs).
|
|||||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perceiver_architecture.jpg"
|
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perceiver_architecture.jpg"
|
||||||
alt="drawing" width="600"/>
|
alt="drawing" width="600"/>
|
||||||
|
|
||||||
<small> Perceiver IO architecture. Taken from the [original paper](https://arxiv.org/abs/2105.15203) </small>
|
<small> Perceiver IO architecture. Taken from the <a href="https://arxiv.org/abs/2105.15203">original paper</a> </small>
|
||||||
|
|
||||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
|
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
|
||||||
[here](https://github.com/deepmind/deepmind-research/tree/master/perceiver).
|
[here](https://github.com/deepmind/deepmind-research/tree/master/perceiver).
|
||||||
|
|||||||
@@ -1881,14 +1881,29 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
|
|||||||
```python
|
```python
|
||||||
>>> from transformers import PerceiverForMultimodalAutoencoding
|
>>> from transformers import PerceiverForMultimodalAutoencoding
|
||||||
>>> import torch
|
>>> import torch
|
||||||
|
>>> import numpy as np
|
||||||
|
|
||||||
|
>>> # create multimodal inputs
|
||||||
>>> images = torch.randn((1, 16, 3, 224, 224))
|
>>> images = torch.randn((1, 16, 3, 224, 224))
|
||||||
>>> audio = torch.randn((1, 30720, 1))
|
>>> audio = torch.randn((1, 30720, 1))
|
||||||
>>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
|
>>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
|
||||||
|
|
||||||
>>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver')
|
>>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver')
|
||||||
|
|
||||||
>>> outputs = model(inputs=inputs)
|
>>> # in the Perceiver IO paper, videos are auto-encoded in chunks
|
||||||
|
>>> # each chunk subsamples different index dimensions of the image and audio modality decoder queries
|
||||||
|
>>> nchunks = 128
|
||||||
|
>>> image_chunk_size = np.prod((16, 224, 224)) // nchunks
|
||||||
|
>>> audio_chunk_size = audio.shape[1] // model.config.samples_per_patch // nchunks
|
||||||
|
>>> # process the first chunk
|
||||||
|
>>> chunk_idx = 0
|
||||||
|
>>> subsampling = {
|
||||||
|
... "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
|
||||||
|
... "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
|
||||||
|
... "label": None,
|
||||||
|
... }
|
||||||
|
|
||||||
|
>>> outputs = model(inputs=inputs, subsampled_output_points=subsampling)
|
||||||
>>> logits = outputs.logits
|
>>> logits = outputs.logits
|
||||||
```"""
|
```"""
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|||||||
Reference in New Issue
Block a user