From 7df4b90c760804295cd4c23a0840055b772898ee Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 22 Dec 2021 14:18:03 +0100 Subject: [PATCH] Fix Perceiver docs (#14879) --- docs/source/model_doc/perceiver.mdx | 2 +- .../models/perceiver/modeling_perceiver.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/docs/source/model_doc/perceiver.mdx b/docs/source/model_doc/perceiver.mdx index b474074e8b..0dbfd3e004 100644 --- a/docs/source/model_doc/perceiver.mdx +++ b/docs/source/model_doc/perceiver.mdx @@ -72,7 +72,7 @@ size of 262 byte IDs). drawing - Perceiver IO architecture. Taken from the [original paper](https://arxiv.org/abs/2105.15203) + Perceiver IO architecture. Taken from the original paper This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/deepmind/deepmind-research/tree/master/perceiver). diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py index 5cfef2effb..c412f7f360 100755 --- a/src/transformers/models/perceiver/modeling_perceiver.py +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -1881,14 +1881,29 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel): ```python >>> from transformers import PerceiverForMultimodalAutoencoding >>> import torch + >>> import numpy as np + >>> # create multimodal inputs >>> images = torch.randn((1, 16, 3, 224, 224)) >>> audio = torch.randn((1, 30720, 1)) >>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700))) >>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver') - >>> outputs = model(inputs=inputs) + >>> # in the Perceiver IO paper, videos are auto-encoded in chunks + >>> # each chunk subsamples different index dimensions of the image and audio modality decoder queries + >>> nchunks = 128 + >>> image_chunk_size = np.prod((16, 224, 224)) // nchunks + >>> audio_chunk_size = audio.shape[1] // model.config.samples_per_patch // nchunks + >>> # process the first chunk + >>> chunk_idx = 0 + >>> subsampling = { + ... "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)), + ... "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)), + ... "label": None, + ... } + + >>> outputs = model(inputs=inputs, subsampled_output_points=subsampling) >>> logits = outputs.logits ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict