@@ -156,7 +156,7 @@ Die [`pipeline`] kann jedes Modell aus dem [Model Hub](https://huggingface.co/mo
|
|||||||
|
|
||||||
<frameworkcontent>
|
<frameworkcontent>
|
||||||
<pt>
|
<pt>
|
||||||
Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below):
|
Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `AutoClass` below):
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||||
@@ -166,7 +166,7 @@ Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the
|
|||||||
```
|
```
|
||||||
</pt>
|
</pt>
|
||||||
<tf>
|
<tf>
|
||||||
Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below):
|
Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `TFAutoClass` below):
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
|
>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
|
||||||
@@ -222,7 +222,7 @@ Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als
|
|||||||
Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
|
Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
|
||||||
|
|
||||||
* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
|
* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
|
||||||
* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
|
* [attention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
|
||||||
|
|
||||||
Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
|
Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
|
|||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
specific language governing permissions and limitations under the License.
|
specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||||
rendered properly in your Markdown viewer.
|
rendered properly in your Markdown viewer.
|
||||||
|
|
||||||
-->
|
-->
|
||||||
@@ -62,7 +62,7 @@ for _ in range(max_new_tokens):
|
|||||||
# Greedily sample one next token
|
# Greedily sample one next token
|
||||||
next_token_ids = outputs.logits[:, -1:].argmax(-1)
|
next_token_ids = outputs.logits[:, -1:].argmax(-1)
|
||||||
generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
|
generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
|
||||||
# Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token
|
# Prepare inputs for the next generation step by leaving unprocessed tokens, in our case we have only one new token
|
||||||
# and expanding attn mask for the new token, as explained above
|
# and expanding attn mask for the new token, as explained above
|
||||||
attention_mask = inputs["attention_mask"]
|
attention_mask = inputs["attention_mask"]
|
||||||
attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
|
attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
|
||||||
@@ -88,7 +88,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", to
|
|||||||
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
|
# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
|
||||||
# in the the legacy format
|
# in the legacy format
|
||||||
generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
|
generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
|
||||||
|
|
||||||
cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
|
cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
|
|||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
specific language governing permissions and limitations under the License.
|
specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||||
rendered properly in your Markdown viewer.
|
rendered properly in your Markdown viewer.
|
||||||
|
|
||||||
-->
|
-->
|
||||||
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
|
Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
|
||||||
|
|
||||||
Multimodal templates are included in the [Processor](./processors) class and requires an additional `type` key for specifying whether the included content is an image, video, or text.
|
Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.
|
||||||
|
|
||||||
This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
|
This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
|
||||||
|
|
||||||
@@ -109,7 +109,7 @@ These inputs are now ready to be used in [`~GenerationMixin.generate`].
|
|||||||
|
|
||||||
Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
|
Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
|
||||||
|
|
||||||
- The content `"type"` should be `"video"` to indicate the the content is a video.
|
- The content `"type"` should be `"video"` to indicate the content is a video.
|
||||||
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
||||||
|
|
||||||
> [!WARNING]
|
> [!WARNING]
|
||||||
@@ -141,7 +141,7 @@ Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input
|
|||||||
|
|
||||||
The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
|
The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
|
||||||
|
|
||||||
The examples below uses Decord as the backend because it is a bit faster than PyAV.
|
The examples below use Decord as the backend because it is a bit faster than PyAV.
|
||||||
|
|
||||||
<hfoptions id="sampling">
|
<hfoptions id="sampling">
|
||||||
<hfoption id="fixed number of frames">
|
<hfoption id="fixed number of frames">
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ class ResnetModel(PreTrainedModel):
|
|||||||
</hfoption>
|
</hfoption>
|
||||||
<hfoption id="ResnetModelForImageClassification">
|
<hfoption id="ResnetModelForImageClassification">
|
||||||
|
|
||||||
The `forward` method needs to be rewrittten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
|
The `forward` method needs to be rewritten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
|
> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
|
|||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
specific language governing permissions and limitations under the License.
|
specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||||
rendered properly in your Markdown viewer.
|
rendered properly in your Markdown viewer.
|
||||||
|
|
||||||
-->
|
-->
|
||||||
@@ -56,7 +56,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
|
|||||||
|
|
||||||
### Order of GPUs
|
### Order of GPUs
|
||||||
|
|
||||||
To select specific GPUs to use and their order, configure the the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if there are 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2:
|
To select specific GPUs to use and their order, configure the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if there are 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
||||||
|
|||||||
@@ -220,7 +220,7 @@ Pasa tu texto al tokenizador:
|
|||||||
El tokenizador devolverá un diccionario conteniendo:
|
El tokenizador devolverá un diccionario conteniendo:
|
||||||
|
|
||||||
* [input_ids](./glossary#input-ids): representaciones numéricas de los tokens.
|
* [input_ids](./glossary#input-ids): representaciones numéricas de los tokens.
|
||||||
* [atttention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
|
* [attention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
|
||||||
|
|
||||||
Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme:
|
Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme:
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ Abbiamo integrato di recente `BetterTransformer` per fare inferenza più rapidam
|
|||||||
|
|
||||||
## PyTorch JIT-mode (TorchScript)
|
## PyTorch JIT-mode (TorchScript)
|
||||||
|
|
||||||
TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programmma TorchScript può esere salvato da un processo Python e caricato in un processo dove non ci sono dipendenze Python.
|
TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programma TorchScript può esere salvato da un processo Python e caricato in un processo dove non ci sono dipendenze Python.
|
||||||
Comparandolo con l'eager mode di default, jit mode in PyTorch normalmente fornisce prestazioni migliori per l'inferenza del modello da parte di metodologie di ottimizzazione come la operator fusion.
|
Comparandolo con l'eager mode di default, jit mode in PyTorch normalmente fornisce prestazioni migliori per l'inferenza del modello da parte di metodologie di ottimizzazione come la operator fusion.
|
||||||
|
|
||||||
Per una prima introduzione a TorchScript, vedi la Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
|
Per una prima introduzione a TorchScript, vedi la Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
|
||||||
|
|||||||
@@ -222,7 +222,7 @@ Passe o texto para o tokenizer:
|
|||||||
O tokenizer retornará um dicionário contendo:
|
O tokenizer retornará um dicionário contendo:
|
||||||
|
|
||||||
* [input_ids](./glossary#input-ids): representações numéricas de seus tokens.
|
* [input_ids](./glossary#input-ids): representações numéricas de seus tokens.
|
||||||
* [atttention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
|
* [attention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
|
||||||
|
|
||||||
Assim como o [`pipeline`], o tokenizer aceitará uma lista de entradas. Além disso, o tokenizer também pode preencher e truncar o texto para retornar um lote com comprimento uniforme:
|
Assim como o [`pipeline`], o tokenizer aceitará uma lista de entradas. Além disso, o tokenizer também pode preencher e truncar o texto para retornar um lote com comprimento uniforme:
|
||||||
|
|
||||||
|
|||||||
@@ -918,7 +918,7 @@ def add_model_to_main_init(
|
|||||||
new_model_patterns (`ModelPatterns`): The patterns for the new model.
|
new_model_patterns (`ModelPatterns`): The patterns for the new model.
|
||||||
frameworks (`List[str]`, *optional*):
|
frameworks (`List[str]`, *optional*):
|
||||||
If specified, only the models implemented in those frameworks will be added.
|
If specified, only the models implemented in those frameworks will be added.
|
||||||
with_processsing (`bool`, *optional*, defaults to `True`):
|
with_processing (`bool`, *optional*, defaults to `True`):
|
||||||
Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
|
Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
|
||||||
"""
|
"""
|
||||||
with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f:
|
with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f:
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ VideoInput = Union[
|
|||||||
list["np.ndarray"],
|
list["np.ndarray"],
|
||||||
list["torch.Tensor"],
|
list["torch.Tensor"],
|
||||||
list[list["PIL.Image.Image"]],
|
list[list["PIL.Image.Image"]],
|
||||||
list[list["np.ndarrray"]],
|
list[list["np.ndarray"]],
|
||||||
list[list["torch.Tensor"]],
|
list[list["torch.Tensor"]],
|
||||||
] # noqa
|
] # noqa
|
||||||
|
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ class AlignProcessor(ProcessorMixin):
|
|||||||
arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` arguments to
|
the text. To prepare the image(s), this method forwards the `images` arguments to
|
||||||
EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
|
EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
|
||||||
to the doctsring of the above two methods for more information.
|
to the docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ class AltCLIPProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
|
and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
|
||||||
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
|||||||
|
|
||||||
FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||||
[
|
[
|
||||||
# Model for Image-classsification
|
# Model for Image-classification
|
||||||
("beit", "FlaxBeitForImageClassification"),
|
("beit", "FlaxBeitForImageClassification"),
|
||||||
("dinov2", "FlaxDinov2ForImageClassification"),
|
("dinov2", "FlaxDinov2ForImageClassification"),
|
||||||
("regnet", "FlaxRegNetForImageClassification"),
|
("regnet", "FlaxRegNetForImageClassification"),
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ class BambaConfig(PretrainedConfig):
|
|||||||
`inputs_ids` passed when calling [`BambaModel`]
|
`inputs_ids` passed when calling [`BambaModel`]
|
||||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||||
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
|
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
|
||||||
model has a output word embedding layer.
|
model has an output word embedding layer.
|
||||||
hidden_size (`int`, *optional*, defaults to 4096):
|
hidden_size (`int`, *optional*, defaults to 4096):
|
||||||
Dimension of the hidden representations.
|
Dimension of the hidden representations.
|
||||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||||
@@ -85,7 +85,7 @@ class BambaConfig(PretrainedConfig):
|
|||||||
mamba_n_heads (`int`, *optional*, defaults to 128):
|
mamba_n_heads (`int`, *optional*, defaults to 128):
|
||||||
The number of mamba heads used in the v2 implementation.
|
The number of mamba heads used in the v2 implementation.
|
||||||
mamba_d_head (`int`, *optional*, defaults to `"auto"`):
|
mamba_d_head (`int`, *optional*, defaults to `"auto"`):
|
||||||
Head embeddding dimension size
|
Head embedding dimension size
|
||||||
mamba_n_groups (`int`, *optional*, defaults to 1):
|
mamba_n_groups (`int`, *optional*, defaults to 1):
|
||||||
The number of the mamba groups used in the v2 implementation.
|
The number of the mamba groups used in the v2 implementation.
|
||||||
mamba_d_state (`int`, *optional*, defaults to 256):
|
mamba_d_state (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -190,12 +190,12 @@ def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
|
|||||||
output_new_model = output_new_model_total.logits[:, [-1], :]
|
output_new_model = output_new_model_total.logits[:, [-1], :]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
prediction_codeboook_channel = 3
|
prediction_codebook_channel = 3
|
||||||
n_codes_total = 8
|
n_codes_total = 8
|
||||||
vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
|
vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
|
||||||
|
|
||||||
output_new_model_total = model(prediction_codeboook_channel, vec)
|
output_new_model_total = model(prediction_codebook_channel, vec)
|
||||||
output_old_model = bark_model(prediction_codeboook_channel, vec)
|
output_old_model = bark_model(prediction_codebook_channel, vec)
|
||||||
|
|
||||||
output_new_model = output_new_model_total.logits
|
output_new_model = output_new_model_total.logits
|
||||||
|
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ class ChameleonProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ class ClapProcessor(ProcessorMixin):
|
|||||||
and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
|
and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
|
||||||
encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||||
ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
|
ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
|
||||||
doctsring of the above two methods for more information.
|
docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ class CLIPProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ class CLIPSegProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
|
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
|
||||||
the above two methods for more information.
|
the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class ClvpProcessor(ProcessorMixin):
|
|||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
|
Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
|
||||||
argument to [`~ClvpTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||||
information.
|
information.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@@ -100,11 +100,11 @@ class ColPaliProcessor(PaliGemmaProcessor):
|
|||||||
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
|
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
|
||||||
both text and images at the same time.
|
both text and images at the same time.
|
||||||
|
|
||||||
When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
||||||
[`~LlamaTokenizerFast.__call__`].
|
[`~LlamaTokenizerFast.__call__`].
|
||||||
When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
||||||
[`~SiglipImageProcessor.__call__`].
|
[`~SiglipImageProcessor.__call__`].
|
||||||
Please refer to the doctsring of the above two methods for more information.
|
Please refer to the docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
|
|||||||
@@ -140,11 +140,11 @@ class ColPaliProcessor(ProcessorMixin):
|
|||||||
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
|
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
|
||||||
both text and images at the same time.
|
both text and images at the same time.
|
||||||
|
|
||||||
When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
||||||
[`~LlamaTokenizerFast.__call__`].
|
[`~LlamaTokenizerFast.__call__`].
|
||||||
When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
||||||
[`~SiglipImageProcessor.__call__`].
|
[`~SiglipImageProcessor.__call__`].
|
||||||
Please refer to the doctsring of the above two methods for more information.
|
Please refer to the docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
|
|||||||
@@ -1303,7 +1303,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
|
|||||||
n_ctx (`int`, *optional*):
|
n_ctx (`int`, *optional*):
|
||||||
Number of tokens or lyrics tokens provided in a single pass.
|
Number of tokens or lyrics tokens provided in a single pass.
|
||||||
embed_dim (`int`, *optional*):
|
embed_dim (`int`, *optional*):
|
||||||
Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
|
Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension,
|
||||||
if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
|
if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
|
||||||
audio_conditioning (`bool`, *optional*, defaults to `False`):
|
audio_conditioning (`bool`, *optional*, defaults to `False`):
|
||||||
Whether or not the prior supports conditionning on audio.
|
Whether or not the prior supports conditionning on audio.
|
||||||
@@ -1921,7 +1921,7 @@ class JukeboxPrior(PreTrainedModel):
|
|||||||
|
|
||||||
def set_metadata_lyric_tokens(self, labels):
|
def set_metadata_lyric_tokens(self, labels):
|
||||||
"""
|
"""
|
||||||
Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens.
|
Processes the full labels to only retrieve the relevant lyric tokens and keep the metadata conditioning tokens.
|
||||||
"""
|
"""
|
||||||
if self.nb_relevant_lyric_tokens > 0:
|
if self.nb_relevant_lyric_tokens > 0:
|
||||||
tokens_list = torch.zeros(
|
tokens_list = torch.zeros(
|
||||||
@@ -2147,7 +2147,7 @@ class JukeboxPrior(PreTrainedModel):
|
|||||||
|
|
||||||
def get_encoder_states(self, lyric_tokens, sample=False):
|
def get_encoder_states(self, lyric_tokens, sample=False):
|
||||||
"""
|
"""
|
||||||
Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
|
Retrieve the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
|
||||||
the lyric encoder.
|
the lyric encoder.
|
||||||
"""
|
"""
|
||||||
if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
|
if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class MCTCTProcessor(ProcessorMixin):
|
|||||||
When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
|
When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
|
||||||
[`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context
|
[`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context
|
||||||
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
|
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
|
||||||
[`~AutoTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
|
[`~AutoTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
|
||||||
"""
|
"""
|
||||||
# For backward compatibility
|
# For backward compatibility
|
||||||
if self._in_target_context_manager:
|
if self._in_target_context_manager:
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class Speech2Text2Processor(ProcessorMixin):
|
|||||||
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
|
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
|
||||||
[`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
|
[`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
|
||||||
[`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
|
[`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
|
||||||
Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
|
Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the docstring of the above two
|
||||||
methods for more information.
|
methods for more information.
|
||||||
"""
|
"""
|
||||||
# For backward compatibility
|
# For backward compatibility
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ class DonutProcessor(ProcessorMixin):
|
|||||||
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
|
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
|
||||||
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
|
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
|
||||||
[`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
|
[`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
|
||||||
[`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
|
[`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
|
||||||
"""
|
"""
|
||||||
# For backward compatibility
|
# For backward compatibility
|
||||||
legacy = kwargs.pop("legacy", True)
|
legacy = kwargs.pop("legacy", True)
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ class Emu3Processor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -481,7 +481,7 @@ class FuyuProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
|
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
|
||||||
encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||||
FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class GitProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ class LlavaProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ class LlavaNextProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
|||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
|
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
|
||||||
this method forwards the `videos` and `kwrags` arguments to LlavaNextVideoImageProcessor's
|
this method forwards the `videos` and `kwrags` arguments to LlavaNextVideoImageProcessor's
|
||||||
[`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring
|
[`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ class MgpstrProcessor(ProcessorMixin):
|
|||||||
When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
|
When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
|
||||||
[`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
|
[`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
|
||||||
arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
|
arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
|
||||||
refer to the doctsring of the above methods for more information.
|
refer to the docstring of the above methods for more information.
|
||||||
"""
|
"""
|
||||||
if images is None and text is None:
|
if images is None and text is None:
|
||||||
raise ValueError("You need to specify either an `images` or `text` input to process.")
|
raise ValueError("You need to specify either an `images` or `text` input to process.")
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class MusicgenProcessor(ProcessorMixin):
|
|||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
|
Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
|
||||||
argument to [`~T5Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||||
information.
|
information.
|
||||||
"""
|
"""
|
||||||
# For backward compatibility
|
# For backward compatibility
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class MusicgenMelodyProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
|
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
|
||||||
and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
|
and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
|
||||||
`None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
|
`None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
|
||||||
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
|
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ class OneFormerProcessor(ProcessorMixin):
|
|||||||
`task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
|
`task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
|
||||||
`None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
`None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||||
OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
|
OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
|
||||||
doctsring of the above two methods for more information.
|
docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
task_inputs (`str`, `List[str]`):
|
task_inputs (`str`, `List[str]`):
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ class Owlv2Processor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||||
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ class OwlViTProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||||
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -162,7 +162,7 @@ class PaliGemmaProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
|
The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ class PixtralProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||||
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
|
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ class RegNetConvLayer(nn.Module):
|
|||||||
|
|
||||||
class RegNetEmbeddings(nn.Module):
|
class RegNetEmbeddings(nn.Module):
|
||||||
"""
|
"""
|
||||||
RegNet Embedddings (stem) composed of a single aggressive convolution.
|
RegNet Embeddings (stem) composed of a single aggressive convolution.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: RegNetConfig):
|
def __init__(self, config: RegNetConfig):
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class SeamlessM4TProcessor(ProcessorMixin):
|
|||||||
and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
|
and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
|
||||||
`None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
`None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||||
SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
|
SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
|
||||||
to the doctsring of the above two methods for more information.
|
to the docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ class SiglipProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` argument to
|
the text. To prepare the image(s), this method forwards the `images` argument to
|
||||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class Siglip2Processor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` argument to
|
the text. To prepare the image(s), this method forwards the `images` argument to
|
||||||
Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class Speech2TextProcessor(ProcessorMixin):
|
|||||||
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
|
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
|
||||||
[`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
|
[`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
|
||||||
[`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
|
[`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
|
||||||
[`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
[`~Speech2TextTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||||
information.
|
information.
|
||||||
"""
|
"""
|
||||||
# For backward compatibility
|
# For backward compatibility
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ class TrOCRProcessor(ProcessorMixin):
|
|||||||
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
|
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
|
||||||
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
|
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
|
||||||
[`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
|
[`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
|
||||||
[`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
|
[`~TrOCRTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
|
||||||
"""
|
"""
|
||||||
# For backward compatibility
|
# For backward compatibility
|
||||||
if self._in_target_context_manager:
|
if self._in_target_context_manager:
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class TvpProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
||||||
TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring of
|
TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring of
|
||||||
the above two methods for more information.
|
the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -59,7 +59,7 @@ class TvpProcessor(ProcessorMixin):
|
|||||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||||
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
|
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`,:
|
||||||
`List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
`List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
||||||
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
||||||
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ class VideoLlavaProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||||
VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
|
and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
|
||||||
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||||
AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ class Wav2Vec2BertProcessor(ProcessorMixin):
|
|||||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
|
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
|
||||||
and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
|
and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
|
||||||
`None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
|
`None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
|
||||||
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
|
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
@@ -127,7 +127,7 @@ class Wav2Vec2BertProcessor(ProcessorMixin):
|
|||||||
"""
|
"""
|
||||||
If `input_features` is not `None`, this method forwards the `input_features` and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.pad`] to pad the input features.
|
If `input_features` is not `None`, this method forwards the `input_features` and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.pad`] to pad the input features.
|
||||||
If `labels` is not `None`, this method forwards the `labels` and `kwargs` arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`] to pad the label(s).
|
If `labels` is not `None`, this method forwards the `labels` and `kwargs` arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`] to pad the label(s).
|
||||||
Please refer to the doctsring of the above two methods for more information.
|
Please refer to the docstring of the above two methods for more information.
|
||||||
"""
|
"""
|
||||||
if input_features is None and labels is None:
|
if input_features is None and labels is None:
|
||||||
raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
|
raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class WhisperProcessor(ProcessorMixin):
|
|||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
|
Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
|
||||||
argument to [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
argument to [`~WhisperTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||||
information.
|
information.
|
||||||
"""
|
"""
|
||||||
# For backward compatibility
|
# For backward compatibility
|
||||||
|
|||||||
@@ -65,14 +65,14 @@ class XCLIPProcessor(ProcessorMixin):
|
|||||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||||
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
||||||
VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
|
VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
|
||||||
doctsring of the above two methods for more information.
|
docstring of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||||
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
|
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`,:
|
||||||
`List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
`List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
||||||
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
||||||
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
||||||
|
|||||||
@@ -337,7 +337,7 @@ class OnnxConfig(ABC):
|
|||||||
" `preprocessor` instead.",
|
" `preprocessor` instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||||
preprocessor = tokenizer
|
preprocessor = tokenizer
|
||||||
if isinstance(preprocessor, PreTrainedTokenizerBase):
|
if isinstance(preprocessor, PreTrainedTokenizerBase):
|
||||||
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
|
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ def export_pytorch(
|
|||||||
" `preprocessor` instead.",
|
" `preprocessor` instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||||
preprocessor = tokenizer
|
preprocessor = tokenizer
|
||||||
|
|
||||||
if issubclass(type(model), PreTrainedModel):
|
if issubclass(type(model), PreTrainedModel):
|
||||||
@@ -221,7 +221,7 @@ def export_tensorflow(
|
|||||||
" `preprocessor` instead.",
|
" `preprocessor` instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||||
preprocessor = tokenizer
|
preprocessor = tokenizer
|
||||||
|
|
||||||
model.config.return_dict = True
|
model.config.return_dict = True
|
||||||
@@ -296,7 +296,7 @@ def export(
|
|||||||
" `preprocessor` instead.",
|
" `preprocessor` instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||||
preprocessor = tokenizer
|
preprocessor = tokenizer
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@@ -335,7 +335,7 @@ def validate_model_outputs(
|
|||||||
" `preprocessor` instead.",
|
" `preprocessor` instead.",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
)
|
||||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||||
preprocessor = tokenizer
|
preprocessor = tokenizer
|
||||||
|
|
||||||
# generate inputs with a different batch_size and seq_len that was used for conversion to properly test
|
# generate inputs with a different batch_size and seq_len that was used for conversion to properly test
|
||||||
|
|||||||
@@ -193,7 +193,7 @@ class HqqHfQuantizer(HfQuantizer):
|
|||||||
unexpected_keys: List[str],
|
unexpected_keys: List[str],
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Each nn.Linear layer is processsed here.
|
Each nn.Linear layer is processed here.
|
||||||
We first check if the corresponding module state_dict contains already HQQ quantized parameters.
|
We first check if the corresponding module state_dict contains already HQQ quantized parameters.
|
||||||
If not, we create a temp linear layer with the module state_dict params and use it for quantization
|
If not, we create a temp linear layer with the module state_dict params and use it for quantization
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -355,7 +355,7 @@ class ModelOutput(OrderedDict):
|
|||||||
|
|
||||||
if is_modeloutput_subclass and not is_dataclass(self):
|
if is_modeloutput_subclass and not is_dataclass(self):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"{self.__module__}.{self.__class__.__name__} is not a dataclasss."
|
f"{self.__module__}.{self.__class__.__name__} is not a dataclass."
|
||||||
" This is a subclass of ModelOutput and so must use the @dataclass decorator."
|
" This is a subclass of ModelOutput and so must use the @dataclass decorator."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -241,19 +241,19 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
|
|||||||
torch.testing.assert_close(out_embeds, out_ids)
|
torch.testing.assert_close(out_embeds, out_ids)
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
)
|
)
|
||||||
def test_training_gradient_checkpointing(self):
|
def test_training_gradient_checkpointing(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
)
|
)
|
||||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
)
|
)
|
||||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||||
pass
|
pass
|
||||||
@@ -311,7 +311,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test(self):
|
def test_small_model_integration_test(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
||||||
|
|
||||||
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
||||||
@@ -333,7 +333,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_llama_single(self):
|
def test_small_model_integration_test_llama_single(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model_id = "rhymes-ai/Aria"
|
model_id = "rhymes-ai/Aria"
|
||||||
|
|
||||||
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||||
@@ -355,7 +355,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_llama_batched(self):
|
def test_small_model_integration_test_llama_batched(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model_id = "rhymes-ai/Aria"
|
model_id = "rhymes-ai/Aria"
|
||||||
|
|
||||||
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||||
@@ -382,7 +382,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_batch(self):
|
def test_small_model_integration_test_batch(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
||||||
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
||||||
prompts = [
|
prompts = [
|
||||||
@@ -408,7 +408,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_llama_batched_regression(self):
|
def test_small_model_integration_test_llama_batched_regression(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model_id = "rhymes-ai/Aria"
|
model_id = "rhymes-ai/Aria"
|
||||||
|
|
||||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||||
@@ -442,7 +442,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
|
processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
|
||||||
|
|
||||||
prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
|
prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
|
||||||
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||||
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||||
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
||||||
@@ -460,7 +460,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
model = model.eval()
|
model = model.eval()
|
||||||
|
|
||||||
EXPECTED_OUTPUT = [
|
EXPECTED_OUTPUT = [
|
||||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
"\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -253,7 +253,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
|||||||
def test_mismatching_num_image_tokens(self):
|
def test_mismatching_num_image_tokens(self):
|
||||||
"""
|
"""
|
||||||
Tests that VLMs through an error with explicit message saying what is wrong
|
Tests that VLMs through an error with explicit message saying what is wrong
|
||||||
when number of images don't match number of image tokens in the text.
|
when number of images doesn't match number of image tokens in the text.
|
||||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||||
"""
|
"""
|
||||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
@@ -306,19 +306,19 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
|||||||
model(**input_dict)
|
model(**input_dict)
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
)
|
)
|
||||||
def test_training_gradient_checkpointing(self):
|
def test_training_gradient_checkpointing(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
)
|
)
|
||||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||||
)
|
)
|
||||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||||
pass
|
pass
|
||||||
@@ -345,7 +345,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test(self):
|
def test_small_model_integration_test(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
||||||
|
|
||||||
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
||||||
@@ -364,7 +364,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_llama_single(self):
|
def test_small_model_integration_test_llama_single(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||||
|
|
||||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||||
@@ -386,7 +386,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_llama_batched(self):
|
def test_small_model_integration_test_llama_batched(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||||
|
|
||||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||||
@@ -413,7 +413,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_batch(self):
|
def test_small_model_integration_test_batch(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
||||||
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
||||||
prompts = [
|
prompts = [
|
||||||
@@ -441,7 +441,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
@slow
|
@slow
|
||||||
@require_bitsandbytes
|
@require_bitsandbytes
|
||||||
def test_small_model_integration_test_llama_batched_regression(self):
|
def test_small_model_integration_test_llama_batched_regression(self):
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
# Let's make sure we test the preprocessing to replace what is used
|
||||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||||
|
|
||||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||||
@@ -478,7 +478,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||||
|
|
||||||
prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
|
prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
|
||||||
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||||
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||||
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
||||||
@@ -496,7 +496,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
model = model.eval()
|
model = model.eval()
|
||||||
|
|
||||||
EXPECTED_OUTPUT = [
|
EXPECTED_OUTPUT = [
|
||||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
"\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||||
]
|
]
|
||||||
@@ -617,7 +617,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
|
|||||||
generate_ids = model.generate(**inputs, max_new_tokens=50)
|
generate_ids = model.generate(**inputs, max_new_tokens=50)
|
||||||
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||||
|
|
||||||
EXPECTED_GENERATION = "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador" # fmt: skip
|
EXPECTED_GENERATION = "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador" # fmt: skip
|
||||||
self.assertEqual(output, EXPECTED_GENERATION)
|
self.assertEqual(output, EXPECTED_GENERATION)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
|||||||
@@ -237,7 +237,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
|||||||
def test_mismatching_num_image_tokens(self):
|
def test_mismatching_num_image_tokens(self):
|
||||||
"""
|
"""
|
||||||
Tests that VLMs through an error with explicit message saying what is wrong
|
Tests that VLMs through an error with explicit message saying what is wrong
|
||||||
when number of images don't match number of image tokens in the text.
|
when number of images doesn't match number of image tokens in the text.
|
||||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||||
"""
|
"""
|
||||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
@@ -234,7 +234,7 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
|||||||
def test_mismatching_num_image_tokens(self):
|
def test_mismatching_num_image_tokens(self):
|
||||||
"""
|
"""
|
||||||
Tests that VLMs through an error with explicit message saying what is wrong
|
Tests that VLMs through an error with explicit message saying what is wrong
|
||||||
when number of images don't match number of image tokens in the text.
|
when number of images doesn't match number of image tokens in the text.
|
||||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||||
"""
|
"""
|
||||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
@@ -231,7 +231,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
|||||||
def test_mismatching_num_image_tokens(self):
|
def test_mismatching_num_image_tokens(self):
|
||||||
"""
|
"""
|
||||||
Tests that VLMs through an error with explicit message saying what is wrong
|
Tests that VLMs through an error with explicit message saying what is wrong
|
||||||
when number of images don't match number of image tokens in the text.
|
when number of images doesn't match number of image tokens in the text.
|
||||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||||
"""
|
"""
|
||||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|||||||
@@ -458,7 +458,7 @@ class GPTQTestExllamaV2(unittest.TestCase):
|
|||||||
|
|
||||||
def test_generate_quality(self):
|
def test_generate_quality(self):
|
||||||
"""
|
"""
|
||||||
Simple test to check the quality of the model by comparing the the generated tokens with the expected tokens
|
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
|
||||||
"""
|
"""
|
||||||
self.check_inference_correctness(self.quantized_model)
|
self.check_inference_correctness(self.quantized_model)
|
||||||
|
|
||||||
|
|||||||
@@ -1090,7 +1090,7 @@ class ProcessorTesterMixin:
|
|||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
def dummmy_sample_indices_fn(metadata, **fn_kwargs):
|
def dummy_sample_indices_fn(metadata, **fn_kwargs):
|
||||||
# sample only the first two frame always
|
# sample only the first two frame always
|
||||||
return [0, 1]
|
return [0, 1]
|
||||||
|
|
||||||
@@ -1099,7 +1099,7 @@ class ProcessorTesterMixin:
|
|||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
tokenize=True,
|
tokenize=True,
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
sample_indices_fn=dummmy_sample_indices_fn,
|
sample_indices_fn=dummy_sample_indices_fn,
|
||||||
)
|
)
|
||||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||||
|
|||||||
@@ -429,7 +429,7 @@ class ImageFeatureExtractionTester(unittest.TestCase):
|
|||||||
self.assertEqual(len(videos_list), 1)
|
self.assertEqual(len(videos_list), 1)
|
||||||
self.assertTrue(np.array_equal(videos_list[0][0], images))
|
self.assertTrue(np.array_equal(videos_list[0][0], images))
|
||||||
|
|
||||||
# Test a 4d array of images is converted to a a list of 1 video
|
# Test a 4d array of images is converted to a list of 1 video
|
||||||
images = np.random.randint(0, 256, (4, 16, 32, 3))
|
images = np.random.randint(0, 256, (4, 16, 32, 3))
|
||||||
videos_list = make_batched_videos(images)
|
videos_list = make_batched_videos(images)
|
||||||
self.assertIsInstance(videos_list[0], list)
|
self.assertIsInstance(videos_list[0], list)
|
||||||
|
|||||||
Reference in New Issue
Block a user