Move DataCollatorForMultipleChoice from the docs to the package (#34763)
* Add implementation for DataCollatorForMultipleChoice based on docs. * Add DataCollatorForMultipleChoice to import structure. * Remove custom DataCollatorForMultipleChoice implementations from example scripts. * Remove custom implementations of DataCollatorForMultipleChoice from docs in English, Spanish, Japanese and Korean. * Refactor torch version of DataCollatorForMultipleChoice to be more easily understandable. * Apply suggested changes and run make fixup. * fix copies, style and fixup * add missing documentation * nits * fix docstring * style * nits * isort --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
This commit is contained in:
@@ -71,3 +71,6 @@ Examples of use can be found in the [example scripts](../examples) or [example n
|
||||
|
||||
[[autodoc]] data.data_collator.DataCollatorWithFlattening
|
||||
|
||||
# DataCollatorForMultipleChoice
|
||||
|
||||
[[autodoc]] data.data_collator.DataCollatorForMultipleChoice
|
||||
|
||||
@@ -109,99 +109,14 @@ The preprocessing function you want to create needs to:
|
||||
To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:
|
||||
|
||||
```py
|
||||
tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
>>> tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
```
|
||||
|
||||
🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [`DataCollatorWithPadding`] to create a batch of examples. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
|
||||
|
||||
`DataCollatorForMultipleChoice` flattens all the model inputs, applies padding, and then unflattens the results:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
To create a batch of examples, it's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. [`DataCollatorForMultipleChoice`] flattens all the model inputs, applies padding, and then unflattens the results.
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import torch
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
|
||||
... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
|
||||
... batch["labels"] = torch.tensor(labels, dtype=torch.int64)
|
||||
... return batch
|
||||
>>> from transformers import DataCollatorForMultipleChoice
|
||||
>>> collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import tensorflow as tf
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="tf",
|
||||
... )
|
||||
|
||||
... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
|
||||
... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
|
||||
... return batch
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## Evaluate
|
||||
|
||||
@@ -271,7 +186,7 @@ At this point, only three steps remain:
|
||||
... train_dataset=tokenized_swag["train"],
|
||||
... eval_dataset=tokenized_swag["validation"],
|
||||
... processing_class=tokenizer,
|
||||
... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
|
||||
... data_collator=collator,
|
||||
... compute_metrics=compute_metrics,
|
||||
... )
|
||||
|
||||
|
||||
@@ -91,99 +91,14 @@ Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicarle la fun
|
||||
tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
```
|
||||
|
||||
🤗 Transformers no tiene un collator de datos para la tarea de selección múltiple, así que tendrías que crear uno. Puedes adaptar el [`DataCollatorWithPadding`] para crear un lote de ejemplos para selección múltiple. Este también
|
||||
le *añadirá relleno de manera dinámica* a tu texto y a las etiquetas para que tengan la longitud del elemento más largo en su lote, de forma que tengan una longitud uniforme. Aunque es posible rellenar el texto en la función `tokenizer` haciendo
|
||||
Para crear un lote de ejemplos para selección múltiple, este también le *añadirá relleno de manera dinámica* a tu texto y a las etiquetas para que tengan la longitud del elemento más largo en su lote, de forma que tengan una longitud uniforme. Aunque es posible rellenar el texto en la función `tokenizer` haciendo
|
||||
`padding=True`, el rellenado dinámico es más eficiente.
|
||||
|
||||
El `DataCollatorForMultipleChoice` aplanará todas las entradas del modelo, les aplicará relleno y luego des-aplanará los resultados:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
El [`DataCollatorForMultipleChoice`] aplanará todas las entradas del modelo, les aplicará relleno y luego des-aplanará los resultados.
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import torch
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Collator de datos que le añadirá relleno de forma automática a las entradas recibidas para
|
||||
... una tarea de selección múltiple.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
|
||||
... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
|
||||
... batch["labels"] = torch.tensor(labels, dtype=torch.int64)
|
||||
... return batch
|
||||
>>> from transformers import DataCollatorForMultipleChoice
|
||||
>>> collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import tensorflow as tf
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="tf",
|
||||
... )
|
||||
|
||||
... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
|
||||
... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
|
||||
... return batch
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## Entrenamiento
|
||||
|
||||
@@ -226,7 +141,7 @@ En este punto, solo quedan tres pasos:
|
||||
... train_dataset=tokenized_swag["train"],
|
||||
... eval_dataset=tokenized_swag["validation"],
|
||||
... processing_class=tokenizer,
|
||||
... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
|
||||
... data_collator=collator,
|
||||
... )
|
||||
|
||||
>>> trainer.train()
|
||||
|
||||
@@ -113,96 +113,11 @@ pip install transformers datasets evaluate
|
||||
tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
```
|
||||
|
||||
🤗 Transformers には多肢選択用のデータ照合器がないため、[`DataCollatorWithPadding`] を調整してサンプルのバッチを作成する必要があります。データセット全体を最大長までパディングするのではなく、照合中にバッチ内の最長の長さまで文を *動的にパディング* する方が効率的です。
|
||||
|
||||
`DataCollatorForMultipleChoice` は、すべてのモデル入力を平坦化し、パディングを適用して、結果を非平坦化します。
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
[`DataCollatorForMultipleChoice`] は、すべてのモデル入力を平坦化し、パディングを適用して、結果を非平坦化します。
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import torch
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
|
||||
... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
|
||||
... batch["labels"] = torch.tensor(labels, dtype=torch.int64)
|
||||
... return batch
|
||||
>>> from transformers import DataCollatorForMultipleChoice
|
||||
>>> collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import tensorflow as tf
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="tf",
|
||||
... )
|
||||
|
||||
... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
|
||||
... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
|
||||
... return batch
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## Evaluate
|
||||
|
||||
@@ -272,7 +187,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
... train_dataset=tokenized_swag["train"],
|
||||
... eval_dataset=tokenized_swag["validation"],
|
||||
... processing_class=tokenizer,
|
||||
... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
|
||||
... data_collator=collator,
|
||||
... compute_metrics=compute_metrics,
|
||||
... )
|
||||
|
||||
|
||||
@@ -112,96 +112,11 @@ pip install transformers datasets evaluate
|
||||
tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
```
|
||||
|
||||
🤗 Transformers에는 객관식용 데이터 콜레이터가 없으므로 예제 배치를 만들려면 [`DataCollatorWithPadding`]을 조정해야 합니다. 데이터 정렬 중에 전체 데이터 집합을 최대 길이로 패딩하는 대신 배치 중 가장 긴 길이로 문장을 *동적 패딩*하는 것이 더 효율적입니다.
|
||||
|
||||
`DataCollatorForMultipleChoice`는 모든 모델 입력을 평탄화하고 패딩을 적용하며 그 결과를 결과를 다차원화합니다:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
[`DataCollatorForMultipleChoice`]는 모든 모델 입력을 평탄화하고 패딩을 적용하며 그 결과를 결과를 다차원화합니다:
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import torch
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
|
||||
... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
|
||||
... batch["labels"] = torch.tensor(labels, dtype=torch.int64)
|
||||
... return batch
|
||||
>>> from transformers import DataCollatorForMultipleChoice
|
||||
>>> collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import tensorflow as tf
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="tf",
|
||||
... )
|
||||
|
||||
... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
|
||||
... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
|
||||
... return batch
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## 평가 하기[[evaluate]]
|
||||
|
||||
@@ -271,7 +186,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
... train_dataset=tokenized_swag["train"],
|
||||
... eval_dataset=tokenized_swag["validation"],
|
||||
... processing_class=tokenizer,
|
||||
... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
|
||||
... data_collator=collator,
|
||||
... compute_metrics=compute_metrics,
|
||||
... )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user