Move DataCollatorForMultipleChoice from the docs to the package (#34763)
* Add implementation for DataCollatorForMultipleChoice based on docs. * Add DataCollatorForMultipleChoice to import structure. * Remove custom DataCollatorForMultipleChoice implementations from example scripts. * Remove custom implementations of DataCollatorForMultipleChoice from docs in English, Spanish, Japanese and Korean. * Refactor torch version of DataCollatorForMultipleChoice to be more easily understandable. * Apply suggested changes and run make fixup. * fix copies, style and fixup * add missing documentation * nits * fix docstring * style * nits * isort --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
This commit is contained in:
@@ -23,11 +23,10 @@ import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from typing import Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
|
||||
import transformers
|
||||
@@ -35,15 +34,15 @@ from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForMultipleChoice,
|
||||
AutoTokenizer,
|
||||
DataCollatorForMultipleChoice,
|
||||
HfArgumentParser,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
default_data_collator,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry
|
||||
from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
@@ -165,63 +164,6 @@ class DataTrainingArguments:
|
||||
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForMultipleChoice:
|
||||
"""
|
||||
Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
|
||||
Args:
|
||||
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
|
||||
The tokenizer used for encoding the data.
|
||||
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||
among:
|
||||
|
||||
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
|
||||
if provided).
|
||||
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
||||
acceptable input length for the model if that argument is not provided.
|
||||
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
|
||||
lengths).
|
||||
max_length (`int`, *optional*):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
pad_to_multiple_of (`int`, *optional*):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
||||
7.5 (Volta).
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
padding: Union[bool, str, PaddingStrategy] = True
|
||||
max_length: Optional[int] = None
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
def __call__(self, features):
|
||||
label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
labels = [feature.pop(label_name) for feature in features]
|
||||
batch_size = len(features)
|
||||
num_choices = len(features[0]["input_ids"])
|
||||
flattened_features = [
|
||||
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
]
|
||||
flattened_features = list(chain(*flattened_features))
|
||||
|
||||
batch = self.tokenizer.pad(
|
||||
flattened_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Un-flatten
|
||||
batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
|
||||
# Add back labels
|
||||
batch["labels"] = torch.tensor(labels, dtype=torch.int64)
|
||||
return batch
|
||||
|
||||
|
||||
def main():
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
@@ -425,7 +367,9 @@ def main():
|
||||
data_collator = (
|
||||
default_data_collator
|
||||
if data_args.pad_to_max_length
|
||||
else DataCollatorForMultipleChoice(tokenizer=tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
|
||||
else DataCollatorForMultipleChoice(
|
||||
tokenizer=tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None, return_tensors="pt"
|
||||
)
|
||||
)
|
||||
|
||||
# Metric
|
||||
|
||||
@@ -24,10 +24,8 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
import datasets
|
||||
import evaluate
|
||||
@@ -47,12 +45,12 @@ from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForMultipleChoice,
|
||||
AutoTokenizer,
|
||||
PreTrainedTokenizerBase,
|
||||
DataCollatorForMultipleChoice,
|
||||
SchedulerType,
|
||||
default_data_collator,
|
||||
get_scheduler,
|
||||
)
|
||||
from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry
|
||||
from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
@@ -226,63 +224,6 @@ def parse_args():
|
||||
return args
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForMultipleChoice:
|
||||
"""
|
||||
Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
|
||||
Args:
|
||||
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
|
||||
The tokenizer used for encoding the data.
|
||||
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||
among:
|
||||
|
||||
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
|
||||
if provided).
|
||||
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
||||
acceptable input length for the model if that argument is not provided.
|
||||
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
|
||||
lengths).
|
||||
max_length (`int`, *optional*):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
pad_to_multiple_of (`int`, *optional*):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
||||
7.5 (Volta).
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
padding: Union[bool, str, PaddingStrategy] = True
|
||||
max_length: Optional[int] = None
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
def __call__(self, features):
|
||||
label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
labels = [feature.pop(label_name) for feature in features]
|
||||
batch_size = len(features)
|
||||
num_choices = len(features[0]["input_ids"])
|
||||
flattened_features = [
|
||||
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
]
|
||||
flattened_features = list(chain(*flattened_features))
|
||||
|
||||
batch = self.tokenizer.pad(
|
||||
flattened_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Un-flatten
|
||||
batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
|
||||
# Add back labels
|
||||
batch["labels"] = torch.tensor(labels, dtype=torch.int64)
|
||||
return batch
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
@@ -480,7 +421,9 @@ def main():
|
||||
pad_to_multiple_of = 8
|
||||
else:
|
||||
pad_to_multiple_of = None
|
||||
data_collator = DataCollatorForMultipleChoice(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||
data_collator = DataCollatorForMultipleChoice(
|
||||
tokenizer, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt"
|
||||
)
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||
|
||||
@@ -23,21 +23,18 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from IPython.display import clear_output, Image, display\n",
|
||||
"import PIL.Image\n",
|
||||
"import io\n",
|
||||
"import json\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import PIL.Image\n",
|
||||
"from IPython.display import Image, display\n",
|
||||
"from modeling_frcnn import GeneralizedRCNN\n",
|
||||
"from processing_image import Preprocess\n",
|
||||
"from visualizing_image import SingleImageViz\n",
|
||||
"from modeling_frcnn import GeneralizedRCNN\n",
|
||||
"from utils import Config\n",
|
||||
"\n",
|
||||
"import utils\n",
|
||||
"from transformers import LxmertForQuestionAnswering, LxmertTokenizer\n",
|
||||
"import wget\n",
|
||||
"import pickle\n",
|
||||
"import os\n",
|
||||
"from utils import Config\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg\",\n",
|
||||
|
||||
@@ -31,19 +31,19 @@
|
||||
"source": [
|
||||
"# Includes\n",
|
||||
"\n",
|
||||
"import h5py\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"from collections import OrderedDict\n",
|
||||
"\n",
|
||||
"from scipy import sparse\n",
|
||||
"import h5py\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"from scipy import sparse\n",
|
||||
"from torch import nn\n",
|
||||
"\n",
|
||||
"from transformers import *\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"os.chdir(\"../../\")"
|
||||
]
|
||||
},
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -25,7 +25,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import tensorflow as tf
|
||||
@@ -37,6 +37,7 @@ from transformers import (
|
||||
TF2_WEIGHTS_NAME,
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorForMultipleChoice,
|
||||
DefaultDataCollator,
|
||||
HfArgumentParser,
|
||||
PushToHubCallback,
|
||||
@@ -45,8 +46,7 @@ from transformers import (
|
||||
create_optimizer,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||
from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry
|
||||
from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
@@ -55,69 +55,6 @@ check_min_version("4.49.0.dev0")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# region Helper classes and functions
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForMultipleChoice:
|
||||
"""
|
||||
Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
|
||||
Args:
|
||||
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
|
||||
The tokenizer used for encoding the data.
|
||||
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||
among:
|
||||
|
||||
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
|
||||
if provided).
|
||||
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
||||
acceptable input length for the model if that argument is not provided.
|
||||
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
|
||||
lengths).
|
||||
max_length (`int`, *optional*):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
pad_to_multiple_of (`int`, *optional*):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
||||
7.5 (Volta).
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
padding: Union[bool, str, PaddingStrategy] = True
|
||||
max_length: Optional[int] = None
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
def __call__(self, features):
|
||||
label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
labels = [feature.pop(label_name) for feature in features]
|
||||
batch_size = len(features)
|
||||
num_choices = len(features[0]["input_ids"])
|
||||
flattened_features = [
|
||||
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
]
|
||||
flattened_features = list(chain(*flattened_features))
|
||||
|
||||
batch = self.tokenizer.pad(
|
||||
flattened_features,
|
||||
padding=self.padding,
|
||||
max_length=self.max_length,
|
||||
pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
return_tensors="np",
|
||||
)
|
||||
|
||||
# Un-flatten
|
||||
batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
|
||||
# Add back labels
|
||||
batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
|
||||
return batch
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# region Arguments
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
@@ -424,8 +361,7 @@ def main():
|
||||
if data_args.pad_to_max_length:
|
||||
data_collator = DefaultDataCollator(return_tensors="np")
|
||||
else:
|
||||
# custom class defined above, as HF has no data collator for multiple choice
|
||||
data_collator = DataCollatorForMultipleChoice(tokenizer)
|
||||
data_collator = DataCollatorForMultipleChoice(tokenizer, return_tensors="tf")
|
||||
# endregion
|
||||
|
||||
with training_args.strategy.scope():
|
||||
|
||||
113
examples/training/distributed_training.py
Normal file
113
examples/training/distributed_training.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
|
||||
# Environment variables set by torch.distributed.launch
|
||||
LOCAL_RANK = int(os.environ["LOCAL_RANK"])
|
||||
WORLD_SIZE = int(os.environ["WORLD_SIZE"])
|
||||
WORLD_RANK = int(os.environ["RANK"])
|
||||
|
||||
LOCAL_RANK = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
|
||||
WORLD_SIZE = int(os.environ["OMPI_COMM_WORLD_SIZE"])
|
||||
WORLD_RANK = int(os.environ["OMPI_COMM_WORLD_RANK"])
|
||||
|
||||
|
||||
def run(backend):
|
||||
tensor = torch.zeros(1)
|
||||
# Need to put tensor on a GPU device for nccl backend
|
||||
if backend == "nccl":
|
||||
device = torch.device("cuda:{}".format(LOCAL_RANK))
|
||||
tensor = tensor.to(device)
|
||||
|
||||
if WORLD_RANK == 0:
|
||||
for rank_recv in range(1, WORLD_SIZE):
|
||||
dist.send(tensor=tensor, dst=rank_recv)
|
||||
print("worker_{} sent data to Rank {}\n".format(0, rank_recv))
|
||||
else:
|
||||
dist.recv(tensor=tensor, src=0)
|
||||
print("worker_{} has received data from rank {}\n".format(WORLD_RANK, 0))
|
||||
|
||||
|
||||
def init_processes(backend):
|
||||
dist.init_process_group(backend, rank=WORLD_RANK, world_size=WORLD_SIZE)
|
||||
run(backend)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility."
|
||||
)
|
||||
parser.add_argument("--backend", type=str, default="nccl", choices=["nccl", "gloo"])
|
||||
args = parser.parse_args()
|
||||
|
||||
init_processes(backend=args.backend)
|
||||
|
||||
""""
|
||||
python-m torch.distributed.launch \
|
||||
--nproc_per_node=2 --nnodes=2 --node_rank=0 \
|
||||
test_compile.py
|
||||
|
||||
python3 -m torch.distributed.launch \
|
||||
--nproc_per_node=2 --nnodes=2 --node_rank=1 \
|
||||
--master_addr=104.171.200.62 --master_port=1234 \
|
||||
main.py \
|
||||
--backend=nccl --use_syn --batch_size=8192 --arch=resnet152
|
||||
|
||||
|
||||
|
||||
mpirun -np 4 \
|
||||
-H 104.171.200.62:2,104.171.200.182:2 \
|
||||
-x MASTER_ADDR=104.171.200.62 \
|
||||
-x MASTER_PORT=1234 \
|
||||
-x PATH \
|
||||
-bind-to none -map-by slot \
|
||||
-mca pml ob1 -mca btl ^openib \
|
||||
python3 main.py
|
||||
"""
|
||||
|
||||
|
||||
""""
|
||||
You need a host file with the name of hosts.
|
||||
for example I have arthur@ip-26-0-162-46 and arthur@ip-26-0-162-239
|
||||
|
||||
________
|
||||
hostfile
|
||||
ip-26-0-162-46 slots=8
|
||||
ip-26-0-162-239 slots=8
|
||||
________
|
||||
|
||||
mpirun --hostfile hostfile -np 16 \
|
||||
--bind-to none --map-by slot \
|
||||
-x MASTER_ADDR=<master-node-ip> \
|
||||
-x MASTER_PORT=29500 \
|
||||
-x NCCL_DEBUG=INFO \
|
||||
-x NCCL_SOCKET_IFNAME=^lo,docker0 \
|
||||
-x CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
python your_script.py --backend nccl
|
||||
|
||||
|
||||
to get the master IP you need to do a few things:
|
||||
hostname -I | awk '{print $1}'
|
||||
|
||||
|
||||
Use `ping ip-26-0-162-46` to check if connected
|
||||
|
||||
26.0.162.46
|
||||
|
||||
mpirun --hostfile hostfile -np 16 \
|
||||
--bind-to none --map-by slot \
|
||||
-x MASTER_ADDR=26.0.162.46 \
|
||||
-x MASTER_PORT=29500 \
|
||||
-x NCCL_DEBUG=INFO \
|
||||
-x NCCL_SOCKET_IFNAME=^lo,docker0 \
|
||||
-x CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
python your_script.py --backend nccl
|
||||
|
||||
|
||||
mpirun --hostfile hostfile -np 2 -x NCCL_DEBUG=INFO python -c "import os;print(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])" -b 8 -e 128M -f 2 -g 1
|
||||
to test your setup
|
||||
"""
|
||||
Reference in New Issue
Block a user