From 854260ca44080a13bbf1937c3c6ce3a2d17aba07 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 31 Aug 2021 13:06:48 +0100 Subject: [PATCH] TF/Numpy variants for all DataCollator classes (#13105) * Adding a TF variant of the DataCollatorForTokenClassification to get feedback * Added a Numpy variant and a post_init check to fail early if a missing import is found * Fixed call to Numpy variant * Added a couple more of the collators * Update src/transformers/data/data_collator.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Fixes, style pass, finished DataCollatorForSeqToSeq * Added all the LanguageModeling DataCollators, except SOP and PermutationLanguageModeling * Adding DataCollatorForPermutationLanguageModeling * Style pass * Add missing `__call__` for PLM * Remove `post_init` checks for frameworks because the imports inside them were making us fail code quality checks * Remove unused imports * First attempt at some TF tests * A second attempt to make any of those tests actually work * TF tests, round three * TF tests, round four * TF tests, round five * TF tests, all enabled! * Style pass * Merging tests into `test_data_collator.py` * Merging tests into `test_data_collator.py` * Fixing up test imports * Fixing up test imports * Trying shuffling the conditionals around * Commenting out non-functional old tests * Completed all tests for all three frameworks * Style pass * Fixed test typo * Style pass * Move standard `__call__` method to mixin * Rearranged imports for `test_data_collator` * Fix data collator typo "torch" -> "pt" * Fixed the most embarrassingly obvious bug * Update src/transformers/data/data_collator.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Renaming mixin * Updating docs Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Dalton Walker Co-authored-by: Andrew Romans --- docs/source/main_classes/data_collator.rst | 6 +- src/transformers/__init__.py | 44 +- src/transformers/data/data_collator.py | 848 +++++++++++++++++++-- src/transformers/utils/dummy_pt_objects.py | 56 -- tests/test_data_collator.py | 557 +++++++++++++- 5 files changed, 1371 insertions(+), 140 deletions(-) diff --git a/docs/source/main_classes/data_collator.rst b/docs/source/main_classes/data_collator.rst index 5e406f312d..4232d05abc 100644 --- a/docs/source/main_classes/data_collator.rst +++ b/docs/source/main_classes/data_collator.rst @@ -54,18 +54,18 @@ DataCollatorForLanguageModeling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling - :members: mask_tokens + :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens DataCollatorForWholeWordMask ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask - :members: mask_tokens + :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens DataCollatorForPermutationLanguageModeling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling - :members: mask_tokens + :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 6ea088ee49..1d0d81700f 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -81,6 +81,17 @@ _import_structure = { "xnli_processors", "xnli_tasks_num_labels", ], + "data.data_collator": [ + "DataCollator", + "DataCollatorForLanguageModeling", + "DataCollatorForPermutationLanguageModeling", + "DataCollatorForSeq2Seq", + "DataCollatorForSOP", + "DataCollatorForTokenClassification", + "DataCollatorForWholeWordMask", + "DataCollatorWithPadding", + "default_data_collator", + ], "feature_extraction_sequence_utils": ["BatchFeature", "SequenceFeatureExtractor"], "file_utils": [ "CONFIG_NAME", @@ -460,17 +471,6 @@ else: if is_torch_available(): _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"] _import_structure["benchmark.benchmark_args"] = ["PyTorchBenchmarkArguments"] - _import_structure["data.data_collator"] = [ - "DataCollator", - "DataCollatorForLanguageModeling", - "DataCollatorForPermutationLanguageModeling", - "DataCollatorForSeq2Seq", - "DataCollatorForSOP", - "DataCollatorForTokenClassification", - "DataCollatorForWholeWordMask", - "DataCollatorWithPadding", - "default_data_collator", - ] _import_structure["data.datasets"] = [ "GlueDataset", "GlueDataTrainingArguments", @@ -1830,6 +1830,17 @@ if TYPE_CHECKING: xnli_processors, xnli_tasks_num_labels, ) + from .data.data_collator import ( + DataCollator, + DataCollatorForLanguageModeling, + DataCollatorForPermutationLanguageModeling, + DataCollatorForSeq2Seq, + DataCollatorForSOP, + DataCollatorForTokenClassification, + DataCollatorForWholeWordMask, + DataCollatorWithPadding, + default_data_collator, + ) # Feature Extractor from .feature_extraction_utils import BatchFeature, SequenceFeatureExtractor @@ -2174,17 +2185,6 @@ if TYPE_CHECKING: # Benchmarks from .benchmark.benchmark import PyTorchBenchmark from .benchmark.benchmark_args import PyTorchBenchmarkArguments - from .data.data_collator import ( - DataCollator, - DataCollatorForLanguageModeling, - DataCollatorForPermutationLanguageModeling, - DataCollatorForSeq2Seq, - DataCollatorForSOP, - DataCollatorForTokenClassification, - DataCollatorForWholeWordMask, - DataCollatorWithPadding, - default_data_collator, - ) from .data.datasets import ( GlueDataset, GlueDataTrainingArguments, diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index ac8f8f620a..6f7214837e 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -17,11 +17,7 @@ import warnings from dataclasses import dataclass from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union -import torch -from torch.nn.utils.rnn import pad_sequence - from ..file_utils import PaddingStrategy -from ..modeling_utils import PreTrainedModel from ..models.bert import BertTokenizer, BertTokenizerFast from ..tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase @@ -30,12 +26,26 @@ InputDataClass = NewType("InputDataClass", Any) """ A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary -of Tensors. +of PyTorch/TensorFlow tensors or NumPy arrays. """ -DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]]) +DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]]) -def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]: +class DataCollatorMixin: + def __call__(self, features, return_tensors=None): + if return_tensors is None: + return_tensors = self.return_tensors + if return_tensors == "tf": + return self.tf_call(features) + elif return_tensors == "pt": + return self.torch_call(features) + elif return_tensors == "np": + return self.numpy_call(features) + else: + raise ValueError(f"Framework '{return_tensors}' not recognized!") + + +def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]: """ Very simple data collator that simply collates batches of dict-like objects and performs special handling for potential keys named: @@ -51,9 +61,25 @@ def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Ten # have the same attributes. # So we will look at the first element as a proxy for what attributes exist # on the whole batch. + + if return_tensors == "pt": + return torch_default_data_collator(features) + elif return_tensors == "tf": + return tf_default_data_collator(features) + elif return_tensors == "np": + return numpy_default_data_collator(features) + + +@dataclass +class DefaultDataCollator(DataCollatorMixin): + return_tensors: str = "pt" + + +def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]: + import torch + if not isinstance(features[0], (dict, BatchEncoding)): features = [vars(f) for f in features] - first = features[0] batch = {} @@ -83,6 +109,81 @@ def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Ten return batch +def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]: + import numpy as np + import tensorflow as tf + + if not isinstance(features[0], (dict, BatchEncoding)): + features = [vars(f) for f in features] + first = features[0] + batch = {} + + # Special handling for labels. + # Ensure that tensor is created with the correct type + # (it should be automatically the case, but let's make sure of it.) + if "label" in first and first["label"] is not None: + if isinstance(first["label"], tf.Tensor): + dtype = tf.int64 if first["label"].dtype.is_integer() else tf.float32 + elif isinstance(first["label"], np.ndarray): + dtype = tf.int64 if np.issubdtype(first["label"].dtype, np.integer) else tf.float32 + elif isinstance(first["label"], (tuple, list)): + dtype = tf.int64 if isinstance(first["label"][0], int) else tf.float32 + else: + dtype = tf.int64 if isinstance(first["label"], int) else tf.float32 + batch["labels"] = tf.convert_to_tensor([f["label"] for f in features], dtype=dtype) + elif "label_ids" in first and first["label_ids"] is not None: + if isinstance(first["label_ids"], tf.Tensor): + batch["labels"] = tf.stack([f["label_ids"] for f in features]) + else: + dtype = tf.int64 if type(first["label_ids"][0]) is int else tf.float32 + batch["labels"] = tf.convert_to_tensor([f["label_ids"] for f in features], dtype=dtype) + + # Handling of all other possible keys. + # Again, we will use the first element to figure out which key/values are not None for this model. + for k, v in first.items(): + if k not in ("label", "label_ids") and v is not None and not isinstance(v, str): + if isinstance(v, (tf.Tensor, np.ndarray)): + batch[k] = tf.stack([f[k] for f in features]) + else: + batch[k] = tf.convert_to_tensor([f[k] for f in features]) + + return batch + + +def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]: + import numpy as np + + if not isinstance(features[0], (dict, BatchEncoding)): + features = [vars(f) for f in features] + first = features[0] + batch = {} + + # Special handling for labels. + # Ensure that tensor is created with the correct type + # (it should be automatically the case, but let's make sure of it.) + if "label" in first and first["label"] is not None: + label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"] + dtype = np.int64 if isinstance(label, int) else np.float32 + batch["labels"] = np.array([f["label"] for f in features], dtype=dtype) + elif "label_ids" in first and first["label_ids"] is not None: + if isinstance(first["label_ids"], np.ndarray): + batch["labels"] = np.stack([f["label_ids"] for f in features]) + else: + dtype = np.int64 if type(first["label_ids"][0]) is int else np.float32 + batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype) + + # Handling of all other possible keys. + # Again, we will use the first element to figure out which key/values are not None for this model. + for k, v in first.items(): + if k not in ("label", "label_ids") and v is not None and not isinstance(v, str): + if isinstance(v, np.ndarray): + batch[k] = np.stack([f[k] for f in features]) + else: + batch[k] = np.array([f[k] for f in features]) + + return batch + + @dataclass class DataCollatorWithPadding: """ @@ -114,14 +215,15 @@ class DataCollatorWithPadding: padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None + return_tensors: str = "pt" - def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: batch = self.tokenizer.pad( features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors="pt", + return_tensors=self.return_tensors, ) if "label" in batch: batch["labels"] = batch["label"] @@ -133,7 +235,7 @@ class DataCollatorWithPadding: @dataclass -class DataCollatorForTokenClassification: +class DataCollatorForTokenClassification(DataCollatorMixin): """ Data collator that will dynamically pad the inputs received, as well as the labels. @@ -166,8 +268,11 @@ class DataCollatorForTokenClassification: max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None label_pad_token_id: int = -100 + return_tensors: str = "pt" + + def torch_call(self, features): + import torch - def __call__(self, features): label_name = "label" if "label" in features[0].keys() else "labels" labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None batch = self.tokenizer.pad( @@ -196,15 +301,74 @@ class DataCollatorForTokenClassification: batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()} return batch + def tf_call(self, features): + import tensorflow as tf -def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None): + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None + batch = self.tokenizer.pad( + features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + # Conversion to tensors will fail if we have labels as they are not of the same length yet. + return_tensors="tf" if labels is None else None, + ) + + if labels is None: + return batch + + sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1] + padding_side = self.tokenizer.padding_side + if padding_side == "right": + batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels] + else: + batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels] + + batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()} + return batch + + def numpy_call(self, features): + import numpy as np + + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None + batch = self.tokenizer.pad( + features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + # Conversion to tensors will fail if we have labels as they are not of the same length yet. + return_tensors="np" if labels is None else None, + ) + + if labels is None: + return batch + + sequence_length = np.array(batch["input_ids"]).shape[1] + padding_side = self.tokenizer.padding_side + if padding_side == "right": + batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels] + else: + batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels] + + batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()} + return batch + + +def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None): """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.""" + import numpy as np + import torch + # Tensorize if necessary. - if isinstance(examples[0], (list, tuple)): + if isinstance(examples[0], (list, tuple, np.ndarray)): examples = [torch.tensor(e, dtype=torch.long) for e in examples] - # Check if padding is necessary. length_of_first = examples[0].size(0) + + # Check if padding is necessary. + are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0): return torch.stack(examples, dim=0) @@ -229,8 +393,85 @@ def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None return result -def tolist(x: Union[List[Any], torch.Tensor]): - return x.tolist() if isinstance(x, torch.Tensor) else x +def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None): + import numpy as np + import tensorflow as tf + + """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.""" + # Tensorize if necessary. + if isinstance(examples[0], (list, tuple)): + examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples] + + # Check if padding is necessary. + length_of_first = len(examples[0]) + are_tensors_same_length = all(len(x) == length_of_first for x in examples) + if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0): + return tf.stack(examples, axis=0) + + # If yes, check if we have a `pad_token`. + if tokenizer._pad_token is None: + raise ValueError( + "You are attempting to pad samples but the tokenizer you are using" + f" ({tokenizer.__class__.__name__}) does not have a pad token." + ) + + # Creating the full tensor and filling it with our data. + max_length = max(len(x) for x in examples) + if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + # result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id) + result = [] + rank = tf.rank(examples[0]) + paddings = np.zeros((rank, 2), dtype=np.int32) + for example in examples: + if tokenizer.padding_side == "right": + paddings[0, 1] = max_length - len(example) + else: + paddings[0, 0] = max_length - len(example) + result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id)) + return tf.stack(result, axis=0) + + +def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None): + import numpy as np + + """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.""" + # Tensorize if necessary. + if isinstance(examples[0], (list, tuple)): + examples = [np.array(e, dtype=np.int64) for e in examples] + + # Check if padding is necessary. + length_of_first = len(examples[0]) + are_tensors_same_length = all(len(x) == length_of_first for x in examples) + if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0): + return np.stack(examples, axis=0) + + # If yes, check if we have a `pad_token`. + if tokenizer._pad_token is None: + raise ValueError( + "You are attempting to pad samples but the tokenizer you are using" + f" ({tokenizer.__class__.__name__}) does not have a pad token." + ) + + # Creating the full tensor and filling it with our data. + max_length = max(len(x) for x in examples) + if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + result = np.full(shape=(len(examples), max_length), fill_value=tokenizer.pad_token_id, dtype=examples[0].dtype) + for i, example in enumerate(examples): + if tokenizer.padding_side == "right": + result[i, : example.shape[0]] = example + else: + result[i, -example.shape[0] :] = example + return result + + +def tolist(x): + if isinstance(x, list): + return x + elif hasattr(x, "numpy"): # Checks for TF tensors without needing the import + x = x.numpy() + return x.tolist() @dataclass @@ -268,13 +509,16 @@ class DataCollatorForSeq2Seq: """ tokenizer: PreTrainedTokenizerBase - model: Optional[PreTrainedModel] = None + model: Optional[Any] = None padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None label_pad_token_id: int = -100 + return_tensors: str = "pt" - def __call__(self, features): + def __call__(self, features, return_tensors=None): + if return_tensors is None: + return_tensors = self.return_tensors labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the # same length to return tensors. @@ -292,7 +536,7 @@ class DataCollatorForSeq2Seq: padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors="pt", + return_tensors=return_tensors, ) # prepare decoder_input_ids @@ -304,7 +548,7 @@ class DataCollatorForSeq2Seq: @dataclass -class DataCollatorForLanguageModeling: +class DataCollatorForLanguageModeling(DataCollatorMixin): """ Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length. @@ -333,6 +577,8 @@ class DataCollatorForLanguageModeling: mlm: bool = True mlm_probability: float = 0.15 pad_to_multiple_of: Optional[int] = None + tf_experimental_compile: bool = False + return_tensors: str = "pt" def __post_init__(self): if self.mlm and self.tokenizer.mask_token is None: @@ -340,20 +586,98 @@ class DataCollatorForLanguageModeling: "This tokenizer does not have a mask token which is necessary for masked language modeling. " "You should pass `mlm=False` to train on causal language modeling instead." ) + if self.tf_experimental_compile: + import tensorflow as tf + + self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True) + + @staticmethod + def tf_bernoulli(shape, probability): + import tensorflow as tf + + prob_matrix = tf.fill(shape, probability) + return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool) + + def tf_mask_tokens( + self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None + ) -> Tuple[Any, Any]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + import tensorflow as tf + + input_shape = tf.shape(inputs) + # 1 for a special token, 0 for a normal token in the special tokens mask + # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) + masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability) & ~special_tokens_mask + # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens + labels = tf.where(masked_indices, inputs, -100) + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices + + inputs = tf.where(indices_replaced, mask_token_id, inputs) + + # 10% of the time, we replace masked input tokens with random word + indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced + random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=tf.int64) + inputs = tf.where(indices_random, random_words, inputs) + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + + def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: + import tensorflow as tf - def __call__( - self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] - ) -> Dict[str, torch.Tensor]: # Handle dict or lists with proper padding and conversion to tensor. if isinstance(examples[0], (dict, BatchEncoding)): - batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of) + batch = self.tokenizer.pad(examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of) else: - batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)} + batch = { + "input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + } # If special token mask has been preprocessed, pop it from the dict. special_tokens_mask = batch.pop("special_tokens_mask", None) if self.mlm: - batch["input_ids"], batch["labels"] = self.mask_tokens( + if special_tokens_mask is None: + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) + for val in batch["input_ids"].numpy().tolist() + ] + # Cannot directly create as bool + special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool) + else: + special_tokens_mask = tf.cast(special_tokens_mask, tf.bool) + batch["input_ids"], batch["labels"] = self.tf_mask_tokens( + tf.cast(batch["input_ids"], tf.int64), + special_tokens_mask=special_tokens_mask, + mask_token_id=self.tokenizer.mask_token_id, + vocab_size=len(self.tokenizer), + ) + else: + labels = batch["input_ids"] + if self.tokenizer.pad_token_id is not None: + # Replace self.tokenizer.pad_token_id with -100 + labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels) + else: + labels = tf.identity(labels) # Makes a copy, just in case + batch["labels"] = labels + return batch + + def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: + # Handle dict or lists with proper padding and conversion to tensor. + if isinstance(examples[0], (dict, BatchEncoding)): + batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of) + else: + batch = { + "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + } + + # If special token mask has been preprocessed, pop it from the dict. + special_tokens_mask = batch.pop("special_tokens_mask", None) + if self.mlm: + batch["input_ids"], batch["labels"] = self.torch_mask_tokens( batch["input_ids"], special_tokens_mask=special_tokens_mask ) else: @@ -363,12 +687,12 @@ class DataCollatorForLanguageModeling: batch["labels"] = labels return batch - def mask_tokens( - self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor]: + def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ + import torch + labels = inputs.clone() # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) probability_matrix = torch.full(labels.shape, self.mlm_probability) @@ -396,6 +720,69 @@ class DataCollatorForLanguageModeling: # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels + def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: + import numpy as np + + # Handle dict or lists with proper padding and conversion to tensor. + if isinstance(examples[0], (dict, BatchEncoding)): + batch = self.tokenizer.pad(examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of) + else: + batch = { + "input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + } + + # If special token mask has been preprocessed, pop it from the dict. + special_tokens_mask = batch.pop("special_tokens_mask", None) + if self.mlm: + batch["input_ids"], batch["labels"] = self.numpy_mask_tokens( + batch["input_ids"], special_tokens_mask=special_tokens_mask + ) + else: + labels = np.copy(batch["input_ids"]) + if self.tokenizer.pad_token_id is not None: + labels[labels == self.tokenizer.pad_token_id] = -100 + batch["labels"] = labels + return batch + + def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + import numpy as np + + labels = np.copy(inputs) + # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) + probability_matrix = np.full(labels.shape, self.mlm_probability) + if special_tokens_mask is None: + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + special_tokens_mask = np.array(special_tokens_mask, dtype=np.bool) + else: + special_tokens_mask = special_tokens_mask.astype(np.bool) + + probability_matrix[special_tokens_mask] = 0 + # Numpy doesn't have bernoulli, so we use a binomial with 1 trial + masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(np.bool) + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(np.bool) & masked_indices + inputs[indices_replaced] = self.tokenizer.mask_token_id + + # 10% of the time, we replace masked input tokens with random word + # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + indices_random = ( + np.random.binomial(1, 0.5, size=labels.shape).astype(np.bool) & masked_indices & ~indices_replaced + ) + random_words = np.random.randint( + low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64 + ) + inputs[indices_random] = random_words + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + @dataclass class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): @@ -413,16 +800,14 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): :class:`.DataCollatorForLanguageModeling`. """ - def __call__( - self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] - ) -> Dict[str, torch.Tensor]: + def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: if isinstance(examples[0], (dict, BatchEncoding)): input_ids = [e["input_ids"] for e in examples] else: input_ids = examples examples = [{"input_ids": e} for e in examples] - batch_input = _collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) mask_labels = [] for e in examples: @@ -439,8 +824,64 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): if i in ref_pos: ref_tokens[i] = "##" + ref_tokens[i] mask_labels.append(self._whole_word_mask(ref_tokens)) - batch_mask = _collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) - inputs, labels = self.mask_tokens(batch_input, batch_mask) + batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + inputs, labels = self.torch_mask_tokens(batch_input, batch_mask) + return {"input_ids": inputs, "labels": labels} + + def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: + if isinstance(examples[0], (dict, BatchEncoding)): + input_ids = [e["input_ids"] for e in examples] + else: + input_ids = examples + examples = [{"input_ids": e} for e in examples] + + batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + + mask_labels = [] + for e in examples: + ref_tokens = [] + for id in tolist(e["input_ids"]): + token = self.tokenizer._convert_id_to_token(id) + ref_tokens.append(token) + + # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢] + if "chinese_ref" in e: + ref_pos = tolist(e["chinese_ref"]) + len_seq = len(e["input_ids"]) + for i in range(len_seq): + if i in ref_pos: + ref_tokens[i] = "##" + ref_tokens[i] + mask_labels.append(self._whole_word_mask(ref_tokens)) + batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + inputs, labels = self.tf_mask_tokens(batch_input, batch_mask) + return {"input_ids": inputs, "labels": labels} + + def np_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: + if isinstance(examples[0], (dict, BatchEncoding)): + input_ids = [e["input_ids"] for e in examples] + else: + input_ids = examples + examples = [{"input_ids": e} for e in examples] + + batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + + mask_labels = [] + for e in examples: + ref_tokens = [] + for id in tolist(e["input_ids"]): + token = self.tokenizer._convert_id_to_token(id) + ref_tokens.append(token) + + # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢] + if "chinese_ref" in e: + ref_pos = tolist(e["chinese_ref"]) + len_seq = len(e["input_ids"]) + for i in range(len_seq): + if i in ref_pos: + ref_tokens[i] = "##" + ref_tokens[i] + mask_labels.append(self._whole_word_mask(ref_tokens)) + batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) + inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask) return {"input_ids": inputs, "labels": labels} def _whole_word_mask(self, input_tokens: List[str], max_predictions=512): @@ -489,11 +930,12 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))] return mask_labels - def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. """ + import torch if self.tokenizer.mask_token is None: raise ValueError( @@ -527,6 +969,90 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels + def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set + 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. + """ + import tensorflow as tf + + input_shape = tf.shape(inputs) + if self.tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." + ) + labels = inputs.clone() + # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) + + masked_indices = tf.cast(mask_labels, tf.bool) + + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + masked_indices = masked_indices & ~tf.convert_to_tensor(special_tokens_mask, dtype=tf.bool) + if self.tokenizer._pad_token is not None: + padding_mask = inputs == self.tokenizer.pad_token_id + masked_indices = masked_indices & ~padding_mask + + # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens + labels = tf.where(masked_indices, inputs, -100) + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices + + inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs) + + # 10% of the time, we replace masked input tokens with random word + indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced + random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64) + inputs = tf.where(indices_random, random_words, inputs) + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + + def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set + 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. + """ + import numpy as np + + if self.tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." + ) + labels = np.copy(inputs) + # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) + + masked_indices = mask_labels.astype(np.bool) + + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + masked_indices[np.array(special_tokens_mask, dtype=np.bool)] = 0 + if self.tokenizer._pad_token is not None: + padding_mask = labels == self.tokenizer.pad_token_id + masked_indices[padding_mask] = 0 + + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(np.bool) & masked_indices + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) + + # 10% of the time, we replace masked input tokens with random word + # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + indices_random = ( + np.random.binomial(1, 0.5, size=labels.shape).astype(np.bool) & masked_indices & ~indices_replaced + ) + random_words = np.random.randint( + low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64 + ) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + @dataclass class DataCollatorForSOP(DataCollatorForLanguageModeling): @@ -544,9 +1070,12 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling): FutureWarning, ) - def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: + def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]: + import torch + from torch.nn.utils.rnn import pad_sequence + input_ids = [example["input_ids"] for example in examples] - input_ids = _collate_batch(input_ids, self.tokenizer) + input_ids = _torch_collate_batch(input_ids, self.tokenizer) input_ids, labels, attention_mask = self.mask_tokens(input_ids) token_type_ids = [example["token_type_ids"] for example in examples] @@ -564,11 +1093,13 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling): "sentence_order_label": sentence_order_label, } - def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]: """ Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10% original. N-gram not applied yet. """ + import torch + if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." @@ -606,7 +1137,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling): @dataclass -class DataCollatorForPermutationLanguageModeling: +class DataCollatorForPermutationLanguageModeling(DataCollatorMixin): """ Data collator used for permutation language modeling. @@ -617,17 +1148,30 @@ class DataCollatorForPermutationLanguageModeling: tokenizer: PreTrainedTokenizerBase plm_probability: float = 1 / 6 max_span_length: int = 5 # maximum length of a span of masked tokens + return_tensors: str = "pt" - def __call__( - self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] - ) -> Dict[str, torch.Tensor]: + def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: if isinstance(examples[0], (dict, BatchEncoding)): examples = [e["input_ids"] for e in examples] - batch = _collate_batch(examples, self.tokenizer) - inputs, perm_mask, target_mapping, labels = self.mask_tokens(batch) + batch = _torch_collate_batch(examples, self.tokenizer) + inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch) return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} - def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: + if isinstance(examples[0], (dict, BatchEncoding)): + examples = [e["input_ids"] for e in examples] + batch = _tf_collate_batch(examples, self.tokenizer) + inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch) + return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} + + def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: + if isinstance(examples[0], (dict, BatchEncoding)): + examples = [e["input_ids"] for e in examples] + batch = _numpy_collate_batch(examples, self.tokenizer) + inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch) + return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} + + def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: """ The masked tokens to be predicted for a particular sequence are determined by the following algorithm: @@ -641,6 +1185,7 @@ class DataCollatorForPermutationLanguageModeling: 4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in the sequence to be processed), repeat from Step 1. """ + import torch if self.tokenizer.mask_token is None: raise ValueError( @@ -723,3 +1268,212 @@ class DataCollatorForPermutationLanguageModeling: ) & masked_indices[i] return inputs.long(), perm_mask, target_mapping, labels.long() + + def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: + """ + The masked tokens to be predicted for a particular sequence are determined by the following algorithm: + + 0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far). + 1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be + masked) + 2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be + masked + 3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - + span_length]`` and mask tokens ``start_index:start_index + span_length`` + 4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in + the sequence to be processed), repeat from Step 1. + """ + from random import randint + + import numpy as np + import tensorflow as tf + + if self.tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for permutation language modeling. Please add a mask token if you want to use this tokenizer." + ) + + if tf.shape(inputs)[1] % 2 != 0: + raise ValueError( + "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see relevant comments in source code for details." + ) + + labels = tf.identity(inputs) + # Creating the mask and target_mapping tensors + masked_indices = np.full(labels.shape.as_list(), 0, dtype=np.bool) + labels_shape = tf.shape(labels) + target_mapping = np.zeros((labels_shape[0], labels_shape[1], labels_shape[1]), dtype=np.float32) + + for i in range(len(labels)): + # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far). + cur_len = 0 + max_len = tf.shape(labels)[1] + + while cur_len < max_len: + # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked) + span_length = randint(1, self.max_span_length + 1) + # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked + context_length = int(span_length / self.plm_probability) + # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length` + start_index = cur_len + randint(0, context_length - span_length + 1) + masked_indices[i, start_index : start_index + span_length] = 1 + # Set `cur_len = cur_len + context_length` + cur_len += context_length + + # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether, + # the i-th predict corresponds to the i-th token. + target_mapping[i] = np.eye(labels_shape[1]) + masked_indices = tf.cast(tf.convert_to_tensor(masked_indices), dtype=tf.bool) + target_mapping = tf.convert_to_tensor(target_mapping) + special_tokens_mask = tf.convert_to_tensor( + [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) + for val in labels.numpy().tolist() + ], + ) + special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool) + masked_indices = masked_indices & ~special_tokens_mask + if self.tokenizer._pad_token is not None: + padding_mask = labels == self.tokenizer.pad_token_id + masked_indices = masked_indices & ~padding_mask + + # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc. + non_func_mask = ~(padding_mask | special_tokens_mask) + + inputs = tf.where(masked_indices, self.tokenizer.mask_token_id, inputs) + labels = tf.where(masked_indices, labels, -100) # We only compute loss on masked tokens + + perm_mask = [] + + for i in range(len(labels)): + # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will + # determine which tokens a given token can attend to (encoded in `perm_mask`). + # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length + # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation, + # we assume that reused length is half of sequence length and permutation length is equal to reused length. + # This requires that the sequence length be even. + + # Create a linear factorisation order + # tf.range is the equivalent of torch.arange + perm_index = tf.range(labels_shape[1]) + # Split this into two halves, assuming that half the sequence is reused each time + perm_index = tf.transpose(tf.reshape(perm_index, (-1, labels_shape[1] // 2))) + # Permute the two halves such that they do not cross over + perm_index = tf.random.shuffle(perm_index) # Shuffles along the first dimension + # Flatten this out into the desired permuted factorisation order + perm_index = tf.reshape(tf.transpose(perm_index), (-1,)) + # Set the permutation indices of non-masked (non-functional) tokens to the + # smallest index (-1) so that: + # (1) They can be seen by all other positions + # (2) They cannot see masked positions, so there won't be information leak + perm_index = tf.where(~masked_indices[i] & non_func_mask[i], -1, perm_index) + # The logic for whether the i-th token can attend on the j-th token based on the factorisation order: + # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token + # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token + perm_mask.append( + (tf.reshape(perm_index, (labels_shape[1], 1)) <= tf.reshape(perm_index, (1, labels_shape[1]))) + & masked_indices[i] + ) + perm_mask = tf.stack(perm_mask, axis=0) + + return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64) + + def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: + """ + The masked tokens to be predicted for a particular sequence are determined by the following algorithm: + + 0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far). + 1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be + masked) + 2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be + masked + 3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - + span_length]`` and mask tokens ``start_index:start_index + span_length`` + 4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in + the sequence to be processed), repeat from Step 1. + """ + from random import randint + + import numpy as np + + if self.tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for permutation language modeling. Please add a mask token if you want to use this tokenizer." + ) + + if inputs.shape[1] % 2 != 0: + raise ValueError( + "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see relevant comments in source code for details." + ) + + labels = np.copy(inputs) + # Creating the mask and target_mapping tensors + masked_indices = np.full(labels.shape, 0, dtype=np.bool) + target_mapping = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32) + + for i in range(labels.shape[0]): + # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far). + cur_len = 0 + max_len = labels.shape[1] + + while cur_len < max_len: + # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked) + span_length = randint(1, self.max_span_length + 1) + # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked + context_length = int(span_length / self.plm_probability) + # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length` + start_index = cur_len + randint(0, context_length - span_length + 1) + masked_indices[i, start_index : start_index + span_length] = 1 + # Set `cur_len = cur_len + context_length` + cur_len += context_length + + # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether, + # the i-th predict corresponds to the i-th token. + target_mapping[i] = np.eye(labels.shape[1]) + + special_tokens_mask = np.array( + [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()], + dtype=np.bool, + ) + masked_indices[special_tokens_mask] = 0 + if self.tokenizer._pad_token is not None: + padding_mask = labels == self.tokenizer.pad_token_id + masked_indices[padding_mask] = 0.0 + + # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc. + non_func_mask = ~(padding_mask | special_tokens_mask) + + inputs[masked_indices] = self.tokenizer.mask_token_id + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + perm_mask = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32) + + for i in range(labels.shape[0]): + # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will + # determine which tokens a given token can attend to (encoded in `perm_mask`). + # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length + # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation, + # we assume that reused length is half of sequence length and permutation length is equal to reused length. + # This requires that the sequence length be even. + + # Create a linear factorisation order + perm_index = np.arange(labels.shape[1]) + # Split this into two halves, assuming that half the sequence is reused each time + perm_index = perm_index.reshape((-1, labels.shape[1] // 2)).T + # Permute the two halves such that they do not cross over + np.random.shuffle(perm_index) + # Flatten this out into the desired permuted factorisation order + perm_index = perm_index.T.flatten() + # Set the permutation indices of non-masked (non-functional) tokens to the + # smallest index (-1) so that: + # (1) They can be seen by all other positions + # (2) They cannot see masked positions, so there won't be information leak + perm_index[~masked_indices[i] & non_func_mask[i]] = -1 + # The logic for whether the i-th token can attend on the j-th token based on the factorisation order: + # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token + # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token + perm_mask[i] = ( + perm_index.reshape((labels.shape[1], 1)) <= perm_index.reshape((1, labels.shape[1])) + ) & masked_indices[i] + + return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index ad62c51006..7241da720d 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -12,62 +12,6 @@ class PyTorchBenchmarkArguments: requires_backends(self, ["torch"]) -class DataCollator: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class DataCollatorForLanguageModeling: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - -class DataCollatorForPermutationLanguageModeling: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - -class DataCollatorForSeq2Seq: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class DataCollatorForSOP: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class DataCollatorForTokenClassification: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - -class DataCollatorForWholeWordMask: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -class DataCollatorWithPadding: - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - -def default_data_collator(*args, **kwargs): - requires_backends(default_data_collator, ["torch"]) - - class GlueDataset: def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) diff --git a/tests/test_data_collator.py b/tests/test_data_collator.py index e9d363229f..632d1f6f7d 100644 --- a/tests/test_data_collator.py +++ b/tests/test_data_collator.py @@ -17,20 +17,27 @@ import shutil import tempfile import unittest -from transformers import BertTokenizer, is_torch_available, set_seed -from transformers.testing_utils import require_torch +import numpy as np + +from transformers import ( + BertTokenizer, + DataCollatorForLanguageModeling, + DataCollatorForPermutationLanguageModeling, + DataCollatorForTokenClassification, + DataCollatorWithPadding, + default_data_collator, + is_tf_available, + is_torch_available, + set_seed, +) +from transformers.testing_utils import require_tf, require_torch if is_torch_available(): import torch - from transformers import ( - DataCollatorForLanguageModeling, - DataCollatorForPermutationLanguageModeling, - DataCollatorForTokenClassification, - DataCollatorWithPadding, - default_data_collator, - ) +if is_tf_available(): + import tensorflow as tf @require_torch @@ -61,14 +68,14 @@ class DataCollatorIntegrationTest(unittest.TestCase): self.assertEqual(batch["inputs"].shape, torch.Size([8, 6])) # Features can already be tensors - features = [{"label": i, "inputs": torch.randint(10, [10])} for i in range(8)] + features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)] batch = default_data_collator(features) self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8))))) self.assertEqual(batch["labels"].dtype, torch.long) self.assertEqual(batch["inputs"].shape, torch.Size([8, 10])) # Labels can already be tensors - features = [{"label": torch.tensor(i), "inputs": torch.randint(10, [10])} for i in range(8)] + features = [{"label": torch.tensor(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)] batch = default_data_collator(features) self.assertEqual(batch["labels"].dtype, torch.long) self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8))))) @@ -238,7 +245,7 @@ class DataCollatorIntegrationTest(unittest.TestCase): self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 10, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) - example = [torch.randint(5, [5])] + example = [np.random.randint(0, 5, [5])] with self.assertRaises(ValueError): # Expect error due to odd sequence length data_collator(example) @@ -290,3 +297,529 @@ class DataCollatorIntegrationTest(unittest.TestCase): self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8))) self.assertEqual(batch["labels"].shape, torch.Size((2, 8))) self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,))) + + +@require_tf +class TFDataCollatorIntegrationTest(unittest.TestCase): + def setUp(self): + super().setUp() + self.tmpdirname = tempfile.mkdtemp() + + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] + self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt") + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_default_with_dict(self): + features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="tf") + self.assertEqual(batch["labels"].numpy().tolist(), list(range(8))) + self.assertEqual(batch["labels"].dtype, tf.int64) + self.assertEqual(batch["inputs"].shape.as_list(), [8, 6]) + + # With label_ids + features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="tf") + self.assertEqual(batch["labels"].numpy().tolist(), ([[0, 1, 2]] * 8)) + self.assertEqual(batch["labels"].dtype, tf.int64) + self.assertEqual(batch["inputs"].shape.as_list(), [8, 6]) + + # Features can already be tensors + features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)] + batch = default_data_collator(features, return_tensors="tf") + self.assertEqual(batch["labels"].numpy().tolist(), (list(range(8)))) + self.assertEqual(batch["labels"].dtype, tf.int64) + self.assertEqual(batch["inputs"].shape.as_list(), [8, 10]) + + # Labels can already be tensors + features = [{"label": np.array(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)] + batch = default_data_collator(features, return_tensors="tf") + self.assertEqual(batch["labels"].dtype, tf.int64) + self.assertEqual(batch["labels"].numpy().tolist(), list(range(8))) + self.assertEqual(batch["labels"].dtype, tf.int64) + self.assertEqual(batch["inputs"].shape.as_list(), [8, 10]) + + def test_default_classification_and_regression(self): + data_collator = default_data_collator + + features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)] + batch = data_collator(features, return_tensors="tf") + self.assertEqual(batch["labels"].dtype, tf.int64) + + features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)] + batch = data_collator(features, return_tensors="tf") + self.assertEqual(batch["labels"].dtype, tf.float32) + + def test_default_with_no_labels(self): + features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="tf") + self.assertTrue("labels" not in batch) + self.assertEqual(batch["inputs"].shape.as_list(), [8, 6]) + + # With label_ids + features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="tf") + self.assertTrue("labels" not in batch) + self.assertEqual(batch["inputs"].shape.as_list(), [8, 6]) + + def test_data_collator_with_padding(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}] + + data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6]) + self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) + + data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="tf") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="tf") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, [2, 8]) + + def test_data_collator_for_token_classification(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [ + {"input_ids": [0, 1, 2], "labels": [0, 1, 2]}, + {"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]}, + ] + + data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6]) + self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) + self.assertEqual(batch["labels"].shape.as_list(), [2, 6]) + self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-100] * 3) + + data_collator = DataCollatorForTokenClassification( + tokenizer, padding="max_length", max_length=10, return_tensors="tf" + ) + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) + + data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="tf") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 8]) + + data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="tf") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6]) + self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) + self.assertEqual(batch["labels"].shape.as_list(), [2, 6]) + self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-1] * 3) + + def _test_no_pad_and_pad(self, no_pad_features, pad_features): + tokenizer = BertTokenizer(self.vocab_file) + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf") + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) + + batch = data_collator(pad_features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) + + data_collator = DataCollatorForLanguageModeling( + tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="tf" + ) + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) + + batch = data_collator(pad_features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) + + tokenizer._pad_token = None + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf") + with self.assertRaises(ValueError): + # Expect error due to padding token missing + data_collator(pad_features) + + set_seed(42) # For reproducibility + tokenizer = BertTokenizer(self.vocab_file) + data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf") + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(tf.reduce_any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist())) + + batch = data_collator(pad_features, return_tensors="tf") + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(tf.reduce_any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist())) + + data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf") + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(tf.reduce_any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist())) + + batch = data_collator(pad_features, return_tensors="tf") + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 16]) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(tf.reduce_any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist())) + + def test_data_collator_for_language_modeling(self): + no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}] + pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}] + self._test_no_pad_and_pad(no_pad_features, pad_features) + + no_pad_features = [list(range(10)), list(range(10))] + pad_features = [list(range(5)), list(range(10))] + self._test_no_pad_and_pad(no_pad_features, pad_features) + + def test_plm(self): + tokenizer = BertTokenizer(self.vocab_file) + no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}] + pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}] + + data_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="tf") + + batch = data_collator(pad_features) + self.assertIsInstance(batch, dict) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10]) + self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) + + batch = data_collator(no_pad_features) + self.assertIsInstance(batch, dict) + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) + self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10]) + self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) + + example = [np.random.randint(0, 5, [5])] + with self.assertRaises(ValueError): + # Expect error due to odd sequence length + data_collator(example) + + def test_nsp(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [ + {"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i} + for i in range(2) + ] + data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5]) + self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 5]) + self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2]) + + data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8]) + self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 8]) + self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2]) + + def test_sop(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [ + { + "input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]), + "token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]), + "sentence_order_label": i, + } + for i in range(2) + ] + data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5]) + self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 5]) + self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2]) + + data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8]) + self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8]) + self.assertEqual(batch["labels"].shape.as_list(), [2, 8]) + self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2]) + + +class NumpyDataCollatorIntegrationTest(unittest.TestCase): + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] + self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt") + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_default_with_dict(self): + features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="np") + self.assertEqual(batch["labels"].tolist(), list(range(8))) + self.assertEqual(batch["labels"].dtype, np.int64) + self.assertEqual(batch["inputs"].shape, (8, 6)) + + # With label_ids + features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="np") + self.assertEqual(batch["labels"].tolist(), [[0, 1, 2]] * 8) + self.assertEqual(batch["labels"].dtype, np.int64) + self.assertEqual(batch["inputs"].shape, (8, 6)) + + # Features can already be tensors + features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)] + batch = default_data_collator(features, return_tensors="np") + self.assertEqual(batch["labels"].tolist(), list(range(8))) + self.assertEqual(batch["labels"].dtype, np.int64) + self.assertEqual(batch["inputs"].shape, (8, 10)) + + # Labels can already be tensors + features = [{"label": np.array(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)] + batch = default_data_collator(features, return_tensors="np") + self.assertEqual(batch["labels"].dtype, np.int64) + self.assertEqual(batch["labels"].tolist(), (list(range(8)))) + self.assertEqual(batch["labels"].dtype, np.int64) + self.assertEqual(batch["inputs"].shape, (8, 10)) + + def test_default_classification_and_regression(self): + data_collator = default_data_collator + + features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)] + batch = data_collator(features, return_tensors="np") + self.assertEqual(batch["labels"].dtype, np.int64) + + features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)] + batch = data_collator(features, return_tensors="np") + self.assertEqual(batch["labels"].dtype, np.float32) + + def test_default_with_no_labels(self): + features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="np") + self.assertTrue("labels" not in batch) + self.assertEqual(batch["inputs"].shape, (8, 6)) + + # With label_ids + features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] + batch = default_data_collator(features, return_tensors="np") + self.assertTrue("labels" not in batch) + self.assertEqual(batch["inputs"].shape, (8, 6)) + + def test_data_collator_with_padding(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}] + + data_collator = DataCollatorWithPadding(tokenizer, return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (2, 6)) + self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) + + data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (2, 10)) + + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (2, 8)) + + def test_data_collator_for_token_classification(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [ + {"input_ids": [0, 1, 2], "labels": [0, 1, 2]}, + {"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]}, + ] + + data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (2, 6)) + self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) + self.assertEqual(batch["labels"].shape, (2, 6)) + self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-100] * 3) + + data_collator = DataCollatorForTokenClassification( + tokenizer, padding="max_length", max_length=10, return_tensors="np" + ) + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (2, 10)) + self.assertEqual(batch["labels"].shape, (2, 10)) + + data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (2, 8)) + self.assertEqual(batch["labels"].shape, (2, 8)) + + data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (2, 6)) + self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) + self.assertEqual(batch["labels"].shape, (2, 6)) + self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3) + + def _test_no_pad_and_pad(self, no_pad_features, pad_features): + tokenizer = BertTokenizer(self.vocab_file) + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np") + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape, (2, 10)) + self.assertEqual(batch["labels"].shape, (2, 10)) + + batch = data_collator(pad_features, return_tensors="np") + self.assertEqual(batch["input_ids"].shape, (2, 10)) + self.assertEqual(batch["labels"].shape, (2, 10)) + + data_collator = DataCollatorForLanguageModeling( + tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="np" + ) + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape, (2, 16)) + self.assertEqual(batch["labels"].shape, (2, 16)) + + batch = data_collator(pad_features, return_tensors="np") + self.assertEqual(batch["input_ids"].shape, (2, 16)) + self.assertEqual(batch["labels"].shape, (2, 16)) + + tokenizer._pad_token = None + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np") + with self.assertRaises(ValueError): + # Expect error due to padding token missing + data_collator(pad_features) + + set_seed(42) # For reproducibility + tokenizer = BertTokenizer(self.vocab_file) + data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np") + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape, (2, 10)) + self.assertEqual(batch["labels"].shape, (2, 10)) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(np.any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) + + batch = data_collator(pad_features) + self.assertEqual(batch["input_ids"].shape, (2, 10)) + self.assertEqual(batch["labels"].shape, (2, 10)) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(np.any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) + + data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np") + batch = data_collator(no_pad_features) + self.assertEqual(batch["input_ids"].shape, (2, 16)) + self.assertEqual(batch["labels"].shape, (2, 16)) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(np.any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) + + batch = data_collator(pad_features) + self.assertEqual(batch["input_ids"].shape, (2, 16)) + self.assertEqual(batch["labels"].shape, (2, 16)) + + masked_tokens = batch["input_ids"] == tokenizer.mask_token_id + self.assertTrue(np.any(masked_tokens)) + # self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) + + def test_data_collator_for_language_modeling(self): + no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}] + pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}] + self._test_no_pad_and_pad(no_pad_features, pad_features) + + no_pad_features = [list(range(10)), list(range(10))] + pad_features = [list(range(5)), list(range(10))] + self._test_no_pad_and_pad(no_pad_features, pad_features) + + def test_plm(self): + tokenizer = BertTokenizer(self.vocab_file) + no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}] + pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}] + + data_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="np") + + batch = data_collator(pad_features) + self.assertIsInstance(batch, dict) + self.assertEqual(batch["input_ids"].shape, (2, 10)) + self.assertEqual(batch["perm_mask"].shape, (2, 10, 10)) + self.assertEqual(batch["target_mapping"].shape, (2, 10, 10)) + self.assertEqual(batch["labels"].shape, (2, 10)) + + batch = data_collator(no_pad_features) + self.assertIsInstance(batch, dict) + self.assertEqual(batch["input_ids"].shape, (2, 10)) + self.assertEqual(batch["perm_mask"].shape, (2, 10, 10)) + self.assertEqual(batch["target_mapping"].shape, (2, 10, 10)) + self.assertEqual(batch["labels"].shape, (2, 10)) + + example = [np.random.randint(0, 5, [5])] + with self.assertRaises(ValueError): + # Expect error due to odd sequence length + data_collator(example) + + def test_nsp(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [ + {"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i} + for i in range(2) + ] + data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape, (2, 5)) + self.assertEqual(batch["token_type_ids"].shape, (2, 5)) + self.assertEqual(batch["labels"].shape, (2, 5)) + self.assertEqual(batch["next_sentence_label"].shape, (2,)) + + data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape, (2, 8)) + self.assertEqual(batch["token_type_ids"].shape, (2, 8)) + self.assertEqual(batch["labels"].shape, (2, 8)) + self.assertEqual(batch["next_sentence_label"].shape, (2,)) + + def test_sop(self): + tokenizer = BertTokenizer(self.vocab_file) + features = [ + { + "input_ids": np.array([0, 1, 2, 3, 4]), + "token_type_ids": np.array([0, 1, 2, 3, 4]), + "sentence_order_label": i, + } + for i in range(2) + ] + data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape, (2, 5)) + self.assertEqual(batch["token_type_ids"].shape, (2, 5)) + self.assertEqual(batch["labels"].shape, (2, 5)) + self.assertEqual(batch["sentence_order_label"].shape, (2,)) + + data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np") + batch = data_collator(features) + + self.assertEqual(batch["input_ids"].shape, (2, 8)) + self.assertEqual(batch["token_type_ids"].shape, (2, 8)) + self.assertEqual(batch["labels"].shape, (2, 8)) + self.assertEqual(batch["sentence_order_label"].shape, (2,))