Make Trainer evaluation handle dynamic seq_length (#8336)

* Make Trainer evaluation handle dynamic seq_length * Document behavior. * Fix test * Better fix * Fixes for realsies this time * Address review comments * Without forgetting to save...
2020-11-05 15:13:51 -05:00
parent 27b402cab0
commit 04e442d575
3 changed files with 135 additions and 13 deletions
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -42,17 +42,50 @@ else:
 logger = logging.get_logger(__name__)


-def nested_concat(tensors, new_tensors, dim=0):
-    "Concat the `new_tensors` to `tensors` on `dim`. Works for tensors or nested list/tuples of tensors."
+def torch_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
+    """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
+    if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
+        return torch.cat((tensor1, tensor2), dim=0)
+
+    # Let's figure out the new shape
+    new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]
+
+    # Now let's fill the result tensor
+    result = tensor1.new_full(new_shape, padding_index)
+    result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
+    result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
+    return result
+
+
+def numpy_pad_and_concatenate(array1, array2, padding_index=-100):
+    """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary."""
+    if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]:
+        return np.concatenate((array1, array2), dim=0)
+
+    # Let's figure out the new shape
+    new_shape = (array1.shape[0] + array2.shape[0], max(array1.shape[1], array2.shape[1])) + array1.shape[2:]
+
+    # Now let's fill the result tensor
+    result = np.full_like(array1, padding_index, shape=new_shape)
+    result[: array1.shape[0], : array1.shape[1]] = array1
+    result[array1.shape[0] :, : array2.shape[1]] = array2
+    return result
+
+
+def nested_concat(tensors, new_tensors, padding_index=-100):
+    """
+    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
+    nested list/tuples of tensors.
+    """
    assert type(tensors) == type(
        new_tensors
    ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
    if isinstance(tensors, (list, tuple)):
-        return type(tensors)(nested_concat(t, n, dim) for t, n in zip(tensors, new_tensors))
+        return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
    elif isinstance(tensors, torch.Tensor):
-        return torch.cat((tensors, new_tensors), dim=dim)
+        return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
    elif isinstance(tensors, np.ndarray):
-        return np.concatenate((tensors, new_tensors), axis=dim)
+        return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
    else:
        raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}")

@@ -190,11 +223,21 @@ def get_tpu_sampler(dataset: torch.utils.data.dataset.Dataset):
    return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())


-def nested_new_like(arrays, num_samples):
+def nested_new_like(arrays, num_samples, padding_index=-100):
    """ Create the same nested structure as `arrays` with a first dimension always at `num_samples`."""
    if isinstance(arrays, (list, tuple)):
        return type(arrays)(nested_new_like(x, num_samples) for x in arrays)
-    return np.zeros((num_samples, *arrays.shape[1:]), dtype=arrays.dtype)
+    return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:]))
+
+
+def nested_expand_like(arrays, new_seq_length, padding_index=-100):
+    """ Expand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding."""
+    if isinstance(arrays, (list, tuple)):
+        return type(arrays)(nested_expand_like(x, new_seq_length, padding_index=padding_index) for x in arrays)
+
+    result = np.full_like(arrays, padding_index, shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:])
+    result[:, : arrays.shape[1]] = arrays
+    return result


 def nested_truncate(tensors, limit):
@@ -204,6 +247,13 @@ def nested_truncate(tensors, limit):
    return tensors[:limit]


+def _get_first_shape(arrays):
+    """Return the shape of the first array found in the nested struct `arrays`."""
+    if isinstance(arrays, (list, tuple)):
+        return _get_first_shape(arrays[0])
+    return arrays.shape
+
+
 class DistributedTensorGatherer:
    """
    A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks.
@@ -247,9 +297,11 @@ class DistributedTensorGatherer:
        make_multiple_of (:obj:`int`, `optional`):
            If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument
            (by adding samples).
+        padding_index (:obj:`int`, `optional`, defaults to -100):
+            The padding index to use if the arrays don't all have the same sequence length.
    """

-    def __init__(self, world_size, num_samples, make_multiple_of=None):
+    def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100):
        self.world_size = world_size
        self.num_samples = num_samples
        total_size = world_size if make_multiple_of is None else world_size * make_multiple_of
@@ -257,6 +309,7 @@ class DistributedTensorGatherer:
        self.process_length = self.total_samples // world_size
        self._storage = None
        self._offsets = None
+        self.padding_index = padding_index

    def add_arrays(self, arrays):
        """
@@ -266,8 +319,14 @@ class DistributedTensorGatherer:
        if arrays is None:
            return
        if self._storage is None:
-            self._storage = nested_new_like(arrays, self.total_samples)
+            self._storage = nested_new_like(arrays, self.total_samples, padding_index=self.padding_index)
            self._offsets = list(range(0, self.total_samples, self.process_length))
+        else:
+            storage_shape = _get_first_shape(self._storage)
+            arrays_shape = _get_first_shape(arrays)
+            if len(storage_shape) > 1 and storage_shape[1] < arrays_shape[1]:
+                # If we get new arrays that are too big too fit, we expand the shape fo the storage
+                self._storage = nested_expand_like(self._storage, arrays_shape[1], padding_index=self.padding_index)
        slice_len = self._nested_set_tensors(self._storage, arrays)
        for i in range(self.world_size):
            self._offsets[i] += slice_len
@@ -283,7 +342,12 @@ class DistributedTensorGatherer:

        slice_len = arrays.shape[0] // self.world_size
        for i in range(self.world_size):
-            storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[i * slice_len : (i + 1) * slice_len]
+            if len(arrays.shape) == 1:
+                storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[i * slice_len : (i + 1) * slice_len]
+            else:
+                storage[self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]] = arrays[
+                    i * slice_len : (i + 1) * slice_len
+                ]
        return slice_len

    def finalize(self):