Feed forward chunking others (#6365)

* Feed forward chunking for Distilbert & Albert * Added ff chunking for many other models * Change model signature * Added chunking for XLM * Cleaned up by removing some variables. * remove test_chunking flag Co-authored-by: patrickvonplaten <patrick.v.platen@gmail.com>
2020-08-19 05:31:10 -07:00
parent fe0b85e77a
commit 2a7402cbd3
13 changed files with 78 additions and 31 deletions
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1519,7 +1519,7 @@ def prune_layer(


 def apply_chunking_to_forward(
-    chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors
+    forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
 ) -> torch.Tensor:
    """
    This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
@@ -1529,12 +1529,12 @@ def apply_chunking_to_forward(
    directly applying :obj:`forward_fn` to :obj:`input_tensors`.

    Args:
+        forward_fn (:obj:`Callable[..., torch.Tensor]`):
+            The forward function of the model.
        chunk_size (:obj:`int`):
            The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`.
        chunk_dim (:obj:`int`):
            The dimension over which the :obj:`input_tensors` should be chunked.
-        forward_fn (:obj:`Callable[..., torch.Tensor]`):
-            The forward function of the model.
        input_tensors (:obj:`Tuple[torch.Tensor]`):
            The input tensors of ``forward_fn`` which will be chunked.
    Returns:
@@ -1550,7 +1550,7 @@ def apply_chunking_to_forward(

        # implement a chunked forward function
        def forward(self, hidden_states):
-            return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states)
+            return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
    """

    assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors)