diff --git a/setup.py b/setup.py index 70197c209f..bddf49e464 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ _deps = [ "keras-nlp>=0.3.1", "librosa", "nltk", - "natten>=0.14.5", + "natten>=0.14.6", "numpy>=1.17", "onnxconverter-common", "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index aa23c1ad06..c0cacd7596 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -35,7 +35,7 @@ deps = { "keras-nlp": "keras-nlp>=0.3.1", "librosa": "librosa", "nltk": "nltk", - "natten": "natten>=0.14.5", + "natten": "natten>=0.14.6", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index efeb68846f..2cacab8ac9 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -356,7 +356,7 @@ class NeighborhoodAttention(nn.Module): # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) - context_layer = natten2dav(attention_probs, value_layer, self.dilation) + context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, self.dilation) context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(new_context_layer_shape) diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py index 3a93b81e4b..dfe801d199 100644 --- a/src/transformers/models/nat/modeling_nat.py +++ b/src/transformers/models/nat/modeling_nat.py @@ -348,7 +348,7 @@ class NeighborhoodAttention(nn.Module): # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) - context_layer = natten2dav(attention_probs, value_layer, 1) + context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, 1) context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(new_context_layer_shape)