From 3028b20a71d566f67d4ef414e4aa32cb3eb53d64 Mon Sep 17 00:00:00 2001 From: Ali Hassani <68103095+alihassanijr@users.noreply.github.com> Date: Fri, 17 Mar 2023 11:07:55 -0400 Subject: [PATCH] Fix natten (#22229) * Add kernel size to NATTEN's QK arguments. The new NATTEN 0.14.5 supports PyTorch 2.0, but also adds an additional argument to the QK operation to allow optional RPBs. This ends up failing NATTEN tests. This commit adds NATTEN back to circleci and adds the arguments to get it working again. * Force NATTEN >= 0.14.5 --- .circleci/create_circleci_config.py | 3 +-- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/models/dinat/modeling_dinat.py | 2 +- src/transformers/models/nat/modeling_nat.py | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 6a45f94a65..0b26762b08 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -374,8 +374,7 @@ exotic_models_job = CircleCIJob( "pip install 'git+https://github.com/facebookresearch/detectron2.git'", "sudo apt install tesseract-ocr", "pip install pytesseract", - # wait until natten is ready for torch 2.0.0 - # "pip install natten", + "pip install natten", ], tests_to_run=[ "tests/models/*layoutlmv*", diff --git a/setup.py b/setup.py index 943bb196b5..c28387a3d4 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ _deps = [ "keras-nlp>=0.3.1", "librosa", "nltk", - "natten>=0.14.4", + "natten>=0.14.5", "numpy>=1.17", "onnxconverter-common", "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 79f9118ae8..aa638a6a9f 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -35,7 +35,7 @@ deps = { "keras-nlp": "keras-nlp>=0.3.1", "librosa": "librosa", "nltk": "nltk", - "natten": "natten>=0.14.4", + "natten": "natten>=0.14.5", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index 95191d52b5..efeb68846f 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -347,7 +347,7 @@ class NeighborhoodAttention(nn.Module): query_layer = query_layer / math.sqrt(self.attention_head_size) # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. - attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation) + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation) # Normalize the attention scores to probabilities. attention_probs = nn.functional.softmax(attention_scores, dim=-1) diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py index 4b34fe730c..3a93b81e4b 100644 --- a/src/transformers/models/nat/modeling_nat.py +++ b/src/transformers/models/nat/modeling_nat.py @@ -339,7 +339,7 @@ class NeighborhoodAttention(nn.Module): query_layer = query_layer / math.sqrt(self.attention_head_size) # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. - attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1) + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1) # Normalize the attention scores to probabilities. attention_probs = nn.functional.softmax(attention_scores, dim=-1)