From 120176ea292d7e3c4df52021f92e9fb62222a18f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 8 Sep 2020 18:51:28 +0200
Subject: [PATCH] [Longformer] Fix longformer documentation (#7016)

* fix longformer

* allow position ids to not be initialized
---
 src/transformers/modeling_longformer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py
index a7d98443f9..f1979225e2 100755
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -795,6 +795,7 @@ class LongformerPreTrainedModel(PreTrainedModel):
 
     config_class = LongformerConfig
     base_model_prefix = "longformer"
+    authorized_missing_keys = [r"position_ids"]
 
     def _init_weights(self, module):
         """ Initialize the weights """
@@ -1019,11 +1020,13 @@ class LongformerModel(LongformerPreTrainedModel):
 
             >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
             >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-            >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
+            >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to global attention to be deactivated for all tokens
+            >>> global_attention_mask[:, [1, 4, 21,]] = 1  # Set global attention to random tokens for the sake of this example
+            ...                                     # Usually, set global attention based on the task. For example,
             ...                                     # classification: the <s> token
             ...                                     # QA: question tokens
             ...                                     # LM: potentially on the beginning of sentences and paragraphs
-            >>> outputs = model(input_ids, attention_mask=attention_mask)
+            >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
             >>> sequence_output = outputs.last_hidden_state
             >>> pooled_output = outputs.pooler_output
         """