[Longformer] Docs and clean API (#4464)

* add longformer docs * improve docs
2020-05-19 21:52:36 +02:00
parent aa925a52fa
commit 48c3a70b4e
5 changed files with 85 additions and 28 deletions
--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -38,6 +38,12 @@ class LongformerConfig(RobertaConfig):
        The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
        It reuses the same defaults. Please check the parent class for more information.

+        Args:
+            attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
+                Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
+                To specify a different window size for each layer, use a :obj:`List[int]` where
+                ``len(attention_window) == num_hidden_layers``.
+
        Example::

            from transformers import LongformerConfig, LongformerModel
@@ -58,18 +64,6 @@ class LongformerConfig(RobertaConfig):
    pretrained_config_archive_map = LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "longformer"

-    def __init__(self, attention_window: Union[List[int], int] = 512, attention_mode: str = "longformer", **kwargs):
-        """
-        Args:
-            attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
-                Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
-                To specify a different window size for each layer, use a :obj:`List[int]` where
-                `len(attention_window) == num_hidden_layers`.
-            attention_mode (:obj:`str`, optional, possible values ['longformer', 'bert'], defaults to 'longformer'):
-                Type of selfattention. Use 'longformer' for :obj:`LongformerSelfAttention` or 'bert' for
-                standard BERT full n^2 self attention using :obj:`modeling_bert.BertSelfAttention`. Note that full n^2
-                selfattention is supported just for comparison, but it will OOM for long sequences.
-        """
+    def __init__(self, attention_window: Union[List[int], int] = 512, **kwargs):
        super().__init__(**kwargs)
        self.attention_window = attention_window
-        self.attention_mode = attention_mode
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -476,25 +476,18 @@ class LongformerModel(RobertaModel):
        super().__init__(config)

        if isinstance(config.attention_window, int):
-            assert config.attention_window % 2 == 0, "`attention_window` has to be an even value"
-            assert config.attention_window > 0, "`attention_window` has to be positive"
+            assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
+            assert config.attention_window > 0, "`config.attention_window` has to be positive"
            config.attention_window = [config.attention_window] * config.num_hidden_layers  # one value per layer
        else:
            assert len(config.attention_window) == config.num_hidden_layers, (
-                "`len(attention_window)` should equal `num_hidden_layers`. "
+                "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
                f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
            )

-        if config.attention_mode == "bert":
-            pass  # do nothing, use the default `modeling_bert.BertSelfAttention` (will OOM for long sequences)
-        elif config.attention_mode == "longformer":
-            for i, layer in enumerate(self.encoder.layer):
-                # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
-                layer.attention.self = LongformerSelfAttention(config, layer_id=i)
-        else:
-            raise ValueError(
-                f'Expected values of `attention_mode` are "longformer" or "bert", given {config.attention_mode}'
-            )
+        for i, layer in enumerate(self.encoder.layer):
+            # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
+            layer.attention.self = LongformerSelfAttention(config, layer_id=i)

        self.init_weights()