[Longformer] Docs and clean API (#4464)

* add longformer docs

* improve docs
This commit is contained in:
Patrick von Platen
2020-05-19 21:52:36 +02:00
committed by GitHub
parent aa925a52fa
commit 48c3a70b4e
5 changed files with 85 additions and 28 deletions

View File

@@ -38,6 +38,12 @@ class LongformerConfig(RobertaConfig):
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
It reuses the same defaults. Please check the parent class for more information.
Args:
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
To specify a different window size for each layer, use a :obj:`List[int]` where
``len(attention_window) == num_hidden_layers``.
Example::
from transformers import LongformerConfig, LongformerModel
@@ -58,18 +64,6 @@ class LongformerConfig(RobertaConfig):
pretrained_config_archive_map = LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "longformer"
def __init__(self, attention_window: Union[List[int], int] = 512, attention_mode: str = "longformer", **kwargs):
"""
Args:
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
To specify a different window size for each layer, use a :obj:`List[int]` where
`len(attention_window) == num_hidden_layers`.
attention_mode (:obj:`str`, optional, possible values ['longformer', 'bert'], defaults to 'longformer'):
Type of selfattention. Use 'longformer' for :obj:`LongformerSelfAttention` or 'bert' for
standard BERT full n^2 self attention using :obj:`modeling_bert.BertSelfAttention`. Note that full n^2
selfattention is supported just for comparison, but it will OOM for long sequences.
"""
def __init__(self, attention_window: Union[List[int], int] = 512, **kwargs):
super().__init__(**kwargs)
self.attention_window = attention_window
self.attention_mode = attention_mode

View File

@@ -476,25 +476,18 @@ class LongformerModel(RobertaModel):
super().__init__(config)
if isinstance(config.attention_window, int):
assert config.attention_window % 2 == 0, "`attention_window` has to be an even value"
assert config.attention_window > 0, "`attention_window` has to be positive"
assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
assert config.attention_window > 0, "`config.attention_window` has to be positive"
config.attention_window = [config.attention_window] * config.num_hidden_layers # one value per layer
else:
assert len(config.attention_window) == config.num_hidden_layers, (
"`len(attention_window)` should equal `num_hidden_layers`. "
"`len(config.attention_window)` should equal `config.num_hidden_layers`. "
f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
)
if config.attention_mode == "bert":
pass # do nothing, use the default `modeling_bert.BertSelfAttention` (will OOM for long sequences)
elif config.attention_mode == "longformer":
for i, layer in enumerate(self.encoder.layer):
# replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
layer.attention.self = LongformerSelfAttention(config, layer_id=i)
else:
raise ValueError(
f'Expected values of `attention_mode` are "longformer" or "bert", given {config.attention_mode}'
)
for i, layer in enumerate(self.encoder.layer):
# replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
layer.attention.self = LongformerSelfAttention(config, layer_id=i)
self.init_weights()