[Longformer] Docs and clean API (#4464)
* add longformer docs * improve docs
This commit is contained in:
committed by
GitHub
parent
aa925a52fa
commit
48c3a70b4e
@@ -38,6 +38,12 @@ class LongformerConfig(RobertaConfig):
|
||||
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
|
||||
It reuses the same defaults. Please check the parent class for more information.
|
||||
|
||||
Args:
|
||||
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
|
||||
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
|
||||
To specify a different window size for each layer, use a :obj:`List[int]` where
|
||||
``len(attention_window) == num_hidden_layers``.
|
||||
|
||||
Example::
|
||||
|
||||
from transformers import LongformerConfig, LongformerModel
|
||||
@@ -58,18 +64,6 @@ class LongformerConfig(RobertaConfig):
|
||||
pretrained_config_archive_map = LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
model_type = "longformer"
|
||||
|
||||
def __init__(self, attention_window: Union[List[int], int] = 512, attention_mode: str = "longformer", **kwargs):
|
||||
"""
|
||||
Args:
|
||||
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
|
||||
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
|
||||
To specify a different window size for each layer, use a :obj:`List[int]` where
|
||||
`len(attention_window) == num_hidden_layers`.
|
||||
attention_mode (:obj:`str`, optional, possible values ['longformer', 'bert'], defaults to 'longformer'):
|
||||
Type of selfattention. Use 'longformer' for :obj:`LongformerSelfAttention` or 'bert' for
|
||||
standard BERT full n^2 self attention using :obj:`modeling_bert.BertSelfAttention`. Note that full n^2
|
||||
selfattention is supported just for comparison, but it will OOM for long sequences.
|
||||
"""
|
||||
def __init__(self, attention_window: Union[List[int], int] = 512, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.attention_window = attention_window
|
||||
self.attention_mode = attention_mode
|
||||
|
||||
@@ -476,25 +476,18 @@ class LongformerModel(RobertaModel):
|
||||
super().__init__(config)
|
||||
|
||||
if isinstance(config.attention_window, int):
|
||||
assert config.attention_window % 2 == 0, "`attention_window` has to be an even value"
|
||||
assert config.attention_window > 0, "`attention_window` has to be positive"
|
||||
assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
|
||||
assert config.attention_window > 0, "`config.attention_window` has to be positive"
|
||||
config.attention_window = [config.attention_window] * config.num_hidden_layers # one value per layer
|
||||
else:
|
||||
assert len(config.attention_window) == config.num_hidden_layers, (
|
||||
"`len(attention_window)` should equal `num_hidden_layers`. "
|
||||
"`len(config.attention_window)` should equal `config.num_hidden_layers`. "
|
||||
f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
|
||||
)
|
||||
|
||||
if config.attention_mode == "bert":
|
||||
pass # do nothing, use the default `modeling_bert.BertSelfAttention` (will OOM for long sequences)
|
||||
elif config.attention_mode == "longformer":
|
||||
for i, layer in enumerate(self.encoder.layer):
|
||||
# replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
|
||||
layer.attention.self = LongformerSelfAttention(config, layer_id=i)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Expected values of `attention_mode` are "longformer" or "bert", given {config.attention_mode}'
|
||||
)
|
||||
for i, layer in enumerate(self.encoder.layer):
|
||||
# replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
|
||||
layer.attention.self = LongformerSelfAttention(config, layer_id=i)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user