New BartModel (#2745)

* Results same as fairseq
* Wrote a ton of tests
* Struggled with api signatures
* added some docs
This commit is contained in:
Sam Shleifer
2020-02-20 18:11:13 -05:00
committed by GitHub
parent 564fd75d65
commit 53ce3854a1
20 changed files with 1766 additions and 59 deletions

View File

@@ -303,7 +303,7 @@ class TransformerDecoderLayer(nn.Module):
self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
self.drop = nn.Dropout(dropout)
mask = self._get_attn_subsequent_mask(MAX_SIZE)
# Register self.mask as a buffer in TransformerDecoderLayer, so
# Register self.mask as a saved_state in TransformerDecoderLayer, so
# it gets TransformerDecoderLayer's cuda behavior automatically.
self.register_buffer("mask", mask)