Reformer (#3351)
* first copy & past commit from Bert and morgans LSH code * add easy way to compare to trax original code * translate most of function * make trax lsh self attention deterministic with numpy seed + copy paste code * add same config * add same config * make layer init work * implemented hash_vectors function for lsh attention * continue reformer translation * hf LSHSelfAttentionLayer gives same output as trax layer * refactor code * refactor code * refactor code * refactor * refactor + add reformer config * delete bogus file * split reformer attention layer into two layers * save intermediate step * save intermediate step * make test work * add complete reformer block layer * finish reformer layer * implement causal and self mask * clean reformer test and refactor code * fix merge conflicts * fix merge conflicts * update init * fix device for GPU * fix chunk length init for tests * include morgans optimization * improve memory a bit * improve comment * factorize num_buckets * better testing parameters * make whole model work * make lm model work * add t5 copy paste tokenizer * add chunking feed forward * clean config * add improved assert statements * make tokenizer work * improve test * correct typo * extend config * add complexer test * add new axial position embeddings * add local block attention layer * clean tests * refactor * better testing * save intermediate progress * clean test file * make shorter input length work for model * allow variable input length * refactor * make forward pass for pretrained model work * add generation possibility * finish dropout and init * make style * refactor * add first version of RevNet Layers * make forward pass work and add convert file * make uploaded model forward pass work * make uploaded model forward pass work * refactor code * add namedtuples and cache buckets * correct head masks * refactor * made reformer more flexible * make style * remove set max length * add attention masks * fix up tests * fix lsh attention mask * make random seed optional for the moment * improve memory in reformer * add tests * make style * make sure masks work correctly * detach gradients * save intermediate * correct backprob through gather * make style * change back num hashes * rename to labels * fix rotation shape * fix detach * update * fix trainer * fix backward dropout * make reformer more flexible * fix conflict * fix * fix * add tests for fixed seed in reformer layer * fix trainer typo * fix typo in activations * add fp16 tests * add fp16 training * support fp16 * correct gradient bug in reformer * add fast gelu * re-add dropout for embedding dropout * better naming * better naming * renaming * finalize test branch * finalize tests * add more tests * finish tests * fix * fix type trainer * fix fp16 tests * fix tests * fix tests * fix tests * fix issue with dropout * fix dropout seeds * correct random seed on gpu * finalize random seed for dropout * finalize random seed for dropout * remove duplicate line * correct half precision bug * make style * refactor * refactor * docstring * remove sinusoidal position encodings for reformer * move chunking to modeling_utils * make style * clean config * make style * fix tests * fix auto tests * pretrained models * fix docstring * update conversion file * Update pretrained_models.rst * fix rst * fix rst * update copyright * fix test path * fix test path * fix small issue in test * include reformer in generation tests * add docs for axial position encoding * finish docs * Update convert_reformer_trax_checkpoint_to_pytorch.py * remove isort * include sams comments * remove wrong comment in utils * correct typos * fix typo * Update reformer.rst * applied morgans optimization * make style * make gpu compatible * remove bogus file * big test refactor * add example for chunking * fix typo * add to README
This commit is contained in:
committed by
GitHub
parent
877fc56410
commit
dca34695d0
@@ -22,6 +22,8 @@ class TestActivations(unittest.TestCase):
|
||||
get_activation("swish")
|
||||
get_activation("relu")
|
||||
get_activation("tanh")
|
||||
get_activation("gelu_new")
|
||||
get_activation("gelu_fast")
|
||||
with self.assertRaises(KeyError):
|
||||
get_activation("bogus")
|
||||
with self.assertRaises(KeyError):
|
||||
|
||||
@@ -125,6 +125,9 @@ class ModelTesterMixin:
|
||||
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
|
||||
decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
|
||||
encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
|
||||
chunk_length = getattr(self.model_tester, "chunk_length", None)
|
||||
if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
|
||||
encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config.output_attentions = True
|
||||
@@ -138,10 +141,17 @@ class ModelTesterMixin:
|
||||
self.assertEqual(model.config.output_attentions, True)
|
||||
self.assertEqual(model.config.output_hidden_states, False)
|
||||
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||||
)
|
||||
|
||||
if chunk_length is not None:
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-4:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
|
||||
)
|
||||
else:
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||||
)
|
||||
out_len = len(outputs)
|
||||
|
||||
if self.is_encoder_decoder:
|
||||
@@ -175,10 +185,16 @@ class ModelTesterMixin:
|
||||
|
||||
self_attentions = outputs[-1]
|
||||
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
|
||||
self.assertListEqual(
|
||||
list(self_attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||||
)
|
||||
if chunk_length is not None:
|
||||
self.assertListEqual(
|
||||
list(self_attentions[0].shape[-4:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
|
||||
)
|
||||
else:
|
||||
self.assertListEqual(
|
||||
list(self_attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||||
)
|
||||
|
||||
def test_torchscript(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@@ -465,14 +481,16 @@ class ModelTesterMixin:
|
||||
self.assertEqual(model.config.output_attentions, False)
|
||||
self.assertEqual(model.config.output_hidden_states, True)
|
||||
self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
|
||||
|
||||
if hasattr(self.model_tester, "encoder_seq_length"):
|
||||
seq_length = self.model_tester.encoder_seq_length
|
||||
if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
|
||||
seq_length = seq_length * self.model_tester.chunk_length
|
||||
else:
|
||||
seq_length = self.model_tester.seq_length
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[
|
||||
self.model_tester.encoder_seq_length
|
||||
if hasattr(self.model_tester, "encoder_seq_length")
|
||||
else self.model_tester.seq_length,
|
||||
self.model_tester.hidden_size,
|
||||
],
|
||||
list(hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size],
|
||||
)
|
||||
|
||||
def test_resize_tokens_embeddings(self):
|
||||
@@ -485,6 +503,9 @@ class ModelTesterMixin:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
|
||||
if self.model_tester.is_training is False:
|
||||
model.eval()
|
||||
|
||||
model_vocab_size = config.vocab_size
|
||||
# Retrieve the embeddings and clone theme
|
||||
model_embed = model.resize_token_embeddings(model_vocab_size)
|
||||
@@ -628,9 +649,13 @@ class ModelTesterMixin:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
input_ids = inputs_dict["input_ids"] if "input_ids" in inputs_dict else inputs_dict["inputs"]
|
||||
|
||||
# make sure that input_ids is at most of size 15
|
||||
input_ids = input_ids[..., :15]
|
||||
|
||||
# iterate over all generative models
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
model.eval()
|
||||
|
||||
if config.bos_token_id is None:
|
||||
# if bos token id is not defined, model needs input_ids
|
||||
@@ -669,8 +694,12 @@ class ModelTesterMixin:
|
||||
torch_device
|
||||
)
|
||||
|
||||
# make sure that input_ids is at most of size 15
|
||||
input_ids = input_ids[..., :15]
|
||||
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
model.eval()
|
||||
|
||||
if config.bos_token_id is None:
|
||||
# if bos token id is not defined mobel needs input_ids, num_return_sequences = 1
|
||||
@@ -750,7 +779,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
|
||||
|
||||
|
||||
def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
||||
"""Creates a random float32 tensor of the shape within the vocab size."""
|
||||
"""Creates a random float32 tensor"""
|
||||
if rng is None:
|
||||
rng = global_rng
|
||||
|
||||
|
||||
955
tests/test_modeling_reformer.py
Normal file
955
tests/test_modeling_reformer.py
Normal file
@@ -0,0 +1,955 @@
|
||||
# coding=utf-8 # Copyright 2020 Huggingface
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (
|
||||
ReformerConfig,
|
||||
ReformerModel,
|
||||
ReformerModelWithLMHead,
|
||||
ReformerTokenizer,
|
||||
ReformerLayer,
|
||||
REFORMER_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
import torch
|
||||
|
||||
|
||||
class ReformerModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=None,
|
||||
seq_length=None,
|
||||
is_training=None,
|
||||
is_decoder=None,
|
||||
use_input_mask=None,
|
||||
vocab_size=None,
|
||||
attention_head_size=None,
|
||||
hidden_size=None,
|
||||
num_attention_heads=None,
|
||||
local_attn_chunk_length=None,
|
||||
local_num_chunks_before=None,
|
||||
local_num_chunks_after=None,
|
||||
num_buckets=None,
|
||||
num_hashes=1,
|
||||
lsh_attn_chunk_length=None,
|
||||
lsh_num_chunks_before=None,
|
||||
lsh_num_chunks_after=None,
|
||||
chunk_size_lm_head=None,
|
||||
chunk_size_feed_forward=None,
|
||||
feed_forward_size=None,
|
||||
hidden_act=None,
|
||||
hidden_dropout_prob=None,
|
||||
local_attention_probs_dropout_prob=None,
|
||||
lsh_attention_probs_dropout_prob=None,
|
||||
max_position_embeddings=None,
|
||||
initializer_range=None,
|
||||
axial_norm_std=None,
|
||||
layer_norm_eps=None,
|
||||
axial_pos_embds=None,
|
||||
axial_pos_shape=None,
|
||||
axial_pos_embds_dim=None,
|
||||
attn_layers=None,
|
||||
pad_token_id=None,
|
||||
eos_token_id=None,
|
||||
scope=None,
|
||||
hash_seed=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.is_decoder = is_decoder
|
||||
self.use_input_mask = use_input_mask
|
||||
self.vocab_size = vocab_size
|
||||
self.attention_head_size = attention_head_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_hidden_layers = len(attn_layers)
|
||||
self.local_attn_chunk_length = local_attn_chunk_length
|
||||
self.local_num_chunks_after = local_num_chunks_after
|
||||
self.local_num_chunks_before = local_num_chunks_before
|
||||
self.num_hashes = num_hashes
|
||||
self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
|
||||
self.lsh_attn_chunk_length = lsh_attn_chunk_length
|
||||
self.lsh_num_chunks_after = lsh_num_chunks_after
|
||||
self.lsh_num_chunks_before = lsh_num_chunks_before
|
||||
self.hidden_act = hidden_act
|
||||
self.feed_forward_size = feed_forward_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
|
||||
self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.axial_pos_embds = axial_pos_embds
|
||||
self.axial_pos_shape = tuple(axial_pos_shape)
|
||||
self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
|
||||
self.axial_norm_std = axial_norm_std
|
||||
self.chunk_size_lm_head = chunk_size_lm_head
|
||||
self.chunk_size_feed_forward = chunk_size_feed_forward
|
||||
self.scope = scope
|
||||
self.attn_layers = attn_layers
|
||||
self.pad_token_id = pad_token_id
|
||||
self.hash_seed = hash_seed
|
||||
|
||||
attn_chunk_length = local_attn_chunk_length if local_attn_chunk_length is not None else lsh_attn_chunk_length
|
||||
num_chunks_after = local_num_chunks_after if local_num_chunks_after is not None else lsh_num_chunks_after
|
||||
num_chunks_before = local_num_chunks_before if local_num_chunks_before is not None else lsh_num_chunks_before
|
||||
|
||||
self.encoder_seq_length = seq_length // attn_chunk_length + (self.seq_length % attn_chunk_length != 0)
|
||||
self.key_length = (num_chunks_before + num_chunks_after + 1) * attn_chunk_length
|
||||
self.chunk_length = attn_chunk_length
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||
|
||||
config = ReformerConfig(
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
feed_forward_size=self.feed_forward_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
local_attention_probs_dropout_prob=self.local_attention_probs_dropout_prob,
|
||||
lsh_attention_probs_dropout_prob=self.lsh_attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
is_decoder=self.is_decoder,
|
||||
axial_pos_embds=self.axial_pos_embds,
|
||||
axial_pos_shape=self.axial_pos_shape,
|
||||
axial_pos_embds_dim=self.axial_pos_embds_dim,
|
||||
local_attn_chunk_length=self.local_attn_chunk_length,
|
||||
local_num_chunks_after=self.local_num_chunks_after,
|
||||
local_num_chunks_before=self.local_num_chunks_before,
|
||||
num_hashes=self.num_hashes,
|
||||
num_buckets=self.num_buckets,
|
||||
lsh_attn_chunk_length=self.lsh_attn_chunk_length,
|
||||
lsh_num_chunks_after=self.lsh_num_chunks_after,
|
||||
lsh_num_chunks_before=self.lsh_num_chunks_before,
|
||||
attn_layers=self.attn_layers,
|
||||
pad_token_id=self.pad_token_id,
|
||||
hash_seed=self.hash_seed,
|
||||
)
|
||||
|
||||
return (
|
||||
config,
|
||||
input_ids,
|
||||
input_mask,
|
||||
)
|
||||
|
||||
def check_loss_output(self, result):
|
||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||
|
||||
def create_and_check_reformer_model(
|
||||
self, config, input_ids, input_mask,
|
||||
):
|
||||
model = ReformerModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
(sequence_output,) = model(input_ids, attention_mask=input_mask)
|
||||
(sequence_output,) = model(input_ids)
|
||||
|
||||
result = {
|
||||
"sequence_output": sequence_output,
|
||||
}
|
||||
# 2 * hidden_size because we use reversible resnet layers
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, 2 * self.hidden_size],
|
||||
)
|
||||
|
||||
def create_and_check_reformer_model_with_lm_backward(
|
||||
self, config, input_ids, input_mask,
|
||||
):
|
||||
model = ReformerModelWithLMHead(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss = model(input_ids, attention_mask=input_mask, labels=input_ids)[0]
|
||||
loss.backward()
|
||||
|
||||
def create_and_check_reformer_with_lm(
|
||||
self, config, input_ids, input_mask,
|
||||
):
|
||||
model = ReformerModelWithLMHead(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, labels=input_ids)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"prediction_scores": prediction_scores,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||
)
|
||||
self.check_loss_output(result)
|
||||
|
||||
def create_and_check_reformer_model_with_attn_mask(self, config, input_ids, input_mask, is_decoder):
|
||||
# no special position embeddings
|
||||
config.axial_pos_embds = False
|
||||
config.is_decoder = is_decoder
|
||||
|
||||
if self.lsh_attn_chunk_length is not None:
|
||||
# need to set chunk length equal sequence length to be certain that chunking works
|
||||
config.lsh_attn_chunk_length = self.seq_length
|
||||
|
||||
model = ReformerModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
# set all position encodings to zero so that postions don't matter
|
||||
with torch.no_grad():
|
||||
embedding = model.embeddings.position_embeddings.embedding
|
||||
embedding.weight = torch.nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device))
|
||||
embedding.weight.requires_grad = False
|
||||
|
||||
half_seq_len = self.seq_length // 2
|
||||
roll = self.chunk_length
|
||||
|
||||
half_input_ids = input_ids[:, :half_seq_len]
|
||||
|
||||
# normal padded
|
||||
attn_mask = torch.cat([torch.ones_like(half_input_ids), torch.zeros_like(half_input_ids)], dim=-1,)
|
||||
input_ids_padded = torch.cat(
|
||||
[half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)], dim=-1,
|
||||
)
|
||||
|
||||
# shifted padded
|
||||
input_ids_roll = torch.cat(
|
||||
[half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)], dim=-1,
|
||||
)
|
||||
input_ids_roll = torch.roll(input_ids_roll, roll, dims=-1)
|
||||
attn_mask_roll = torch.roll(attn_mask, roll, dims=-1)
|
||||
|
||||
output_padded = model(input_ids_padded, attention_mask=attn_mask)[0][:, :half_seq_len]
|
||||
output_padded_rolled = model(input_ids_roll, attention_mask=attn_mask_roll)[0][:, roll : half_seq_len + roll]
|
||||
|
||||
self.parent.assertTrue(torch.allclose(output_padded, output_padded_rolled, atol=1e-3))
|
||||
|
||||
def create_and_check_reformer_layer_dropout_seed(self, config, input_ids, input_mask, is_decoder):
|
||||
config.is_decoder = is_decoder
|
||||
layer = ReformerLayer(config).to(torch_device)
|
||||
layer.train()
|
||||
shape = (
|
||||
self.batch_size,
|
||||
self.seq_length,
|
||||
config.hidden_size,
|
||||
) # Batch x SeqLen x hiddenSize
|
||||
|
||||
# get random tensors
|
||||
hidden_states = floats_tensor(shape)
|
||||
prev_attn_output = floats_tensor(shape)
|
||||
|
||||
# now the random seeds for attention and feed forward is initialized
|
||||
# forward tensors with dropout
|
||||
layer_outputs = layer(prev_attn_output, hidden_states, attention_mask=input_mask)
|
||||
|
||||
next_attn_output = layer_outputs.attn_output
|
||||
next_hidden_states = layer_outputs.hidden_states
|
||||
|
||||
torch.manual_seed(layer.attention_seed)
|
||||
attn_outputs = layer.attention(hidden_states, attention_mask=input_mask)
|
||||
self.parent.assertTrue(
|
||||
torch.allclose(prev_attn_output + attn_outputs.hidden_states, next_attn_output, atol=1e-3,)
|
||||
)
|
||||
|
||||
torch.manual_seed(layer.feed_forward_seed)
|
||||
feed_forward_hidden_states = layer.feed_forward(next_attn_output)
|
||||
self.parent.assertTrue(
|
||||
torch.allclose(next_hidden_states, hidden_states + feed_forward_hidden_states, atol=1e-3,)
|
||||
)
|
||||
|
||||
def create_and_check_reformer_feed_forward_chunking(self, config, input_ids, input_mask):
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
hidden_states_no_chunk = model(input_ids, attention_mask=input_mask)[0]
|
||||
|
||||
config.chunk_size_lm_head = 1
|
||||
config.chunk_size_feed_forward = 1
|
||||
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
hidden_states_with_chunk = model(input_ids, attention_mask=input_mask)[0]
|
||||
self.parent.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
|
||||
|
||||
def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask):
|
||||
if not self.is_training:
|
||||
return
|
||||
|
||||
# disable dropout
|
||||
config.hidden_dropout_prob = 0
|
||||
config.local_attention_probs_dropout_prob = 0
|
||||
config.lsh_attention_probs_dropout_prob = 0
|
||||
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModelWithLMHead(config=config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
model.zero_grad()
|
||||
loss_no_chunk, output_no_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
|
||||
loss_no_chunk.backward()
|
||||
grad_slice_word_no_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
|
||||
grad_slice_position_factor_1_no_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
|
||||
grad_slice_position_factor_2_no_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
|
||||
|
||||
config.chunk_size_lm_head = 1
|
||||
config.chunk_size_feed_forward = 1
|
||||
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModelWithLMHead(config=config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
model.zero_grad()
|
||||
loss_chunk, output_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
|
||||
loss_chunk.backward()
|
||||
grad_slice_word_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
|
||||
grad_slice_position_factor_1_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
|
||||
grad_slice_position_factor_2_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
|
||||
self.parent.assertTrue(torch.allclose(loss_chunk, loss_no_chunk, atol=1e-3))
|
||||
self.parent.assertTrue(torch.allclose(grad_slice_word_no_chunk, grad_slice_word_chunk, atol=1e-3))
|
||||
self.parent.assertTrue(
|
||||
torch.allclose(grad_slice_position_factor_1_chunk, grad_slice_position_factor_1_no_chunk, atol=1e-3)
|
||||
)
|
||||
self.parent.assertTrue(
|
||||
torch.allclose(grad_slice_position_factor_2_chunk, grad_slice_position_factor_2_no_chunk, atol=1e-3)
|
||||
)
|
||||
|
||||
def create_and_check_reformer_random_seed(self, config, input_ids, input_mask):
|
||||
layer = ReformerLayer(config).to(torch_device)
|
||||
layer.train()
|
||||
|
||||
shape = (
|
||||
self.batch_size,
|
||||
self.seq_length,
|
||||
config.hidden_size,
|
||||
) # Batch x SeqLen x hiddenSize
|
||||
|
||||
hidden_states = floats_tensor(shape)
|
||||
attn_output = floats_tensor(shape)
|
||||
|
||||
seeds = []
|
||||
for _ in range(100):
|
||||
layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
|
||||
attn_output = layer_outputs.attn_output
|
||||
hidden_states = layer_outputs.hidden_states
|
||||
torch.manual_seed(layer.attention_seed)
|
||||
seeds.append(layer.attention_seed)
|
||||
self.parent.assertGreater(len(set(seeds)), 70)
|
||||
|
||||
seeds = []
|
||||
for _ in range(100):
|
||||
layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
|
||||
attn_output = layer_outputs.attn_output
|
||||
hidden_states = layer_outputs.hidden_states
|
||||
torch.manual_seed(layer.feed_forward_seed)
|
||||
seeds.append(layer.feed_forward_seed)
|
||||
self.parent.assertGreater(len(set(seeds)), 70)
|
||||
|
||||
def create_and_check_reformer_model_fp16_forward(self, config, input_ids, input_mask):
|
||||
model = ReformerModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.half()
|
||||
model.eval()
|
||||
output = model(input_ids, attention_mask=input_mask)[0]
|
||||
self.parent.assertFalse(torch.isnan(output).any().item())
|
||||
|
||||
def create_and_check_reformer_model_fp16_generate(self, config, input_ids, input_mask):
|
||||
model = ReformerModelWithLMHead(config=config)
|
||||
model.to(torch_device)
|
||||
model.half()
|
||||
model.eval()
|
||||
output = model.generate(input_ids, attention_mask=input_mask, do_sample=False)
|
||||
self.parent.assertFalse(torch.isnan(output).any().item())
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids, input_mask,) = config_and_inputs
|
||||
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
class ReformerTesterMixin:
|
||||
"""
|
||||
Reformer Local and Reformer LSH run essentially the same tests
|
||||
"""
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_reformer_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_model(*config_and_inputs)
|
||||
|
||||
def test_reformer_lm_model_backward(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_model_with_lm_backward(*config_and_inputs)
|
||||
|
||||
def test_reformer_model_attn_masking(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, True)
|
||||
self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, False)
|
||||
|
||||
def test_reformer_with_lm(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_with_lm(*config_and_inputs)
|
||||
|
||||
def test_reformer_layer_training_dropout(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, True)
|
||||
self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, False)
|
||||
|
||||
def test_reformer_chunking_forward_equality(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_feed_forward_chunking(*config_and_inputs)
|
||||
|
||||
def test_reformer_chunking_backward_equality(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_feed_backward_chunking(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_dropout_random_seed_is_changing(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_random_seed(*config_and_inputs)
|
||||
|
||||
@unittest.skipIf(torch_device == "cpu", "Cant do half precision")
|
||||
def test_reformer_model_fp16_forward(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_model_fp16_forward(*config_and_inputs)
|
||||
|
||||
@unittest.skipIf(torch_device == "cpu", "Cant do half precision")
|
||||
def test_reformer_model_fp16_generate(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reformer_model_fp16_generate(*config_and_inputs)
|
||||
|
||||
|
||||
@require_torch
|
||||
class ReformerLocalAttnModelTest(ModelTesterMixin, ReformerTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (ReformerModel, ReformerModelWithLMHead) if is_torch_available() else ()
|
||||
all_generative_model_classes = (ReformerModelWithLMHead,) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_headmasking = False
|
||||
test_torchscript = False
|
||||
|
||||
def prepare_kwargs(self):
|
||||
return {
|
||||
"batch_size": 13,
|
||||
"seq_length": 32,
|
||||
"is_training": True,
|
||||
"is_decoder": False,
|
||||
"use_input_mask": True,
|
||||
"vocab_size": 32,
|
||||
"attention_head_size": 16,
|
||||
"hidden_size": 32,
|
||||
"num_attention_heads": 2,
|
||||
"local_attn_chunk_length": 4,
|
||||
"local_num_chunks_before": 1,
|
||||
"local_num_chunks_after": 0,
|
||||
"chunk_size_lm_head": 0,
|
||||
"chunk_size_feed_forward": 0,
|
||||
"feed_forward_size": 32,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"local_attention_probs_dropout_prob": 0.1,
|
||||
"max_position_embeddings": 512,
|
||||
"initializer_range": 0.02,
|
||||
"axial_norm_std": 1.0,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"axial_pos_embds": True,
|
||||
"axial_pos_shape": [4, 8],
|
||||
"axial_pos_embds_dim": [16, 16],
|
||||
"attn_layers": ["local", "local", "local", "local"],
|
||||
"pad_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"scope": None,
|
||||
"hash_seed": 0,
|
||||
}
|
||||
|
||||
def setUp(self):
|
||||
tester_kwargs = self.prepare_kwargs()
|
||||
self.model_tester = ReformerModelTester(self, **tester_kwargs)
|
||||
self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in list(REFORMER_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
model = ReformerModelWithLMHead.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
@require_torch
|
||||
class ReformerLSHAttnModelTest(ModelTesterMixin, unittest.TestCase, ReformerTesterMixin):
|
||||
all_model_classes = (ReformerModel, ReformerModelWithLMHead) if is_torch_available() else ()
|
||||
all_generative_model_classes = (ReformerModelWithLMHead,) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_headmasking = False
|
||||
test_torchscript = False
|
||||
|
||||
def prepare_kwargs(self):
|
||||
return {
|
||||
"batch_size": 13,
|
||||
"seq_length": 13,
|
||||
"use_input_mask": True,
|
||||
"is_training": False,
|
||||
"is_decoder": False,
|
||||
"vocab_size": 32,
|
||||
"attention_head_size": 16,
|
||||
"hidden_size": 64,
|
||||
"num_attention_heads": 2,
|
||||
"num_buckets": 2,
|
||||
"num_hashes": 4,
|
||||
"lsh_attn_chunk_length": 4,
|
||||
"lsh_num_chunks_before": 2,
|
||||
"lsh_num_chunks_after": 3,
|
||||
"chunk_size_lm_head": 5,
|
||||
"chunk_size_feed_forward": 6,
|
||||
"feed_forward_size": 32,
|
||||
"hidden_act": "relu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"lsh_attention_probs_dropout_prob": 0.1,
|
||||
"max_position_embeddings": 512,
|
||||
"initializer_range": 0.02,
|
||||
"axial_norm_std": 1.0,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"axial_pos_embds": True,
|
||||
"axial_pos_shape": [4, 8],
|
||||
"axial_pos_embds_dim": [16, 48],
|
||||
"attn_layers": ["lsh", "lsh", "lsh", "lsh"],
|
||||
"pad_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"scope": None,
|
||||
"hash_seed": 0,
|
||||
}
|
||||
|
||||
def setUp(self):
|
||||
tester_kwargs = self.prepare_kwargs()
|
||||
self.model_tester = ReformerModelTester(self, **tester_kwargs)
|
||||
self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
|
||||
|
||||
|
||||
@require_torch
|
||||
class ReformerIntegrationTests(unittest.TestCase):
|
||||
"""
|
||||
These integration tests test the current layer activations and gradients againts the output of the Hugging Face Reformer model at time of integration: 29/04/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "local" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `branch_to_save_trax_integration_tests`.
|
||||
"""
|
||||
|
||||
def _get_basic_config_and_input(self):
|
||||
config = {
|
||||
"vocab_size": 320,
|
||||
"attention_head_size": 8,
|
||||
"hidden_size": 16,
|
||||
"num_attention_heads": 2,
|
||||
"num_buckets": 2,
|
||||
"num_hashes": 4,
|
||||
"lsh_attn_chunk_length": 4,
|
||||
"local_attn_chunk_length": 4,
|
||||
"lsh_num_chunks_before": 1,
|
||||
"lsh_num_chunks_after": 0,
|
||||
"local_num_chunks_before": 1,
|
||||
"local_num_chunks_after": 0,
|
||||
"chunk_size_lm_head": 0,
|
||||
"chunk_size_feed_forward": 0,
|
||||
"feed_forward_size": 32,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.0,
|
||||
"lsh_attention_probs_dropout_prob": 0.0,
|
||||
"local_attention_probs_dropout_prob": 0.0,
|
||||
"max_position_embeddings": 32,
|
||||
"initializer_range": 0.02,
|
||||
"axial_norm_std": 1.0,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"sinusoidal_pos_embds": False,
|
||||
"axial_pos_embds": True,
|
||||
"axial_pos_shape": [4, 8],
|
||||
"axial_pos_embds_dim": [8, 8],
|
||||
"hash_seed": 0,
|
||||
"is_decoder": True,
|
||||
}
|
||||
return config
|
||||
|
||||
def _get_hidden_states(self):
|
||||
return torch.tensor(
|
||||
[
|
||||
[
|
||||
[
|
||||
1.90826353e00,
|
||||
-1.45999730e00,
|
||||
-6.20405462e-01,
|
||||
1.52503433e00,
|
||||
-3.64464232e-01,
|
||||
-8.27359235e-01,
|
||||
8.39670803e-01,
|
||||
2.44492178e-01,
|
||||
4.98332758e-01,
|
||||
2.69175139e00,
|
||||
-7.08081422e-03,
|
||||
1.04915401e00,
|
||||
-1.83476661e00,
|
||||
7.67220476e-01,
|
||||
2.98580543e-01,
|
||||
2.84803992e-02,
|
||||
],
|
||||
[
|
||||
-2.66374286e-02,
|
||||
4.33497576e-01,
|
||||
3.10386309e-01,
|
||||
5.46039944e-01,
|
||||
-2.47292666e-04,
|
||||
-7.52305019e-01,
|
||||
2.39162103e-01,
|
||||
7.25216186e-01,
|
||||
-7.58357372e-01,
|
||||
4.20635998e-01,
|
||||
-4.04739919e-02,
|
||||
1.59924145e-01,
|
||||
2.05135748e00,
|
||||
-1.15997978e00,
|
||||
5.37166397e-01,
|
||||
2.62873606e-01,
|
||||
],
|
||||
[
|
||||
1.85247482e-01,
|
||||
7.07046037e-01,
|
||||
-6.77089715e-01,
|
||||
-2.24209655e00,
|
||||
-3.75307980e-02,
|
||||
-8.59380874e-01,
|
||||
-2.81027884e00,
|
||||
1.01276376e00,
|
||||
-1.69438001e00,
|
||||
4.17574660e-01,
|
||||
-1.49196962e00,
|
||||
-1.76483717e00,
|
||||
-1.94566312e-01,
|
||||
-1.71183858e00,
|
||||
7.72903565e-01,
|
||||
-1.11557056e00,
|
||||
],
|
||||
[
|
||||
9.46069193e-01,
|
||||
1.53417623e-01,
|
||||
-9.58686996e-01,
|
||||
1.18126669e-01,
|
||||
1.75967724e00,
|
||||
1.62194590e00,
|
||||
-5.74108159e-01,
|
||||
6.79920443e-01,
|
||||
5.44028163e-01,
|
||||
2.05466114e-01,
|
||||
-3.63045868e-01,
|
||||
2.41865062e-01,
|
||||
3.20348382e-01,
|
||||
-9.05611176e-01,
|
||||
-1.92690727e-01,
|
||||
-1.19917547e00,
|
||||
],
|
||||
]
|
||||
],
|
||||
dtype=torch.float32,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
def _get_attn_mask(self):
|
||||
return torch.tensor([[0, 1, 0, 0]], dtype=torch.long, device=torch_device)
|
||||
|
||||
def _get_input_ids_and_mask(self):
|
||||
mask = torch.tensor(
|
||||
[
|
||||
[1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1],
|
||||
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0],
|
||||
],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
input_ids = torch.tensor(
|
||||
[
|
||||
[
|
||||
89,
|
||||
279,
|
||||
286,
|
||||
84,
|
||||
194,
|
||||
316,
|
||||
182,
|
||||
28,
|
||||
283,
|
||||
37,
|
||||
169,
|
||||
7,
|
||||
253,
|
||||
267,
|
||||
107,
|
||||
250,
|
||||
44,
|
||||
7,
|
||||
102,
|
||||
62,
|
||||
3,
|
||||
243,
|
||||
171,
|
||||
265,
|
||||
302,
|
||||
48,
|
||||
164,
|
||||
264,
|
||||
148,
|
||||
229,
|
||||
280,
|
||||
150,
|
||||
],
|
||||
[
|
||||
9,
|
||||
192,
|
||||
66,
|
||||
112,
|
||||
163,
|
||||
83,
|
||||
135,
|
||||
70,
|
||||
224,
|
||||
96,
|
||||
31,
|
||||
80,
|
||||
196,
|
||||
80,
|
||||
63,
|
||||
22,
|
||||
85,
|
||||
100,
|
||||
47,
|
||||
283,
|
||||
0,
|
||||
163,
|
||||
126,
|
||||
143,
|
||||
195,
|
||||
82,
|
||||
53,
|
||||
82,
|
||||
18,
|
||||
27,
|
||||
182,
|
||||
52,
|
||||
],
|
||||
],
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
return input_ids, mask
|
||||
|
||||
def test_lsh_layer_forward(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["lsh"]
|
||||
config["is_decoder"] = False
|
||||
hidden_states = self._get_hidden_states()
|
||||
torch.manual_seed(0)
|
||||
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
|
||||
layer.eval()
|
||||
reformer_output = layer(prev_attn_output=hidden_states.clone(), hidden_states=hidden_states)
|
||||
output_slice = reformer_output.hidden_states[0, 0, :5]
|
||||
expected_output_slice = torch.tensor(
|
||||
[1.6879, -1.3083, -0.4708, 1.3555, -0.6292], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_lsh_layer_forward_complex(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["lsh"]
|
||||
config["num_buckets"] = [2, 4]
|
||||
attn_mask = self._get_attn_mask()
|
||||
hidden_states = self._get_hidden_states()
|
||||
torch.manual_seed(0)
|
||||
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
|
||||
layer.eval()
|
||||
reformer_output = layer(
|
||||
prev_attn_output=hidden_states.clone(), hidden_states=hidden_states, attention_mask=attn_mask,
|
||||
)
|
||||
output_slice = reformer_output.hidden_states[0, 0, :5]
|
||||
expected_output_slice = torch.tensor(
|
||||
[1.6439, -1.2306, -0.5108, 1.3006, -0.6537], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_local_layer_forward(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["local"]
|
||||
config["is_decoder"] = False
|
||||
hidden_states = self._get_hidden_states()
|
||||
torch.manual_seed(0)
|
||||
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
|
||||
layer.eval()
|
||||
reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states)
|
||||
output_slice = reformer_output.hidden_states[0, 0, :5]
|
||||
expected_output_slice = torch.tensor(
|
||||
[1.4212, -2.0576, -0.9688, 1.4599, -0.1344], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_local_layer_forward_complex(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["local"]
|
||||
attn_mask = self._get_attn_mask()
|
||||
hidden_states = self._get_hidden_states()
|
||||
torch.manual_seed(0)
|
||||
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
|
||||
layer.eval()
|
||||
reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states, attention_mask=attn_mask,)
|
||||
output_slice = reformer_output.hidden_states[0, 0, :5]
|
||||
expected_output_slice = torch.tensor(
|
||||
[1.5476, -1.9020, -0.9902, 1.5013, -0.1950], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_lsh_model_forward(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
|
||||
config["num_buckets"] = [2, 4]
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModel(ReformerConfig(**config)).to(torch_device)
|
||||
model.eval()
|
||||
input_ids, attn_mask = self._get_input_ids_and_mask()
|
||||
hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
|
||||
output_slice = hidden_states[0, 0, :5]
|
||||
expected_output_slice = torch.tensor(
|
||||
[-0.9896, -0.9396, -1.0831, -0.0597, 0.2456], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_local_model_forward(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["local", "local", "local", "local"]
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModel(ReformerConfig(**config)).to(torch_device)
|
||||
model.eval()
|
||||
input_ids, attn_mask = self._get_input_ids_and_mask()
|
||||
hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
|
||||
output_slice = hidden_states[0, 0, :5]
|
||||
expected_output_slice = torch.tensor(
|
||||
[-1.6791, 0.7171, 0.1594, 0.4063, 1.2584], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_lm_model_forward(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["local", "lsh", "local", "lsh", "local", "lsh"]
|
||||
config["num_buckets"] = [2, 4]
|
||||
config["is_decoder"] = False
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device)
|
||||
model.eval()
|
||||
input_ids, attn_mask = self._get_input_ids_and_mask()
|
||||
hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
|
||||
output_slice = hidden_states[1, -1, :5]
|
||||
expected_output_slice = torch.tensor(
|
||||
[0.0324, -0.0121, 0.0615, 0.0031, -0.0297], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
|
||||
|
||||
def test_local_lm_model_grad(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["local", "local", "local", "local"]
|
||||
config["hidden_dropout_prob"] = 0.0
|
||||
config["local_attention_probs_dropout_prob"] = 0.0
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device)
|
||||
model.train()
|
||||
model.zero_grad()
|
||||
input_ids, _ = self._get_input_ids_and_mask()
|
||||
loss = model(input_ids=input_ids, labels=input_ids)[0]
|
||||
|
||||
self.assertTrue(torch.allclose(loss, torch.tensor(5.7786, dtype=torch.float, device=torch_device), atol=1e-3))
|
||||
loss.backward()
|
||||
|
||||
# check last grads to cover all proable errors
|
||||
grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
|
||||
expected_grad_slice_word = torch.tensor(
|
||||
[-0.0005, 0.0001, 0.0002, 0.0003, 0.0006], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
|
||||
expected_grad_slice_pos_fac_1 = torch.tensor(
|
||||
[0.0037, -1.3793, -1.0231, -1.5230, -2.5306], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
|
||||
expected_grad_slice_pos_fac_2 = torch.tensor(
|
||||
[-1.3165, 0.5168, 0.7785, 1.0811, -0.9830], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
|
||||
|
||||
def test_lsh_lm_model_grad(self):
|
||||
config = self._get_basic_config_and_input()
|
||||
config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
|
||||
config["hidden_dropout_prob"] = 0.0
|
||||
config["lsh_attention_probs_dropout_prob"] = 0.0
|
||||
config["num_buckets"] = [2, 4]
|
||||
config["num_hashes"] = 6
|
||||
torch.manual_seed(0)
|
||||
model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device)
|
||||
model.train()
|
||||
model.zero_grad()
|
||||
input_ids, _ = self._get_input_ids_and_mask()
|
||||
loss = model(input_ids=input_ids, labels=input_ids)[0]
|
||||
|
||||
self.assertTrue(torch.allclose(loss, torch.tensor(5.7819, dtype=torch.float, device=torch_device), atol=1e-3))
|
||||
loss.backward()
|
||||
# check last grads to cover all proable errors
|
||||
grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
|
||||
expected_grad_slice_word = torch.tensor(
|
||||
[2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
|
||||
expected_grad_slice_pos_fac_1 = torch.tensor(
|
||||
[-0.0984, 0.6283, 0.4282, 1.2960, 0.6897], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
|
||||
expected_grad_slice_pos_fac_2 = torch.tensor(
|
||||
[0.4626, -0.0231, -0.0172, 0.1081, 0.3805], dtype=torch.float, device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
|
||||
|
||||
@slow
|
||||
def test_pretrained_generate_crime_and_punish(self):
|
||||
model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
|
||||
tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
|
||||
model.eval()
|
||||
|
||||
input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
|
||||
output_ids = model.generate(
|
||||
input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8
|
||||
)
|
||||
output_text = tokenizer.decode(output_ids[0])
|
||||
self.assertEqual(
|
||||
output_text,
|
||||
"A few months later state expression in his ideas, at the first entrance. He was positively for an inst",
|
||||
)
|
||||
Reference in New Issue
Block a user