Add BigBirdPegasus (#10991)
* init bigbird pegasus * add debugging nb ; update config * init conversion * update conversion script * complete conversion script * init forward() * complete forward() * add tokenizer * add some slow tests * commit current * fix copies * add docs * add conversion script for bigbird-roberta-summarization * remove TODO * small fixups * correct tokenizer * add bigbird core for now * fix config * fix more * revert pegasus-tokenizer back * make style * everything working for pubmed; yayygit status * complete tests finally * remove bigbird pegasus tok * correct tokenizer * correct tests * add tokenizer files * finish make style * fix test * update * make style * fix tok utils base file * make fix-copies * clean a bit * small update * fix some suggestions * add to readme * fix a bit, clean tests * fix more tests * Update src/transformers/__init__.py * Update src/transformers/__init__.py * make fix-copies * complete attn switching, auto-padding left * make style * fix auto-padding test * make style * fix batched attention tests * put tolerance at 1e-1 for stand-alone decoder test * fix docs * fix tests * correct slow tokenizer conversion * Apply suggestions from code review Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * complete remaining suggestions * fix test Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -55,7 +55,6 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
|
||||
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||
# TODO: (Thom, Patrick) - this fails because the rust tokenizer does not know about the <mask_1>, <mask_2>, and those <unk_token_x> yet
|
||||
self.assertListEqual(py_ids, rust_ids)
|
||||
|
||||
def test_large_mask_tokens(self):
|
||||
@@ -96,3 +95,81 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
assert batch.attention_mask.shape == (2, 1024)
|
||||
assert targets["input_ids"].shape == (2, 5)
|
||||
assert len(batch) == 2 # input_ids, attention_mask.
|
||||
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = PegasusTokenizer
|
||||
rust_tokenizer_class = PegasusTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
# We have a SentencePiece fixture for testing
|
||||
tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
@cached_property
|
||||
def _large_tokenizer(self):
|
||||
return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
|
||||
|
||||
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
|
||||
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
return ("This is a test", "This is a test")
|
||||
|
||||
def test_mask_tokens_rust_pegasus(self):
|
||||
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
|
||||
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
|
||||
raw_input_str = "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s> <pad> <pad> <pad>"
|
||||
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||
self.assertListEqual(py_ids, rust_ids)
|
||||
|
||||
@require_torch
|
||||
def test_large_seq2seq_truncation(self):
|
||||
src_texts = ["This is going to be way too long." * 1000, "short example"]
|
||||
tgt_texts = ["not super long but more than 5 tokens", "tiny"]
|
||||
batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
|
||||
with self._large_tokenizer.as_target_tokenizer():
|
||||
targets = self._large_tokenizer(
|
||||
tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
|
||||
)
|
||||
|
||||
assert batch.input_ids.shape == (2, 4096)
|
||||
assert batch.attention_mask.shape == (2, 4096)
|
||||
assert targets["input_ids"].shape == (2, 5)
|
||||
assert len(batch) == 2 # input_ids, attention_mask.
|
||||
|
||||
def test_equivalence_to_orig_tokenizer(self):
|
||||
"""
|
||||
To run with original TF tokenizer:
|
||||
|
||||
!wget https://github.com/google-research/bigbird/raw/master/bigbird/vocab/pegasus.model
|
||||
!pip install tensorflow-text
|
||||
|
||||
import tensorflow.compat.v2 as tf
|
||||
import tensorflow_text as tft
|
||||
|
||||
VOCAB_FILE = "./pegasus.model"
|
||||
|
||||
tf.enable_v2_behavior()
|
||||
|
||||
test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
|
||||
tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(VOCAB_FILE, "rb").read())
|
||||
|
||||
tokenizer.tokenize(test_str)
|
||||
"""
|
||||
|
||||
test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
|
||||
|
||||
token_ids = self._large_tokenizer(test_str).input_ids
|
||||
|
||||
self.assertListEqual(
|
||||
token_ids,
|
||||
[182, 117, 142, 587, 4211, 120, 117, 263, 112, 804, 109, 856, 25016, 3137, 464, 109, 26955, 3137, 1],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user