From 18c263c4b6b82726a3f2699e2dfee89383804391 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 23 Jun 2022 15:57:12 +0200 Subject: [PATCH] BLOOM minor changes on tokenizer (#17823) * few fixes: - hardcode tokenizer padding side - remove unused args * few fixes: - added new attribute on TokenizerTesterMixin - added new slow test - remove unused arg on tokenizer class * make style * Update src/transformers/models/bloom/tokenization_bloom_fast.py Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com> * make quality * apply changes - remove new attribute - redefine test on the class * add comments Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com> --- .../models/bloom/tokenization_bloom_fast.py | 13 +-------- tests/models/bloom/test_modeling_bloom.py | 27 +++++++++++++++++++ tests/models/bloom/test_tokenization_bloom.py | 7 +++++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index c9785d641b..7c5f9b2407 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -45,16 +45,6 @@ PRETRAINED_VOCAB_FILES_MAP = { }, } -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "bigscience/tokenizer": 1024, - "bigscience/bloom-350m": 1024, - "bigscience/bloom-760m": 1024, - "bigscience/bloom-1b3": 1024, - "bigscience/bloom-2b5": 1024, - "bigscience/bloom-6b3": 1024, - "bigscience/bloom": 1024, -} - class BloomTokenizerFast(PreTrainedTokenizerFast): """ @@ -109,9 +99,9 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] slow_tokenizer_class = None + # No `max_model_input_sizes` as BLOOM uses ALiBi positional embeddings def __init__( self, @@ -136,7 +126,6 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): add_prefix_space=add_prefix_space, **kwargs, ) - pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py index f71618eae8..0b2501982d 100644 --- a/tests/models/bloom/test_modeling_bloom.py +++ b/tests/models/bloom/test_modeling_bloom.py @@ -441,6 +441,33 @@ class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase) tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True), ) + @slow + def test_right_left_batched_input(self): + path_1b3 = "bigscience/bloom-1b3" + model = BloomForCausalLM.from_pretrained(path_1b3, use_cache=True) + model = model.eval() + + tokenizer = BloomTokenizerFast.from_pretrained(path_1b3) + tokenizer.padding_side = "right" + + inputs = ["Hello there", "Joe Biden is the president of the"] + inputs_right = tokenizer(inputs, return_tensors="pt", padding=True) + + tokenizer.padding_side = "left" + inputs_left = tokenizer(inputs, return_tensors="pt", padding=True) + + # test token values are different + self.assertNotEqual(inputs_right["input_ids"].tolist(), inputs_left["input_ids"].tolist()) + + # test reconstructions are the same + outputs_right = model.generate(**inputs_right, max_length=10, do_sample=False) + outputs_left = model.generate(**inputs_left, max_length=10, do_sample=False) + + self.assertEqual( + tokenizer.decode(outputs_right[0], skip_special_tokens=True), + tokenizer.decode(outputs_left[0], skip_special_tokens=True), + ) + @require_torch class BloomEmbeddingTest(unittest.TestCase): diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index c213437a37..117240dbda 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -127,3 +127,10 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): output_tokens = list(map(tokenizer.encode, input_text)) predicted_text = list(map(lambda x: tokenizer.decode(x, clean_up_tokenization_spaces=False), output_tokens)) self.assertListEqual(predicted_text, input_text) + + def test_pretrained_model_lists(self): + # The test has to be overriden because BLOOM uses ALiBi positional embeddings that does not have + # any sequence length constraints. This test of the parent class will fail since it relies on the + # maximum sequence length of the positoonal embeddings. + self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) + self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)