BLOOM minor changes on tokenizer (#17823)
* few fixes: - hardcode tokenizer padding side - remove unused args * few fixes: - added new attribute on TokenizerTesterMixin - added new slow test - remove unused arg on tokenizer class * make style * Update src/transformers/models/bloom/tokenization_bloom_fast.py Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com> * make quality * apply changes - remove new attribute - redefine test on the class * add comments Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com>
This commit is contained in:
@@ -45,16 +45,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|
||||||
"bigscience/tokenizer": 1024,
|
|
||||||
"bigscience/bloom-350m": 1024,
|
|
||||||
"bigscience/bloom-760m": 1024,
|
|
||||||
"bigscience/bloom-1b3": 1024,
|
|
||||||
"bigscience/bloom-2b5": 1024,
|
|
||||||
"bigscience/bloom-6b3": 1024,
|
|
||||||
"bigscience/bloom": 1024,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class BloomTokenizerFast(PreTrainedTokenizerFast):
|
class BloomTokenizerFast(PreTrainedTokenizerFast):
|
||||||
"""
|
"""
|
||||||
@@ -109,9 +99,9 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
|
||||||
model_input_names = ["input_ids", "attention_mask"]
|
model_input_names = ["input_ids", "attention_mask"]
|
||||||
slow_tokenizer_class = None
|
slow_tokenizer_class = None
|
||||||
|
# No `max_model_input_sizes` as BLOOM uses ALiBi positional embeddings
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -136,7 +126,6 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
add_prefix_space=add_prefix_space,
|
add_prefix_space=add_prefix_space,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
||||||
|
|||||||
@@ -441,6 +441,33 @@ class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
|
|||||||
tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True),
|
tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_right_left_batched_input(self):
|
||||||
|
path_1b3 = "bigscience/bloom-1b3"
|
||||||
|
model = BloomForCausalLM.from_pretrained(path_1b3, use_cache=True)
|
||||||
|
model = model.eval()
|
||||||
|
|
||||||
|
tokenizer = BloomTokenizerFast.from_pretrained(path_1b3)
|
||||||
|
tokenizer.padding_side = "right"
|
||||||
|
|
||||||
|
inputs = ["Hello there", "Joe Biden is the president of the"]
|
||||||
|
inputs_right = tokenizer(inputs, return_tensors="pt", padding=True)
|
||||||
|
|
||||||
|
tokenizer.padding_side = "left"
|
||||||
|
inputs_left = tokenizer(inputs, return_tensors="pt", padding=True)
|
||||||
|
|
||||||
|
# test token values are different
|
||||||
|
self.assertNotEqual(inputs_right["input_ids"].tolist(), inputs_left["input_ids"].tolist())
|
||||||
|
|
||||||
|
# test reconstructions are the same
|
||||||
|
outputs_right = model.generate(**inputs_right, max_length=10, do_sample=False)
|
||||||
|
outputs_left = model.generate(**inputs_left, max_length=10, do_sample=False)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
tokenizer.decode(outputs_right[0], skip_special_tokens=True),
|
||||||
|
tokenizer.decode(outputs_left[0], skip_special_tokens=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class BloomEmbeddingTest(unittest.TestCase):
|
class BloomEmbeddingTest(unittest.TestCase):
|
||||||
|
|||||||
@@ -127,3 +127,10 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
output_tokens = list(map(tokenizer.encode, input_text))
|
output_tokens = list(map(tokenizer.encode, input_text))
|
||||||
predicted_text = list(map(lambda x: tokenizer.decode(x, clean_up_tokenization_spaces=False), output_tokens))
|
predicted_text = list(map(lambda x: tokenizer.decode(x, clean_up_tokenization_spaces=False), output_tokens))
|
||||||
self.assertListEqual(predicted_text, input_text)
|
self.assertListEqual(predicted_text, input_text)
|
||||||
|
|
||||||
|
def test_pretrained_model_lists(self):
|
||||||
|
# The test has to be overriden because BLOOM uses ALiBi positional embeddings that does not have
|
||||||
|
# any sequence length constraints. This test of the parent class will fail since it relies on the
|
||||||
|
# maximum sequence length of the positoonal embeddings.
|
||||||
|
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
|
||||||
|
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
|
||||||
|
|||||||
Reference in New Issue
Block a user