BLOOM minor changes on tokenizer (#17823)

* few fixes:

- hardcode tokenizer padding side
- remove unused args

* few fixes:

- added new attribute on TokenizerTesterMixin
- added new slow test
- remove unused arg on tokenizer class

* make style

* Update src/transformers/models/bloom/tokenization_bloom_fast.py

Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com>

* make quality

* apply changes

- remove new attribute
- redefine test on the class

* add comments

Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com>
This commit is contained in:
Younes Belkada
2022-06-23 15:57:12 +02:00
committed by GitHub
parent 6f29029b05
commit 18c263c4b6
3 changed files with 35 additions and 12 deletions

View File

@@ -441,6 +441,33 @@ class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True),
)
@slow
def test_right_left_batched_input(self):
path_1b3 = "bigscience/bloom-1b3"
model = BloomForCausalLM.from_pretrained(path_1b3, use_cache=True)
model = model.eval()
tokenizer = BloomTokenizerFast.from_pretrained(path_1b3)
tokenizer.padding_side = "right"
inputs = ["Hello there", "Joe Biden is the president of the"]
inputs_right = tokenizer(inputs, return_tensors="pt", padding=True)
tokenizer.padding_side = "left"
inputs_left = tokenizer(inputs, return_tensors="pt", padding=True)
# test token values are different
self.assertNotEqual(inputs_right["input_ids"].tolist(), inputs_left["input_ids"].tolist())
# test reconstructions are the same
outputs_right = model.generate(**inputs_right, max_length=10, do_sample=False)
outputs_left = model.generate(**inputs_left, max_length=10, do_sample=False)
self.assertEqual(
tokenizer.decode(outputs_right[0], skip_special_tokens=True),
tokenizer.decode(outputs_left[0], skip_special_tokens=True),
)
@require_torch
class BloomEmbeddingTest(unittest.TestCase):