Fix MarianTokenizer to remove metaspace character in decode (#26091)

* add: check to remove metaspace from marian tokenizer

* fix: metaspace character being removed from everywhere

* fix: remove redundant check at top

* add: test for marian tokenizer decode fix

* fix: simplified the test
This commit is contained in:
Tanay Mehta
2023-09-13 01:23:31 +05:30
committed by GitHub
parent 03e309d58e
commit 12f043eaea
2 changed files with 10 additions and 0 deletions

View File

@@ -149,3 +149,10 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
decoded = tokenizer.decode(target_ids, skip_special_tokens=True)
self.assertEqual(decoded, target_text)
def test_tokenizer_decode(self):
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
source_text = "Hello World"
ids = tokenizer(source_text)["input_ids"]
output_text = tokenizer.decode(ids, skip_special_tokens=True)
self.assertEqual(source_text, output_text)