Remove deprecated properties in tokenization_nllb.py and tokenization_nllb_fast.py (#29834)

* Fix typo in tokenization_nllb.py

Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability.

* Fix typo in tokenization_nllb_fast.py

Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability.

* Remove deprecated attributes in tokenization_nllb.py

Remove deprecated attributes: `lang_code_to_id`, `fairseq_tokens_to_ids`, `id_to_lang_code`, and `fairseq_ids_to_tokens`

* Remove deprecated attribute in tokenization_nllb_fast.py

Remove deprecated attribute `lang_code_to_id`

* Remove deprecated properties in tokenization_nllb.py

Remove deprecated properties - fix format

* Remove deprecated properties in tokenization_nllb_fast.py

Remove deprecated properties - fix format

* Update test_tokenization_nllb.py

* update test_tokenization_nllb.py

* Update tokenization_nllb.py

* Update test_tokenization_seamless_m4t.py

* Update test_tokenization_seamless_m4t.py
This commit is contained in:
Yasmin Moslem
2024-05-23 17:53:26 +01:00
committed by GitHub
parent 965e98dc54
commit 6d3d5b1039
4 changed files with 0 additions and 77 deletions

View File

@@ -367,11 +367,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
cls.pad_token_id = 1
return cls
def test_language_codes(self):
self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Arab"], 256001)
self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Latn"], 256002)
self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["fra_Latn"], 256057)
def test_enro_tokenizer_batch_encode_plus(self):
ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
self.assertListEqual(self.expected_src_tokens, ids)
@@ -397,13 +392,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
def test_mask_token(self):
self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [256203, 3])
def test_special_tokens_unaffacted_by_save_load(self):
tmpdirname = tempfile.mkdtemp()
original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
self.tokenizer.save_pretrained(tmpdirname)
new_tok = NllbTokenizer.from_pretrained(tmpdirname)
self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
@require_torch
def test_enro_tokenizer_prepare_batch(self):
batch = self.tokenizer(

View File

@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
from transformers import (
@@ -499,14 +498,6 @@ class SeamlessM4TDistilledIntegrationTest(unittest.TestCase):
self.assertEqual(ids[0], EN_CODE)
self.assertEqual(len(ids), desired_max_length)
# Copied from tests.models.nllb.test_tokenization_nllb.NllbDistilledIntegrationTest.test_special_tokens_unaffacted_by_save_load with fairseq_tokens_to_ids->additional_special_tokens, Nllb->SeamlessM4T, Dict->List
def test_special_tokens_unaffacted_by_save_load(self):
tmpdirname = tempfile.mkdtemp()
original_special_tokens = self.tokenizer.additional_special_tokens
self.tokenizer.save_pretrained(tmpdirname)
new_tok = SeamlessM4TTokenizer.from_pretrained(tmpdirname)
self.assertListEqual(new_tok.additional_special_tokens, original_special_tokens)
@require_torch
def test_enro_tokenizer_prepare_batch(self):
batch = self.tokenizer(