Remove deprecated properties in tokenization_nllb.py and tokenization_nllb_fast.py (#29834)
* Fix typo in tokenization_nllb.py Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability. * Fix typo in tokenization_nllb_fast.py Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability. * Remove deprecated attributes in tokenization_nllb.py Remove deprecated attributes: `lang_code_to_id`, `fairseq_tokens_to_ids`, `id_to_lang_code`, and `fairseq_ids_to_tokens` * Remove deprecated attribute in tokenization_nllb_fast.py Remove deprecated attribute `lang_code_to_id` * Remove deprecated properties in tokenization_nllb.py Remove deprecated properties - fix format * Remove deprecated properties in tokenization_nllb_fast.py Remove deprecated properties - fix format * Update test_tokenization_nllb.py * update test_tokenization_nllb.py * Update tokenization_nllb.py * Update test_tokenization_seamless_m4t.py * Update test_tokenization_seamless_m4t.py
This commit is contained in:
@@ -159,18 +159,6 @@ class NllbTokenizer(PreTrainedTokenizer):
|
|||||||
self.fairseq_offset = 1
|
self.fairseq_offset = 1
|
||||||
self.sp_model_size = len(self.sp_model)
|
self.sp_model_size = len(self.sp_model)
|
||||||
|
|
||||||
# Everything that follows is kept for BC and will be removed in v4.38
|
|
||||||
self._fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
|
|
||||||
language_codes = FAIRSEQ_LANGUAGE_CODES if additional_special_tokens is None else additional_special_tokens
|
|
||||||
self._lang_code_to_id = {
|
|
||||||
code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(language_codes)
|
|
||||||
}
|
|
||||||
self._id_to_lang_code = {v: k for k, v in self._lang_code_to_id.items()}
|
|
||||||
self._fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
|
|
||||||
|
|
||||||
self._fairseq_tokens_to_ids.update(self.lang_code_to_id)
|
|
||||||
self._fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
@@ -217,38 +205,6 @@ class NllbTokenizer(PreTrainedTokenizer):
|
|||||||
def src_lang(self) -> str:
|
def src_lang(self) -> str:
|
||||||
return self._src_lang
|
return self._src_lang
|
||||||
|
|
||||||
@property
|
|
||||||
def lang_code_to_id(self):
|
|
||||||
logger.warning_once(
|
|
||||||
"the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
|
|
||||||
" this attribute will be removed in `transformers` v4.38"
|
|
||||||
)
|
|
||||||
return self._lang_code_to_id
|
|
||||||
|
|
||||||
@property
|
|
||||||
def fairseq_tokens_to_ids(self):
|
|
||||||
logger.warning_once(
|
|
||||||
"the `fairseq_tokens_to_ids` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
|
|
||||||
" this attribute will be removed in `transformers` v4.38"
|
|
||||||
)
|
|
||||||
return self._fairseq_tokens_to_ids
|
|
||||||
|
|
||||||
@property
|
|
||||||
def id_to_lang_code(self):
|
|
||||||
logger.warning_once(
|
|
||||||
"the `id_to_lang_code` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
|
|
||||||
" this attribute will be removed in `transformers` v4.38"
|
|
||||||
)
|
|
||||||
return self._id_to_lang_code
|
|
||||||
|
|
||||||
@property
|
|
||||||
def fairseq_ids_to_tokens(self):
|
|
||||||
logger.warning_once(
|
|
||||||
"the `_fairseq_ids_to_tokens` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
|
|
||||||
" this attribute will be removed in `transformers` v4.38"
|
|
||||||
)
|
|
||||||
return self._fairseq_ids_to_tokens
|
|
||||||
|
|
||||||
@src_lang.setter
|
@src_lang.setter
|
||||||
def src_lang(self, new_src_lang: str) -> None:
|
def src_lang(self, new_src_lang: str) -> None:
|
||||||
self._src_lang = new_src_lang
|
self._src_lang = new_src_lang
|
||||||
|
|||||||
@@ -161,23 +161,11 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._lang_code_to_id = {
|
|
||||||
lang_code: self.convert_tokens_to_ids(str(lang_code)) for lang_code in additional_special_tokens
|
|
||||||
}
|
|
||||||
|
|
||||||
self._src_lang = src_lang if src_lang is not None else "eng_Latn"
|
self._src_lang = src_lang if src_lang is not None else "eng_Latn"
|
||||||
self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
|
self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
|
||||||
self.tgt_lang = tgt_lang
|
self.tgt_lang = tgt_lang
|
||||||
self.set_src_lang_special_tokens(self._src_lang)
|
self.set_src_lang_special_tokens(self._src_lang)
|
||||||
|
|
||||||
@property
|
|
||||||
def lang_code_to_id(self):
|
|
||||||
logger.warning_once(
|
|
||||||
"the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
|
|
||||||
" this attribute will be removed in `transformers` v4.38"
|
|
||||||
)
|
|
||||||
return self._lang_code_to_id
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def can_save_slow_tokenizer(self) -> bool:
|
def can_save_slow_tokenizer(self) -> bool:
|
||||||
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
||||||
|
|||||||
@@ -367,11 +367,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
|
|||||||
cls.pad_token_id = 1
|
cls.pad_token_id = 1
|
||||||
return cls
|
return cls
|
||||||
|
|
||||||
def test_language_codes(self):
|
|
||||||
self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Arab"], 256001)
|
|
||||||
self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Latn"], 256002)
|
|
||||||
self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["fra_Latn"], 256057)
|
|
||||||
|
|
||||||
def test_enro_tokenizer_batch_encode_plus(self):
|
def test_enro_tokenizer_batch_encode_plus(self):
|
||||||
ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
|
ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
|
||||||
self.assertListEqual(self.expected_src_tokens, ids)
|
self.assertListEqual(self.expected_src_tokens, ids)
|
||||||
@@ -397,13 +392,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
|
|||||||
def test_mask_token(self):
|
def test_mask_token(self):
|
||||||
self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [256203, 3])
|
self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [256203, 3])
|
||||||
|
|
||||||
def test_special_tokens_unaffacted_by_save_load(self):
|
|
||||||
tmpdirname = tempfile.mkdtemp()
|
|
||||||
original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
|
|
||||||
self.tokenizer.save_pretrained(tmpdirname)
|
|
||||||
new_tok = NllbTokenizer.from_pretrained(tmpdirname)
|
|
||||||
self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_enro_tokenizer_prepare_batch(self):
|
def test_enro_tokenizer_prepare_batch(self):
|
||||||
batch = self.tokenizer(
|
batch = self.tokenizer(
|
||||||
|
|||||||
@@ -12,7 +12,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import tempfile
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
@@ -499,14 +498,6 @@ class SeamlessM4TDistilledIntegrationTest(unittest.TestCase):
|
|||||||
self.assertEqual(ids[0], EN_CODE)
|
self.assertEqual(ids[0], EN_CODE)
|
||||||
self.assertEqual(len(ids), desired_max_length)
|
self.assertEqual(len(ids), desired_max_length)
|
||||||
|
|
||||||
# Copied from tests.models.nllb.test_tokenization_nllb.NllbDistilledIntegrationTest.test_special_tokens_unaffacted_by_save_load with fairseq_tokens_to_ids->additional_special_tokens, Nllb->SeamlessM4T, Dict->List
|
|
||||||
def test_special_tokens_unaffacted_by_save_load(self):
|
|
||||||
tmpdirname = tempfile.mkdtemp()
|
|
||||||
original_special_tokens = self.tokenizer.additional_special_tokens
|
|
||||||
self.tokenizer.save_pretrained(tmpdirname)
|
|
||||||
new_tok = SeamlessM4TTokenizer.from_pretrained(tmpdirname)
|
|
||||||
self.assertListEqual(new_tok.additional_special_tokens, original_special_tokens)
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_enro_tokenizer_prepare_batch(self):
|
def test_enro_tokenizer_prepare_batch(self):
|
||||||
batch = self.tokenizer(
|
batch = self.tokenizer(
|
||||||
|
|||||||
Reference in New Issue
Block a user