From 132852203a02e320049457316a63cffb64968aa1 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 5 Mar 2024 09:42:52 +0100 Subject: [PATCH] [`UdopTokenizer`] Fix post merge imports (#29451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update * ... * nits * arf * 🧼 * beat the last guy * style everyone --- .../models/udop/tokenization_udop.py | 7 ------- .../models/udop/tokenization_udop_fast.py | 17 +++++++++++------ tests/models/udop/test_tokenization_udop.py | 6 +++++- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py index 10e92db48c..c3b270bc55 100644 --- a/src/transformers/models/udop/tokenization_udop.py +++ b/src/transformers/models/udop/tokenization_udop.py @@ -157,12 +157,6 @@ PRETRAINED_VOCAB_FILES_MAP = { } -# TODO(PVP) - this should be removed in Transformers v5 -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "microsoft/udop-large": 512, -} - - class UdopTokenizer(PreTrainedTokenizer): """ Adapted from [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on @@ -256,7 +250,6 @@ class UdopTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] def __init__( diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py index ee06975955..cce527a805 100644 --- a/src/transformers/models/udop/tokenization_udop_fast.py +++ b/src/transformers/models/udop/tokenization_udop_fast.py @@ -29,11 +29,6 @@ from ...tokenization_utils_base import ( ) from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging -from ..udop.tokenization_udop import ( - PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES, - PRETRAINED_VOCAB_FILES_MAP, - VOCAB_FILES_NAMES, -) if is_sentencepiece_available(): @@ -42,6 +37,17 @@ else: UdopTokenizer = None +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/spiece.model", + }, + "tokenizer_file": { + "microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/tokenizer.json", + }, +} + logger = logging.get_logger(__name__) UDOP_ENCODE_KWARGS_DOCSTRING = r""" @@ -197,7 +203,6 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] slow_tokenizer_class = UdopTokenizer diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py index e9d41c5b77..cc9a2f2852 100644 --- a/tests/models/udop/test_tokenization_udop.py +++ b/tests/models/udop/test_tokenization_udop.py @@ -22,12 +22,12 @@ from typing import List from transformers import ( AddedToken, SpecialTokensMixin, + UdopTokenizer, UdopTokenizerFast, is_tf_available, is_torch_available, logging, ) -from transformers.models.udop.tokenization_udop import UdopTokenizer from transformers.testing_utils import ( get_tests_dir, is_pt_tf_cross_test, @@ -1717,6 +1717,10 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_alignement_methods(self): pass + @unittest.skip("#TODO will be removed in main") + def test_pretrained_model_lists(self): + pass + @unittest.skip("UDOP tokenizer requires boxes besides sequences.") def test_maximum_encoding_length_pair_input(self): pass