From 99b9affa02a6ef02c765c541a5c6f3dcff592bae Mon Sep 17 00:00:00 2001 From: Ethan Chau Date: Fri, 29 Jan 2021 02:11:53 -0800 Subject: [PATCH] Clarify use of unk_token in tokenizer docstrings (#9875) --- src/transformers/tokenization_utils.py | 3 --- src/transformers/tokenization_utils_base.py | 9 +-------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 5f10f4b6f4..0f2880e2ed 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -230,9 +230,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): """ Converts a string in a sequence of tokens, using the tokenizer. - Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method won't replace the unknown - tokens with the `unk_token` yet (this is done in the `encode()` method) - Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Takes care of added tokens. diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 8544547d82..411bae4c78 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2043,14 +2043,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]: """ - Converts a string in a sequence of tokens, using the backend Rust tokenizer. - - Note that this method behave differently between fast and slow tokenizers: - - - in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method will - replace the unknown tokens with the :obj:`unk_token`, - - in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method keep unknown - tokens unchanged. + Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`. Args: text (:obj:`str`):