Clarify use of unk_token in tokenizer docstrings (#9875)
This commit is contained in:
@@ -230,9 +230,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
|||||||
"""
|
"""
|
||||||
Converts a string in a sequence of tokens, using the tokenizer.
|
Converts a string in a sequence of tokens, using the tokenizer.
|
||||||
|
|
||||||
Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method won't replace the unknown
|
|
||||||
tokens with the `unk_token` yet (this is done in the `encode()` method)
|
|
||||||
|
|
||||||
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
|
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
|
||||||
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
|
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
|
||||||
|
|
||||||
|
|||||||
@@ -2043,14 +2043,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
|
|
||||||
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
|
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Converts a string in a sequence of tokens, using the backend Rust tokenizer.
|
Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`.
|
||||||
|
|
||||||
Note that this method behave differently between fast and slow tokenizers:
|
|
||||||
|
|
||||||
- in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method will
|
|
||||||
replace the unknown tokens with the :obj:`unk_token`,
|
|
||||||
- in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method keep unknown
|
|
||||||
tokens unchanged.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (:obj:`str`):
|
text (:obj:`str`):
|
||||||
|
|||||||
Reference in New Issue
Block a user