From 3c39c07f1160beb6190f635e2dab6a27e6d67596 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Fri, 25 Nov 2022 20:28:00 +0100 Subject: [PATCH] fix `word_to_tokens` docstring format (#20450) * fix docstring * fix 2 * add details --- src/transformers/tokenization_utils_base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index b58ffc69c2..e3721f8131 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -475,8 +475,10 @@ class BatchEncoding(UserDict): or 1) the provided word index belongs to. Returns: - Optional [`~tokenization_utils_base.TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if - no tokens correspond to the word. + ([`~tokenization_utils_base.TokenSpan`], *optional*): Span of tokens in the encoded sequence. Returns + `None` if no tokens correspond to the word. This can happen especially when the token is a special token + that has been used to format the tokenization. For example when we add a class token at the very beginning + of the tokenization. """ if not self._encodings: