diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 5dc9357810..22ee1a0db6 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -105,12 +105,14 @@ class BartTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import BartTokenizer + >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index 6d6e29986b..f05ed1b7a8 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -75,12 +75,14 @@ class BartTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import BartTokenizerFast + >>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index 208ced46bc..cb4a33a3c2 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -96,13 +96,15 @@ class BlenderbotTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import BlenderbotTokenizer + >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B") >>> tokenizer.add_prefix_space = False - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [47, 921, 86, 1085, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [6950, 1085, 2] ``` diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 7c4e060e5d..4737e92617 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -55,12 +55,14 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import BlenderbotTokenizerFast + >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [6950, 1085, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [6950, 1085, 2] ``` diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index 1c8efb10cb..800c73f025 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -54,13 +54,15 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import BloomTokenizerFast + >>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom") - >>> tokenizer("Hello world")['input_ids'] - [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] - [18435, 995] + >>> tokenizer("Hello world")["input_ids"] + [59414, 8876] + + >>> tokenizer(" Hello world")["input_ids"] + [86153, 8876] ``` You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py index c09a816bfb..0c01e63471 100644 --- a/src/transformers/models/codegen/tokenization_codegen.py +++ b/src/transformers/models/codegen/tokenization_codegen.py @@ -102,12 +102,14 @@ class CodeGenTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import CodeGenTokenizer + >>> tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [18435, 995] ``` diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py index 332f0ed934..83e5e30d0b 100644 --- a/src/transformers/models/codegen/tokenization_codegen_fast.py +++ b/src/transformers/models/codegen/tokenization_codegen_fast.py @@ -68,12 +68,14 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import CodeGenTokenizerFast + >>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [18435, 995] ``` diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py index bcaaaa4421..8a778a947c 100644 --- a/src/transformers/models/deberta/tokenization_deberta.py +++ b/src/transformers/models/deberta/tokenization_deberta.py @@ -116,13 +116,15 @@ class DebertaTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import DebertaTokenizer + >>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base") - >>> tokenizer("Hello world")['input_ids'] - [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] - [18435, 995] + >>> tokenizer("Hello world")["input_ids"] + [1, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [1, 20920, 232, 2] ``` You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py index 959bcae470..c05cf25761 100644 --- a/src/transformers/models/deberta/tokenization_deberta_fast.py +++ b/src/transformers/models/deberta/tokenization_deberta_fast.py @@ -79,13 +79,15 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import DebertaTokenizerFast + >>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base") - >>> tokenizer("Hello world")['input_ids'] - [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] - [18435, 995] + >>> tokenizer("Hello world")["input_ids"] + [1, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [1, 20920, 232, 2] ``` You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index c462e45d01..9a8ce3a4fa 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -108,12 +108,14 @@ class GPT2Tokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import GPT2Tokenizer + >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [18435, 995] ``` diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index 7d7500ee9c..cf2b8b2cb2 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -75,12 +75,14 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import GPT2TokenizerFast + >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [18435, 995] ``` diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py index 1d4c1cec3a..570b2abaa4 100644 --- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py +++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py @@ -49,12 +49,14 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import GPTNeoXTokenizerFast + >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [18435, 995] ``` diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py index f982c5b6b1..422cb07220 100644 --- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py +++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py @@ -47,10 +47,11 @@ class GPTSw3Tokenizer(PreTrainedTokenizer): this superclass for more information regarding those methods. Example usage: - ``` + ```python >>> from transformers import GPTSw3Tokenizer + >>> tokenizer = GPTSw3Tokenizer.from_pretrained("AI-Sweden/gpt-sw3-126m") - >>> tokenizer("Svenska är kul!")['input_ids'] + >>> tokenizer("Svenska är kul!")["input_ids"] [1814, 377, 3617, 63504] ``` diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py index 63399adf16..9a45f6bd65 100644 --- a/src/transformers/models/jukebox/tokenization_jukebox.py +++ b/src/transformers/models/jukebox/tokenization_jukebox.py @@ -68,13 +68,13 @@ class JukeboxTokenizer(PreTrainedTokenizer): as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.: Depending on the number of genres on which the model should be conditioned (`n_genres`). - ``` + ```python >>> from transformers import JukeboxTokenizer + >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics") - >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids'] + >>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"] [tensor([[ 0, 0, 0, 6785, 546, 41, 38, 30, 76, 46, 41, 49, 40, 76, 44, 41, 27, 30]]), tensor([[ 0, 0, 0, 145, 0]]), tensor([[ 0, 0, 0, 145, 0]])] - ``` You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py index 5b22701a22..1cdb524301 100644 --- a/src/transformers/models/led/tokenization_led.py +++ b/src/transformers/models/led/tokenization_led.py @@ -97,12 +97,14 @@ class LEDTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import LEDTokenizer + >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py index 153b32c296..51b8ab4aaa 100644 --- a/src/transformers/models/led/tokenization_led_fast.py +++ b/src/transformers/models/led/tokenization_led_fast.py @@ -55,12 +55,14 @@ class LEDTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import LEDTokenizerFast + >>> tokenizer = LEDTokenizerFast.from_pretrained("allenai/led-base-16384") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py index 69bc505953..fea949658a 100644 --- a/src/transformers/models/longformer/tokenization_longformer.py +++ b/src/transformers/models/longformer/tokenization_longformer.py @@ -120,12 +120,14 @@ class LongformerTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import LongformerTokenizer + >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py index dfe1b08e14..1460f2f2cc 100644 --- a/src/transformers/models/longformer/tokenization_longformer_fast.py +++ b/src/transformers/models/longformer/tokenization_longformer_fast.py @@ -96,12 +96,14 @@ class LongformerTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import LongformerTokenizerFast + >>> tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index 89fb9b63e8..8b47ced1d3 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -197,12 +197,14 @@ class LukeTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import LukeTokenizer + >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/mvp/tokenization_mvp.py b/src/transformers/models/mvp/tokenization_mvp.py index 98d373188e..2d497c23d1 100644 --- a/src/transformers/models/mvp/tokenization_mvp.py +++ b/src/transformers/models/mvp/tokenization_mvp.py @@ -93,12 +93,14 @@ class MvpTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import MvpTokenizer + >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py index 28dd1ea942..fd6abd1700 100644 --- a/src/transformers/models/mvp/tokenization_mvp_fast.py +++ b/src/transformers/models/mvp/tokenization_mvp_fast.py @@ -58,12 +58,14 @@ class MvpTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import MvpTokenizerFast + >>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py index e8d4a751bc..24b9748c3d 100644 --- a/src/transformers/models/roberta/tokenization_roberta.py +++ b/src/transformers/models/roberta/tokenization_roberta.py @@ -111,12 +111,14 @@ class RobertaTokenizer(PreTrainedTokenizer): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import RobertaTokenizer + >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py index 32cc66d750..c2c479da09 100644 --- a/src/transformers/models/roberta/tokenization_roberta_fast.py +++ b/src/transformers/models/roberta/tokenization_roberta_fast.py @@ -81,12 +81,14 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast): This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: - ``` + ```python >>> from transformers import RobertaTokenizerFast + >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") - >>> tokenizer("Hello world")['input_ids'] + >>> tokenizer("Hello world")["input_ids"] [0, 31414, 232, 2] - >>> tokenizer(" Hello world")['input_ids'] + + >>> tokenizer(" Hello world")["input_ids"] [0, 20920, 232, 2] ``` diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 7f3535eb55..bfd87a5c0b 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -444,3 +444,26 @@ src/transformers/models/wav2vec2/processing_wav2vec2.py src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py src/transformers/models/whisper/processing_whisper.py src/transformers/models/x_clip/processing_x_clip.py +src/transformers/models/bart/tokenization_bart.py +src/transformers/models/bart/tokenization_bart_fast.py +src/transformers/models/blenderbot/tokenization_blenderbot.py +src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +src/transformers/models/bloom/tokenization_bloom_fast.py +src/transformers/models/codegen/tokenization_codegen.py +src/transformers/models/codegen/tokenization_codegen_fast.py +src/transformers/models/deberta/tokenization_deberta.py +src/transformers/models/deberta/tokenization_deberta_fast.py +src/transformers/models/gpt2/tokenization_gpt2.py +src/transformers/models/gpt2/tokenization_gpt2_fast.py +src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py +src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py +src/transformers/models/jukebox/tokenization_jukebox.py +src/transformers/models/led/tokenization_led.py +src/transformers/models/led/tokenization_led_fast.py +src/transformers/models/longformer/tokenization_longformer.py +src/transformers/models/longformer/tokenization_longformer_fast.py +src/transformers/models/luke/tokenization_luke.py +src/transformers/models/mvp/tokenization_mvp.py +src/transformers/models/mvp/tokenization_mvp_fast.py +src/transformers/models/roberta/tokenization_roberta.py +src/transformers/models/roberta/tokenization_roberta_fast.py