fix more doctests (#22292)
* fix more doctests * fix style --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -105,12 +105,14 @@ class BartTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import BartTokenizer
|
>>> from transformers import BartTokenizer
|
||||||
|
|
||||||
>>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
|
>>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -75,12 +75,14 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import BartTokenizerFast
|
>>> from transformers import BartTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
|
>>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -96,13 +96,15 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import BlenderbotTokenizer
|
>>> from transformers import BlenderbotTokenizer
|
||||||
|
|
||||||
>>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
|
>>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
|
||||||
>>> tokenizer.add_prefix_space = False
|
>>> tokenizer.add_prefix_space = False
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[47, 921, 86, 1085, 2]
|
[47, 921, 86, 1085, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[6950, 1085, 2]
|
[6950, 1085, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -55,12 +55,14 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import BlenderbotTokenizerFast
|
>>> from transformers import BlenderbotTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
|
>>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[6950, 1085, 2]
|
[6950, 1085, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[6950, 1085, 2]
|
[6950, 1085, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -54,13 +54,15 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import BloomTokenizerFast
|
>>> from transformers import BloomTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom")
|
>>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[59414, 8876]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
[18435, 995]
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
|
[86153, 8876]
|
||||||
```
|
```
|
||||||
|
|
||||||
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
|
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
|
||||||
|
|||||||
@@ -102,12 +102,14 @@ class CodeGenTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import CodeGenTokenizer
|
>>> from transformers import CodeGenTokenizer
|
||||||
|
|
||||||
>>> tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
|
>>> tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[15496, 995]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[18435, 995]
|
[18435, 995]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -68,12 +68,14 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import CodeGenTokenizerFast
|
>>> from transformers import CodeGenTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono")
|
>>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[15496, 995]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[18435, 995]
|
[18435, 995]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -116,13 +116,15 @@ class DebertaTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import DebertaTokenizer
|
>>> from transformers import DebertaTokenizer
|
||||||
|
|
||||||
>>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
|
>>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[1, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
[18435, 995]
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
|
[1, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
|
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
|
||||||
|
|||||||
@@ -79,13 +79,15 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import DebertaTokenizerFast
|
>>> from transformers import DebertaTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
|
>>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[1, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
[18435, 995]
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
|
[1, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
|
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
|
||||||
|
|||||||
@@ -108,12 +108,14 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import GPT2Tokenizer
|
>>> from transformers import GPT2Tokenizer
|
||||||
|
|
||||||
>>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
>>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[15496, 995]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[18435, 995]
|
[18435, 995]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -75,12 +75,14 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import GPT2TokenizerFast
|
>>> from transformers import GPT2TokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
>>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[15496, 995]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[18435, 995]
|
[18435, 995]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -49,12 +49,14 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import GPTNeoXTokenizerFast
|
>>> from transformers import GPTNeoXTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
|
>>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[15496, 995]
|
[15496, 995]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[18435, 995]
|
[18435, 995]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -47,10 +47,11 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
|
|||||||
this superclass for more information regarding those methods.
|
this superclass for more information regarding those methods.
|
||||||
|
|
||||||
Example usage:
|
Example usage:
|
||||||
```
|
```python
|
||||||
>>> from transformers import GPTSw3Tokenizer
|
>>> from transformers import GPTSw3Tokenizer
|
||||||
|
|
||||||
>>> tokenizer = GPTSw3Tokenizer.from_pretrained("AI-Sweden/gpt-sw3-126m")
|
>>> tokenizer = GPTSw3Tokenizer.from_pretrained("AI-Sweden/gpt-sw3-126m")
|
||||||
>>> tokenizer("Svenska är kul!")['input_ids']
|
>>> tokenizer("Svenska är kul!")["input_ids"]
|
||||||
[1814, 377, 3617, 63504]
|
[1814, 377, 3617, 63504]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -68,13 +68,13 @@ class JukeboxTokenizer(PreTrainedTokenizer):
|
|||||||
as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
|
as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
|
||||||
|
|
||||||
Depending on the number of genres on which the model should be conditioned (`n_genres`).
|
Depending on the number of genres on which the model should be conditioned (`n_genres`).
|
||||||
```
|
```python
|
||||||
>>> from transformers import JukeboxTokenizer
|
>>> from transformers import JukeboxTokenizer
|
||||||
|
|
||||||
>>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
|
>>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
|
||||||
>>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
|
>>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"]
|
||||||
[tensor([[ 0, 0, 0, 6785, 546, 41, 38, 30, 76, 46, 41, 49,
|
[tensor([[ 0, 0, 0, 6785, 546, 41, 38, 30, 76, 46, 41, 49,
|
||||||
40, 76, 44, 41, 27, 30]]), tensor([[ 0, 0, 0, 145, 0]]), tensor([[ 0, 0, 0, 145, 0]])]
|
40, 76, 44, 41, 27, 30]]), tensor([[ 0, 0, 0, 145, 0]]), tensor([[ 0, 0, 0, 145, 0]])]
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
|
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
|
||||||
|
|||||||
@@ -97,12 +97,14 @@ class LEDTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import LEDTokenizer
|
>>> from transformers import LEDTokenizer
|
||||||
|
|
||||||
>>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
|
>>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -55,12 +55,14 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import LEDTokenizerFast
|
>>> from transformers import LEDTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = LEDTokenizerFast.from_pretrained("allenai/led-base-16384")
|
>>> tokenizer = LEDTokenizerFast.from_pretrained("allenai/led-base-16384")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -120,12 +120,14 @@ class LongformerTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import LongformerTokenizer
|
>>> from transformers import LongformerTokenizer
|
||||||
|
|
||||||
>>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
|
>>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -96,12 +96,14 @@ class LongformerTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import LongformerTokenizerFast
|
>>> from transformers import LongformerTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
|
>>> tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -197,12 +197,14 @@ class LukeTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import LukeTokenizer
|
>>> from transformers import LukeTokenizer
|
||||||
|
|
||||||
>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
|
>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -93,12 +93,14 @@ class MvpTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import MvpTokenizer
|
>>> from transformers import MvpTokenizer
|
||||||
|
|
||||||
>>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
|
>>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -58,12 +58,14 @@ class MvpTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import MvpTokenizerFast
|
>>> from transformers import MvpTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
|
>>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -111,12 +111,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import RobertaTokenizer
|
>>> from transformers import RobertaTokenizer
|
||||||
|
|
||||||
>>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
>>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -81,12 +81,14 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||||
|
|
||||||
```
|
```python
|
||||||
>>> from transformers import RobertaTokenizerFast
|
>>> from transformers import RobertaTokenizerFast
|
||||||
|
|
||||||
>>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
>>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
>>> tokenizer("Hello world")["input_ids"]
|
||||||
[0, 31414, 232, 2]
|
[0, 31414, 232, 2]
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
|
>>> tokenizer(" Hello world")["input_ids"]
|
||||||
[0, 20920, 232, 2]
|
[0, 20920, 232, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -444,3 +444,26 @@ src/transformers/models/wav2vec2/processing_wav2vec2.py
|
|||||||
src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
|
src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
|
||||||
src/transformers/models/whisper/processing_whisper.py
|
src/transformers/models/whisper/processing_whisper.py
|
||||||
src/transformers/models/x_clip/processing_x_clip.py
|
src/transformers/models/x_clip/processing_x_clip.py
|
||||||
|
src/transformers/models/bart/tokenization_bart.py
|
||||||
|
src/transformers/models/bart/tokenization_bart_fast.py
|
||||||
|
src/transformers/models/blenderbot/tokenization_blenderbot.py
|
||||||
|
src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
|
||||||
|
src/transformers/models/bloom/tokenization_bloom_fast.py
|
||||||
|
src/transformers/models/codegen/tokenization_codegen.py
|
||||||
|
src/transformers/models/codegen/tokenization_codegen_fast.py
|
||||||
|
src/transformers/models/deberta/tokenization_deberta.py
|
||||||
|
src/transformers/models/deberta/tokenization_deberta_fast.py
|
||||||
|
src/transformers/models/gpt2/tokenization_gpt2.py
|
||||||
|
src/transformers/models/gpt2/tokenization_gpt2_fast.py
|
||||||
|
src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
|
||||||
|
src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
|
||||||
|
src/transformers/models/jukebox/tokenization_jukebox.py
|
||||||
|
src/transformers/models/led/tokenization_led.py
|
||||||
|
src/transformers/models/led/tokenization_led_fast.py
|
||||||
|
src/transformers/models/longformer/tokenization_longformer.py
|
||||||
|
src/transformers/models/longformer/tokenization_longformer_fast.py
|
||||||
|
src/transformers/models/luke/tokenization_luke.py
|
||||||
|
src/transformers/models/mvp/tokenization_mvp.py
|
||||||
|
src/transformers/models/mvp/tokenization_mvp_fast.py
|
||||||
|
src/transformers/models/roberta/tokenization_roberta.py
|
||||||
|
src/transformers/models/roberta/tokenization_roberta_fast.py
|
||||||
|
|||||||
Reference in New Issue
Block a user