Final update of doctest (#22299)
* update * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -303,7 +303,7 @@ class AutoFeatureExtractor:
|
|||||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
>>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
|
>>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
|
||||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
|
>>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
|
||||||
```"""
|
```"""
|
||||||
config = kwargs.pop("config", None)
|
config = kwargs.pop("config", None)
|
||||||
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
||||||
|
|||||||
@@ -306,7 +306,7 @@ class AutoImageProcessor:
|
|||||||
>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
|
>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
|
||||||
|
|
||||||
>>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*)
|
>>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*)
|
||||||
>>> image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
|
>>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
|
||||||
```"""
|
```"""
|
||||||
config = kwargs.pop("config", None)
|
config = kwargs.pop("config", None)
|
||||||
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
||||||
|
|||||||
@@ -188,7 +188,7 @@ class AutoProcessor:
|
|||||||
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
>>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
|
>>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
|
||||||
>>> processor = AutoProcessor.from_pretrained("./test/saved_model/")
|
>>> # processor = AutoProcessor.from_pretrained("./test/saved_model/")
|
||||||
```"""
|
```"""
|
||||||
config = kwargs.pop("config", None)
|
config = kwargs.pop("config", None)
|
||||||
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
||||||
|
|||||||
@@ -575,7 +575,7 @@ class AutoTokenizer:
|
|||||||
>>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
|
>>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
|
||||||
|
|
||||||
>>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
|
>>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
|
||||||
>>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
|
>>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
|
||||||
|
|
||||||
>>> # Download vocabulary from huggingface.co and define model-specific arguments
|
>>> # Download vocabulary from huggingface.co and define model-specific arguments
|
||||||
>>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
|
>>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
|
||||||
|
|||||||
@@ -640,9 +640,17 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
|
|||||||
|
|
||||||
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
|
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
|
||||||
|
|
||||||
>>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: £100')
|
Examples:
|
||||||
'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: £100')) Price: £100 >>>
|
|
||||||
"""
|
```python
|
||||||
|
>>> from nltk.tokenize.casual import _replace_html_entities
|
||||||
|
|
||||||
|
>>> _replace_html_entities(b"Price: £100")
|
||||||
|
'Price: \\xa3100'
|
||||||
|
|
||||||
|
>>> print(_replace_html_entities(b"Price: £100"))
|
||||||
|
Price: £100
|
||||||
|
```"""
|
||||||
|
|
||||||
def _convert_entity(match):
|
def _convert_entity(match):
|
||||||
entity_body = match.group(3)
|
entity_body = match.group(3)
|
||||||
|
|||||||
@@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin:
|
|||||||
>>> outputs = model(**encoded_inputs)
|
>>> outputs = model(**encoded_inputs)
|
||||||
>>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
|
>>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
|
||||||
>>> print(predicted_spans[0].text) # best span
|
>>> print(predicted_spans[0].text) # best span
|
||||||
|
a song
|
||||||
```"""
|
```"""
|
||||||
input_ids = reader_input["input_ids"]
|
input_ids = reader_input["input_ids"]
|
||||||
start_logits, end_logits, relevance_logits = reader_output[:3]
|
start_logits, end_logits, relevance_logits = reader_output[:3]
|
||||||
|
|||||||
@@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin:
|
|||||||
>>> outputs = model(**encoded_inputs)
|
>>> outputs = model(**encoded_inputs)
|
||||||
>>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
|
>>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
|
||||||
>>> print(predicted_spans[0].text) # best span
|
>>> print(predicted_spans[0].text) # best span
|
||||||
|
a song
|
||||||
```"""
|
```"""
|
||||||
input_ids = reader_input["input_ids"]
|
input_ids = reader_input["input_ids"]
|
||||||
start_logits, end_logits, relevance_logits = reader_output[:3]
|
start_logits, end_logits, relevance_logits = reader_output[:3]
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
|
|||||||
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
||||||
>>> # You can confirm both 慶応 and 慶應 are encoded to 17750
|
>>> # You can confirm both 慶応 and 慶應 are encoded to 17750
|
||||||
>>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
|
>>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
|
||||||
[34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
|
[35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
|
||||||
|
|
||||||
>>> # Both 慶応 and 慶應 are decoded to 慶応
|
>>> # Both 慶応 and 慶應 are decoded to 慶応
|
||||||
>>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
|
>>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
|
||||||
@@ -311,6 +311,9 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
```python
|
```python
|
||||||
|
>>> from transformers import GPTSanJapaneseTokenizer
|
||||||
|
|
||||||
|
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
|
||||||
>>> x_token = tokenizer("アイウエ")
|
>>> x_token = tokenizer("アイウエ")
|
||||||
>>> # input_ids: | SOT | SEG | ア | イ | ウ | エ |
|
>>> # input_ids: | SOT | SEG | ア | イ | ウ | エ |
|
||||||
>>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 |
|
>>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 |
|
||||||
|
|||||||
@@ -110,13 +110,14 @@ class M2M100Tokenizer(PreTrainedTokenizer):
|
|||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from transformers import M2M100Tokenizer
|
>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
||||||
|
|
||||||
|
>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
|
||||||
>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
|
>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
|
||||||
>>> src_text = " UN Chief Says There Is No Military Solution in Syria"
|
>>> src_text = " UN Chief Says There Is No Military Solution in Syria"
|
||||||
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
|
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
|
||||||
>>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
|
>>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
|
||||||
>>> model(**model_inputs) # should work
|
>>> outputs = model(**model_inputs) # should work
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -106,13 +106,13 @@ class MarianTokenizer(PreTrainedTokenizer):
|
|||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from transformers import MarianTokenizer
|
>>> from transformers import MarianForCausalLM, MarianTokenizer
|
||||||
|
|
||||||
|
>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
|
||||||
>>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
|
>>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
|
||||||
>>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
|
>>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
|
||||||
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
|
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
|
||||||
>>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)
|
>>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)
|
||||||
# keys [input_ids, attention_mask, labels].
|
|
||||||
|
|
||||||
>>> outputs = model(**inputs) # should work
|
>>> outputs = model(**inputs) # should work
|
||||||
```"""
|
```"""
|
||||||
|
|||||||
@@ -344,7 +344,7 @@ class RoFormerTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
>>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
|
>>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
|
||||||
>>> tokenizer.tokenize("今天天气非常好。")
|
>>> tokenizer.tokenize("今天天气非常好。")
|
||||||
# ['今', '天', '天', '气', '非常', '好', '。']
|
['今', '天', '天', '气', '非常', '好', '。']
|
||||||
```"""
|
```"""
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
>>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
|
>>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
|
||||||
>>> tokenizer.tokenize("今天天气非常好。")
|
>>> tokenizer.tokenize("今天天气非常好。")
|
||||||
# ['今', '天', '天', '气', '非常', '好', '。']
|
['今', '天', '天', '气', '非常', '好', '。']
|
||||||
```"""
|
```"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
>>> tokenize_numbers(["$", "5,000", "1.73", "m"])
|
>>> tokenize_numbers(["$", "5,000", "1.73", "m"])
|
||||||
["$", "5", "@,@", "000", "1", "@.@", "73", "m"]
|
['$', '5', '@,@', '000', '1', '@.@', '73', 'm']
|
||||||
```"""
|
```"""
|
||||||
tokenized = []
|
tokenized = []
|
||||||
for i in range(len(text_array)):
|
for i in range(len(text_array)):
|
||||||
@@ -113,7 +113,7 @@ def detokenize_numbers(text: str) -> str:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
>>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
|
>>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
|
||||||
"$ 5,000 1.73 m"
|
'$ 5,000 1.73 m'
|
||||||
```"""
|
```"""
|
||||||
for reg, sub in DETOKENIZE_NUMBERS:
|
for reg, sub in DETOKENIZE_NUMBERS:
|
||||||
text = re.sub(reg, sub, text)
|
text = re.sub(reg, sub, text)
|
||||||
|
|||||||
@@ -467,3 +467,16 @@ src/transformers/models/mvp/tokenization_mvp.py
|
|||||||
src/transformers/models/mvp/tokenization_mvp_fast.py
|
src/transformers/models/mvp/tokenization_mvp_fast.py
|
||||||
src/transformers/models/roberta/tokenization_roberta.py
|
src/transformers/models/roberta/tokenization_roberta.py
|
||||||
src/transformers/models/roberta/tokenization_roberta_fast.py
|
src/transformers/models/roberta/tokenization_roberta_fast.py
|
||||||
|
src/transformers/models/auto/feature_extraction_auto.py
|
||||||
|
src/transformers/models/auto/image_processing_auto.py
|
||||||
|
src/transformers/models/auto/processing_auto.py
|
||||||
|
src/transformers/models/auto/tokenization_auto.py
|
||||||
|
src/transformers/models/bertweet/tokenization_bertweet.py
|
||||||
|
src/transformers/models/dpr/tokenization_dpr.py
|
||||||
|
src/transformers/models/dpr/tokenization_dpr_fast.py
|
||||||
|
src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
|
||||||
|
src/transformers/models/m2m_100/tokenization_m2m_100.py
|
||||||
|
src/transformers/models/marian/tokenization_marian.py
|
||||||
|
src/transformers/models/roformer/tokenization_roformer.py
|
||||||
|
src/transformers/models/roformer/tokenization_roformer_fast.py
|
||||||
|
src/transformers/models/transfo_xl/tokenization_transfo_xl.py
|
||||||
Reference in New Issue
Block a user