From 0dcb46e7a4a9e587ba84ff35778ab4233a184c11 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 22 Mar 2023 01:00:33 +0100 Subject: [PATCH] Final update of doctest (#22299) * update * update --------- Co-authored-by: ydshieh --- .../models/auto/feature_extraction_auto.py | 2 +- .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/auto/processing_auto.py | 2 +- src/transformers/models/auto/tokenization_auto.py | 2 +- .../models/bertweet/tokenization_bertweet.py | 14 +++++++++++--- src/transformers/models/dpr/tokenization_dpr.py | 1 + .../models/dpr/tokenization_dpr_fast.py | 1 + .../tokenization_gptsan_japanese.py | 5 ++++- .../models/m2m_100/tokenization_m2m_100.py | 5 +++-- .../models/marian/tokenization_marian.py | 4 ++-- .../models/roformer/tokenization_roformer.py | 2 +- .../models/roformer/tokenization_roformer_fast.py | 2 +- .../models/transfo_xl/tokenization_transfo_xl.py | 4 ++-- utils/documentation_tests.txt | 13 +++++++++++++ 14 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index adeadf17e3..90218d137f 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -303,7 +303,7 @@ class AutoFeatureExtractor: >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*) - >>> feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/") + >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/") ```""" config = kwargs.pop("config", None) trust_remote_code = kwargs.pop("trust_remote_code", False) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 7a042c56c5..95ecd57172 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -306,7 +306,7 @@ class AutoImageProcessor: >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") >>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*) - >>> image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/") + >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/") ```""" config = kwargs.pop("config", None) trust_remote_code = kwargs.pop("trust_remote_code", False) diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 29726fde4f..fb02003400 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -188,7 +188,7 @@ class AutoProcessor: >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*) - >>> processor = AutoProcessor.from_pretrained("./test/saved_model/") + >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/") ```""" config = kwargs.pop("config", None) trust_remote_code = kwargs.pop("trust_remote_code", False) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 2e6ddc0b13..a029bfb52c 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -575,7 +575,7 @@ class AutoTokenizer: >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased") >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*) - >>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/") + >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/") >>> # Download vocabulary from huggingface.co and define model-specific arguments >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True) diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index 129806ebd3..9a5390c0b7 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -640,9 +640,17 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8") See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py - >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: £100') - 'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: £100')) Price: £100 >>> - """ + Examples: + + ```python + >>> from nltk.tokenize.casual import _replace_html_entities + + >>> _replace_html_entities(b"Price: £100") + 'Price: \\xa3100' + + >>> print(_replace_html_entities(b"Price: £100")) + Price: £100 + ```""" def _convert_entity(match): entity_body = match.group(3) diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py index a14133459b..a2024dda5d 100644 --- a/src/transformers/models/dpr/tokenization_dpr.py +++ b/src/transformers/models/dpr/tokenization_dpr.py @@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin: >>> outputs = model(**encoded_inputs) >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs) >>> print(predicted_spans[0].text) # best span + a song ```""" input_ids = reader_input["input_ids"] start_logits, end_logits, relevance_logits = reader_output[:3] diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py index 507cd2bc40..de32332bf2 100644 --- a/src/transformers/models/dpr/tokenization_dpr_fast.py +++ b/src/transformers/models/dpr/tokenization_dpr_fast.py @@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin: >>> outputs = model(**encoded_inputs) >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs) >>> print(predicted_spans[0].text) # best span + a song ```""" input_ids = reader_input["input_ids"] start_logits, end_logits, relevance_logits = reader_output[:3] diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py index a9ebc21f38..0c89a60b78 100644 --- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py @@ -96,7 +96,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") >>> # You can confirm both 慶応 and 慶應 are encoded to 17750 >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"] - [34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281] + [35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281] >>> # Both 慶応 and 慶應 are decoded to 慶応 >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]) @@ -311,6 +311,9 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): Example: ```python + >>> from transformers import GPTSanJapaneseTokenizer + + >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") >>> x_token = tokenizer("アイウエ") >>> # input_ids: | SOT | SEG | ア | イ | ウ | エ | >>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 | diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index dcfa51555f..82f5e3a47b 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -110,13 +110,14 @@ class M2M100Tokenizer(PreTrainedTokenizer): Examples: ```python - >>> from transformers import M2M100Tokenizer + >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro") >>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") - >>> model(**model_inputs) # should work + >>> outputs = model(**model_inputs) # should work ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index 7d2af76fc3..aa63b1ff88 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -106,13 +106,13 @@ class MarianTokenizer(PreTrainedTokenizer): Examples: ```python - >>> from transformers import MarianTokenizer + >>> from transformers import MarianForCausalLM, MarianTokenizer + >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de") >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de") >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."] >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True) - # keys [input_ids, attention_mask, labels]. >>> outputs = model(**inputs) # should work ```""" diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py index 6c0b6cd4f3..1e4907f50c 100644 --- a/src/transformers/models/roformer/tokenization_roformer.py +++ b/src/transformers/models/roformer/tokenization_roformer.py @@ -344,7 +344,7 @@ class RoFormerTokenizer(PreTrainedTokenizer): >>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base") >>> tokenizer.tokenize("今天天气非常好。") - # ['今', '天', '天', '气', '非常', '好', '。'] + ['今', '天', '天', '气', '非常', '好', '。'] ```""" vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py index 88ccf183d1..d73e3cdb93 100644 --- a/src/transformers/models/roformer/tokenization_roformer_fast.py +++ b/src/transformers/models/roformer/tokenization_roformer_fast.py @@ -85,7 +85,7 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast): >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base") >>> tokenizer.tokenize("今天天气非常好。") - # ['今', '天', '天', '气', '非常', '好', '。'] + ['今', '天', '天', '气', '非常', '好', '。'] ```""" vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py index 13977d4382..0097b2a6f2 100644 --- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py +++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py @@ -88,7 +88,7 @@ def tokenize_numbers(text_array: List[str]) -> List[str]: ```python >>> tokenize_numbers(["$", "5,000", "1.73", "m"]) - ["$", "5", "@,@", "000", "1", "@.@", "73", "m"] + ['$', '5', '@,@', '000', '1', '@.@', '73', 'm'] ```""" tokenized = [] for i in range(len(text_array)): @@ -113,7 +113,7 @@ def detokenize_numbers(text: str) -> str: ```python >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m") - "$ 5,000 1.73 m" + '$ 5,000 1.73 m' ```""" for reg, sub in DETOKENIZE_NUMBERS: text = re.sub(reg, sub, text) diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index bfd87a5c0b..3357c1d569 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -467,3 +467,16 @@ src/transformers/models/mvp/tokenization_mvp.py src/transformers/models/mvp/tokenization_mvp_fast.py src/transformers/models/roberta/tokenization_roberta.py src/transformers/models/roberta/tokenization_roberta_fast.py +src/transformers/models/auto/feature_extraction_auto.py +src/transformers/models/auto/image_processing_auto.py +src/transformers/models/auto/processing_auto.py +src/transformers/models/auto/tokenization_auto.py +src/transformers/models/bertweet/tokenization_bertweet.py +src/transformers/models/dpr/tokenization_dpr.py +src/transformers/models/dpr/tokenization_dpr_fast.py +src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py +src/transformers/models/m2m_100/tokenization_m2m_100.py +src/transformers/models/marian/tokenization_marian.py +src/transformers/models/roformer/tokenization_roformer.py +src/transformers/models/roformer/tokenization_roformer_fast.py +src/transformers/models/transfo_xl/tokenization_transfo_xl.py \ No newline at end of file