Doc styler examples (#14953)

* Fix bad examples * Add black formatting to style_doc * Use first nonempty line * Put it at the right place * Don't add spaces to empty lines * Better templates * Deal with triple quotes in docstrings * Result of style_doc * Enable mdx treatment and fix code examples in MDXs * Result of doc styler on doc source files * Last fixes * Break copy from
2021-12-27 19:07:46 -05:00
parent e13f72fbff
commit b5e2b183af
211 changed files with 2738 additions and 1711 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -862,17 +862,17 @@ class SpecialTokensMixin:

        ```python
        # Let's see how to add a new classification token to GPT-2
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2Model.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2Model.from_pretrained("gpt2")

-        special_tokens_dict = {'cls_token': '<CLS>'}
+        special_tokens_dict = {"cls_token": "<CLS>"}

        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
-        print('We have added', num_added_toks, 'tokens')
+        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))

-        assert tokenizer.cls_token == '<CLS>'
+        assert tokenizer.cls_token == "<CLS>"
        ```"""
        if not special_tokens_dict:
            return 0
@@ -929,11 +929,11 @@ class SpecialTokensMixin:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained("bert-base-uncased")

-        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-        print('We have added', num_added_toks, 'tokens')
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```"""
@@ -1585,22 +1585,22 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        ```python
        # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
        # Download vocabulary from huggingface.co and cache.
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        # Download vocabulary from huggingface.co (user-uploaded) and cache.
-        tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+        tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
-        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/")

        # If the tokenizer uses a single vocabulary file, you can point directly to this file
-        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")

        # You can link tokens to special vocabulary when instantiating
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", unk_token="<unk>")
        # You should be sure '<unk>' is in the vocabulary when doing that.
        # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
-        assert tokenizer.unk_token == '<unk>'
+        assert tokenizer.unk_token == "<unk>"
        ```"""
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)