Refactor Code samples; Test code samples (#5036)
* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr):
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
|
||||
|
||||
>>> outputs = model(**inputs, labels=labels)
|
||||
>>> loss, scores = outputs[:2]
|
||||
"""
|
||||
|
||||
PT_QUESTION_ANSWERING_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> start_positions = torch.tensor([1])
|
||||
>>> end_positions = torch.tensor([3])
|
||||
|
||||
>>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
|
||||
>>> loss, start_scores, end_scores = outputs[:3]
|
||||
"""
|
||||
|
||||
PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(**inputs, labels=labels)
|
||||
>>> loss, logits = outputs[:2]
|
||||
"""
|
||||
|
||||
PT_MASKED_LM_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
|
||||
|
||||
>>> outputs = model(input_ids, labels=input_ids)
|
||||
>>> loss, prediction_scores = outputs[:2]
|
||||
"""
|
||||
|
||||
PT_BASE_MODEL_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
"""
|
||||
|
||||
PT_MULTIPLE_CHOICE_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> choice0 = "It is eaten with a fork and a knife."
|
||||
>>> choice1 = "It is eaten while held in the hand."
|
||||
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
|
||||
|
||||
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
|
||||
>>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
|
||||
|
||||
>>> # the linear classifier still needs to be trained
|
||||
>>> loss, logits = outputs[:2]
|
||||
"""
|
||||
|
||||
PT_CAUSAL_LM_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> import torch
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs, labels=inputs["input_ids"])
|
||||
>>> loss, logits = outputs[:2]
|
||||
"""
|
||||
|
||||
TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
|
||||
>>> input_ids = inputs["input_ids"]
|
||||
>>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
|
||||
|
||||
>>> outputs = model(inputs)
|
||||
>>> loss, scores = outputs[:2]
|
||||
"""
|
||||
|
||||
TF_QUESTION_ANSWERING_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
>>> input_dict = tokenizer(question, text, return_tensors='tf')
|
||||
>>> start_scores, end_scores = model(input_dict)
|
||||
|
||||
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
|
||||
>>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
|
||||
"""
|
||||
|
||||
TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
|
||||
>>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
|
||||
|
||||
>>> outputs = model(inputs)
|
||||
>>> loss, logits = outputs[:2]
|
||||
"""
|
||||
|
||||
TF_MASKED_LM_SAMPLE = r"""
|
||||
Example::
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
|
||||
|
||||
>>> outputs = model(input_ids)
|
||||
>>> prediction_scores = outputs[0]
|
||||
"""
|
||||
|
||||
TF_BASE_MODEL_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
|
||||
>>> outputs = model(inputs)
|
||||
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
"""
|
||||
|
||||
TF_MULTIPLE_CHOICE_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> choice0 = "It is eaten with a fork and a knife."
|
||||
>>> choice1 = "It is eaten while held in the hand."
|
||||
|
||||
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
|
||||
>>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
|
||||
>>> outputs = model(inputs) # batch size is 1
|
||||
|
||||
>>> # the linear classifier still needs to be trained
|
||||
>>> logits = outputs[0]
|
||||
"""
|
||||
|
||||
TF_CAUSAL_LM_SAMPLE = r"""
|
||||
Example::
|
||||
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
|
||||
>>> outputs = model(inputs)
|
||||
>>> logits = outputs[0]
|
||||
"""
|
||||
|
||||
|
||||
def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
|
||||
def docstring_decorator(fn):
|
||||
model_class = fn.__qualname__.split(".")[0]
|
||||
is_tf_class = model_class[:2] == "TF"
|
||||
|
||||
if "SequenceClassification" in model_class:
|
||||
code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
|
||||
elif "QuestionAnswering" in model_class:
|
||||
code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
|
||||
elif "TokenClassification" in model_class:
|
||||
code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
|
||||
elif "MultipleChoice" in model_class:
|
||||
code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
|
||||
elif "MaskedLM" in model_class:
|
||||
code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
|
||||
elif "LMHead" in model_class:
|
||||
code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
|
||||
elif "Model" in model_class:
|
||||
code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
|
||||
else:
|
||||
raise ValueError(f"Docstring can't be built for model {model_class}")
|
||||
|
||||
built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
|
||||
fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
|
||||
return fn
|
||||
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def is_remote_url(url_or_filename):
|
||||
parsed = urlparse(url_or_filename)
|
||||
return parsed.scheme in ("http", "https")
|
||||
|
||||
Reference in New Issue
Block a user