From 9badcecf694f174da929bcfe668ca57851960ad8 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 24 Mar 2022 10:26:27 +0100 Subject: [PATCH] [Doctests] Make TFRoberta-like meaningfull (#16370) * update doc examples for TFRoberta * fix style * fix style * use TF ckpt * apply suggestion * add the code file to test here * fix style Co-authored-by: ydshieh --- .../models/roberta/modeling_tf_roberta.py | 15 ++- src/transformers/utils/doc.py | 93 ++++++++++++++----- utils/documentation_tests.txt | 1 + 3 files changed, 84 insertions(+), 25 deletions(-) diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 4ae381451c..bbdf7ebf33 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -1076,6 +1076,9 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos checkpoint=_CHECKPOINT_FOR_DOC, output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC, + mask="", + expected_output="' Paris'", + expected_loss=0.1, ) def call( self, @@ -1331,9 +1334,11 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, + checkpoint="cardiffnlp/twitter-roberta-base-emotion", output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, + expected_output="'optimism'", + expected_loss=0.08, ) def call( self, @@ -1543,9 +1548,11 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, + checkpoint="ydshieh/roberta-large-ner-english", output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC, + expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']", + expected_loss=0.01, ) def call( self, @@ -1628,9 +1635,11 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, + checkpoint="ydshieh/roberta-base-squad2", output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, + expected_output="' puppet'", + expected_loss=0.86, ) def call( self, diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py index 17f8adeb26..394d2aaa2f 100644 --- a/src/transformers/utils/doc.py +++ b/src/transformers/utils/doc.py @@ -618,15 +618,26 @@ TF_TOKEN_CLASSIFICATION_SAMPLE = r""" >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") >>> model = {model_class}.from_pretrained("{checkpoint}") - >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") - >>> input_ids = inputs["input_ids"] - >>> inputs["labels"] = tf.reshape( - ... tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids)) - >>> ) # Batch size 1 + >>> inputs = tokenizer( + ... "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="tf" + ... ) - >>> outputs = model(inputs) - >>> loss = outputs.loss - >>> logits = outputs.logits + >>> logits = model(**inputs).logits + >>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1) + + >>> # Note that tokens are classified rather then input words which means that + >>> # there might be more predicted token classes than words. + >>> # Multiple token classes might account for the same word + >>> predicted_tokens_classes = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()] + >>> predicted_tokens_classes + {expected_output} + ``` + + ```python + >>> labels = predicted_token_class_ids + >>> loss = tf.math.reduce_mean(model(**inputs, labels=labels).loss) + >>> round(float(loss), 2) + {expected_loss} ``` """ @@ -641,13 +652,26 @@ TF_QUESTION_ANSWERING_SAMPLE = r""" >>> model = {model_class}.from_pretrained("{checkpoint}") >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - >>> input_dict = tokenizer(question, text, return_tensors="tf") - >>> outputs = model(input_dict) - >>> start_logits = outputs.start_logits - >>> end_logits = outputs.end_logits - >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0]) - >>> answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1]) + >>> inputs = tokenizer(question, text, return_tensors="tf") + >>> outputs = model(**inputs) + + >>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) + >>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) + + >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] + >>> tokenizer.decode(predict_answer_tokens) + {expected_output} + ``` + + ```python + >>> # target is "nice puppet" + >>> target_start_index, target_end_index = tf.constant([14]), tf.constant([15]) + + >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index) + >>> loss = tf.math.reduce_mean(outputs.loss) + >>> round(float(loss), 2) + {expected_loss} ``` """ @@ -662,11 +686,23 @@ TF_SEQUENCE_CLASSIFICATION_SAMPLE = r""" >>> model = {model_class}.from_pretrained("{checkpoint}") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") - >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1 - >>> outputs = model(inputs) - >>> loss = outputs.loss - >>> logits = outputs.logits + >>> logits = model(**inputs).logits + + >>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) + >>> model.config.id2label[predicted_class_id] + {expected_output} + ``` + + ```python + >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` + >>> num_labels = len(model.config.id2label) + >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels) + + >>> labels = tf.constant(1) + >>> loss = model(**inputs, labels=labels).loss + >>> round(float(loss), 2) + {expected_loss} ``` """ @@ -681,11 +717,24 @@ TF_MASKED_LM_SAMPLE = r""" >>> model = {model_class}.from_pretrained("{checkpoint}") >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf") - >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"] + >>> logits = model(**inputs).logits - >>> outputs = model(inputs) - >>> loss = outputs.loss - >>> logits = outputs.logits + >>> # retrieve index of {mask} + >>> mask_token_index = tf.where(inputs.input_ids == tokenizer.mask_token_id)[0][1] + + >>> predicted_token_id = tf.math.argmax(logits[0, mask_token_index], axis=-1) + >>> tokenizer.decode(predicted_token_id) + {expected_output} + ``` + + ```python + >>> labels = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"] + >>> # mask labels of non-{mask} tokens + >>> labels = tf.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) + + >>> outputs = model(**inputs, labels=labels) + >>> round(float(outputs.loss), 2) + {expected_loss} ``` """ diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 7d31045184..b5d9f8570c 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -30,6 +30,7 @@ src/transformers/models/poolformer/modeling_poolformer.py src/transformers/models/resnet/modeling_resnet.py src/transformers/models/resnet/modeling_resnet.py src/transformers/models/roberta/modeling_roberta.py +src/transformers/models/roberta/modeling_tf_roberta.py src/transformers/models/segformer/modeling_segformer.py src/transformers/models/sew/modeling_sew.py src/transformers/models/sew_d/modeling_sew_d.py