From 1310e1a758edc8e89ec363db76863c771fbeb1de Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 10 Dec 2020 11:57:12 -0500 Subject: [PATCH] Enforce all objects in the main init are documented (#9014) --- docs/source/internal/generation_utils.rst | 10 + docs/source/internal/trainer_utils.rst | 9 + .../main_classes/optimizer_schedules.rst | 4 + docs/source/main_classes/pipelines.rst | 12 +- docs/source/model_doc/albert.rst | 7 + docs/source/model_doc/auto.rst | 7 + docs/source/model_doc/bart.rst | 6 + docs/source/model_doc/barthez.rst | 6 + docs/source/model_doc/bert.rst | 7 + docs/source/model_doc/camembert.rst | 7 + docs/source/model_doc/layoutlm.rst | 7 + docs/source/model_doc/mbart.rst | 7 + docs/source/model_doc/mt5.rst | 16 ++ docs/source/model_doc/pegasus.rst | 7 + docs/source/model_doc/reformer.rst | 7 + docs/source/model_doc/t5.rst | 7 + docs/source/model_doc/transformerxl.rst | 8 + docs/source/model_doc/xlmroberta.rst | 7 + docs/source/model_doc/xlnet.rst | 7 + utils/check_repo.py | 236 +++++++++++------- 20 files changed, 290 insertions(+), 94 deletions(-) diff --git a/docs/source/internal/generation_utils.rst b/docs/source/internal/generation_utils.rst index c608a75f5e..f645472ffa 100644 --- a/docs/source/internal/generation_utils.rst +++ b/docs/source/internal/generation_utils.rst @@ -32,6 +32,9 @@ generation. .. autoclass:: transformers.LogitsProcessorList :members: __call__ +.. autoclass:: transformers.LogitsWarper + :members: __call__ + .. autoclass:: transformers.MinLengthLogitsProcessor :members: __call__ @@ -67,3 +70,10 @@ BeamSearch .. autoclass:: transformers.BeamSearchScorer :members: process, finalize + +Utilities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: transformers.top_k_top_p_filtering + +.. autofunction:: transformers.tf_top_k_top_p_filtering diff --git a/docs/source/internal/trainer_utils.rst b/docs/source/internal/trainer_utils.rst index 8769ad82bd..5d787620f2 100644 --- a/docs/source/internal/trainer_utils.rst +++ b/docs/source/internal/trainer_utils.rst @@ -22,6 +22,8 @@ Utilities .. autoclass:: transformers.EvalPrediction +.. autoclass:: transformers.EvaluationStrategy + .. autofunction:: transformers.set_seed .. autofunction:: transformers.torch_distributed_zero_first @@ -32,8 +34,15 @@ Callbacks internals .. autoclass:: transformers.trainer_callback.CallbackHandler + Distributed Evaluation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer :members: + + +Distributed Evaluation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.HfArgumentParser diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst index 7f27badbf4..b53d682375 100644 --- a/docs/source/main_classes/optimizer_schedules.rst +++ b/docs/source/main_classes/optimizer_schedules.rst @@ -74,6 +74,10 @@ Learning Rate Schedules (Pytorch) :target: /imgs/warmup_linear_schedule.png :alt: + +.. autofunction:: transformers.get_polynomial_decay_schedule_with_warmup + + Warmup (TensorFlow) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst index f997e95065..1e4bb4a43c 100644 --- a/docs/source/main_classes/pipelines.rst +++ b/docs/source/main_classes/pipelines.rst @@ -73,8 +73,9 @@ FillMaskPipeline NerPipeline ======================================================================================================================= -This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined below. Please refer to that -pipeline for documentation and usage examples. +.. autoclass:: transformers.NerPipeline + +See :class:`~transformers.TokenClassificationPipeline` for all details. QuestionAnsweringPipeline ======================================================================================================================= @@ -118,6 +119,13 @@ TokenClassificationPipeline :special-members: __call__ :members: +TranslationPipeline +======================================================================================================================= + +.. autoclass:: transformers.TranslationPipeline + :special-members: __call__ + :members: + ZeroShotClassificationPipeline ======================================================================================================================= diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst index 6c5f85ccb5..256695df9b 100644 --- a/docs/source/model_doc/albert.rst +++ b/docs/source/model_doc/albert.rst @@ -60,6 +60,13 @@ AlbertTokenizer create_token_type_ids_from_sequences, save_vocabulary +AlbertTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.AlbertTokenizerFast + :members: + + Albert specific outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst index dd325d5be9..b4776a737c 100644 --- a/docs/source/model_doc/auto.rst +++ b/docs/source/model_doc/auto.rst @@ -175,3 +175,10 @@ TFAutoModelForQuestionAnswering .. autoclass:: transformers.TFAutoModelForQuestionAnswering :members: + + +FlaxAutoModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaxAutoModel + :members: diff --git a/docs/source/model_doc/bart.rst b/docs/source/model_doc/bart.rst index 678d3236b0..189f3c5e35 100644 --- a/docs/source/model_doc/bart.rst +++ b/docs/source/model_doc/bart.rst @@ -98,6 +98,12 @@ BartTokenizer :members: +BartTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BartTokenizerFast + :members: + BartModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/barthez.rst b/docs/source/model_doc/barthez.rst index e58213e4d9..f0f1d4be42 100644 --- a/docs/source/model_doc/barthez.rst +++ b/docs/source/model_doc/barthez.rst @@ -51,3 +51,9 @@ BarthezTokenizer .. autoclass:: transformers.BarthezTokenizer :members: + +BarthezTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BarthezTokenizerFast + :members: diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst index 77493d2ac5..0ed892783c 100644 --- a/docs/source/model_doc/bert.rst +++ b/docs/source/model_doc/bert.rst @@ -207,3 +207,10 @@ FlaxBertModel .. autoclass:: transformers.FlaxBertModel :members: __call__ + + +FlaxBertForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaxBertForMaskedLM + :members: __call__ diff --git a/docs/source/model_doc/camembert.rst b/docs/source/model_doc/camembert.rst index 44696b3789..c8f7d7998b 100644 --- a/docs/source/model_doc/camembert.rst +++ b/docs/source/model_doc/camembert.rst @@ -54,6 +54,13 @@ CamembertTokenizer create_token_type_ids_from_sequences, save_vocabulary +CamembertTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.CamembertTokenizerFast + :members: + + CamembertModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/layoutlm.rst b/docs/source/model_doc/layoutlm.rst index 8158372539..1dc9ee971f 100644 --- a/docs/source/model_doc/layoutlm.rst +++ b/docs/source/model_doc/layoutlm.rst @@ -57,6 +57,13 @@ LayoutLMTokenizer :members: +LayoutLMTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.LayoutLMTokenizerFast + :members: + + LayoutLMModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/mbart.rst b/docs/source/model_doc/mbart.rst index ae11425b7e..eb9b979802 100644 --- a/docs/source/model_doc/mbart.rst +++ b/docs/source/model_doc/mbart.rst @@ -90,6 +90,13 @@ MBartTokenizer :members: build_inputs_with_special_tokens, prepare_seq2seq_batch +MBartTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.MBartTokenizerFast + :members: + + MBartForConditionalGeneration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/mt5.rst b/docs/source/model_doc/mt5.rst index 5ae01bfcd4..f6c7af74c8 100644 --- a/docs/source/model_doc/mt5.rst +++ b/docs/source/model_doc/mt5.rst @@ -37,6 +37,22 @@ MT5Config :members: +MT5Tokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.MT5Tokenizer + +See :class:`~transformers.T5Tokenizer` for all details. + + +MT5TokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.MT5TokenizerFast + +See :class:`~transformers.T5TokenizerFast` for all details. + + MT5Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/pegasus.rst b/docs/source/model_doc/pegasus.rst index df9a3369fa..42b3e5ea57 100644 --- a/docs/source/model_doc/pegasus.rst +++ b/docs/source/model_doc/pegasus.rst @@ -112,6 +112,13 @@ warning: ``add_tokens`` does not work at the moment. :members: __call__, prepare_seq2seq_batch +PegasusTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PegasusTokenizerFast + :members: + + PegasusForConditionalGeneration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/reformer.rst b/docs/source/model_doc/reformer.rst index 730f90a903..c46bd2bb74 100644 --- a/docs/source/model_doc/reformer.rst +++ b/docs/source/model_doc/reformer.rst @@ -163,6 +163,13 @@ ReformerTokenizer :members: save_vocabulary +ReformerTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ReformerTokenizerFast + :members: + + ReformerModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst index 29eba6a82b..60a7b58492 100644 --- a/docs/source/model_doc/t5.rst +++ b/docs/source/model_doc/t5.rst @@ -107,6 +107,13 @@ T5Tokenizer create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary +T5TokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.T5TokenizerFast + :members: + + T5Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst index fab2de06b7..fe9a6e5b46 100644 --- a/docs/source/model_doc/transformerxl.rst +++ b/docs/source/model_doc/transformerxl.rst @@ -105,3 +105,11 @@ TFTransfoXLLMHeadModel .. autoclass:: transformers.TFTransfoXLLMHeadModel :members: call + + +Internal Layers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.AdaptiveEmbedding + +.. autoclass:: transformers.TFAdaptiveEmbedding diff --git a/docs/source/model_doc/xlmroberta.rst b/docs/source/model_doc/xlmroberta.rst index 45747022d3..c95954a201 100644 --- a/docs/source/model_doc/xlmroberta.rst +++ b/docs/source/model_doc/xlmroberta.rst @@ -62,6 +62,13 @@ XLMRobertaTokenizer create_token_type_ids_from_sequences, save_vocabulary +XLMRobertaTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.XLMRobertaTokenizerFast + :members: + + XLMRobertaModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst index 89cd83b834..bdf8dbeb81 100644 --- a/docs/source/model_doc/xlnet.rst +++ b/docs/source/model_doc/xlnet.rst @@ -62,6 +62,13 @@ XLNetTokenizer create_token_type_ids_from_sequences, save_vocabulary +XLNetTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.XLNetTokenizerFast + :members: + + XLNet specific outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/utils/check_repo.py b/utils/check_repo.py index 11ca4bd976..a367134fd9 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -17,6 +17,7 @@ import importlib import inspect import os import re +from pathlib import Path # All paths are set with the intent you should run this script from the root of the repo with the command @@ -57,28 +58,6 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [ "test_modeling_xlm_roberta.py", ] -# Update this list for models that are not documented with a comment explaining the reason it should not be. -# Being in this list is an exception and should **not** be the rule. -IGNORE_NON_DOCUMENTED = [ - "BartDecoder", # Building part of bigger (documented) model. - "BartEncoder", # Building part of bigger (documented) model. - "DPREncoder", # Building part of bigger (documented) model. - "DPRSpanPredictor", # Building part of bigger (documented) model. - "T5Stack", # Building part of bigger (documented) model. - "TFDPREncoder", # Building part of bigger (documented) model. - "TFDPRSpanPredictor", # Building part of bigger (documented) model. -] - -# Update this dict with any special correspondance model name (used in modeling_xxx.py) to doc file. -MODEL_NAME_TO_DOC_FILE = { - "openai": "gpt.rst", - "transfo_xl": "transformerxl.rst", - "xlm_prophetnet": "xlmprophetnet.rst", - "xlm_roberta": "xlmroberta.rst", - "bert_generation": "bertgeneration.rst", - "marian": "marian.rst", -} - # Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and # should **not** be the rule. IGNORE_NON_AUTO_CONFIGURED = [ @@ -192,22 +171,6 @@ def get_model_test_files(): return test_files -# If some doc source files should be ignored when checking models are all documented, they should be added in the -# nested list _ignore_modules of this function. -def get_model_doc_files(): - """ Get the model doc files.""" - _ignore_modules = [ - "auto", - "dialogpt", - "retribert", - ] - doc_files = [] - for filename in os.listdir(PATH_TO_DOC): - if os.path.isfile(f"{PATH_TO_DOC}/{filename}") and not os.path.splitext(filename)[0] in _ignore_modules: - doc_files.append(filename) - return doc_files - - # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the tester class # for the all_model_classes variable. def find_tested_models(test_file): @@ -269,58 +232,6 @@ def check_all_models_are_tested(): raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures)) -def find_documented_classes(doc_file): - """ Parse the content of doc_file to detect which classes it documents""" - with open(os.path.join(PATH_TO_DOC, doc_file), "r", encoding="utf-8", newline="\n") as f: - content = f.read() - return re.findall(r"autoclass:: transformers.(\S+)\s+", content) - - -def check_models_are_documented(module, doc_file): - """ Check models defined in module are documented in doc_file.""" - defined_models = get_models(module) - documented_classes = find_documented_classes(doc_file) - failures = [] - for model_name, _ in defined_models: - if model_name not in documented_classes and model_name not in IGNORE_NON_DOCUMENTED: - failures.append( - f"{model_name} is defined in {module.__name__} but is not documented in " - + f"{os.path.join(PATH_TO_DOC, doc_file)}. Add it to that file." - + "If this model should not be documented, add its name to `IGNORE_NON_DOCUMENTED`" - + "in the file `utils/check_repo.py`." - ) - return failures - - -def _get_model_name(module): - """ Get the model name for the module defining it.""" - module_name = module.__name__.split(".")[-1] - splits = module_name.split("_") - splits = splits[(2 if splits[1] in ["flax", "tf"] else 1) :] - return "_".join(splits) - - -def check_all_models_are_documented(): - """ Check all models are properly documented.""" - modules = get_model_modules() - doc_files = get_model_doc_files() - failures = [] - for module in modules: - model_name = _get_model_name(module) - doc_file = MODEL_NAME_TO_DOC_FILE.get(model_name, f"{model_name}.rst") - if doc_file not in doc_files: - failures.append( - f"{module.__name__} does not have its corresponding doc file {doc_file}. " - + f"If the doc file exists but isn't named {doc_file}, update `MODEL_NAME_TO_DOC_FILE` " - + "in the file `utils/check_repo.py`." - ) - new_failures = check_models_are_documented(module, doc_file) - if new_failures is not None: - failures += new_failures - if len(failures) > 0: - raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures)) - - def get_all_auto_configured_models(): """ Return the list of all models in at least one auto class.""" result = set() # To avoid duplicates we concatenate all model classes in a set. @@ -396,13 +307,154 @@ def check_all_decorator_order(): ) +def find_all_documented_objects(): + """ Parse the content of all doc files to detect which classes and functions it documents""" + documented_obj = [] + for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"): + with open(doc_file) as f: + content = f.read() + raw_doc_objs = re.findall(r"(?:autoclass|autofunction):: transformers.(\S+)\s+", content) + documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs] + return documented_obj + + +# One good reason for not being documented is to be deprecated. Put in this list deprecated objects. +DEPRECATED_OBJECTS = [ + "AutoModelWithLMHead", + "GlueDataset", + "GlueDataTrainingArguments", + "LineByLineTextDataset", + "LineByLineWithRefDataset", + "LineByLineWithSOPTextDataset", + "PretrainedBartModel", + "PretrainedFSMTModel", + "SingleSentenceClassificationProcessor", + "SquadDataTrainingArguments", + "SquadDataset", + "SquadExample", + "SquadFeatures", + "SquadV1Processor", + "SquadV2Processor", + "TFAutoModelWithLMHead", + "TextDataset", + "TextDatasetForNextSentencePrediction", + "glue_compute_metrics", + "glue_convert_examples_to_features", + "glue_output_modes", + "glue_processors", + "glue_tasks_num_labels", + "squad_convert_examples_to_features", + "xnli_compute_metrics", + "xnli_output_modes", + "xnli_processors", + "xnli_tasks_num_labels", +] + +# Exceptionally, some objects should not be documented after all rules passed. +# ONLY PUT SOMETHING IN THIS LIST AS A LAST RESORT! +UNDOCUMENTED_OBJECTS = [ + "AddedToken", # This is a tokenizers class. + "BasicTokenizer", # Internal, should never have been in the main init. + "DPRPretrainedReader", # Like an Encoder. + "ModelCard", # Internal type. + "SqueezeBertModule", # Internal building block (should have been called SqueezeBertLayer) + "TFDPRPretrainedReader", # Like an Encoder. + "TransfoXLCorpus", # Internal type. + "WordpieceTokenizer", # Internal, should never have been in the main init. + "absl", # External module + "add_end_docstrings", # Internal, should never have been in the main init. + "add_start_docstrings", # Internal, should never have been in the main init. + "cached_path", # Internal used for downloading models. + "convert_tf_weight_name_to_pt_weight_name", # Internal used to convert model weights + "logger", # Internal logger + "logging", # External module +] + +# This list should be empty. Objects in it should get their own doc page. +SHOULD_HAVE_THEIR_OWN_PAGE = [ + # bert-japanese + "BertJapaneseTokenizer", + "CharacterTokenizer", + "MecabTokenizer", + # Bertweet + "BertweetTokenizer", + # Herbert + "HerbertTokenizer", + "HerbertTokenizerFast", + # Phoebus + "PhobertTokenizer", + # Benchmarks + "PyTorchBenchmark", + "PyTorchBenchmarkArguments", + "TensorFlowBenchmark", + "TensorFlowBenchmarkArguments", +] + + +def ignore_undocumented(name): + """Rules to determine if `name` should be undocumented.""" + # NOT DOCUMENTED ON PURPOSE. + # Magic attributes are not documented. + if name.startswith("__"): + return True + # Constants uppercase are not documented. + if name.isupper(): + return True + # PreTrainedModels / Encoders / Decoders / Layers / Embeddings / Attention are not documented. + if ( + name.endswith("PreTrainedModel") + or name.endswith("Decoder") + or name.endswith("Encoder") + or name.endswith("Layer") + or name.endswith("Embeddings") + or name.endswith("Attention") + ): + return True + # Submodules are not documented. + if os.path.isdir(os.path.join(PATH_TO_TRANSFORMERS, name)) or os.path.isfile( + os.path.join(PATH_TO_TRANSFORMERS, f"{name}.py") + ): + return True + # All load functions are not documented. + if name.startswith("load_tf") or name.startswith("load_pytorch"): + return True + # is_xxx_available functions are not documented. + if name.startswith("is_") and name.endswith("_available"): + return True + # Deprecated objects are not documented. + if name in DEPRECATED_OBJECTS or name in UNDOCUMENTED_OBJECTS: + return True + # MMBT model does not really work. + if name.startswith("MMBT"): + return True + + # NOT DOCUMENTED BUT NOT ON PURPOSE, SHOULD BE FIXED! + # All data collators should be documented + if name.startswith("DataCollator") or name.endswith("data_collator"): + return True + if name in SHOULD_HAVE_THEIR_OWN_PAGE: + return True + return False + + +def check_all_objects_are_documented(): + """ Check all models are properly documented.""" + documented_objs = find_all_documented_objects() + undocumented_objs = [c for c in dir(transformers) if c not in documented_objs and not ignore_undocumented(c)] + if len(undocumented_objs) > 0: + raise Exception( + "The following objects are in the public init so should be documented:\n - " + + "\n - ".join(undocumented_objs) + ) + + def check_repo_quality(): """ Check all models are properly tested and documented.""" print("Checking all models are properly tested.") check_all_decorator_order() check_all_models_are_tested() - print("Checking all models are properly documented.") - check_all_models_are_documented() + print("Checking all objects are properly documented.") + check_all_objects_are_documented print("Checking all models are in at least one auto class.") check_all_models_are_auto_configured()