From 3312e96bfbebbad67ff29539d2df9211923ddd70 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 13 Apr 2021 12:14:25 -0400 Subject: [PATCH] Doc check: a bit of clean up (#11224) --- docs/source/main_classes/data_collator.rst | 48 ++++++++-------------- utils/check_repo.py | 13 ++---- 2 files changed, 22 insertions(+), 39 deletions(-) diff --git a/docs/source/main_classes/data_collator.rst b/docs/source/main_classes/data_collator.rst index 8162f2d65f..1ab8b6eb2b 100644 --- a/docs/source/main_classes/data_collator.rst +++ b/docs/source/main_classes/data_collator.rst @@ -1,30 +1,26 @@ - - -DataCollator +Data Collator ----------------------------------------------------------------------------------------------------------------------- -DataCollators are objects that will form a batch by using a list of elements as input. These lists of elements are of +Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`. -A data collator will default to :func:`transformers.data.data_collator.default_data_collator` if no `tokenizer` has -been provided. This is a function that takes a list of samples from a Dataset as input and collates them into a batch -of a dict-like object. The default collator performs special handling of potential keys: +To be able to build batches, data collators may apply some processing (like padding). Some of them (like +:class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking) +oin the formed batch. - - ``label``: handles a single value (int or float) per object - - ``label_ids``: handles a list of values per object - -This function does not perform any preprocessing. An example of use can be found in glue and ner. +Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`. Default data collator @@ -37,47 +33,39 @@ DataCollatorWithPadding ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorWithPadding - :special-members: __call__ :members: + DataCollatorForTokenClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForTokenClassification - :special-members: __call__ :members: + DataCollatorForSeq2Seq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForSeq2Seq - :special-members: __call__ :members: + DataCollatorForLanguageModeling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling - :special-members: __call__ :members: mask_tokens + DataCollatorForWholeWordMask ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask - :special-members: __call__ :members: mask_tokens -DataCollatorForSOP -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: transformers.data.data_collator.DataCollatorForSOP - :special-members: __call__ - :members: mask_tokens DataCollatorForPermutationLanguageModeling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling - :special-members: __call__ :members: mask_tokens diff --git a/utils/check_repo.py b/utils/check_repo.py index 4fa45d7c66..6f5fd8faf3 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -348,6 +348,8 @@ def find_all_documented_objects(): DEPRECATED_OBJECTS = [ "AutoModelWithLMHead", "BartPretrainedModel", + "DataCollator", + "DataCollatorForSOP", "GlueDataset", "GlueDataTrainingArguments", "LineByLineTextDataset", @@ -385,7 +387,9 @@ DEPRECATED_OBJECTS = [ UNDOCUMENTED_OBJECTS = [ "AddedToken", # This is a tokenizers class. "BasicTokenizer", # Internal, should never have been in the main init. + "CharacterTokenizer", # Internal, should never have been in the main init. "DPRPretrainedReader", # Like an Encoder. + "MecabTokenizer", # Internal, should never have been in the main init. "ModelCard", # Internal type. "SqueezeBertModule", # Internal building block (should have been called SqueezeBertLayer) "TFDPRPretrainedReader", # Like an Encoder. @@ -403,10 +407,6 @@ UNDOCUMENTED_OBJECTS = [ # This list should be empty. Objects in it should get their own doc page. SHOULD_HAVE_THEIR_OWN_PAGE = [ - # bert-japanese - "BertJapaneseTokenizer", - "CharacterTokenizer", - "MecabTokenizer", # Benchmarks "PyTorchBenchmark", "PyTorchBenchmarkArguments", @@ -448,11 +448,6 @@ def ignore_undocumented(name): # MMBT model does not really work. if name.startswith("MMBT"): return True - - # NOT DOCUMENTED BUT NOT ON PURPOSE, SHOULD BE FIXED! - # All data collators should be documented - if name.startswith("DataCollator") or name.endswith("data_collator"): - return True if name in SHOULD_HAVE_THEIR_OWN_PAGE: return True return False