[AutoDocstring] Based on inspect parsing of the signature (#33771)

* delete common docstring

* nit

* updates

* push

* fixup

* move stuff around fixup

* no need for dataclas

* damn nice modular

* add auto class docstring

* style

* modular update

* import autodocstring

* fixup

* maybe add original doc!

* more cleanup

* remove class do cas well

* update

* nits

* more celanup

* fix

* wups

* small check

* updatez

* some fixes

* fix doc

* update

* nits

* try?

* nit

* some updates

* a little bit better

* where ever we did not have help we are not really adding it!

* revert llama config

* small fixes and small tests

* test

* fixup

* more fix-copies

* updates

* updates

* fix doc building

* style

* small fixes

* nits

* fix-copies

* fix merge issues faster

* fix merge conf

* nits jamba

* ?

* working autodoc for model class and forward except returns and example

* support return section and unpack kwargs description

* nits and cleanup

* fix-copies

* fix-copies

* nits

* Add support for llava-like models

* fixup

* add class args subset support

* add examples inferred from automodel/pipelines

* update ruff

* autodocstring for Aria, Albert + fixups

* Fix empty return blocks

* fix copies

* fix copies

* add autodoc for all fast image processors + align, altclip

* fix copies

* add auto_doc for audio_spectrogram, auto_former, bark, bamba

* Drastically improve speed + add bart beit bert

* add autodoc to all bert-like models

* Fix broken doc

* fix copies

* fix auto_docstring after merge

* add autodoc to models

* add models

* add models

* add models and improve support for optional, and custom shape in args docstring

* update fast image processors

* refactor auto_method_docstring in args_doc

* add models and fix docstring parsing

* add models

* add models

* remove debugging

* add models

* add fix_auto_docstrings and improve args_docs

* add support for additional_info in args docstring

* refactor (almost) all models

* fix check docstring

* fix -copies

* fill in all missing docstrings

* fix copies

* fix qwen3 moe docstring

* add documentation

* add back labels

* update docs and fix can_return_tuple in modular files

* fix LongformerForMaskedLM docstring

* add auto_docstring to _toctree

* remove auto_docstring tests temporarily

* fix copyrights new files

* fix can_return_tuple granite hybrid

* fix fast beit

* Fix empty config doc

* add support for COMMON_CUSTOM_ARGS in check_docstrings and add missing models

* fix code block not closed flava

* fix can_return_tuple sam hq

* Fix Flaubert dataclass

---------

Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
This commit is contained in:
Arthur
2025-05-08 23:46:07 +02:00
committed by GitHub
parent d231f5a7d4
commit 5f5ccfdc54
405 changed files with 18189 additions and 46715 deletions

View File

@@ -258,7 +258,7 @@ def get_docstring_indent(docstring):
return 0
def is_full_docstring(new_docstring: str) -> bool:
def is_full_docstring(original_docstring: str, new_docstring: str, original_level: int) -> bool:
"""Check if `new_docstring` is a full docstring, or if it is only part of a docstring that should then
be merged with the existing old one.
"""
@@ -267,6 +267,17 @@ def is_full_docstring(new_docstring: str) -> bool:
# The docstring contains Args definition, so it is self-contained
if re.search(r"\n\s*Args:\n", new_docstring):
return True
elif re.search(r"\n\s*Args:\n", original_docstring):
return False
# Check if the docstring contains args docstring (meaning it is self contained):
param_pattern = re.compile(
# |--- Group 1 ---|| Group 2 ||- Group 3 -||---------- Group 4 ----------|
rf"^\s{{0,{original_level}}}(\w+)\s*\(\s*([^, \)]*)(\s*.*?)\s*\)\s*:\s*((?:(?!\n^\s{{0,{original_level}}}\w+\s*\().)*)",
re.DOTALL | re.MULTILINE,
)
match_object = param_pattern.search(new_docstring)
if match_object is not None:
return True
# If it contains Returns, but starts with text indented with an additional 4 spaces before, it is self-contained
# (this is the scenario when using `@add_start_docstrings_to_model_forward`, but adding more args to docstring)
match_object = re.search(r"\n([^\S\n]*)Returns:\n", new_docstring)
@@ -280,7 +291,7 @@ def is_full_docstring(new_docstring: str) -> bool:
def merge_docstrings(original_docstring, updated_docstring):
original_level = get_docstring_indent(original_docstring)
if not is_full_docstring(updated_docstring):
if not is_full_docstring(original_docstring, updated_docstring, original_level):
# Split the docstring at the example section, assuming `"""` is used to define the docstring
parts = original_docstring.split("```")
if "```" in updated_docstring and len(parts) > 1:
@@ -291,13 +302,22 @@ def merge_docstrings(original_docstring, updated_docstring):
parts[1] = new_parts[1]
updated_docstring = "".join(
[
parts[0].rstrip(" \n") + new_parts[0],
f"\n{original_level * ' '}```",
parts[1],
"```",
parts[2],
]
)
docstring_opening, original_start_docstring = parts[0].rstrip(" \n").split('"""')[:2]
new_start_docstring = new_parts[0].rstrip(" \n")
docstring_opening += '"""'
if new_start_docstring.startswith(original_start_docstring):
updated_docstring = new_start_docstring + "\n" + updated_docstring
elif original_start_docstring.endswith(new_start_docstring):
updated_docstring = original_start_docstring + "\n" + updated_docstring
else:
updated_docstring = original_start_docstring + "\n" + new_start_docstring + "\n" + updated_docstring
updated_docstring = docstring_opening + updated_docstring
elif updated_docstring not in original_docstring:
# add tabulation if we are at the lowest level.
if re.search(r"\n\s*.*\(.*\)\:\n\s*\w", updated_docstring):