Trainer push to hub (#11328)

* Initial support for upload to hub

* push -> upload

* Fixes + examples

* Fix torchhub test

* Torchhub test I hate you

* push_model_to_hub -> push_to_hub

* Apply mixin to other pretrained models

* Remove ABC inheritance

* Add tests

* Typo

* Run tests

* Install git-lfs

* Change approach

* Add push_to_hub to all

* Staging test suite

* Typo

* Maybe like this?

* More deps

* Cache

* Adapt name

* Quality

* MOAR tests

* Put it in testing_utils

* Docs + torchhub last hope

* Styling

* Wrong method

* Typos

* Update src/transformers/file_utils.py

Co-authored-by: Julien Chaumond <julien@huggingface.co>

* Address review comments

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Co-authored-by: Julien Chaumond <julien@huggingface.co>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
Sylvain Gugger
2021-04-23 09:17:37 -04:00
committed by GitHub
parent 7bc86bea68
commit bf2e0cf70b
31 changed files with 766 additions and 31 deletions

View File

@@ -34,6 +34,7 @@ import requests
from .file_utils import (
ExplicitEnum,
PaddingStrategy,
PushToHubMixin,
TensorType,
_is_jax,
_is_numpy,
@@ -1415,7 +1416,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizerBase(SpecialTokensMixin):
class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
"""
Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
@@ -1850,6 +1851,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
save_directory: Union[str, os.PathLike],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
push_to_hub: bool = False,
**kwargs,
) -> Tuple[str]:
"""
Save the full tokenizer state.
@@ -1925,13 +1928,21 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
file_names = (tokenizer_config_file, special_tokens_map_file)
return self._save_pretrained(
save_files = self._save_pretrained(
save_directory=save_directory,
file_names=file_names,
legacy_format=legacy_format,
filename_prefix=filename_prefix,
)
if push_to_hub:
# Annoyingly, the return contains files that don't exist.
existing_files = [f for f in save_files if os.path.isfile(f)]
url = self._push_to_hub(save_files=existing_files, **kwargs)
logger.info(f"Tokenizer pushed to the hub in this commit: {url}")
return save_files
def _save_pretrained(
self,
save_directory: Union[str, os.PathLike],