Trainer push to hub (#11328)
* Initial support for upload to hub * push -> upload * Fixes + examples * Fix torchhub test * Torchhub test I hate you * push_model_to_hub -> push_to_hub * Apply mixin to other pretrained models * Remove ABC inheritance * Add tests * Typo * Run tests * Install git-lfs * Change approach * Add push_to_hub to all * Staging test suite * Typo * Maybe like this? * More deps * Cache * Adapt name * Quality * MOAR tests * Put it in testing_utils * Docs + torchhub last hope * Styling * Wrong method * Typos * Update src/transformers/file_utils.py Co-authored-by: Julien Chaumond <julien@huggingface.co> * Address review comments * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Julien Chaumond <julien@huggingface.co> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -34,6 +34,7 @@ import requests
|
||||
from .file_utils import (
|
||||
ExplicitEnum,
|
||||
PaddingStrategy,
|
||||
PushToHubMixin,
|
||||
TensorType,
|
||||
_is_jax,
|
||||
_is_numpy,
|
||||
@@ -1415,7 +1416,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
|
||||
|
||||
|
||||
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
|
||||
class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
"""
|
||||
Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
|
||||
|
||||
@@ -1850,6 +1851,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
save_directory: Union[str, os.PathLike],
|
||||
legacy_format: Optional[bool] = None,
|
||||
filename_prefix: Optional[str] = None,
|
||||
push_to_hub: bool = False,
|
||||
**kwargs,
|
||||
) -> Tuple[str]:
|
||||
"""
|
||||
Save the full tokenizer state.
|
||||
@@ -1925,13 +1928,21 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
file_names = (tokenizer_config_file, special_tokens_map_file)
|
||||
|
||||
return self._save_pretrained(
|
||||
save_files = self._save_pretrained(
|
||||
save_directory=save_directory,
|
||||
file_names=file_names,
|
||||
legacy_format=legacy_format,
|
||||
filename_prefix=filename_prefix,
|
||||
)
|
||||
|
||||
if push_to_hub:
|
||||
# Annoyingly, the return contains files that don't exist.
|
||||
existing_files = [f for f in save_files if os.path.isfile(f)]
|
||||
url = self._push_to_hub(save_files=existing_files, **kwargs)
|
||||
logger.info(f"Tokenizer pushed to the hub in this commit: {url}")
|
||||
|
||||
return save_files
|
||||
|
||||
def _save_pretrained(
|
||||
self,
|
||||
save_directory: Union[str, os.PathLike],
|
||||
|
||||
Reference in New Issue
Block a user