diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 4bf2e0fa6d..a28d4cf063 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -3,11 +3,11 @@ Pretrained models Here is the full list of the currently provided pretrained models together with a short presentation of each model. -For a list that includes community-uploaded models, refer to `https://huggingface.co/models +For a list that includes all community-uploaded models, refer to `https://huggingface.co/models `__. +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| Architecture | Shortcut name | Details of the model | +| Architecture | Model id | Details of the model | +====================+============================================================+=======================================================================================================================================+ | BERT | ``bert-base-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | Trained on lower-cased English text. | diff --git a/examples/adversarial/run_hans.py b/examples/adversarial/run_hans.py index 6bde583282..9cc6a0a86e 100644 --- a/examples/adversarial/run_hans.py +++ b/examples/adversarial/run_hans.py @@ -57,7 +57,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/bert-loses-patience/run_glue_with_pabee.py b/examples/bert-loses-patience/run_glue_with_pabee.py index 37eb76a511..1ac84f28d3 100755 --- a/examples/bert-loses-patience/run_glue_with_pabee.py +++ b/examples/bert-loses-patience/run_glue_with_pabee.py @@ -476,7 +476,7 @@ def main(): "--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--max_seq_length", diff --git a/examples/bertology/run_bertology.py b/examples/bertology/run_bertology.py index 340b2f2cc2..d0eef30430 100644 --- a/examples/bertology/run_bertology.py +++ b/examples/bertology/run_bertology.py @@ -298,7 +298,7 @@ def main(): "--cache_dir", default=None, type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances." diff --git a/examples/contrib/legacy/run_language_modeling.py b/examples/contrib/legacy/run_language_modeling.py index 3b52a52229..4b9f272a0a 100644 --- a/examples/contrib/legacy/run_language_modeling.py +++ b/examples/contrib/legacy/run_language_modeling.py @@ -81,7 +81,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/contrib/mm-imdb/run_mmimdb.py b/examples/contrib/mm-imdb/run_mmimdb.py index cd2d47f6fe..d948a5a62d 100644 --- a/examples/contrib/mm-imdb/run_mmimdb.py +++ b/examples/contrib/mm-imdb/run_mmimdb.py @@ -350,7 +350,7 @@ def main(): "--cache_dir", default=None, type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--max_seq_length", diff --git a/examples/deebert/run_glue_deebert.py b/examples/deebert/run_glue_deebert.py index 8361553984..7e415d0939 100644 --- a/examples/deebert/run_glue_deebert.py +++ b/examples/deebert/run_glue_deebert.py @@ -452,7 +452,7 @@ def main(): "--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--max_seq_length", diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py index 3a8af71267..3429bf1cbe 100644 --- a/examples/distillation/run_squad_w_distillation.py +++ b/examples/distillation/run_squad_w_distillation.py @@ -578,7 +578,7 @@ def main(): "--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index be75a4ee38..396631b9ff 100644 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -76,7 +76,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 35e8a03df2..dfc2614a72 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -74,7 +74,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, diff --git a/examples/language-modeling/run_mlm_wwm.py b/examples/language-modeling/run_mlm_wwm.py index 557282a742..b2ffcc34ac 100644 --- a/examples/language-modeling/run_mlm_wwm.py +++ b/examples/language-modeling/run_mlm_wwm.py @@ -76,7 +76,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, diff --git a/examples/language-modeling/run_plm.py b/examples/language-modeling/run_plm.py index 5969fd98b5..65700a415c 100644 --- a/examples/language-modeling/run_plm.py +++ b/examples/language-modeling/run_plm.py @@ -64,7 +64,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, diff --git a/examples/lightning_base.py b/examples/lightning_base.py index 8ceee24979..0c4913e15f 100644 --- a/examples/lightning_base.py +++ b/examples/lightning_base.py @@ -236,7 +236,7 @@ class BaseTransformer(pl.LightningModule): "--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--encoder_layerdrop", diff --git a/examples/movement-pruning/masked_run_glue.py b/examples/movement-pruning/masked_run_glue.py index b07fe03d29..0657aa24ce 100644 --- a/examples/movement-pruning/masked_run_glue.py +++ b/examples/movement-pruning/masked_run_glue.py @@ -620,7 +620,7 @@ def main(): "--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--max_seq_length", diff --git a/examples/movement-pruning/masked_run_squad.py b/examples/movement-pruning/masked_run_squad.py index 56d13b6f97..979649a6be 100644 --- a/examples/movement-pruning/masked_run_squad.py +++ b/examples/movement-pruning/masked_run_squad.py @@ -725,7 +725,7 @@ def main(): "--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( diff --git a/examples/multiple-choice/run_multiple_choice.py b/examples/multiple-choice/run_multiple_choice.py index b4b77f347e..efa9a6d389 100644 --- a/examples/multiple-choice/run_multiple_choice.py +++ b/examples/multiple-choice/run_multiple_choice.py @@ -61,7 +61,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/multiple-choice/run_tf_multiple_choice.py b/examples/multiple-choice/run_tf_multiple_choice.py index 9bdc4c6d03..85d9f2127f 100644 --- a/examples/multiple-choice/run_tf_multiple_choice.py +++ b/examples/multiple-choice/run_tf_multiple_choice.py @@ -65,7 +65,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py index d717c17e01..ff693ad24d 100644 --- a/examples/question-answering/run_squad.py +++ b/examples/question-answering/run_squad.py @@ -532,7 +532,7 @@ def main(): "--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( diff --git a/examples/question-answering/run_squad_trainer.py b/examples/question-answering/run_squad_trainer.py index f71c7c09de..e49e2458a8 100644 --- a/examples/question-answering/run_squad_trainer.py +++ b/examples/question-answering/run_squad_trainer.py @@ -51,7 +51,8 @@ class ModelArguments: # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # or just modify its tokenizer_config.json. cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/question-answering/run_tf_squad.py b/examples/question-answering/run_tf_squad.py index fb712964c6..1632d2d1d9 100644 --- a/examples/question-answering/run_tf_squad.py +++ b/examples/question-answering/run_tf_squad.py @@ -63,7 +63,8 @@ class ModelArguments: # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # or just modify its tokenizer_config.json. cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/rag/finetune.sh b/examples/rag/finetune.sh index 08a6d0a846..ce82070aaa 100755 --- a/examples/rag/finetune.sh +++ b/examples/rag/finetune.sh @@ -7,7 +7,7 @@ export PYTHONPATH="../":"${PYTHONPATH}" python examples/rag/finetune.py \ --data_dir $DATA_DIR \ --output_dir $OUTPUT_DIR \ - --model_name_or_path $MODLE_NAME_OR_PATH \ + --model_name_or_path $MODEL_NAME_OR_PATH \ --model_type rag_sequence \ --fp16 \ --gpus 8 \ diff --git a/examples/seq2seq/finetune_trainer.py b/examples/seq2seq/finetune_trainer.py index a37344958c..2243ebd9e4 100644 --- a/examples/seq2seq/finetune_trainer.py +++ b/examples/seq2seq/finetune_trainer.py @@ -43,7 +43,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) freeze_encoder: bool = field(default=False, metadata={"help": "Whether tp freeze the encoder."}) freeze_embeds: bool = field(default=False, metadata={"help": "Whether to freeze the embeddings."}) diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index d15bc5a9f1..941b3c84d0 100644 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -124,7 +124,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, diff --git a/examples/text-classification/run_tf_glue.py b/examples/text-classification/run_tf_glue.py index 538134ed6d..3434393436 100644 --- a/examples/text-classification/run_tf_glue.py +++ b/examples/text-classification/run_tf_glue.py @@ -117,7 +117,8 @@ class ModelArguments: # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # or just modify its tokenizer_config.json. cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/text-classification/run_tf_text_classification.py b/examples/text-classification/run_tf_text_classification.py index 4dfea9f894..880f0f2aac 100644 --- a/examples/text-classification/run_tf_text_classification.py +++ b/examples/text-classification/run_tf_text_classification.py @@ -182,7 +182,8 @@ class ModelArguments: # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # or just modify its tokenizer_config.json. cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py index c7482814e0..19d3d040ed 100644 --- a/examples/text-classification/run_xnli.py +++ b/examples/text-classification/run_xnli.py @@ -406,7 +406,7 @@ def main(): "--cache_dir", default=None, type=str, - help="Where do you want to store the pre-trained models downloaded from s3", + help="Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--max_seq_length", diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py index 62cd47a0ba..718927f3eb 100644 --- a/examples/token-classification/run_ner.py +++ b/examples/token-classification/run_ner.py @@ -60,7 +60,8 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/token-classification/run_ner_old.py b/examples/token-classification/run_ner_old.py index bcb92c5dee..e97048ecef 100644 --- a/examples/token-classification/run_ner_old.py +++ b/examples/token-classification/run_ner_old.py @@ -65,7 +65,8 @@ class ModelArguments: # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # or just modify its tokenizer_config.json. cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/examples/token-classification/run_tf_ner.py b/examples/token-classification/run_tf_ner.py index adb9094b9c..7b5e0d1926 100644 --- a/examples/token-classification/run_tf_ner.py +++ b/examples/token-classification/run_tf_ner.py @@ -67,7 +67,8 @@ class ModelArguments: # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, # or just modify its tokenizer_config.json. cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) diff --git a/hubconf.py b/hubconf.py index 86abe39ebd..578b8866ac 100644 --- a/hubconf.py +++ b/hubconf.py @@ -25,7 +25,7 @@ def config(*args, **kwargs): # Using torch.hub ! import torch - config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache. + config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased') # Download configuration from huggingface.co and cache. config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json') config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attentions=True, foo=False) @@ -45,7 +45,7 @@ def tokenizer(*args, **kwargs): # Using torch.hub ! import torch - tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache. + tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from huggingface.co and cache. tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` """ @@ -59,7 +59,7 @@ def model(*args, **kwargs): # Using torch.hub ! import torch - model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache. + model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased') # Download model and configuration from huggingface.co and cache. model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attentions=True) # Update configuration during loading assert model.config.output_attentions == True @@ -78,7 +78,7 @@ def modelWithLMHead(*args, **kwargs): # Using torch.hub ! import torch - model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache. + model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from huggingface.co and cache. model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attentions=True) # Update configuration during loading assert model.config.output_attentions == True @@ -96,7 +96,7 @@ def modelForSequenceClassification(*args, **kwargs): # Using torch.hub ! import torch - model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache. + model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from huggingface.co and cache. model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attentions=True) # Update configuration during loading assert model.config.output_attentions == True @@ -115,7 +115,7 @@ def modelForQuestionAnswering(*args, **kwargs): # Using torch.hub ! import torch - model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache. + model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from huggingface.co and cache. model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attentions=True) # Update configuration during loading assert model.config.output_attentions == True diff --git a/src/transformers/commands/user.py b/src/transformers/commands/user.py index 9124f80a85..442e88b5c5 100644 --- a/src/transformers/commands/user.py +++ b/src/transformers/commands/user.py @@ -31,7 +31,7 @@ class UserCommands(BaseTransformersCLICommand): ls_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") ls_parser.set_defaults(func=lambda args: ListObjsCommand(args)) rm_parser = s3_subparsers.add_parser("rm") - rm_parser.add_argument("filename", type=str, help="individual object filename to delete from S3.") + rm_parser.add_argument("filename", type=str, help="individual object filename to delete from huggingface.co.") rm_parser.add_argument("--organization", type=str, help="Optional: organization namespace.") rm_parser.set_defaults(func=lambda args: DeleteObjCommand(args)) upload_parser = s3_subparsers.add_parser("upload", help="Upload a file to S3.") diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 94e85b9629..4e55c4db65 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -291,10 +291,9 @@ class PretrainedConfig(object): pretrained_model_name_or_path (:obj:`str`): This can be either: - - the `shortcut name` of a pretrained model configuration to load from cache or download, e.g., - ``bert-base-uncased``. - - the `identifier name` of a pretrained model configuration that was uploaded to our S3 by any user, - e.g., ``dbmdz/bert-base-german-cased``. + - a string, the `model id` of a pretrained model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g., ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g., @@ -333,7 +332,7 @@ class PretrainedConfig(object): # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a # derived class: BertConfig - config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from huggingface.co and cache. config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False) diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 8315a781f9..dc9998d633 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -855,7 +855,9 @@ def is_remote_url(url_or_filename): return parsed.scheme in ("http", "https") -def hf_bucket_url(model_id: str, filename: str, revision: Optional[str] = None, mirror=None) -> str: +def hf_bucket_url( + model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None +) -> str: """ Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting to Cloudfront (a Content Delivery Network, or CDN) for large files. @@ -872,6 +874,9 @@ def hf_bucket_url(model_id: str, filename: str, revision: Optional[str] = None, its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0 are not shared with those new files, because the cached file's name contains a hash of the url (which changed). """ + if subfolder is not None: + filename = f"{subfolder}/{filename}" + if mirror: endpoint = PRESET_MIRROR_DICT.get(mirror, mirror) legacy_format = "/" not in model_id diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index d61ee8f673..2e2c555e83 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -148,12 +148,12 @@ class TFGenerationMixin: Examples:: tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from huggingface.co and cache. outputs = model.generate(max_length=40) # do greedy decoding print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. + model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from huggingface.co and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' @@ -161,7 +161,7 @@ class TFGenerationMixin: print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from huggingface.co and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # generate 3 candidates using sampling @@ -169,14 +169,14 @@ class TFGenerationMixin: print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. + model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from huggingface.co and cache. input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from huggingface.co and cache. input_context = 'My cute dog' bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py index 2cd3673055..2daab84649 100644 --- a/src/transformers/modelcard.py +++ b/src/transformers/modelcard.py @@ -87,10 +87,9 @@ class ModelCard: Parameters: pretrained_model_name_or_path: either: - - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: - ``bert-base-uncased``. - - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, - e.g.: ``dbmdz/bert-base-german-cased``. + - a string, the `model id` of a pretrained model card hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a + user or organization name, like ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a model card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``. @@ -124,7 +123,7 @@ class ModelCard: Examples:: - modelcard = ModelCard.from_pretrained('bert-base-uncased') # Download model card from S3 and cache. + modelcard = ModelCard.from_pretrained('bert-base-uncased') # Download model card from huggingface.co and cache. modelcard = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')` modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json') modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attentions=True, foo=False) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 2de2b1f0ee..63d1917e1e 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -544,10 +544,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): pretrained_model_name_or_path (:obj:`str`, `optional`): Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformersTF.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In @@ -568,8 +567,8 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - - The model is a model provided by the library (loaded with the `shortcut name` string of a - pretrained model). + - The model is a model provided by the library (loaded with the `model id` string of a pretrained + model). - The model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by supplying the save directory. - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a @@ -618,7 +617,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): Examples:: >>> from transformers import BertConfig, TFBertModel - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFBertModel.from_pretrained('bert-base-uncased') >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable). >>> model = TFBertModel.from_pretrained('./test/saved_model/') diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 793f429f6e..ead4f192ee 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -758,10 +758,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): pretrained_model_name_or_path (:obj:`str`, `optional`): Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In @@ -781,8 +780,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - - The model is a model provided by the library (loaded with the `shortcut name` string of a - pretrained model). + - The model is a model provided by the library (loaded with the `model id` string of a pretrained + model). - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by supplying the save directory. - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a @@ -838,7 +837,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): Examples:: >>> from transformers import BertConfig, BertModel - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = BertModel.from_pretrained('bert-base-uncased') >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable). >>> model = BertModel.from_pretrained('./test/saved_model/') diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 8bc1f0ebd8..b998ff8d2a 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -277,10 +277,9 @@ class AutoConfig: pretrained_model_name_or_path (:obj:`str`): Can be either: - - A string with the `shortcut name` of a pretrained model configuration to load from cache or - download, e.g., ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model configuration that was user-uploaded to - our S3, e.g., ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing a configuration file saved using the :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``. @@ -317,10 +316,10 @@ class AutoConfig: >>> from transformers import AutoConfig - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') - >>> # Download configuration from S3 (user-uploaded) and cache. + >>> # Download configuration from huggingface.co (user-uploaded) and cache. >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased') >>> # If configuration file is in a directory (e.g., was saved using `save_pretrained('./test/saved_model/')`). diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index fe8c7bb4da..b056dc2790 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -505,10 +505,9 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r""" pretrained_model_name_or_path: Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In @@ -521,8 +520,8 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r""" Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when: - - The model is a model provided by the library (loaded with the `shortcut name` string of a - pretrained model). + - The model is a model provided by the library (loaded with the `model id` string of a pretrained + model). - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by supplying the save directory. - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a @@ -608,7 +607,7 @@ class AutoModel: Examples:: >>> from transformers import AutoConfig, AutoModel - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModel.from_config(config) """ @@ -634,7 +633,7 @@ class AutoModel: >>> from transformers import AutoConfig, AutoModel - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModel.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -702,7 +701,7 @@ class AutoModelForPreTraining: Examples:: >>> from transformers import AutoConfig, AutoModelForPreTraining - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelForPreTraining.from_config(config) """ @@ -728,7 +727,7 @@ class AutoModelForPreTraining: >>> from transformers import AutoConfig, AutoModelForPreTraining - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForPreTraining.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -802,7 +801,7 @@ class AutoModelWithLMHead: Examples:: >>> from transformers import AutoConfig, AutoModelWithLMHead - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelWithLMHead.from_config(config) """ @@ -834,7 +833,7 @@ class AutoModelWithLMHead: >>> from transformers import AutoConfig, AutoModelWithLMHead - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelWithLMHead.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -908,7 +907,7 @@ class AutoModelForCausalLM: Examples:: >>> from transformers import AutoConfig, AutoModelForCausalLM - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('gpt2') >>> model = AutoModelForCausalLM.from_config(config) """ @@ -934,7 +933,7 @@ class AutoModelForCausalLM: >>> from transformers import AutoConfig, AutoModelForCausalLM - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForCausalLM.from_pretrained('gpt2') >>> # Update configuration during loading @@ -1002,7 +1001,7 @@ class AutoModelForMaskedLM: Examples:: >>> from transformers import AutoConfig, AutoModelForMaskedLM - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelForMaskedLM.from_config(config) """ @@ -1028,7 +1027,7 @@ class AutoModelForMaskedLM: >>> from transformers import AutoConfig, AutoModelForMaskedLM - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1096,7 +1095,7 @@ class AutoModelForSeq2SeqLM: Examples:: >>> from transformers import AutoConfig, AutoModelForSeq2SeqLM - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('t5') >>> model = AutoModelForSeq2SeqLM.from_config(config) """ @@ -1124,7 +1123,7 @@ class AutoModelForSeq2SeqLM: >>> from transformers import AutoConfig, AutoModelForSeq2SeqLM - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForSeq2SeqLM.from_pretrained('t5-base') >>> # Update configuration during loading @@ -1194,7 +1193,7 @@ class AutoModelForSequenceClassification: Examples:: >>> from transformers import AutoConfig, AutoModelForSequenceClassification - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelForSequenceClassification.from_config(config) """ @@ -1222,7 +1221,7 @@ class AutoModelForSequenceClassification: >>> from transformers import AutoConfig, AutoModelForSequenceClassification - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1291,7 +1290,7 @@ class AutoModelForQuestionAnswering: Examples:: >>> from transformers import AutoConfig, AutoModelForQuestionAnswering - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelForQuestionAnswering.from_config(config) """ @@ -1320,7 +1319,7 @@ class AutoModelForQuestionAnswering: >>> from transformers import AutoConfig, AutoModelForQuestionAnswering - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1390,7 +1389,7 @@ class AutoModelForTokenClassification: Examples:: >>> from transformers import AutoConfig, AutoModelForTokenClassification - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelForTokenClassification.from_config(config) """ @@ -1419,7 +1418,7 @@ class AutoModelForTokenClassification: >>> from transformers import AutoConfig, AutoModelForTokenClassification - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1490,7 +1489,7 @@ class AutoModelForMultipleChoice: Examples:: >>> from transformers import AutoConfig, AutoModelForMultipleChoice - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelForMultipleChoice.from_config(config) """ @@ -1519,7 +1518,7 @@ class AutoModelForMultipleChoice: >>> from transformers import AutoConfig, AutoModelForMultipleChoice - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForMultipleChoice.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1590,7 +1589,7 @@ class AutoModelForNextSentencePrediction: Examples:: >>> from transformers import AutoConfig, AutoModelForNextSentencePrediction - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = AutoModelForNextSentencePrediction.from_config(config) """ @@ -1619,7 +1618,7 @@ class AutoModelForNextSentencePrediction: >>> from transformers import AutoConfig, AutoModelForNextSentencePrediction - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = AutoModelForNextSentencePrediction.from_pretrained('bert-base-uncased') >>> # Update configuration during loading diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py index 642815a7fc..bc44f88112 100644 --- a/src/transformers/models/auto/modeling_flax_auto.py +++ b/src/transformers/models/auto/modeling_flax_auto.py @@ -75,7 +75,7 @@ class FlaxAutoModel(object): Examples:: config = BertConfig.from_pretrained('bert-base-uncased') - # Download configuration from S3 and cache. + # Download configuration from huggingface.co and cache. model = FlaxAutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ @@ -109,10 +109,9 @@ class FlaxAutoModel(object): Args: pretrained_model_name_or_path: either: - - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: - ``bert-base-uncased``. - - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: - ``dbmdz/bert-base-german-cased``. + - a string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. Valid + model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a user or + organization name, like ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this @@ -165,7 +164,7 @@ class FlaxAutoModel(object): Examples:: - model = FlaxAutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = FlaxAutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from huggingface.co and cache. model = FlaxAutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attention == True diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index a1c34a137b..291b324307 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -403,10 +403,9 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r""" pretrained_model_name_or_path: Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In @@ -420,8 +419,8 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r""" Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when: - - The model is a model provided by the library (loaded with the `shortcut name` string of a - pretrained model). + - The model is a model provided by the library (loaded with the `model id` string of a pretrained + model). - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppyling the save directory. - The model is loaded by suppyling a local directory as ``pretrained_model_name_or_path`` and a @@ -507,7 +506,7 @@ class TFAutoModel(object): Examples:: >>> from transformers import AutoConfig, TFAutoModel - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = TFAutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModel.from_config(config) """ @@ -533,7 +532,7 @@ class TFAutoModel(object): >>> from transformers import AutoConfig, AutoModel - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModel.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -601,7 +600,7 @@ class TFAutoModelForPreTraining(object): Examples:: >>> from transformers import AutoConfig, TFAutoModelForPreTraining - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelForPreTraining.from_config(config) """ @@ -627,7 +626,7 @@ class TFAutoModelForPreTraining(object): >>> from transformers import AutoConfig, TFAutoModelForPreTraining - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForPreTraining.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -701,7 +700,7 @@ class TFAutoModelWithLMHead(object): Examples:: >>> from transformers import AutoConfig, TFAutoModelWithLMHead - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelWithLMHead.from_config(config) """ @@ -733,7 +732,7 @@ class TFAutoModelWithLMHead(object): >>> from transformers import AutoConfig, TFAutoModelWithLMHead - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -808,7 +807,7 @@ class TFAutoModelForCausalLM: Examples:: >>> from transformers import AutoConfig, TFAutoModelForCausalLM - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('gpt2') >>> model = TFAutoModelForCausalLM.from_config(config) """ @@ -834,7 +833,7 @@ class TFAutoModelForCausalLM: >>> from transformers import AutoConfig, TFAutoModelForCausalLM - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForCausalLM.from_pretrained('gpt2') >>> # Update configuration during loading @@ -902,7 +901,7 @@ class TFAutoModelForMaskedLM: Examples:: >>> from transformers import AutoConfig, TFAutoModelForMaskedLM - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelForMaskedLM.from_config(config) """ @@ -928,7 +927,7 @@ class TFAutoModelForMaskedLM: >>> from transformers import AutoConfig, TFAutoModelForMaskedLM - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForMaskedLM.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -996,7 +995,7 @@ class TFAutoModelForSeq2SeqLM: Examples:: >>> from transformers import AutoConfig, TFAutoModelForSeq2SeqLM - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('t5') >>> model = TFAutoModelForSeq2SeqLM.from_config(config) """ @@ -1024,7 +1023,7 @@ class TFAutoModelForSeq2SeqLM: >>> from transformers import AutoConfig, TFAutoModelForSeq2SeqLM - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-base') >>> # Update configuration during loading @@ -1094,7 +1093,7 @@ class TFAutoModelForSequenceClassification(object): Examples:: >>> from transformers import AutoConfig, TFAutoModelForSequenceClassification - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelForSequenceClassification.from_config(config) """ @@ -1122,7 +1121,7 @@ class TFAutoModelForSequenceClassification(object): >>> from transformers import AutoConfig, TFAutoModelForSequenceClassification - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1191,7 +1190,7 @@ class TFAutoModelForQuestionAnswering(object): Examples:: >>> from transformers import AutoConfig, TFAutoModelForQuestionAnswering - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelForQuestionAnswering.from_config(config) """ @@ -1219,7 +1218,7 @@ class TFAutoModelForQuestionAnswering(object): >>> from transformers import AutoConfig, TFAutoModelForQuestionAnswering - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1288,7 +1287,7 @@ class TFAutoModelForTokenClassification: Examples:: >>> from transformers import AutoConfig, TFAutoModelForTokenClassification - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelForTokenClassification.from_config(config) """ @@ -1316,7 +1315,7 @@ class TFAutoModelForTokenClassification: >>> from transformers import AutoConfig, TFAutoModelForTokenClassification - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1386,7 +1385,7 @@ class TFAutoModelForMultipleChoice: Examples:: >>> from transformers import AutoConfig, TFAutoModelForMultipleChoice - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelForMultipleChoice.from_config(config) """ @@ -1414,7 +1413,7 @@ class TFAutoModelForMultipleChoice: >>> from transformers import AutoConfig, TFAutoModelForMultipleChoice - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForMultipleChoice.from_pretrained('bert-base-uncased') >>> # Update configuration during loading @@ -1484,7 +1483,7 @@ class TFAutoModelForNextSentencePrediction: Examples:: >>> from transformers import AutoConfig, TFAutoModelForNextSentencePrediction - >>> # Download configuration from S3 and cache. + >>> # Download configuration from huggingface.co and cache. >>> config = AutoConfig.from_pretrained('bert-base-uncased') >>> model = TFAutoModelForNextSentencePrediction.from_config(config) """ @@ -1512,7 +1511,7 @@ class TFAutoModelForNextSentencePrediction: >>> from transformers import AutoConfig, TFAutoModelForNextSentencePrediction - >>> # Download model and configuration from S3 and cache. + >>> # Download model and configuration from huggingface.co and cache. >>> model = TFAutoModelForNextSentencePrediction.from_pretrained('bert-base-uncased') >>> # Update configuration during loading diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index ea5113bd1c..5619b77333 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -250,10 +250,9 @@ class AutoTokenizer: pretrained_model_name_or_path (:obj:`str`): Can be either: - - A string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, - e.g., ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g., ``./my_model_directory/``. @@ -280,6 +279,9 @@ class AutoTokenizer: The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. + subfolder (:obj:`str`, `optional`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for + facebook/rag-token-base), specify it here. use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to try to load the fast version of the tokenizer. kwargs (additional keyword arguments, `optional`): @@ -291,10 +293,10 @@ class AutoTokenizer: >>> from transformers import AutoTokenizer - >>> # Download vocabulary from S3 and cache. + >>> # Download vocabulary from huggingface.co and cache. >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') - >>> # Download vocabulary from S3 (user-uploaded) and cache. + >>> # Download vocabulary from huggingface.co (user-uploaded) and cache. >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased') >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index c16514ac6c..956ddfb0f8 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -214,10 +214,9 @@ class EncoderDecoderModel(PreTrainedModel): encoder_pretrained_model_name_or_path (:obj: `str`, `optional`): Information necessary to initiate the encoder. Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In @@ -228,10 +227,9 @@ class EncoderDecoderModel(PreTrainedModel): decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): Information necessary to initiate the decoder. Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py index fe12a95a34..159e3c1b72 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert.py +++ b/src/transformers/models/lxmert/tokenization_lxmert.py @@ -24,7 +24,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model shortcut names. +# to pretrained vocabulary URL for all the model ids. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { @@ -33,13 +33,13 @@ PRETRAINED_VOCAB_FILES_MAP = { } #################################################### -# Mapping from model shortcut names to max length of inputs +# Mapping from model ids to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "unc-nlp/lxmert-base-uncased": 512, } #################################################### -# Mapping from model shortcut names to a dictionary of additional +# Mapping from model ids to a dictionary of additional # keyword arguments for Tokenizer `__init__`. # To be used for checkpoint specific configurations. #################################################### diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py index bace6236b1..d2bb378544 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py +++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py @@ -25,7 +25,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model shortcut names. +# to pretrained vocabulary URL for all the model ids. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { @@ -37,13 +37,13 @@ PRETRAINED_VOCAB_FILES_MAP = { } #################################################### -# Mapping from model shortcut names to max length of inputs +# Mapping from model ids to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "unc-nlp/lxmert-base-uncased": 512, } #################################################### -# Mapping from model shortcut names to a dictionary of additional +# Mapping from model ids to a dictionary of additional # keyword arguments for Tokenizer `__init__`. # To be used for checkpoint specific configurations. #################################################### diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 236aa4cfb9..f32b3f036a 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -238,10 +238,9 @@ class RagPreTrainedModel(PreTrainedModel): question_encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): Information necessary to initiate the question encoder. Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In @@ -252,10 +251,9 @@ class RagPreTrainedModel(PreTrainedModel): generator_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`): Information necessary to initiate the generator. Can be either: - - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py index 4acfc018bd..c3deffc986 100644 --- a/src/transformers/models/rag/tokenization_rag.py +++ b/src/transformers/models/rag/tokenization_rag.py @@ -49,10 +49,12 @@ class RagTokenizer: if config is None: config = RagConfig.from_pretrained(pretrained_model_name_or_path) - question_encoder_path = os.path.join(pretrained_model_name_or_path, "question_encoder_tokenizer") - generator_path = os.path.join(pretrained_model_name_or_path, "generator_tokenizer") - question_encoder = AutoTokenizer.from_pretrained(question_encoder_path, config=config.question_encoder) - generator = AutoTokenizer.from_pretrained(generator_path, config=config.generator) + question_encoder = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, config=config.question_encoder, subfolder="question_encoder_tokenizer" + ) + generator = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, config=config.generator, subfolder="generator_tokenizer" + ) return cls(question_encoder=question_encoder, generator=generator) def __call__(self, *args, **kwargs): diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py index 37efa5f6e7..66767b07e2 100644 --- a/src/transformers/models/reformer/tokenization_reformer.py +++ b/src/transformers/models/reformer/tokenization_reformer.py @@ -38,7 +38,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model shortcut names. +# to pretrained vocabulary URL for all the model ids. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { @@ -47,7 +47,7 @@ PRETRAINED_VOCAB_FILES_MAP = { } #################################################### -# Mapping from model shortcut names to max length of inputs +# Mapping from model ids to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/reformer-crime-and-punishment": 524288, diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py index ff73ea6cd2..1a3d58f84d 100644 --- a/src/transformers/models/reformer/tokenization_reformer_fast.py +++ b/src/transformers/models/reformer/tokenization_reformer_fast.py @@ -43,7 +43,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer. #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model shortcut names. +# to pretrained vocabulary URL for all the model ids. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { @@ -55,7 +55,7 @@ PRETRAINED_VOCAB_FILES_MAP = { } #################################################### -# Mapping from model shortcut names to max length of inputs +# Mapping from model ids to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "google/reformer-crime-and-punishment": 524288, diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 39bff0f46a..bd0ac1c75d 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -50,7 +50,7 @@ _CONFIG_FOR_DOC = "T5Config" _TOKENIZER_FOR_DOC = "T5Tokenizer" #################################################### -# This dict contains shortcut names and associated url +# This dict contains ids and associated url # for the pretrained weights provided with the models #################################################### T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 34ecd55580..95359e3ac7 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -39,7 +39,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model shortcut names. +# to pretrained vocabulary URL for all the model ids. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { @@ -52,7 +52,7 @@ PRETRAINED_VOCAB_FILES_MAP = { } #################################################### -# Mapping from model shortcut names to max length of inputs +# Mapping from model ids to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 53608e6460..7ae47bd3ad 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -42,7 +42,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer. #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model shortcut names. +# to pretrained vocabulary URL for all the model ids. #################################################### PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { @@ -62,7 +62,7 @@ PRETRAINED_VOCAB_FILES_MAP = { } #################################################### -# Mapping from model shortcut names to max length of inputs +# Mapping from model ids to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a7581b70f8..1f492b06e9 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1615,10 +1615,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): pretrained_model_name_or_path (:obj:`str`): Can be either: - - A string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g., - ``bert-base-uncased``. - - A string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g., - ``dbmdz/bert-base-german-cased``. + - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a + user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained` method, e.g., ``./my_model_directory/``. @@ -1641,6 +1640,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. + subfolder (:obj:`str`, `optional`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for + facebook/rag-token-base), specify it here. inputs (additional positional arguments, `optional`): Will be passed along to the Tokenizer ``__init__`` method. kwargs (additional keyword arguments, `optional`): @@ -1651,10 +1653,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): Examples:: # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer - # Download vocabulary from S3 and cache. + # Download vocabulary from huggingface.co and cache. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - # Download vocabulary from S3 (user-uploaded) and cache. + # Download vocabulary from huggingface.co (user-uploaded) and cache. tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) @@ -1676,6 +1678,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", None) s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} @@ -1722,13 +1725,20 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): # Look for the tokenizer files for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): if os.path.isdir(pretrained_model_name_or_path): - full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + if subfolder is not None: + full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name) + else: + full_file_name = os.path.join(pretrained_model_name_or_path, file_name) if not os.path.exists(full_file_name): logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) full_file_name = None else: full_file_name = hf_bucket_url( - pretrained_model_name_or_path, filename=file_name, revision=revision, mirror=None + pretrained_model_name_or_path, + filename=file_name, + subfolder=subfolder, + revision=revision, + mirror=None, ) vocab_files[file_id] = full_file_name diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py index 778ee04afa..a659ed95bb 100644 --- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py +++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py @@ -75,7 +75,7 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"} ) use_fast_tokenizer: bool = field( default=True, @@ -98,7 +98,7 @@ class ModelArguments: default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} ) cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"} ) use_fast_tokenizer: bool = field( default=True,