Update all references to canonical models (#29001)

* Script & Manual edition * Update
2024-02-16 08:16:58 +01:00
parent 1e402b957d
commit f497f564bb
561 changed files with 2682 additions and 2687 deletions
--- a/examples/research_projects/bert-loses-patience/README.md
+++ b/examples/research_projects/bert-loses-patience/README.md
@@ -15,7 +15,7 @@ export TASK_NAME=MRPC

 python ./run_glue_with_pabee.py \
  --model_type albert \
-  --model_name_or_path bert-base-uncased/albert-base-v2 \
+  --model_name_or_path google-bert/bert-base-uncased/albert/albert-base-v2 \
  --task_name $TASK_NAME \
  --do_train \
  --do_eval \
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -276,8 +276,8 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
                from torch import nn
                import torch

-                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
+                tokenizer = AlbertTokenizer.from_pretrained('albert/albert-base-v2')
+                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert/albert-base-v2')
                input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
                labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
                outputs = model(input_ids, labels=labels)
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -300,8 +300,8 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
            from torch import nn
            import torch

-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
+            model = BertForSequenceClassificationWithPabee.from_pretrained('google-bert/bert-base-uncased')

            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
--- a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
@@ -29,7 +29,7 @@ class PabeeTests(TestCasePlus):
        testargs = f"""
            run_glue_with_pabee.py
            --model_type albert
-            --model_name_or_path albert-base-v2
+            --model_name_or_path albert/albert-base-v2
            --data_dir ./tests/fixtures/tests_samples/MRPC/
            --output_dir {tmp_dir}
            --overwrite_output_dir
--- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -107,7 +107,7 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
    # ----------------------------------

    logging.info("Make sure that the models' outputs are identical")
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

    # prepare the model inputs
    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@@ -128,7 +128,7 @@ class Bert(nn.Module):

    def __init__(self):
        super().__init__()
-        config = BertConfig.from_pretrained("bert-base-uncased")
+        config = BertConfig.from_pretrained("google-bert/bert-base-uncased")
        self.model = BertModel(config)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
--- a/examples/research_projects/bertabs/run_summarization.py
+++ b/examples/research_projects/bertabs/run_summarization.py
@@ -29,7 +29,7 @@ Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mas


 def evaluate(args):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", do_lower_case=True)
    model = BertAbs.from_pretrained("remi/bertabs-finetuned-extractive-abstractive-summarization")
    model.to(args.device)
    model.eval()
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -79,7 +79,7 @@ python scripts/pretokenizing.py \
 Before training a new model for code we create a new tokenizer that is efficient at code tokenization. To train the tokenizer you can run the following command: 
 ```bash
 python scripts/bpe_training.py \
-    --base_tokenizer gpt2 \
+    --base_tokenizer openai-community/gpt2 \
    --dataset_name codeparrot/codeparrot-clean-train
 ```

@@ -90,12 +90,12 @@ The models are randomly initialized and trained from scratch. To initialize a ne

 ```bash
 python scripts/initialize_model.py \
--config_name gpt2-large \
+--config_name openai-community/gpt2-large \
 --tokenizer_name codeparrot/codeparrot \
 --model_name codeparrot \
 --push_to_hub True
 ```
-This will initialize a new model with the architecture and configuration of `gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
+This will initialize a new model with the architecture and configuration of `openai-community/gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.

 We can either pass the name of a text dataset or a pretokenized dataset which speeds up training a bit.
 Now that the tokenizer and model are also ready we can start training the model. The main training script is built with `accelerate` to scale across a wide range of platforms and infrastructure scales. We train two models with [110M](https://huggingface.co/codeparrot/codeparrot-small/) and [1.5B](https://huggingface.co/codeparrot/codeparrot/) parameters for 25-30B tokens on a 16xA100 (40GB) machine which takes 1 day and 1 week, respectively.
--- a/examples/research_projects/codeparrot/scripts/arguments.py
+++ b/examples/research_projects/codeparrot/scripts/arguments.py
@@ -172,7 +172,7 @@ class TokenizerTrainingArguments:
    """

    base_tokenizer: Optional[str] = field(
-        default="gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
+        default="openai-community/gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
    )
    dataset_name: Optional[str] = field(
        default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
@@ -211,7 +211,7 @@ class InitializationArguments:
    """

    config_name: Optional[str] = field(
-        default="gpt2-large", metadata={"help": "Configuration to use for model initialization."}
+        default="openai-community/gpt2-large", metadata={"help": "Configuration to use for model initialization."}
    )
    tokenizer_name: Optional[str] = field(
        default="codeparrot/codeparrot", metadata={"help": "Tokenizer attached to model."}
--- a/examples/research_projects/deebert/test_glue_deebert.py
+++ b/examples/research_projects/deebert/test_glue_deebert.py
@@ -48,7 +48,7 @@ class DeeBertTests(TestCasePlus):
    def test_glue_deebert_train(self):
        train_args = """
            --model_type roberta
-            --model_name_or_path roberta-base
+            --model_name_or_path FacebookAI/roberta-base
            --task_name MRPC
            --do_train
            --do_eval
@@ -61,7 +61,7 @@ class DeeBertTests(TestCasePlus):
            --num_train_epochs 3
            --overwrite_output_dir
            --seed 42
-            --output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
            --plot_data_dir ./examples/deebert/results/
            --save_steps 0
            --overwrite_cache
@@ -71,12 +71,12 @@ class DeeBertTests(TestCasePlus):

        eval_args = """
            --model_type roberta
-            --model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
            --task_name MRPC
            --do_eval
            --do_lower_case
            --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
            --plot_data_dir ./examples/deebert/results/
            --max_seq_length 128
            --eval_each_highway
@@ -88,12 +88,12 @@ class DeeBertTests(TestCasePlus):

        entropy_eval_args = """
            --model_type roberta
-            --model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
            --task_name MRPC
            --do_eval
            --do_lower_case
            --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
            --plot_data_dir ./examples/deebert/results/
            --max_seq_length 128
            --early_exit_entropy 0.1
--- a/examples/research_projects/information-gain-filtration/README.md
+++ b/examples/research_projects/information-gain-filtration/README.md
@@ -64,7 +64,7 @@ To fine-tune a transformer model with IGF on a language modeling task, use the f
  
 ```python
 python run_clm_igf.py\
--model_name_or_path "gpt2" \
+--model_name_or_path "openai-community/gpt2" \
 --data_file="data/tokenized_stories_train_wikitext103" \
 --igf_data_file="data/IGF_values" \
 --context_len 32 \
--- a/examples/research_projects/information-gain-filtration/igf/igf.py
+++ b/examples/research_projects/information-gain-filtration/igf/igf.py
@@ -69,9 +69,9 @@ def compute_perplexity(model, test_data, context_len):
    return perplexity


-def load_gpt2(model_name="gpt2"):
+def load_gpt2(model_name="openai-community/gpt2"):
    """
-    load original gpt2 and save off for quicker loading
+    load original openai-community/gpt2 and save off for quicker loading

    Args:
        model_name: GPT-2
--- a/examples/research_projects/information-gain-filtration/run_clm_igf.py
+++ b/examples/research_projects/information-gain-filtration/run_clm_igf.py
@@ -84,7 +84,7 @@ def generate_n_pairs(
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # load pretrained model
-    model = load_gpt2("gpt2").to(device)
+    model = load_gpt2("openai-community/gpt2").to(device)
    print("computing perplexity on objective set")
    orig_perp = compute_perplexity(model, objective_set, context_len).item()
    print("perplexity on objective set:", orig_perp)
@@ -121,7 +121,7 @@ def training_secondary_learner(
    set_seed(42)

    # Load pre-trained model
-    model = GPT2LMHeadModel.from_pretrained("gpt2")
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

    # Initialize secondary learner to use embedding weights of model
    secondary_learner = SecondaryLearner(model)
@@ -153,7 +153,7 @@ def finetune(
    recopy_model=recopy_gpt2,
    secondary_learner=None,
    eval_interval=10,
-    finetuned_model_name="gpt2_finetuned.pt",
+    finetuned_model_name="openai-community/gpt2_finetuned.pt",
 ):
    """
    fine-tune with IGF if secondary_learner is not None, else standard fine-tuning
@@ -346,7 +346,10 @@ def main():
    )

    parser.add_argument(
-        "--batch_size", default=16, type=int, help="batch size of training data of language model(gpt2) "
+        "--batch_size",
+        default=16,
+        type=int,
+        help="batch size of training data of language model(openai-community/gpt2) ",
    )

    parser.add_argument(
@@ -383,7 +386,9 @@ def main():
        ),
    )

-    parser.add_argument("--finetuned_model_name", default="gpt2_finetuned.pt", type=str, help="finetuned_model_name")
+    parser.add_argument(
+        "--finetuned_model_name", default="openai-community/gpt2_finetuned.pt", type=str, help="finetuned_model_name"
+    )

    parser.add_argument(
        "--recopy_model",
@@ -416,16 +421,16 @@ def main():
        igf_model_path="igf_model.pt",
    )

-    # load pretrained gpt2 model
-    model = GPT2LMHeadModel.from_pretrained("gpt2")
+    # load pretrained openai-community/gpt2 model
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
    set_seed(42)

-    # Generate train and test data to train and evaluate gpt2 model
+    # Generate train and test data to train and evaluate openai-community/gpt2 model
    train_dataset, test_dataset = generate_datasets(
        context_len=32, file="data/tokenized_stories_train_wikitext103.jbl", number=100, min_len=1026, trim=True
    )

-    # fine-tuning of the gpt2 model using igf (Information Gain Filtration)
+    # fine-tuning of the openai-community/gpt2 model using igf (Information Gain Filtration)
    finetune(
        model,
        train_dataset,
@@ -437,7 +442,7 @@ def main():
        recopy_model=recopy_gpt2,
        secondary_learner=secondary_learner,
        eval_interval=10,
-        finetuned_model_name="gpt2_finetuned.pt",
+        finetuned_model_name="openai-community/gpt2_finetuned.pt",
    )


--- a/examples/research_projects/jax-projects/README.md
+++ b/examples/research_projects/jax-projects/README.md
@@ -159,13 +159,13 @@ to be used, but that everybody in team is on the same page on what type of model
 To give an example, a well-defined project would be the following:

 - task: summarization
- model: [t5-small](https://huggingface.co/t5-small)
+- model: [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
 - dataset: [CNN/Daily mail](https://huggingface.co/datasets/cnn_dailymail)
 - training script: [run_summarization_flax.py](https://github.com/huggingface/transformers/blob/main/examples/flax/summarization/run_summarization_flax.py)
 - outcome: t5 model that can summarize news
- work flow: adapt `run_summarization_flax.py` to work with `t5-small`.
+- work flow: adapt `run_summarization_flax.py` to work with `google-t5/t5-small`.

-This example is a very easy and not the most interesting project since a `t5-small`
+This example is a very easy and not the most interesting project since a `google-t5/t5-small`
 summarization model exists already for CNN/Daily mail and pretty much no code has to be 
 written. 
 A well-defined project does not need to have the dataset be part of 
@@ -335,7 +335,7 @@ dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', str

 dummy_input = next(iter(dataset))["text"]

-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
 input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]

 model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
@@ -492,7 +492,7 @@ dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', str

 dummy_input = next(iter(dataset))["text"]

-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
 input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]

 model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
@@ -518,7 +518,7 @@ be available in a couple of days.
 - [BigBird](https://github.com/huggingface/transformers/blob/main/src/transformers/models/big_bird/modeling_flax_big_bird.py)
 - [CLIP](https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_flax_clip.py)
 - [ELECTRA](https://github.com/huggingface/transformers/blob/main/src/transformers/models/electra/modeling_flax_electra.py)
- [GPT2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_flax_gpt2.py)
+- [GPT2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/openai-community/gpt2/modeling_flax_gpt2.py)
 - [(TODO) MBART](https://github.com/huggingface/transformers/blob/main/src/transformers/models/mbart/modeling_flax_mbart.py)
 - [RoBERTa](https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/modeling_flax_roberta.py)
 - [T5](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_flax_t5.py)
@@ -729,7 +729,7 @@ Let's use the base `FlaxRobertaModel` without any heads as an example.
 from transformers import FlaxRobertaModel, RobertaTokenizerFast
 import jax

-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
 inputs = tokenizer("JAX/Flax is amazing ", padding="max_length", max_length=128, return_tensors="np")

 model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
@@ -1011,7 +1011,7 @@ and run the following commands in a Python shell to save a config.
 ```python
 from transformers import RobertaConfig

-config = RobertaConfig.from_pretrained("roberta-base")
+config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
 config.save_pretrained("./")
 ```

@@ -1193,12 +1193,12 @@ All the widgets are open sourced in the `huggingface_hub` [repo](https://github.
 **NLP**
 * **Conversational:** To have the best conversations!. [Example](https://huggingface.co/microsoft/DialoGPT-large?).
 * **Feature Extraction:** Retrieve the input embeddings. [Example](https://huggingface.co/sentence-transformers/distilbert-base-nli-mean-tokens?text=test).
-* **Fill Mask:** Predict potential words for a mask token. [Example](https://huggingface.co/bert-base-uncased?).
-* **Question Answering:** Given a context and a question, predict the answer. [Example](https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad).
+* **Fill Mask:** Predict potential words for a mask token. [Example](https://huggingface.co/google-bert/bert-base-uncased?).
+* **Question Answering:** Given a context and a question, predict the answer. [Example](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
 * **Sentence Simmilarity:** Predict how similar a set of sentences are. Useful for Sentence Transformers.
 * **Summarization:** Given a text, output a summary of it. [Example](https://huggingface.co/sshleifer/distilbart-cnn-12-6).
 * **Table Question Answering:** Given a table and a question, predict the answer. [Example](https://huggingface.co/google/tapas-base-finetuned-wtq).
-* **Text Generation:** Generate text based on a prompt. [Example](https://huggingface.co/gpt2)
+* **Text Generation:** Generate text based on a prompt. [Example](https://huggingface.co/openai-community/gpt2)
 * **Token Classification:** Useful for tasks such as Named Entity Recognition and Part of Speech. [Example](https://huggingface.co/dslim/bert-base-NER).
 * **Zero-Shot Classification:** Too cool to explain with words. Here is an [example](https://huggingface.co/typeform/distilbert-base-uncased-mnli)
 * ([WIP](https://github.com/huggingface/huggingface_hub/issues/99)) **Table to Text Generation**.
--- a/examples/research_projects/jax-projects/dataset-streaming/README.md
+++ b/examples/research_projects/jax-projects/dataset-streaming/README.md
@@ -31,7 +31,7 @@ without ever having to download the full dataset.
 In the following, we demonstrate how to train a bi-directional transformer model 
 using masked language modeling objective as introduced in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
 More specifically, we demonstrate how JAX/Flax and dataset streaming can be leveraged 
-to pre-train [**`roberta-base`**](https://huggingface.co/roberta-base)
+to pre-train [**`FacebookAI/roberta-base`**](https://huggingface.co/FacebookAI/roberta-base)
 in English on a single TPUv3-8 pod for 10000 update steps.

 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
@@ -80,8 +80,8 @@ from transformers import RobertaTokenizerFast, RobertaConfig

 model_dir = "./english-roberta-base-dummy"

-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-config = RobertaConfig.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
+config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")

 tokenizer.save_pretrained(model_dir)
 config.save_pretrained(model_dir)
--- a/examples/research_projects/jax-projects/hybrid_clip/README.md
+++ b/examples/research_projects/jax-projects/hybrid_clip/README.md
@@ -32,7 +32,7 @@ Models written in JAX/Flax are **immutable** and updated in a purely functional
 way which enables simple and efficient model parallelism.

 In this example we will use the vision model from [CLIP](https://huggingface.co/models?filter=clip)
-as the image encoder and [`roberta-base`](https://huggingface.co/roberta-base) as the text encoder.
+as the image encoder and [`FacebookAI/roberta-base`](https://huggingface.co/FacebookAI/roberta-base) as the text encoder.
 Note that one can also use the [ViT](https://huggingface.co/models?filter=vit) model as image encoder and any other BERT or ROBERTa model as text encoder.
 To train the model on languages other than English one should choose a text encoder trained on the desired
 language and a image-text dataset in that language. One such dataset is [WIT](https://github.com/google-research-datasets/wit).	
@@ -76,7 +76,7 @@ Here is an example of how to load the model using pre-trained text and vision mo
 ```python
 from modeling_hybrid_clip import FlaxHybridCLIP

-model = FlaxHybridCLIP.from_text_vision_pretrained("bert-base-uncased", "openai/clip-vit-base-patch32")
+model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32")

 # save the model
 model.save_pretrained("bert-clip")
@@ -89,7 +89,7 @@ If the checkpoints are in PyTorch then one could pass `text_from_pt=True` and `v
 PyTorch checkpoints convert them to flax and load the model.

 ```python
-model = FlaxHybridCLIP.from_text_vision_pretrained("bert-base-uncased", "openai/clip-vit-base-patch32", text_from_pt=True, vision_from_pt=True)
+model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32", text_from_pt=True, vision_from_pt=True)
 ```

 This loads both the text and vision encoders using pre-trained weights, the projection layers are randomly
@@ -154,9 +154,9 @@ Next we can run the example script to train the model:
 ```bash
 python run_hybrid_clip.py \
    --output_dir ${MODEL_DIR} \
-    --text_model_name_or_path="roberta-base" \
+    --text_model_name_or_path="FacebookAI/roberta-base" \
    --vision_model_name_or_path="openai/clip-vit-base-patch32" \
-    --tokenizer_name="roberta-base" \
+    --tokenizer_name="FacebookAI/roberta-base" \
    --train_file="coco_dataset/train_dataset.json" \
    --validation_file="coco_dataset/validation_dataset.json" \
    --do_train --do_eval \
--- a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
+++ b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
@@ -314,8 +314,6 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
                Information necessary to initiate the text model. Can be either:

                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
                    - A path to a `directory` containing model weights saved using
                      :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
                    - A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
@@ -327,8 +325,6 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
                Information necessary to initiate the vision model. Can be either:

                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
                    - A path to a `directory` containing model weights saved using
                      :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
                    - A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
@@ -354,7 +350,7 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
            >>> from transformers import FlaxHybridCLIP
            >>> # initialize a model from pretrained BERT and CLIP models. Note that the projection layers will be randomly initialized.
            >>> # If using CLIP's vision model the vision projection layer will be initialized using pre-trained weights
-            >>> model = FlaxHybridCLIP.from_text_vision_pretrained('bert-base-uncased', 'openai/clip-vit-base-patch32')
+            >>> model = FlaxHybridCLIP.from_text_vision_pretrained('google-bert/bert-base-uncased', 'openai/clip-vit-base-patch32')
            >>> # saving model after fine-tuning
            >>> model.save_pretrained("./bert-clip")
            >>> # load fine-tuned model
--- a/examples/research_projects/jax-projects/model_parallel/README.md
+++ b/examples/research_projects/jax-projects/model_parallel/README.md
@@ -54,7 +54,7 @@ model.save_pretrained("gpt-neo-1.3B")
 ```bash
 python run_clm_mp.py \
    --model_name_or_path gpt-neo-1.3B  \
-    --tokenizer_name gpt2 \
+    --tokenizer_name openai-community/gpt2 \
    --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
    --do_train  --do_eval \
    --block_size 1024 \
--- a/examples/research_projects/longform-qa/eli5_app.py
+++ b/examples/research_projects/longform-qa/eli5_app.py
@@ -36,7 +36,7 @@ def load_models():
        _ = s2s_model.eval()
    else:
        s2s_tokenizer, s2s_model = make_qa_s2s_model(
-            model_name="t5-small", from_file="seq2seq_models/eli5_t5_model_1024_4.pth", device="cuda:0"
+            model_name="google-t5/t5-small", from_file="seq2seq_models/eli5_t5_model_1024_4.pth", device="cuda:0"
        )
    return (qar_tokenizer, qar_model, s2s_tokenizer, s2s_model)

--- a/examples/research_projects/mlm_wwm/README.md
+++ b/examples/research_projects/mlm_wwm/README.md
@@ -32,7 +32,7 @@ to that word). This technique has been refined for Chinese in [this paper](https
 To fine-tune a model using whole word masking, use the following script:
 ```bash
 python run_mlm_wwm.py \
-    --model_name_or_path roberta-base \
+    --model_name_or_path FacebookAI/roberta-base \
    --dataset_name wikitext \
    --dataset_config_name wikitext-2-raw-v1 \
    --do_train \
@@ -83,7 +83,7 @@ export VALIDATION_REF_FILE=/path/to/validation/chinese_ref/file
 export OUTPUT_DIR=/tmp/test-mlm-wwm

 python run_mlm_wwm.py \
-    --model_name_or_path roberta-base \
+    --model_name_or_path FacebookAI/roberta-base \
    --train_file $TRAIN_FILE \
    --validation_file $VALIDATION_FILE \
    --train_ref_file $TRAIN_REF_FILE \
--- a/examples/research_projects/mm-imdb/README.md
+++ b/examples/research_projects/mm-imdb/README.md
@@ -10,7 +10,7 @@ Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformer
 python run_mmimdb.py \
    --data_dir /path/to/mmimdb/dataset/ \
    --model_type bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
    --output_dir /path/to/save/dir/ \
    --do_train \
    --do_eval \
--- a/examples/research_projects/movement-pruning/README.md
+++ b/examples/research_projects/movement-pruning/README.md
@@ -61,7 +61,7 @@ python examples/movement-pruning/masked_run_squad.py \
    --predict_file dev-v1.1.json \
    --do_train --do_eval --do_lower_case \
    --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
    --per_gpu_train_batch_size 16 \
    --warmup_steps 5400 \
    --num_train_epochs 10 \
@@ -84,7 +84,7 @@ python examples/movement-pruning/masked_run_squad.py \
    --predict_file dev-v1.1.json \
    --do_train --do_eval --do_lower_case \
    --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
    --per_gpu_train_batch_size 16 \
    --warmup_steps 5400 \
    --num_train_epochs 10 \
@@ -104,7 +104,7 @@ python examples/movement-pruning/masked_run_squad.py \
    --predict_file dev-v1.1.json \
    --do_train --do_eval --do_lower_case \
    --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
    --per_gpu_train_batch_size 16 \
    --warmup_steps 5400 \
    --num_train_epochs 10 \
@@ -124,7 +124,7 @@ python examples/movement-pruning/masked_run_squad.py \
    --predict_file dev-v1.1.json \
    --do_train --do_eval --do_lower_case \
    --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
    --per_gpu_train_batch_size 16 \
    --warmup_steps 5400 \
    --num_train_epochs 10 \
--- a/examples/research_projects/performer/README.md
+++ b/examples/research_projects/performer/README.md
@@ -10,8 +10,8 @@ Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyo

 ## Examples

-`sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
-`full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
+`sanity_script.sh` will launch performer fine-tuning from the google-bert/bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
+`full_script.sh` will launch performer fine-tuning from the google-bert/bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.

 Here are a few key arguments:
 - Remove the `--performer` argument to use a standard Bert model.
--- a/examples/research_projects/pplm/run_pplm.py
+++ b/examples/research_projects/pplm/run_pplm.py
@@ -61,7 +61,7 @@ DISCRIMINATOR_MODELS_PARAMS = {
        "embed_size": 1024,
        "class_vocab": {"non_clickbait": 0, "clickbait": 1},
        "default_class": 1,
-        "pretrained_model": "gpt2-medium",
+        "pretrained_model": "openai-community/gpt2-medium",
    },
    "sentiment": {
        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
@@ -69,7 +69,7 @@ DISCRIMINATOR_MODELS_PARAMS = {
        "embed_size": 1024,
        "class_vocab": {"very_positive": 2, "very_negative": 3},
        "default_class": 3,
-        "pretrained_model": "gpt2-medium",
+        "pretrained_model": "openai-community/gpt2-medium",
    },
 }

@@ -585,7 +585,7 @@ def set_generic_model_params(discrim_weights, discrim_meta):


 def run_pplm_example(
-    pretrained_model="gpt2-medium",
+    pretrained_model="openai-community/gpt2-medium",
    cond_text="",
    uncond=False,
    num_samples=1,
@@ -738,7 +738,7 @@ if __name__ == "__main__":
        "--pretrained_model",
        "-M",
        type=str,
-        default="gpt2-medium",
+        default="openai-community/gpt2-medium",
        help="pretrained model name or path to local checkpoint",
    )
    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
--- a/examples/research_projects/pplm/run_pplm_discrim_train.py
+++ b/examples/research_projects/pplm/run_pplm_discrim_train.py
@@ -45,7 +45,7 @@ max_length_seq = 100
 class Discriminator(nn.Module):
    """Transformer encoder followed by a Classification Head"""

-    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
+    def __init__(self, class_size, pretrained_model="openai-community/gpt2-medium", cached_mode=False, device="cpu"):
        super().__init__()
        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
        self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
@@ -218,7 +218,7 @@ def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, de
 def train_discriminator(
    dataset,
    dataset_fp=None,
-    pretrained_model="gpt2-medium",
+    pretrained_model="openai-community/gpt2-medium",
    epochs=10,
    batch_size=64,
    log_interval=10,
@@ -502,7 +502,10 @@ if __name__ == "__main__":
        help="File path of the dataset to use. Needed only in case of generic datadset",
    )
    parser.add_argument(
-        "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
+        "--pretrained_model",
+        type=str,
+        default="openai-community/gpt2-medium",
+        help="Pretrained model to use as encoder",
    )
    parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
    parser.add_argument(
--- a/examples/research_projects/quantization-qdqbert/README.md
+++ b/examples/research_projects/quantization-qdqbert/README.md
@@ -50,11 +50,11 @@ Calibrate the pretrained model and finetune with quantization awared:

 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
  --dataset_name squad \
  --max_seq_length 128 \
  --doc_stride 32 \
-  --output_dir calib/bert-base-uncased \
+  --output_dir calib/google-bert/bert-base-uncased \
  --do_calib \
  --calibrator percentile \
  --percentile 99.99
@@ -62,7 +62,7 @@ python3 run_quant_qa.py \

 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path calib/bert-base-uncased \
+  --model_name_or_path calib/google-bert/bert-base-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
@@ -71,8 +71,8 @@ python3 run_quant_qa.py \
  --num_train_epochs 2 \
  --max_seq_length 128 \
  --doc_stride 32 \
-  --output_dir finetuned_int8/bert-base-uncased \
-  --tokenizer_name bert-base-uncased \
+  --output_dir finetuned_int8/google-bert/bert-base-uncased \
+  --tokenizer_name google-bert/bert-base-uncased \
  --save_steps 0
 ```

@@ -82,14 +82,14 @@ To export the QAT model finetuned above:

 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path finetuned_int8/bert-base-uncased \
+  --model_name_or_path finetuned_int8/google-bert/bert-base-uncased \
  --output_dir ./ \
  --save_onnx \
  --per_device_eval_batch_size 1 \
  --max_seq_length 128 \
  --doc_stride 32 \
  --dataset_name squad \
-  --tokenizer_name bert-base-uncased
+  --tokenizer_name google-bert/bert-base-uncased
 ```

 Use `--recalibrate-weights` to calibrate the weight ranges according to the quantizer axis. Use `--quant-per-tensor` for per tensor quantization (default is per channel).
@@ -117,7 +117,7 @@ python3 evaluate-hf-trt-qa.py \
  --max_seq_length 128 \
  --doc_stride 32 \
  --dataset_name squad \
-  --tokenizer_name bert-base-uncased \
+  --tokenizer_name google-bert/bert-base-uncased \
  --int8 \
  --seed 42
 ```
@@ -128,14 +128,14 @@ Finetune a fp32 precision model with [transformers/examples/pytorch/question-ans

 ```bash
 python3 ../../pytorch/question-answering/run_qa.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
  --dataset_name squad \
  --per_device_train_batch_size 12 \
  --learning_rate 3e-5 \
  --num_train_epochs 2 \
  --max_seq_length 128 \
  --doc_stride 32 \
-  --output_dir ./finetuned_fp32/bert-base-uncased \
+  --output_dir ./finetuned_fp32/google-bert/bert-base-uncased \
  --save_steps 0 \
  --do_train \
  --do_eval
@@ -147,13 +147,13 @@ python3 ../../pytorch/question-answering/run_qa.py \

 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path ./finetuned_fp32/bert-base-uncased \
+  --model_name_or_path ./finetuned_fp32/google-bert/bert-base-uncased \
  --dataset_name squad \
  --calibrator percentile \
  --percentile 99.99 \
  --max_seq_length 128 \
  --doc_stride 32 \
-  --output_dir ./calib/bert-base-uncased \
+  --output_dir ./calib/google-bert/bert-base-uncased \
  --save_steps 0 \
  --do_calib \
  --do_eval
@@ -163,14 +163,14 @@ python3 run_quant_qa.py \

 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path ./calib/bert-base-uncased \
+  --model_name_or_path ./calib/google-bert/bert-base-uncased \
  --output_dir ./ \
  --save_onnx \
  --per_device_eval_batch_size 1 \
  --max_seq_length 128 \
  --doc_stride 32 \
  --dataset_name squad \
-  --tokenizer_name bert-base-uncased
+  --tokenizer_name google-bert/bert-base-uncased
 ```

 ### Evaluate the INT8 PTQ ONNX model inference with TensorRT
@@ -183,7 +183,7 @@ python3 evaluate-hf-trt-qa.py \
  --max_seq_length 128 \
  --doc_stride 32 \
  --dataset_name squad \
-  --tokenizer_name bert-base-uncased \
+  --tokenizer_name google-bert/bert-base-uncased \
  --int8 \
  --seed 42
 ```