Update all references to canonical models (#29001)

* Script & Manual edition

* Update
This commit is contained in:
Lysandre Debut
2024-02-16 08:16:58 +01:00
committed by GitHub
parent 1e402b957d
commit f497f564bb
561 changed files with 2682 additions and 2687 deletions

View File

@@ -25,7 +25,7 @@ Instantiating one of [`AutoConfig`], [`AutoModel`], and
```python
model = AutoModel.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("google-bert/bert-base-cased")
```
will create a model that is an instance of [`BertModel`].

View File

@@ -44,15 +44,15 @@ subsequent fine-tuning:
```python
>>> # leverage checkpoints for Bert2Bert model...
>>> # use BERT's cls token as BOS token and sep token as EOS token
>>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
>>> encoder = BertGenerationEncoder.from_pretrained("google-bert/bert-large-uncased", bos_token_id=101, eos_token_id=102)
>>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
>>> decoder = BertGenerationDecoder.from_pretrained(
... "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
... "google-bert/bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
... )
>>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
>>> # create tokenizer...
>>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
>>> input_ids = tokenizer(
... "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"

View File

@@ -34,7 +34,7 @@ The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, li
distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
*bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
*google-bert/bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
understanding benchmark.
The abstract from the paper is the following:
@@ -152,8 +152,8 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
>>> device = "cuda" # the device to load the model onto
>>> tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
>>> model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
>>> tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
>>> model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
>>> text = "Replace me by any text you'd like."

View File

@@ -55,8 +55,8 @@ To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_
```python
>>> from transformers import EncoderDecoderModel, BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
```
## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
@@ -119,8 +119,8 @@ target sequence).
```python
>>> from transformers import BertTokenizer, EncoderDecoderModel
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
>>> model.config.pad_token_id = tokenizer.pad_token_id

View File

@@ -38,7 +38,7 @@ The main differences compared to GPT2.
- Use jit to fuse the attention fp32 casting, masking, softmax, and scaling.
- Combine the attention and causal masks into a single one, pre-computed for the whole model instead of every layer.
- Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original gpt2 model).
- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model).
You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)

View File

@@ -39,7 +39,7 @@ This model was contributed by [shangz](https://huggingface.co/shangz).
- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *google-bert/bert-base-uncased*), and
perform Quantization Aware Training/Post Training Quantization.
- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).

View File

@@ -52,7 +52,7 @@ To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecode
>>> from transformers import SpeechEncoderDecoderModel
>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
... "facebook/hubert-large-ll60k", "bert-base-uncased"
... "facebook/hubert-large-ll60k", "google-bert/bert-base-uncased"
... )
```
@@ -93,7 +93,7 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq
>>> from datasets import load_dataset
>>> encoder_id = "facebook/wav2vec2-base-960h" # acoustic model encoder
>>> decoder_id = "bert-base-uncased" # text decoder
>>> decoder_id = "google-bert/bert-base-uncased" # text decoder
>>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id)
>>> tokenizer = AutoTokenizer.from_pretrained(decoder_id)

View File

@@ -64,15 +64,15 @@ for summarization: *summarize: ...*.
T5 comes in different sizes:
- [t5-small](https://huggingface.co/t5-small)
- [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
- [t5-base](https://huggingface.co/t5-base)
- [google-t5/t5-base](https://huggingface.co/google-t5/t5-base)
- [t5-large](https://huggingface.co/t5-large)
- [google-t5/t5-large](https://huggingface.co/google-t5/t5-large)
- [t5-3b](https://huggingface.co/t5-3b)
- [google-t5/t5-3b](https://huggingface.co/google-t5/t5-3b)
- [t5-11b](https://huggingface.co/t5-11b).
- [google-t5/t5-11b](https://huggingface.co/google-t5/t5-11b).
Based on the original T5 model, Google has released some follow-up works:
@@ -121,8 +121,8 @@ processed as follows:
```python
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
@@ -146,8 +146,8 @@ the model as follows:
```python
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
>>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
@@ -183,8 +183,8 @@ ignored. The code example below illustrates all of this.
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> import torch
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
>>> # the following 2 hyperparameters are task-specific
>>> max_source_length = 512
@@ -258,8 +258,8 @@ generation works in general in encoder-decoder models.
```python
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
>>> outputs = model.generate(input_ids)
@@ -275,8 +275,8 @@ The example above only shows a single example. You can also do batched inference
```python
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
>>> task_prefix = "translate English to German: "
>>> # use different length sentences to test batching
@@ -301,8 +301,8 @@ The predicted tokens will then be placed between the sentinel tokens.
```python
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids

View File

@@ -22,7 +22,7 @@ This model is in maintenance mode only, so we won't accept any new PRs changing
We recommend switching to more recent models for improved security.
In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
You will need to set the environment variable `TRUST_REMOTE_CODE` to `True` in order to allow the
usage of `pickle.load()`:
@@ -33,7 +33,7 @@ from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
os.environ["TRUST_REMOTE_CODE"] = "True"
checkpoint = 'transfo-xl-wt103'
checkpoint = 'transfo-xl/transfo-xl-wt103'
revision = '40a186da79458c9f9de846edfaea79c412137f97'
tokenizer = TransfoXLTokenizer.from_pretrained(checkpoint, revision=revision)

View File

@@ -58,7 +58,7 @@ To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecode
>>> from transformers import VisionEncoderDecoderModel
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased"
... "microsoft/swin-base-patch4-window7-224-in22k", "google-bert/bert-base-uncased"
... )
```
@@ -123,9 +123,9 @@ images) and `labels` (which are the `input_ids` of the encoded target sequence).
>>> from datasets import load_dataset
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "bert-base-uncased"
... "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
... )
>>> model.config.decoder_start_token_id = tokenizer.cls_token_id

View File

@@ -73,7 +73,7 @@ The following example shows how to get the last hidden state using [`VisualBertM
>>> from transformers import BertTokenizer, VisualBertModel
>>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> inputs = tokenizer("What is the man eating?", return_tensors="pt")
>>> # this is a custom function that returns the visual embeddings given the image path