Update all references to canonical models (#29001)
* Script & Manual edition * Update
This commit is contained in:
@@ -25,7 +25,7 @@ Instantiating one of [`AutoConfig`], [`AutoModel`], and
|
||||
|
||||
|
||||
```python
|
||||
model = AutoModel.from_pretrained("bert-base-cased")
|
||||
model = AutoModel.from_pretrained("google-bert/bert-base-cased")
|
||||
```
|
||||
|
||||
will create a model that is an instance of [`BertModel`].
|
||||
|
||||
@@ -44,15 +44,15 @@ subsequent fine-tuning:
|
||||
```python
|
||||
>>> # leverage checkpoints for Bert2Bert model...
|
||||
>>> # use BERT's cls token as BOS token and sep token as EOS token
|
||||
>>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
|
||||
>>> encoder = BertGenerationEncoder.from_pretrained("google-bert/bert-large-uncased", bos_token_id=101, eos_token_id=102)
|
||||
>>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
|
||||
>>> decoder = BertGenerationDecoder.from_pretrained(
|
||||
... "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
|
||||
... "google-bert/bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
|
||||
... )
|
||||
>>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
|
||||
|
||||
>>> # create tokenizer...
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
|
||||
|
||||
>>> input_ids = tokenizer(
|
||||
... "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
|
||||
|
||||
@@ -34,7 +34,7 @@ The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, li
|
||||
distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
|
||||
distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
|
||||
small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
|
||||
*bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
|
||||
*google-bert/bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
|
||||
understanding benchmark.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
@@ -152,8 +152,8 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
|
||||
|
||||
>>> device = "cuda" # the device to load the model onto
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
>>> model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
|
||||
>>> model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
|
||||
|
||||
>>> text = "Replace me by any text you'd like."
|
||||
|
||||
|
||||
@@ -55,8 +55,8 @@ To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_
|
||||
```python
|
||||
>>> from transformers import EncoderDecoderModel, BertTokenizer
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
|
||||
```
|
||||
|
||||
## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
|
||||
@@ -119,8 +119,8 @@ target sequence).
|
||||
```python
|
||||
>>> from transformers import BertTokenizer, EncoderDecoderModel
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
|
||||
|
||||
>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
|
||||
>>> model.config.pad_token_id = tokenizer.pad_token_id
|
||||
|
||||
@@ -38,7 +38,7 @@ The main differences compared to GPT2.
|
||||
- Use jit to fuse the attention fp32 casting, masking, softmax, and scaling.
|
||||
- Combine the attention and causal masks into a single one, pre-computed for the whole model instead of every layer.
|
||||
- Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
|
||||
- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original gpt2 model).
|
||||
- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model).
|
||||
|
||||
You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ This model was contributed by [shangz](https://huggingface.co/shangz).
|
||||
- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
|
||||
inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
|
||||
- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
|
||||
- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
|
||||
- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *google-bert/bert-base-uncased*), and
|
||||
perform Quantization Aware Training/Post Training Quantization.
|
||||
- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
|
||||
SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).
|
||||
|
||||
@@ -52,7 +52,7 @@ To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecode
|
||||
>>> from transformers import SpeechEncoderDecoderModel
|
||||
|
||||
>>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
|
||||
... "facebook/hubert-large-ll60k", "bert-base-uncased"
|
||||
... "facebook/hubert-large-ll60k", "google-bert/bert-base-uncased"
|
||||
... )
|
||||
```
|
||||
|
||||
@@ -93,7 +93,7 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> encoder_id = "facebook/wav2vec2-base-960h" # acoustic model encoder
|
||||
>>> decoder_id = "bert-base-uncased" # text decoder
|
||||
>>> decoder_id = "google-bert/bert-base-uncased" # text decoder
|
||||
|
||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id)
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained(decoder_id)
|
||||
|
||||
@@ -64,15 +64,15 @@ for summarization: *summarize: ...*.
|
||||
|
||||
T5 comes in different sizes:
|
||||
|
||||
- [t5-small](https://huggingface.co/t5-small)
|
||||
- [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
|
||||
|
||||
- [t5-base](https://huggingface.co/t5-base)
|
||||
- [google-t5/t5-base](https://huggingface.co/google-t5/t5-base)
|
||||
|
||||
- [t5-large](https://huggingface.co/t5-large)
|
||||
- [google-t5/t5-large](https://huggingface.co/google-t5/t5-large)
|
||||
|
||||
- [t5-3b](https://huggingface.co/t5-3b)
|
||||
- [google-t5/t5-3b](https://huggingface.co/google-t5/t5-3b)
|
||||
|
||||
- [t5-11b](https://huggingface.co/t5-11b).
|
||||
- [google-t5/t5-11b](https://huggingface.co/google-t5/t5-11b).
|
||||
|
||||
Based on the original T5 model, Google has released some follow-up works:
|
||||
|
||||
@@ -121,8 +121,8 @@ processed as follows:
|
||||
```python
|
||||
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
|
||||
|
||||
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
|
||||
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
|
||||
@@ -146,8 +146,8 @@ the model as follows:
|
||||
```python
|
||||
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
|
||||
|
||||
>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
|
||||
>>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
|
||||
@@ -183,8 +183,8 @@ ignored. The code example below illustrates all of this.
|
||||
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
|
||||
|
||||
>>> # the following 2 hyperparameters are task-specific
|
||||
>>> max_source_length = 512
|
||||
@@ -258,8 +258,8 @@ generation works in general in encoder-decoder models.
|
||||
```python
|
||||
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
|
||||
|
||||
>>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
|
||||
>>> outputs = model.generate(input_ids)
|
||||
@@ -275,8 +275,8 @@ The example above only shows a single example. You can also do batched inference
|
||||
```python
|
||||
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
|
||||
|
||||
>>> task_prefix = "translate English to German: "
|
||||
>>> # use different length sentences to test batching
|
||||
@@ -301,8 +301,8 @@ The predicted tokens will then be placed between the sentinel tokens.
|
||||
```python
|
||||
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
|
||||
|
||||
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ This model is in maintenance mode only, so we won't accept any new PRs changing
|
||||
|
||||
We recommend switching to more recent models for improved security.
|
||||
|
||||
In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
|
||||
In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
|
||||
|
||||
You will need to set the environment variable `TRUST_REMOTE_CODE` to `True` in order to allow the
|
||||
usage of `pickle.load()`:
|
||||
@@ -33,7 +33,7 @@ from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
|
||||
|
||||
os.environ["TRUST_REMOTE_CODE"] = "True"
|
||||
|
||||
checkpoint = 'transfo-xl-wt103'
|
||||
checkpoint = 'transfo-xl/transfo-xl-wt103'
|
||||
revision = '40a186da79458c9f9de846edfaea79c412137f97'
|
||||
|
||||
tokenizer = TransfoXLTokenizer.from_pretrained(checkpoint, revision=revision)
|
||||
|
||||
@@ -58,7 +58,7 @@ To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecode
|
||||
>>> from transformers import VisionEncoderDecoderModel
|
||||
|
||||
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
|
||||
... "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased"
|
||||
... "microsoft/swin-base-patch4-window7-224-in22k", "google-bert/bert-base-uncased"
|
||||
... )
|
||||
```
|
||||
|
||||
@@ -123,9 +123,9 @@ images) and `labels` (which are the `input_ids` of the encoded target sequence).
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
|
||||
... "google/vit-base-patch16-224-in21k", "bert-base-uncased"
|
||||
... "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
|
||||
... )
|
||||
|
||||
>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
|
||||
|
||||
@@ -73,7 +73,7 @@ The following example shows how to get the last hidden state using [`VisualBertM
|
||||
>>> from transformers import BertTokenizer, VisualBertModel
|
||||
|
||||
>>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("What is the man eating?", return_tensors="pt")
|
||||
>>> # this is a custom function that returns the visual embeddings given the image path
|
||||
|
||||
Reference in New Issue
Block a user