Update all references to canonical models (#29001)
* Script & Manual edition * Update
This commit is contained in:
@@ -15,7 +15,7 @@ export TASK_NAME=MRPC
|
||||
|
||||
python ./run_glue_with_pabee.py \
|
||||
--model_type albert \
|
||||
--model_name_or_path bert-base-uncased/albert-base-v2 \
|
||||
--model_name_or_path google-bert/bert-base-uncased/albert/albert-base-v2 \
|
||||
--task_name $TASK_NAME \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
|
||||
@@ -276,8 +276,8 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
|
||||
from torch import nn
|
||||
import torch
|
||||
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert/albert-base-v2')
|
||||
model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert/albert-base-v2')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
|
||||
@@ -300,8 +300,8 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
|
||||
from torch import nn
|
||||
import torch
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
|
||||
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
|
||||
model = BertForSequenceClassificationWithPabee.from_pretrained('google-bert/bert-base-uncased')
|
||||
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
|
||||
@@ -29,7 +29,7 @@ class PabeeTests(TestCasePlus):
|
||||
testargs = f"""
|
||||
run_glue_with_pabee.py
|
||||
--model_type albert
|
||||
--model_name_or_path albert-base-v2
|
||||
--model_name_or_path albert/albert-base-v2
|
||||
--data_dir ./tests/fixtures/tests_samples/MRPC/
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
|
||||
@@ -107,7 +107,7 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
|
||||
# ----------------------------------
|
||||
|
||||
logging.info("Make sure that the models' outputs are identical")
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
# prepare the model inputs
|
||||
encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
|
||||
|
||||
@@ -128,7 +128,7 @@ class Bert(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
config = BertConfig.from_pretrained("bert-base-uncased")
|
||||
config = BertConfig.from_pretrained("google-bert/bert-base-uncased")
|
||||
self.model = BertModel(config)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
|
||||
|
||||
@@ -29,7 +29,7 @@ Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mas
|
||||
|
||||
|
||||
def evaluate(args):
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
|
||||
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", do_lower_case=True)
|
||||
model = BertAbs.from_pretrained("remi/bertabs-finetuned-extractive-abstractive-summarization")
|
||||
model.to(args.device)
|
||||
model.eval()
|
||||
|
||||
@@ -79,7 +79,7 @@ python scripts/pretokenizing.py \
|
||||
Before training a new model for code we create a new tokenizer that is efficient at code tokenization. To train the tokenizer you can run the following command:
|
||||
```bash
|
||||
python scripts/bpe_training.py \
|
||||
--base_tokenizer gpt2 \
|
||||
--base_tokenizer openai-community/gpt2 \
|
||||
--dataset_name codeparrot/codeparrot-clean-train
|
||||
```
|
||||
|
||||
@@ -90,12 +90,12 @@ The models are randomly initialized and trained from scratch. To initialize a ne
|
||||
|
||||
```bash
|
||||
python scripts/initialize_model.py \
|
||||
--config_name gpt2-large \
|
||||
--config_name openai-community/gpt2-large \
|
||||
--tokenizer_name codeparrot/codeparrot \
|
||||
--model_name codeparrot \
|
||||
--push_to_hub True
|
||||
```
|
||||
This will initialize a new model with the architecture and configuration of `gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
|
||||
This will initialize a new model with the architecture and configuration of `openai-community/gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
|
||||
|
||||
We can either pass the name of a text dataset or a pretokenized dataset which speeds up training a bit.
|
||||
Now that the tokenizer and model are also ready we can start training the model. The main training script is built with `accelerate` to scale across a wide range of platforms and infrastructure scales. We train two models with [110M](https://huggingface.co/codeparrot/codeparrot-small/) and [1.5B](https://huggingface.co/codeparrot/codeparrot/) parameters for 25-30B tokens on a 16xA100 (40GB) machine which takes 1 day and 1 week, respectively.
|
||||
|
||||
@@ -172,7 +172,7 @@ class TokenizerTrainingArguments:
|
||||
"""
|
||||
|
||||
base_tokenizer: Optional[str] = field(
|
||||
default="gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
|
||||
default="openai-community/gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
|
||||
)
|
||||
dataset_name: Optional[str] = field(
|
||||
default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
|
||||
@@ -211,7 +211,7 @@ class InitializationArguments:
|
||||
"""
|
||||
|
||||
config_name: Optional[str] = field(
|
||||
default="gpt2-large", metadata={"help": "Configuration to use for model initialization."}
|
||||
default="openai-community/gpt2-large", metadata={"help": "Configuration to use for model initialization."}
|
||||
)
|
||||
tokenizer_name: Optional[str] = field(
|
||||
default="codeparrot/codeparrot", metadata={"help": "Tokenizer attached to model."}
|
||||
|
||||
@@ -48,7 +48,7 @@ class DeeBertTests(TestCasePlus):
|
||||
def test_glue_deebert_train(self):
|
||||
train_args = """
|
||||
--model_type roberta
|
||||
--model_name_or_path roberta-base
|
||||
--model_name_or_path FacebookAI/roberta-base
|
||||
--task_name MRPC
|
||||
--do_train
|
||||
--do_eval
|
||||
@@ -61,7 +61,7 @@ class DeeBertTests(TestCasePlus):
|
||||
--num_train_epochs 3
|
||||
--overwrite_output_dir
|
||||
--seed 42
|
||||
--output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
|
||||
--output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
|
||||
--plot_data_dir ./examples/deebert/results/
|
||||
--save_steps 0
|
||||
--overwrite_cache
|
||||
@@ -71,12 +71,12 @@ class DeeBertTests(TestCasePlus):
|
||||
|
||||
eval_args = """
|
||||
--model_type roberta
|
||||
--model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
|
||||
--model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
|
||||
--task_name MRPC
|
||||
--do_eval
|
||||
--do_lower_case
|
||||
--data_dir ./tests/fixtures/tests_samples/MRPC/
|
||||
--output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
|
||||
--output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
|
||||
--plot_data_dir ./examples/deebert/results/
|
||||
--max_seq_length 128
|
||||
--eval_each_highway
|
||||
@@ -88,12 +88,12 @@ class DeeBertTests(TestCasePlus):
|
||||
|
||||
entropy_eval_args = """
|
||||
--model_type roberta
|
||||
--model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
|
||||
--model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
|
||||
--task_name MRPC
|
||||
--do_eval
|
||||
--do_lower_case
|
||||
--data_dir ./tests/fixtures/tests_samples/MRPC/
|
||||
--output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
|
||||
--output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
|
||||
--plot_data_dir ./examples/deebert/results/
|
||||
--max_seq_length 128
|
||||
--early_exit_entropy 0.1
|
||||
|
||||
@@ -64,7 +64,7 @@ To fine-tune a transformer model with IGF on a language modeling task, use the f
|
||||
|
||||
```python
|
||||
python run_clm_igf.py\
|
||||
--model_name_or_path "gpt2" \
|
||||
--model_name_or_path "openai-community/gpt2" \
|
||||
--data_file="data/tokenized_stories_train_wikitext103" \
|
||||
--igf_data_file="data/IGF_values" \
|
||||
--context_len 32 \
|
||||
|
||||
@@ -69,9 +69,9 @@ def compute_perplexity(model, test_data, context_len):
|
||||
return perplexity
|
||||
|
||||
|
||||
def load_gpt2(model_name="gpt2"):
|
||||
def load_gpt2(model_name="openai-community/gpt2"):
|
||||
"""
|
||||
load original gpt2 and save off for quicker loading
|
||||
load original openai-community/gpt2 and save off for quicker loading
|
||||
|
||||
Args:
|
||||
model_name: GPT-2
|
||||
|
||||
@@ -84,7 +84,7 @@ def generate_n_pairs(
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# load pretrained model
|
||||
model = load_gpt2("gpt2").to(device)
|
||||
model = load_gpt2("openai-community/gpt2").to(device)
|
||||
print("computing perplexity on objective set")
|
||||
orig_perp = compute_perplexity(model, objective_set, context_len).item()
|
||||
print("perplexity on objective set:", orig_perp)
|
||||
@@ -121,7 +121,7 @@ def training_secondary_learner(
|
||||
set_seed(42)
|
||||
|
||||
# Load pre-trained model
|
||||
model = GPT2LMHeadModel.from_pretrained("gpt2")
|
||||
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
|
||||
|
||||
# Initialize secondary learner to use embedding weights of model
|
||||
secondary_learner = SecondaryLearner(model)
|
||||
@@ -153,7 +153,7 @@ def finetune(
|
||||
recopy_model=recopy_gpt2,
|
||||
secondary_learner=None,
|
||||
eval_interval=10,
|
||||
finetuned_model_name="gpt2_finetuned.pt",
|
||||
finetuned_model_name="openai-community/gpt2_finetuned.pt",
|
||||
):
|
||||
"""
|
||||
fine-tune with IGF if secondary_learner is not None, else standard fine-tuning
|
||||
@@ -346,7 +346,10 @@ def main():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--batch_size", default=16, type=int, help="batch size of training data of language model(gpt2) "
|
||||
"--batch_size",
|
||||
default=16,
|
||||
type=int,
|
||||
help="batch size of training data of language model(openai-community/gpt2) ",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -383,7 +386,9 @@ def main():
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument("--finetuned_model_name", default="gpt2_finetuned.pt", type=str, help="finetuned_model_name")
|
||||
parser.add_argument(
|
||||
"--finetuned_model_name", default="openai-community/gpt2_finetuned.pt", type=str, help="finetuned_model_name"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--recopy_model",
|
||||
@@ -416,16 +421,16 @@ def main():
|
||||
igf_model_path="igf_model.pt",
|
||||
)
|
||||
|
||||
# load pretrained gpt2 model
|
||||
model = GPT2LMHeadModel.from_pretrained("gpt2")
|
||||
# load pretrained openai-community/gpt2 model
|
||||
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
|
||||
set_seed(42)
|
||||
|
||||
# Generate train and test data to train and evaluate gpt2 model
|
||||
# Generate train and test data to train and evaluate openai-community/gpt2 model
|
||||
train_dataset, test_dataset = generate_datasets(
|
||||
context_len=32, file="data/tokenized_stories_train_wikitext103.jbl", number=100, min_len=1026, trim=True
|
||||
)
|
||||
|
||||
# fine-tuning of the gpt2 model using igf (Information Gain Filtration)
|
||||
# fine-tuning of the openai-community/gpt2 model using igf (Information Gain Filtration)
|
||||
finetune(
|
||||
model,
|
||||
train_dataset,
|
||||
@@ -437,7 +442,7 @@ def main():
|
||||
recopy_model=recopy_gpt2,
|
||||
secondary_learner=secondary_learner,
|
||||
eval_interval=10,
|
||||
finetuned_model_name="gpt2_finetuned.pt",
|
||||
finetuned_model_name="openai-community/gpt2_finetuned.pt",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -159,13 +159,13 @@ to be used, but that everybody in team is on the same page on what type of model
|
||||
To give an example, a well-defined project would be the following:
|
||||
|
||||
- task: summarization
|
||||
- model: [t5-small](https://huggingface.co/t5-small)
|
||||
- model: [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
|
||||
- dataset: [CNN/Daily mail](https://huggingface.co/datasets/cnn_dailymail)
|
||||
- training script: [run_summarization_flax.py](https://github.com/huggingface/transformers/blob/main/examples/flax/summarization/run_summarization_flax.py)
|
||||
- outcome: t5 model that can summarize news
|
||||
- work flow: adapt `run_summarization_flax.py` to work with `t5-small`.
|
||||
- work flow: adapt `run_summarization_flax.py` to work with `google-t5/t5-small`.
|
||||
|
||||
This example is a very easy and not the most interesting project since a `t5-small`
|
||||
This example is a very easy and not the most interesting project since a `google-t5/t5-small`
|
||||
summarization model exists already for CNN/Daily mail and pretty much no code has to be
|
||||
written.
|
||||
A well-defined project does not need to have the dataset be part of
|
||||
@@ -335,7 +335,7 @@ dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', str
|
||||
|
||||
dummy_input = next(iter(dataset))["text"]
|
||||
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
|
||||
input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]
|
||||
|
||||
model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
|
||||
@@ -492,7 +492,7 @@ dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', str
|
||||
|
||||
dummy_input = next(iter(dataset))["text"]
|
||||
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
|
||||
input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]
|
||||
|
||||
model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
|
||||
@@ -518,7 +518,7 @@ be available in a couple of days.
|
||||
- [BigBird](https://github.com/huggingface/transformers/blob/main/src/transformers/models/big_bird/modeling_flax_big_bird.py)
|
||||
- [CLIP](https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_flax_clip.py)
|
||||
- [ELECTRA](https://github.com/huggingface/transformers/blob/main/src/transformers/models/electra/modeling_flax_electra.py)
|
||||
- [GPT2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_flax_gpt2.py)
|
||||
- [GPT2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/openai-community/gpt2/modeling_flax_gpt2.py)
|
||||
- [(TODO) MBART](https://github.com/huggingface/transformers/blob/main/src/transformers/models/mbart/modeling_flax_mbart.py)
|
||||
- [RoBERTa](https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/modeling_flax_roberta.py)
|
||||
- [T5](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_flax_t5.py)
|
||||
@@ -729,7 +729,7 @@ Let's use the base `FlaxRobertaModel` without any heads as an example.
|
||||
from transformers import FlaxRobertaModel, RobertaTokenizerFast
|
||||
import jax
|
||||
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
|
||||
inputs = tokenizer("JAX/Flax is amazing ", padding="max_length", max_length=128, return_tensors="np")
|
||||
|
||||
model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
|
||||
@@ -1011,7 +1011,7 @@ and run the following commands in a Python shell to save a config.
|
||||
```python
|
||||
from transformers import RobertaConfig
|
||||
|
||||
config = RobertaConfig.from_pretrained("roberta-base")
|
||||
config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
|
||||
config.save_pretrained("./")
|
||||
```
|
||||
|
||||
@@ -1193,12 +1193,12 @@ All the widgets are open sourced in the `huggingface_hub` [repo](https://github.
|
||||
**NLP**
|
||||
* **Conversational:** To have the best conversations!. [Example](https://huggingface.co/microsoft/DialoGPT-large?).
|
||||
* **Feature Extraction:** Retrieve the input embeddings. [Example](https://huggingface.co/sentence-transformers/distilbert-base-nli-mean-tokens?text=test).
|
||||
* **Fill Mask:** Predict potential words for a mask token. [Example](https://huggingface.co/bert-base-uncased?).
|
||||
* **Question Answering:** Given a context and a question, predict the answer. [Example](https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad).
|
||||
* **Fill Mask:** Predict potential words for a mask token. [Example](https://huggingface.co/google-bert/bert-base-uncased?).
|
||||
* **Question Answering:** Given a context and a question, predict the answer. [Example](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
|
||||
* **Sentence Simmilarity:** Predict how similar a set of sentences are. Useful for Sentence Transformers.
|
||||
* **Summarization:** Given a text, output a summary of it. [Example](https://huggingface.co/sshleifer/distilbart-cnn-12-6).
|
||||
* **Table Question Answering:** Given a table and a question, predict the answer. [Example](https://huggingface.co/google/tapas-base-finetuned-wtq).
|
||||
* **Text Generation:** Generate text based on a prompt. [Example](https://huggingface.co/gpt2)
|
||||
* **Text Generation:** Generate text based on a prompt. [Example](https://huggingface.co/openai-community/gpt2)
|
||||
* **Token Classification:** Useful for tasks such as Named Entity Recognition and Part of Speech. [Example](https://huggingface.co/dslim/bert-base-NER).
|
||||
* **Zero-Shot Classification:** Too cool to explain with words. Here is an [example](https://huggingface.co/typeform/distilbert-base-uncased-mnli)
|
||||
* ([WIP](https://github.com/huggingface/huggingface_hub/issues/99)) **Table to Text Generation**.
|
||||
|
||||
@@ -31,7 +31,7 @@ without ever having to download the full dataset.
|
||||
In the following, we demonstrate how to train a bi-directional transformer model
|
||||
using masked language modeling objective as introduced in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
|
||||
More specifically, we demonstrate how JAX/Flax and dataset streaming can be leveraged
|
||||
to pre-train [**`roberta-base`**](https://huggingface.co/roberta-base)
|
||||
to pre-train [**`FacebookAI/roberta-base`**](https://huggingface.co/FacebookAI/roberta-base)
|
||||
in English on a single TPUv3-8 pod for 10000 update steps.
|
||||
|
||||
The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
|
||||
@@ -80,8 +80,8 @@ from transformers import RobertaTokenizerFast, RobertaConfig
|
||||
|
||||
model_dir = "./english-roberta-base-dummy"
|
||||
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
||||
config = RobertaConfig.from_pretrained("roberta-base")
|
||||
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
|
||||
config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
|
||||
|
||||
tokenizer.save_pretrained(model_dir)
|
||||
config.save_pretrained(model_dir)
|
||||
|
||||
@@ -32,7 +32,7 @@ Models written in JAX/Flax are **immutable** and updated in a purely functional
|
||||
way which enables simple and efficient model parallelism.
|
||||
|
||||
In this example we will use the vision model from [CLIP](https://huggingface.co/models?filter=clip)
|
||||
as the image encoder and [`roberta-base`](https://huggingface.co/roberta-base) as the text encoder.
|
||||
as the image encoder and [`FacebookAI/roberta-base`](https://huggingface.co/FacebookAI/roberta-base) as the text encoder.
|
||||
Note that one can also use the [ViT](https://huggingface.co/models?filter=vit) model as image encoder and any other BERT or ROBERTa model as text encoder.
|
||||
To train the model on languages other than English one should choose a text encoder trained on the desired
|
||||
language and a image-text dataset in that language. One such dataset is [WIT](https://github.com/google-research-datasets/wit).
|
||||
@@ -76,7 +76,7 @@ Here is an example of how to load the model using pre-trained text and vision mo
|
||||
```python
|
||||
from modeling_hybrid_clip import FlaxHybridCLIP
|
||||
|
||||
model = FlaxHybridCLIP.from_text_vision_pretrained("bert-base-uncased", "openai/clip-vit-base-patch32")
|
||||
model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32")
|
||||
|
||||
# save the model
|
||||
model.save_pretrained("bert-clip")
|
||||
@@ -89,7 +89,7 @@ If the checkpoints are in PyTorch then one could pass `text_from_pt=True` and `v
|
||||
PyTorch checkpoints convert them to flax and load the model.
|
||||
|
||||
```python
|
||||
model = FlaxHybridCLIP.from_text_vision_pretrained("bert-base-uncased", "openai/clip-vit-base-patch32", text_from_pt=True, vision_from_pt=True)
|
||||
model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32", text_from_pt=True, vision_from_pt=True)
|
||||
```
|
||||
|
||||
This loads both the text and vision encoders using pre-trained weights, the projection layers are randomly
|
||||
@@ -154,9 +154,9 @@ Next we can run the example script to train the model:
|
||||
```bash
|
||||
python run_hybrid_clip.py \
|
||||
--output_dir ${MODEL_DIR} \
|
||||
--text_model_name_or_path="roberta-base" \
|
||||
--text_model_name_or_path="FacebookAI/roberta-base" \
|
||||
--vision_model_name_or_path="openai/clip-vit-base-patch32" \
|
||||
--tokenizer_name="roberta-base" \
|
||||
--tokenizer_name="FacebookAI/roberta-base" \
|
||||
--train_file="coco_dataset/train_dataset.json" \
|
||||
--validation_file="coco_dataset/validation_dataset.json" \
|
||||
--do_train --do_eval \
|
||||
|
||||
@@ -314,8 +314,6 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
|
||||
Information necessary to initiate the text model. Can be either:
|
||||
|
||||
- A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
|
||||
Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
|
||||
a user or organization name, like ``dbmdz/bert-base-german-cased``.
|
||||
- A path to a `directory` containing model weights saved using
|
||||
:func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
|
||||
- A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
|
||||
@@ -327,8 +325,6 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
|
||||
Information necessary to initiate the vision model. Can be either:
|
||||
|
||||
- A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
|
||||
Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
|
||||
a user or organization name, like ``dbmdz/bert-base-german-cased``.
|
||||
- A path to a `directory` containing model weights saved using
|
||||
:func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
|
||||
- A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
|
||||
@@ -354,7 +350,7 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
|
||||
>>> from transformers import FlaxHybridCLIP
|
||||
>>> # initialize a model from pretrained BERT and CLIP models. Note that the projection layers will be randomly initialized.
|
||||
>>> # If using CLIP's vision model the vision projection layer will be initialized using pre-trained weights
|
||||
>>> model = FlaxHybridCLIP.from_text_vision_pretrained('bert-base-uncased', 'openai/clip-vit-base-patch32')
|
||||
>>> model = FlaxHybridCLIP.from_text_vision_pretrained('google-bert/bert-base-uncased', 'openai/clip-vit-base-patch32')
|
||||
>>> # saving model after fine-tuning
|
||||
>>> model.save_pretrained("./bert-clip")
|
||||
>>> # load fine-tuned model
|
||||
|
||||
@@ -54,7 +54,7 @@ model.save_pretrained("gpt-neo-1.3B")
|
||||
```bash
|
||||
python run_clm_mp.py \
|
||||
--model_name_or_path gpt-neo-1.3B \
|
||||
--tokenizer_name gpt2 \
|
||||
--tokenizer_name openai-community/gpt2 \
|
||||
--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train --do_eval \
|
||||
--block_size 1024 \
|
||||
|
||||
@@ -36,7 +36,7 @@ def load_models():
|
||||
_ = s2s_model.eval()
|
||||
else:
|
||||
s2s_tokenizer, s2s_model = make_qa_s2s_model(
|
||||
model_name="t5-small", from_file="seq2seq_models/eli5_t5_model_1024_4.pth", device="cuda:0"
|
||||
model_name="google-t5/t5-small", from_file="seq2seq_models/eli5_t5_model_1024_4.pth", device="cuda:0"
|
||||
)
|
||||
return (qar_tokenizer, qar_model, s2s_tokenizer, s2s_model)
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ to that word). This technique has been refined for Chinese in [this paper](https
|
||||
To fine-tune a model using whole word masking, use the following script:
|
||||
```bash
|
||||
python run_mlm_wwm.py \
|
||||
--model_name_or_path roberta-base \
|
||||
--model_name_or_path FacebookAI/roberta-base \
|
||||
--dataset_name wikitext \
|
||||
--dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train \
|
||||
@@ -83,7 +83,7 @@ export VALIDATION_REF_FILE=/path/to/validation/chinese_ref/file
|
||||
export OUTPUT_DIR=/tmp/test-mlm-wwm
|
||||
|
||||
python run_mlm_wwm.py \
|
||||
--model_name_or_path roberta-base \
|
||||
--model_name_or_path FacebookAI/roberta-base \
|
||||
--train_file $TRAIN_FILE \
|
||||
--validation_file $VALIDATION_FILE \
|
||||
--train_ref_file $TRAIN_REF_FILE \
|
||||
|
||||
@@ -10,7 +10,7 @@ Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformer
|
||||
python run_mmimdb.py \
|
||||
--data_dir /path/to/mmimdb/dataset/ \
|
||||
--model_type bert \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--model_name_or_path google-bert/bert-base-uncased \
|
||||
--output_dir /path/to/save/dir/ \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
|
||||
@@ -61,7 +61,7 @@ python examples/movement-pruning/masked_run_squad.py \
|
||||
--predict_file dev-v1.1.json \
|
||||
--do_train --do_eval --do_lower_case \
|
||||
--model_type masked_bert \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--model_name_or_path google-bert/bert-base-uncased \
|
||||
--per_gpu_train_batch_size 16 \
|
||||
--warmup_steps 5400 \
|
||||
--num_train_epochs 10 \
|
||||
@@ -84,7 +84,7 @@ python examples/movement-pruning/masked_run_squad.py \
|
||||
--predict_file dev-v1.1.json \
|
||||
--do_train --do_eval --do_lower_case \
|
||||
--model_type masked_bert \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--model_name_or_path google-bert/bert-base-uncased \
|
||||
--per_gpu_train_batch_size 16 \
|
||||
--warmup_steps 5400 \
|
||||
--num_train_epochs 10 \
|
||||
@@ -104,7 +104,7 @@ python examples/movement-pruning/masked_run_squad.py \
|
||||
--predict_file dev-v1.1.json \
|
||||
--do_train --do_eval --do_lower_case \
|
||||
--model_type masked_bert \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--model_name_or_path google-bert/bert-base-uncased \
|
||||
--per_gpu_train_batch_size 16 \
|
||||
--warmup_steps 5400 \
|
||||
--num_train_epochs 10 \
|
||||
@@ -124,7 +124,7 @@ python examples/movement-pruning/masked_run_squad.py \
|
||||
--predict_file dev-v1.1.json \
|
||||
--do_train --do_eval --do_lower_case \
|
||||
--model_type masked_bert \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--model_name_or_path google-bert/bert-base-uncased \
|
||||
--per_gpu_train_batch_size 16 \
|
||||
--warmup_steps 5400 \
|
||||
--num_train_epochs 10 \
|
||||
|
||||
@@ -10,8 +10,8 @@ Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyo
|
||||
|
||||
## Examples
|
||||
|
||||
`sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
|
||||
`full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
|
||||
`sanity_script.sh` will launch performer fine-tuning from the google-bert/bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
|
||||
`full_script.sh` will launch performer fine-tuning from the google-bert/bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
|
||||
|
||||
Here are a few key arguments:
|
||||
- Remove the `--performer` argument to use a standard Bert model.
|
||||
|
||||
@@ -61,7 +61,7 @@ DISCRIMINATOR_MODELS_PARAMS = {
|
||||
"embed_size": 1024,
|
||||
"class_vocab": {"non_clickbait": 0, "clickbait": 1},
|
||||
"default_class": 1,
|
||||
"pretrained_model": "gpt2-medium",
|
||||
"pretrained_model": "openai-community/gpt2-medium",
|
||||
},
|
||||
"sentiment": {
|
||||
"url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
|
||||
@@ -69,7 +69,7 @@ DISCRIMINATOR_MODELS_PARAMS = {
|
||||
"embed_size": 1024,
|
||||
"class_vocab": {"very_positive": 2, "very_negative": 3},
|
||||
"default_class": 3,
|
||||
"pretrained_model": "gpt2-medium",
|
||||
"pretrained_model": "openai-community/gpt2-medium",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -585,7 +585,7 @@ def set_generic_model_params(discrim_weights, discrim_meta):
|
||||
|
||||
|
||||
def run_pplm_example(
|
||||
pretrained_model="gpt2-medium",
|
||||
pretrained_model="openai-community/gpt2-medium",
|
||||
cond_text="",
|
||||
uncond=False,
|
||||
num_samples=1,
|
||||
@@ -738,7 +738,7 @@ if __name__ == "__main__":
|
||||
"--pretrained_model",
|
||||
"-M",
|
||||
type=str,
|
||||
default="gpt2-medium",
|
||||
default="openai-community/gpt2-medium",
|
||||
help="pretrained model name or path to local checkpoint",
|
||||
)
|
||||
parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
|
||||
|
||||
@@ -45,7 +45,7 @@ max_length_seq = 100
|
||||
class Discriminator(nn.Module):
|
||||
"""Transformer encoder followed by a Classification Head"""
|
||||
|
||||
def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
|
||||
def __init__(self, class_size, pretrained_model="openai-community/gpt2-medium", cached_mode=False, device="cpu"):
|
||||
super().__init__()
|
||||
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
||||
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
|
||||
@@ -218,7 +218,7 @@ def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, de
|
||||
def train_discriminator(
|
||||
dataset,
|
||||
dataset_fp=None,
|
||||
pretrained_model="gpt2-medium",
|
||||
pretrained_model="openai-community/gpt2-medium",
|
||||
epochs=10,
|
||||
batch_size=64,
|
||||
log_interval=10,
|
||||
@@ -502,7 +502,10 @@ if __name__ == "__main__":
|
||||
help="File path of the dataset to use. Needed only in case of generic datadset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
|
||||
"--pretrained_model",
|
||||
type=str,
|
||||
default="openai-community/gpt2-medium",
|
||||
help="Pretrained model to use as encoder",
|
||||
)
|
||||
parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
|
||||
parser.add_argument(
|
||||
|
||||
@@ -50,11 +50,11 @@ Calibrate the pretrained model and finetune with quantization awared:
|
||||
|
||||
```bash
|
||||
python3 run_quant_qa.py \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--model_name_or_path google-bert/bert-base-uncased \
|
||||
--dataset_name squad \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--output_dir calib/bert-base-uncased \
|
||||
--output_dir calib/google-bert/bert-base-uncased \
|
||||
--do_calib \
|
||||
--calibrator percentile \
|
||||
--percentile 99.99
|
||||
@@ -62,7 +62,7 @@ python3 run_quant_qa.py \
|
||||
|
||||
```bash
|
||||
python3 run_quant_qa.py \
|
||||
--model_name_or_path calib/bert-base-uncased \
|
||||
--model_name_or_path calib/google-bert/bert-base-uncased \
|
||||
--dataset_name squad \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
@@ -71,8 +71,8 @@ python3 run_quant_qa.py \
|
||||
--num_train_epochs 2 \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--output_dir finetuned_int8/bert-base-uncased \
|
||||
--tokenizer_name bert-base-uncased \
|
||||
--output_dir finetuned_int8/google-bert/bert-base-uncased \
|
||||
--tokenizer_name google-bert/bert-base-uncased \
|
||||
--save_steps 0
|
||||
```
|
||||
|
||||
@@ -82,14 +82,14 @@ To export the QAT model finetuned above:
|
||||
|
||||
```bash
|
||||
python3 run_quant_qa.py \
|
||||
--model_name_or_path finetuned_int8/bert-base-uncased \
|
||||
--model_name_or_path finetuned_int8/google-bert/bert-base-uncased \
|
||||
--output_dir ./ \
|
||||
--save_onnx \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--dataset_name squad \
|
||||
--tokenizer_name bert-base-uncased
|
||||
--tokenizer_name google-bert/bert-base-uncased
|
||||
```
|
||||
|
||||
Use `--recalibrate-weights` to calibrate the weight ranges according to the quantizer axis. Use `--quant-per-tensor` for per tensor quantization (default is per channel).
|
||||
@@ -117,7 +117,7 @@ python3 evaluate-hf-trt-qa.py \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--dataset_name squad \
|
||||
--tokenizer_name bert-base-uncased \
|
||||
--tokenizer_name google-bert/bert-base-uncased \
|
||||
--int8 \
|
||||
--seed 42
|
||||
```
|
||||
@@ -128,14 +128,14 @@ Finetune a fp32 precision model with [transformers/examples/pytorch/question-ans
|
||||
|
||||
```bash
|
||||
python3 ../../pytorch/question-answering/run_qa.py \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--model_name_or_path google-bert/bert-base-uncased \
|
||||
--dataset_name squad \
|
||||
--per_device_train_batch_size 12 \
|
||||
--learning_rate 3e-5 \
|
||||
--num_train_epochs 2 \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--output_dir ./finetuned_fp32/bert-base-uncased \
|
||||
--output_dir ./finetuned_fp32/google-bert/bert-base-uncased \
|
||||
--save_steps 0 \
|
||||
--do_train \
|
||||
--do_eval
|
||||
@@ -147,13 +147,13 @@ python3 ../../pytorch/question-answering/run_qa.py \
|
||||
|
||||
```bash
|
||||
python3 run_quant_qa.py \
|
||||
--model_name_or_path ./finetuned_fp32/bert-base-uncased \
|
||||
--model_name_or_path ./finetuned_fp32/google-bert/bert-base-uncased \
|
||||
--dataset_name squad \
|
||||
--calibrator percentile \
|
||||
--percentile 99.99 \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--output_dir ./calib/bert-base-uncased \
|
||||
--output_dir ./calib/google-bert/bert-base-uncased \
|
||||
--save_steps 0 \
|
||||
--do_calib \
|
||||
--do_eval
|
||||
@@ -163,14 +163,14 @@ python3 run_quant_qa.py \
|
||||
|
||||
```bash
|
||||
python3 run_quant_qa.py \
|
||||
--model_name_or_path ./calib/bert-base-uncased \
|
||||
--model_name_or_path ./calib/google-bert/bert-base-uncased \
|
||||
--output_dir ./ \
|
||||
--save_onnx \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--dataset_name squad \
|
||||
--tokenizer_name bert-base-uncased
|
||||
--tokenizer_name google-bert/bert-base-uncased
|
||||
```
|
||||
|
||||
### Evaluate the INT8 PTQ ONNX model inference with TensorRT
|
||||
@@ -183,7 +183,7 @@ python3 evaluate-hf-trt-qa.py \
|
||||
--max_seq_length 128 \
|
||||
--doc_stride 32 \
|
||||
--dataset_name squad \
|
||||
--tokenizer_name bert-base-uncased \
|
||||
--tokenizer_name google-bert/bert-base-uncased \
|
||||
--int8 \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user