From 2d70c912067ee28cbf35d333cda496120dae9fee Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 28 Jun 2021 19:23:35 +0100 Subject: [PATCH] [Flax] Adapt flax examples to include `push_to_hub` (#12391) * fix_torch_device_generate_test * remove @ * finish * correct summary writer * correct push to hub * fix indent * finish * finish * finish * finish * finish Co-authored-by: Patrick von Platen --- examples/flax/language-modeling/README.md | 79 +++++++++++++++---- .../flax/language-modeling/run_clm_flax.py | 15 ++-- .../flax/language-modeling/run_mlm_flax.py | 19 +++-- .../summarization/run_summarization_flax.py | 15 ++-- examples/flax/text-classification/README.md | 55 ++++++++++--- .../flax/text-classification/run_flax_glue.py | 18 ++++- 6 files changed, 153 insertions(+), 48 deletions(-) mode change 100644 => 100755 examples/flax/language-modeling/run_clm_flax.py diff --git a/examples/flax/language-modeling/README.md b/examples/flax/language-modeling/README.md index cd0c499ffe..e7a44c9d85 100644 --- a/examples/flax/language-modeling/README.md +++ b/examples/flax/language-modeling/README.md @@ -33,11 +33,37 @@ in Norwegian on a single TPUv3-8 pod. The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets. -Let's start by creating a folder to save the trained model and a symbolic link to the `run_mlm_flax.py` script. +Let's start by creating a model repository to save the trained model and logs. +Here we call the model `"norwegian-roberta-base"`, but you can change the model name as you like. + +You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that +you are logged in) or via the command line: + +``` +huggingface-cli repo create norwegian-roberta-base +``` + +Next we clone the model repository to add the tokenizer and model files. + +``` +git clone https://huggingface.co//norwegian-roberta-base +``` + +To ensure that all tensorboard traces will be uploaded correctly, we need to +track them. You can run the following command inside your model repo to do so. + +``` +cd norwegian-roberta-base +git lfs track "*tfevents*" +``` + +Great, we have set up our model repository. During training, we will automatically +push the training logs and model weights to the repo. + +Next, let's add a symbolic link to the `run_mlm_flax.py`. ```bash export MODEL_DIR="./norwegian-roberta-base" -mkdir -p ${MODEL_DIR} ln -s ~/transformers/examples/flax/language-modeling/run_mlm_flax.py run_mlm_flax.py ``` @@ -98,7 +124,7 @@ Next we can run the example script to pretrain the model: ```bash ./run_mlm_flax.py \ - --output_dir="./runs" \ + --output_dir="${MODEL_DIR}" \ --model_type="roberta" \ --config_name="${MODEL_DIR}" \ --tokenizer_name="${MODEL_DIR}" \ @@ -114,7 +140,8 @@ Next we can run the example script to pretrain the model: --pad_to_max_length \ --num_train_epochs="18" \ --adam_beta1="0.9" \ - --adam_beta2="0.98" + --adam_beta2="0.98" \ + --push_to_hub ``` Training should converge at a loss and accuracy @@ -135,11 +162,37 @@ in Norwegian on a single TPUv3-8 pod. The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets. -Let's start by creating a folder to save the trained model and a symbolic link to the `run_clm_flax.py` script. +Let's start by creating a model repository to save the trained model and logs. +Here we call the model `"norwegian-gpt2"`, but you can change the model name as you like. + +You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that +you are logged in) or via the command line: + +``` +huggingface-cli repo create norwegian-gpt2 +``` + +Next we clone the model repository to add the tokenizer and model files. + +``` +git clone https://huggingface.co//norwegian-gpt2 +``` + +To ensure that all tensorboard traces will be uploaded correctly, we need to +track them. You can run the following command inside your model repo to do so. + +``` +cd norwegian-gpt2 +git lfs track "*tfevents*" +``` + +Great, we have set up our model repository. During training, we will automatically +push the training logs and model weights to the repo. + +Next, let's add a symbolic link to the `run_clm_flax.py`. ```bash export MODEL_DIR="./norwegian-gpt2" -mkdir -p ${MODEL_DIR} ln -s ~/transformers/examples/flax/language-modeling/run_clm_flax.py run_clm_flax.py ``` @@ -166,7 +219,7 @@ Next we can run the example script to pretrain the model: ```bash ./run_clm_flax.py \ - --output_dir="./runs" \ + --output_dir="${MODEL_DIR}" \ --model_type="gpt2" \ --config_name="${MODEL_DIR}" \ --tokenizer_name="${MODEL_DIR}" \ @@ -180,6 +233,7 @@ Next we can run the example script to pretrain the model: --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \ --overwrite_output_dir \ --num_train_epochs="20" \ + --push_to_hub ``` Training should converge at a loss and perplexity @@ -197,14 +251,9 @@ For reproducibility, we state the training commands used for PyTorch/XLA and PyT | Task | [TPU v3-8 (Flax)](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg/) | [TPU v3-8 (Pytorch/XLA)](https://tensorboard.dev/experiment/7Jq1kcQQRAmy12KOdXek7A/)| [8 GPU (PyTorch)](https://tensorboard.dev/experiment/PJneV8FQRxa2unPw1QnVHA) | |-------|-----------|------------|------------| | MLM | 15h32m | 23h46m | 44h14m | -| **COST*** | $124.24 | $187.84 | $877.92 | -*All experiments are ran on Google Cloud Platform. Prices are on-demand prices -(not preemptible), obtained on May 12, 2021 for zone Iowa (us-central1) using -the following tables: -[TPU pricing table](https://cloud.google.com/tpu/pricing) ($8.00/h for v3-8), -[GPU pricing table](https://cloud.google.com/compute/gpus-pricing) ($2.48/h per -V100 GPU). GPU experiments are ran without further optimizations besides JAX +*All experiments are ran on Google Cloud Platform. +GPU experiments are ran without further optimizations besides JAX transformations. GPU experiments are ran with full precision (fp32). "TPU v3-8" are 8 TPU cores on 4 chips (each chips has 2 cores), while "8 GPU" are 8 GPU chips. @@ -281,7 +330,7 @@ mkdir -p ${MODEL_DIR} ```bash python3 -m torch.distributed.launch --nproc_per_node ${NUM_GPUS} run_mlm.py \ - --output_dir="./runs" \ + --output_dir="${MODEL_DIR}" \ --model_type="roberta" \ --config_name="${MODEL_DIR}" \ --tokenizer_name="${MODEL_DIR}" \ diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py old mode 100644 new mode 100755 index ace918ec48..c313ad0b3a --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -451,7 +451,7 @@ def main(): # Enable tensorboard only on the master node if has_tensorboard and jax.process_index() == 0: - summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix()) + summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir)) # Initialize our training rng = jax.random.PRNGKey(training_args.seed) @@ -604,10 +604,15 @@ def main(): cur_step = epoch * (len(train_dataset) // train_batch_size) write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step) - # save last checkpoint - if jax.process_index() == 0: - params = jax.device_get(unreplicate(state.params)) - model.save_pretrained(training_args.output_dir, params=params) + # save checkpoint after each epoch and push checkpoint to the hub + if jax.process_index() == 0: + params = jax.device_get(unreplicate(state.params)) + model.save_pretrained( + training_args.output_dir, + params=params, + push_to_hub=training_args.push_to_hub, + commit_message=f"Saving weights and logs of epoch {epoch+1}", + ) if __name__ == "__main__": diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index 8810468fb6..32a9d09ca4 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -269,7 +269,7 @@ def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndar return batch_idx -def write_metric(train_metrics, eval_metrics, train_time, step): +def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step): summary_writer.scalar("train_time", train_time, step) train_metrics = get_metrics(train_metrics) @@ -472,7 +472,7 @@ if __name__ == "__main__": # Enable tensorboard only on the master node if has_tensorboard and jax.process_index() == 0: - summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix()) + summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir)) # Data collator # This one will take care of randomly masking the tokens. @@ -642,9 +642,14 @@ if __name__ == "__main__": # Save metrics if has_tensorboard and jax.process_index() == 0: cur_step = epoch * (len(tokenized_datasets["train"]) // train_batch_size) - write_metric(train_metrics, eval_metrics, train_time, cur_step) + write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step) - # save last checkpoint - if jax.process_index() == 0: - params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) - model.save_pretrained(training_args.output_dir, params=params) + # save checkpoint after each epoch and push checkpoint to the hub + if jax.process_index() == 0: + params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) + model.save_pretrained( + training_args.output_dir, + params=params, + push_to_hub=training_args.push_to_hub, + commit_message=f"Saving weights and logs of epoch {epoch+1}", + ) diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py index e8c683c5ff..3abefc1d1e 100644 --- a/examples/flax/summarization/run_summarization_flax.py +++ b/examples/flax/summarization/run_summarization_flax.py @@ -542,7 +542,7 @@ def main(): try: from flax.metrics.tensorboard import SummaryWriter - summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix()) + summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir)) except ImportError as ie: has_tensorboard = False logger.warning( @@ -787,10 +787,15 @@ def main(): desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})" logger.info(desc) - # save last checkpoint - if jax.process_index() == 0: - params = jax.device_get(unreplicate(state.params)) - model.save_pretrained(training_args.output_dir, params=params) + # save checkpoint after each epoch and push checkpoint to the hub + if jax.process_index() == 0: + params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) + model.save_pretrained( + training_args.output_dir, + params=params, + push_to_hub=training_args.push_to_hub, + commit_message=f"Saving weights and logs of epoch {epoch+1}", + ) if __name__ == "__main__": diff --git a/examples/flax/text-classification/README.md b/examples/flax/text-classification/README.md index 45f17f5518..cb2c27d141 100644 --- a/examples/flax/text-classification/README.md +++ b/examples/flax/text-classification/README.md @@ -23,31 +23,68 @@ Based on the script [`run_flax_glue.py`](https://github.com/huggingface/transfor Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/). This script can fine-tune any of the models on the [hub](https://huggingface.co/models). -GLUE is made up of a total of 9 different tasks. Here is how to run the script on one of them: +To begin with it is recommended to create a model repository to save the trained model and logs. +Here we call the model `"bert-glue-mrpc-test"`, but you can change the model name as you like. + +You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that +you are logged in) or via the command line: + +``` +huggingface-cli repo create bert-glue-mrpc-test +``` + +Next we clone the model repository to add the tokenizer and model files. + +``` +git clone https://huggingface.co//bert-glue-mrpc-test +``` + +To ensure that all tensorboard traces will be uploaded correctly, we need to +track them. You can run the following command inside your model repo to do so. + +``` +cd bert-glue-mrpc-test +git lfs track "*tfevents*" +``` + +Great, we have set up our model repository. During training, we will automatically +push the training logs and model weights to the repo. + +Next, let's add a symbolic link to the `run_flax_glue.py`. ```bash export TASK_NAME=mrpc +export MODEL_DIR="./bert-glue-mrpc-test" +ln -s ~/transformers/examples/flax/text-classification/run_flax_glue.py run_flax_glue.py +``` + +GLUE is made up of a total of 9 different tasks. Here is how to run the script on one of them: + +```bash python run_flax_glue.py \ --model_name_or_path bert-base-cased \ - --task_name $TASK_NAME \ + --task_name ${TASK_NAME} \ --max_length 128 \ --learning_rate 2e-5 \ --num_train_epochs 3 \ --per_device_train_batch_size 4 \ - --output_dir /tmp/$TASK_NAME/ + --output_dir ${MODEL_DIR} \ + --push_to_hub ``` where task name can be one of cola, mnli, mnli-mm, mrpc, qnli, qqp, rte, sst2, stsb, wnli. Using the command above, the script will train for 3 epochs and run eval after each epoch. -Metrics and hyperparameters are stored in Tensorflow event files in `---output_dir`. +Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`. You can see the results by running `tensorboard` in that directory: ```bash $ tensorboard --logdir . ``` +or directly on the hub under *Training metrics*. + ### Accuracy Evaluation We train five replicas and report mean accuracy and stdev on the dev set below. @@ -95,14 +132,8 @@ overall training time below. For comparison we ran Pytorch's [run_glue.py](https | WNLI | 1m 11s | 48s | 39s | 36s | |-------| | **TOTAL** | 1h 03m | 1h 28m | 5h 16m | 6h 37m | -| **COST*** | $8.56 | $29.10 | $13.06 | $16.41 | - -*All experiments are ran on Google Cloud Platform. Prices are on-demand prices -(not preemptible), obtained on May 12, 2021 for zone Iowa (us-central1) using -the following tables: -[TPU pricing table](https://cloud.google.com/tpu/pricing) ($8.00/h for v3-8), -[GPU pricing table](https://cloud.google.com/compute/gpus-pricing) ($2.48/h per -V100 GPU). GPU experiments are ran without further optimizations besides JAX +*All experiments are ran on Google Cloud Platform. +GPU experiments are ran without further optimizations besides JAX transformations. GPU experiments are ran with full precision (fp32). "TPU v3-8" are 8 TPU cores on 4 chips (each chips has 2 cores), while "8 GPU" are 8 GPU chips. diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index edb13a6a40..6a12a855be 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -123,6 +123,11 @@ def parse_args(): ) parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") parser.add_argument("--seed", type=int, default=3, help="A seed for reproducible training.") + parser.add_argument( + "--push_to_hub", + action="store_true", + help="If passed, model checkpoints and tensorboard logs will be pushed to the hub", + ) args = parser.parse_args() # Sanity checks @@ -491,10 +496,15 @@ def main(): cur_step = epoch * (len(train_dataset) // train_batch_size) write_metric(train_metrics, eval_metric, train_time, cur_step) - # save last checkpoint - if jax.process_index() == 0: - params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) - model.save_pretrained(args.output_dir, params=params) + # save checkpoint after each epoch and push checkpoint to the hub + if jax.process_index() == 0: + params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) + model.save_pretrained( + args.output_dir, + params=params, + push_to_hub=args.push_to_hub, + commit_message=f"Saving weights and logs of epoch {epoch}", + ) if __name__ == "__main__":