diff --git a/examples/lightning_base.py b/examples/lightning_base.py index c66884d787..fe37967680 100644 --- a/examples/lightning_base.py +++ b/examples/lightning_base.py @@ -119,7 +119,7 @@ class BaseTransformer(pl.LightningModule): def get_lr_scheduler(self): get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler] scheduler = get_schedule_func( - self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps + self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps() ) scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1} return scheduler @@ -159,19 +159,20 @@ class BaseTransformer(pl.LightningModule): def test_epoch_end(self, outputs): return self.validation_end(outputs) - @property def total_steps(self) -> int: """The number of total training steps that will be run. Used for lr scheduler purposes.""" num_devices = max(1, self.hparams.gpus) # TODO: consider num_tpu_cores effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices - dataset_size = len(self.train_loader.dataset) - return (dataset_size / effective_batch_size) * self.hparams.max_epochs + return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs def setup(self, mode): - if mode == "fit": + if mode == "test": + self.dataset_size = len(self.test_dataloader().dataset) + else: self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True) + self.dataset_size = len(self.train_loader.dataset) - def get_dataloader(self, type_path, batch_size, shuffle=False): + def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False): raise NotImplementedError("You must implement this for your task") def train_dataloader(self): diff --git a/examples/requirements.txt b/examples/requirements.txt index f080459723..41bb6c852a 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -5,7 +5,7 @@ psutil sacrebleu rouge-score tensorflow_datasets -pytorch-lightning==0.8.5 +pytorch-lightning==0.9.0 matplotlib git-python==1.0.3 faiss-cpu diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index cc411da7a2..f9081a0601 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -12,7 +12,6 @@ For `bertabs` instructions, see [`bertabs/README.md`](bertabs/README.md). - `MBartForConditionalGeneration` - `FSMTForConditionalGeneration` - `T5ForConditionalGeneration` - ## Datasets @@ -100,7 +99,7 @@ All finetuning bash scripts call finetune.py (or distillation.py) with reasonabl To see all the possible command line options, run: ```bash - ./finetune.py --help +./finetune.py --help ``` ### Finetuning Training Params @@ -192,7 +191,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(f'{output_dir}/best_tfmr') ### Fine-tuning using Seq2SeqTrainer To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer` releated `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that, calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument, set this argument to calculate BLEU and ROUGE metrics. -With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set. +With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set. To see all the possible command line options, run: @@ -265,6 +264,7 @@ export DATA_DIR=cnn_dm --fp16 \ --bs 32 ``` + ### Multi-GPU Evaluation here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have @@ -391,6 +391,17 @@ runtime: 13H on V-100 16GB GPU. pytest examples/seq2seq/ ``` +### Converting pytorch-lightning checkpoints +pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it. + +This should be done for you, with a file called `{save_dir}/best_tfmr`. + +If that file doesn't exist but you have a lightning `.ckpt` file, you can run +```bash +python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT randomly_initialized_hf_model_path save_dir/best_tfmr +``` +Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections) + ## Experimental Features These features are harder to use and not always useful. @@ -419,4 +430,3 @@ uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes. The feature is still experimental, because: + we can make it much more robust if we have memory mapped/preprocessed datasets. + The speedup over sortish sampler is not that large at the moment. - diff --git a/examples/seq2seq/distillation.py b/examples/seq2seq/distillation.py index dc44a7322d..8ccaabca3b 100755 --- a/examples/seq2seq/distillation.py +++ b/examples/seq2seq/distillation.py @@ -17,7 +17,7 @@ from finetune import main as ft_main from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration from transformers.modeling_bart import shift_tokens_right -from utils import calculate_bleu, freeze_params, label_smoothed_nll_loss, pickle_load, use_task_specific_params +from utils import calculate_bleu, freeze_params, label_smoothed_nll_loss, use_task_specific_params # need the parent dir module @@ -264,30 +264,6 @@ def create_module(args): return model -def evaluate_checkpoint(ckpt_path: Path, dest_dir=None): - # TODO(SS): DELETE? Better to convert_pl_ckpt_to_hf and run_eval.py - exp_dir = ckpt_path.parent - if dest_dir is None: - dest_dir = exp_dir - clash = list(dest_dir.glob("test_generations*")) - if clash: - print(f"SKIPPING to avoid overwriting {clash}") - ckpt = torch.load(ckpt_path, map_location="cpu") - if "hparams" in ckpt: - args = argparse.Namespace(**ckpt["hparams"]) - else: - args = argparse.Namespace(**pickle_load(exp_dir / "hparams.pkl")) - args.resume_from_checkpoint = str(ckpt_path) - args.do_train = False - args.output_dir = str(dest_dir) - args.n_gpu = 1 - args.eval_batch_size = 16 - Path(args.output_dir).mkdir(exist_ok=True) - model = create_module(args) - trainer: pl.Trainer = generic_train(model, args, early_stopping_callback=False) - trainer.test(model) - - def distill_main(args): Path(args.output_dir).mkdir(exist_ok=True) if len(os.listdir(args.output_dir)) > 3 and args.do_train: diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py index b401add5cf..d50a45740a 100755 --- a/examples/seq2seq/finetune.py +++ b/examples/seq2seq/finetune.py @@ -181,6 +181,7 @@ class SummarizationModule(BaseTransformer): return self._generative_step(batch) def validation_epoch_end(self, outputs, prefix="val") -> Dict: + self.step_count += 1 losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names} loss = losses["loss"] diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index e28acc3131..c95b06f604 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -13,7 +13,7 @@ import torch import lightning_base from convert_pl_checkpoint_to_hf import convert_pl_to_hf -from distillation import distill_main, evaluate_checkpoint +from distillation import distill_main from finetune import SummarizationModule, main from run_eval import generate_summaries_or_translations, run_generate from run_eval_search import run_search @@ -178,7 +178,6 @@ class TestSummarizationDistiller(unittest.TestCase): generate_summaries_or_translations(examples, out_path, str(model.output_dir / "best_tfmr")) self.assertTrue(Path(out_path).exists()) - evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp())) out_path_new = tempfile.mkdtemp() convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new) assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin")) @@ -227,8 +226,6 @@ class TestSummarizationDistiller(unittest.TestCase): assert len(all_files) > 2 self.assertEqual(len(transformer_ckpts), 2) - evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp())) - def test_distill_t5(self): updates = dict( student_encoder_layers=1, diff --git a/examples/test_examples.py b/examples/test_examples.py index ae32fc9caa..c47f5277a0 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -116,8 +116,8 @@ class ExamplesTests(TestCasePlus): testargs.append("--fp16") with patch.object(sys, "argv", testargs): - result = run_pl_glue.main() - # for now just testing that the script can run to a completion + result = run_pl_glue.main()[0] + # for now just testing that the script can run to completion self.assertGreater(result["acc"], 0.25) # # TODO: this fails on CI - doesn't get acc/f1>=0.75: