[examples] bump pl=0.9.0 (#7053)
This commit is contained in:
@@ -119,7 +119,7 @@ class BaseTransformer(pl.LightningModule):
|
|||||||
def get_lr_scheduler(self):
|
def get_lr_scheduler(self):
|
||||||
get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
|
get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
|
||||||
scheduler = get_schedule_func(
|
scheduler = get_schedule_func(
|
||||||
self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps
|
self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
|
||||||
)
|
)
|
||||||
scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
|
scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
|
||||||
return scheduler
|
return scheduler
|
||||||
@@ -159,19 +159,20 @@ class BaseTransformer(pl.LightningModule):
|
|||||||
def test_epoch_end(self, outputs):
|
def test_epoch_end(self, outputs):
|
||||||
return self.validation_end(outputs)
|
return self.validation_end(outputs)
|
||||||
|
|
||||||
@property
|
|
||||||
def total_steps(self) -> int:
|
def total_steps(self) -> int:
|
||||||
"""The number of total training steps that will be run. Used for lr scheduler purposes."""
|
"""The number of total training steps that will be run. Used for lr scheduler purposes."""
|
||||||
num_devices = max(1, self.hparams.gpus) # TODO: consider num_tpu_cores
|
num_devices = max(1, self.hparams.gpus) # TODO: consider num_tpu_cores
|
||||||
effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
|
effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
|
||||||
dataset_size = len(self.train_loader.dataset)
|
return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
|
||||||
return (dataset_size / effective_batch_size) * self.hparams.max_epochs
|
|
||||||
|
|
||||||
def setup(self, mode):
|
def setup(self, mode):
|
||||||
if mode == "fit":
|
if mode == "test":
|
||||||
|
self.dataset_size = len(self.test_dataloader().dataset)
|
||||||
|
else:
|
||||||
self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
|
self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
|
||||||
|
self.dataset_size = len(self.train_loader.dataset)
|
||||||
|
|
||||||
def get_dataloader(self, type_path, batch_size, shuffle=False):
|
def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
|
||||||
raise NotImplementedError("You must implement this for your task")
|
raise NotImplementedError("You must implement this for your task")
|
||||||
|
|
||||||
def train_dataloader(self):
|
def train_dataloader(self):
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ psutil
|
|||||||
sacrebleu
|
sacrebleu
|
||||||
rouge-score
|
rouge-score
|
||||||
tensorflow_datasets
|
tensorflow_datasets
|
||||||
pytorch-lightning==0.8.5
|
pytorch-lightning==0.9.0
|
||||||
matplotlib
|
matplotlib
|
||||||
git-python==1.0.3
|
git-python==1.0.3
|
||||||
faiss-cpu
|
faiss-cpu
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ For `bertabs` instructions, see [`bertabs/README.md`](bertabs/README.md).
|
|||||||
- `MBartForConditionalGeneration`
|
- `MBartForConditionalGeneration`
|
||||||
- `FSMTForConditionalGeneration`
|
- `FSMTForConditionalGeneration`
|
||||||
- `T5ForConditionalGeneration`
|
- `T5ForConditionalGeneration`
|
||||||
|
|
||||||
|
|
||||||
## Datasets
|
## Datasets
|
||||||
|
|
||||||
@@ -100,7 +99,7 @@ All finetuning bash scripts call finetune.py (or distillation.py) with reasonabl
|
|||||||
To see all the possible command line options, run:
|
To see all the possible command line options, run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./finetune.py --help
|
./finetune.py --help
|
||||||
```
|
```
|
||||||
|
|
||||||
### Finetuning Training Params
|
### Finetuning Training Params
|
||||||
@@ -192,7 +191,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(f'{output_dir}/best_tfmr')
|
|||||||
### Fine-tuning using Seq2SeqTrainer
|
### Fine-tuning using Seq2SeqTrainer
|
||||||
To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer` releated `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that, calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument, set this argument to calculate BLEU and ROUGE metrics.
|
To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer` releated `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that, calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument, set this argument to calculate BLEU and ROUGE metrics.
|
||||||
|
|
||||||
With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set.
|
With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set.
|
||||||
|
|
||||||
To see all the possible command line options, run:
|
To see all the possible command line options, run:
|
||||||
|
|
||||||
@@ -265,6 +264,7 @@ export DATA_DIR=cnn_dm
|
|||||||
--fp16 \
|
--fp16 \
|
||||||
--bs 32
|
--bs 32
|
||||||
```
|
```
|
||||||
|
|
||||||
### Multi-GPU Evaluation
|
### Multi-GPU Evaluation
|
||||||
here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases
|
here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases
|
||||||
because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
|
because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
|
||||||
@@ -391,6 +391,17 @@ runtime: 13H on V-100 16GB GPU.
|
|||||||
pytest examples/seq2seq/
|
pytest examples/seq2seq/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Converting pytorch-lightning checkpoints
|
||||||
|
pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
|
||||||
|
|
||||||
|
This should be done for you, with a file called `{save_dir}/best_tfmr`.
|
||||||
|
|
||||||
|
If that file doesn't exist but you have a lightning `.ckpt` file, you can run
|
||||||
|
```bash
|
||||||
|
python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT randomly_initialized_hf_model_path save_dir/best_tfmr
|
||||||
|
```
|
||||||
|
Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
|
||||||
|
|
||||||
|
|
||||||
## Experimental Features
|
## Experimental Features
|
||||||
These features are harder to use and not always useful.
|
These features are harder to use and not always useful.
|
||||||
@@ -419,4 +430,3 @@ uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
|
|||||||
The feature is still experimental, because:
|
The feature is still experimental, because:
|
||||||
+ we can make it much more robust if we have memory mapped/preprocessed datasets.
|
+ we can make it much more robust if we have memory mapped/preprocessed datasets.
|
||||||
+ The speedup over sortish sampler is not that large at the moment.
|
+ The speedup over sortish sampler is not that large at the moment.
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from finetune import main as ft_main
|
|||||||
from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise
|
from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise
|
||||||
from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration
|
from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration
|
||||||
from transformers.modeling_bart import shift_tokens_right
|
from transformers.modeling_bart import shift_tokens_right
|
||||||
from utils import calculate_bleu, freeze_params, label_smoothed_nll_loss, pickle_load, use_task_specific_params
|
from utils import calculate_bleu, freeze_params, label_smoothed_nll_loss, use_task_specific_params
|
||||||
|
|
||||||
|
|
||||||
# need the parent dir module
|
# need the parent dir module
|
||||||
@@ -264,30 +264,6 @@ def create_module(args):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def evaluate_checkpoint(ckpt_path: Path, dest_dir=None):
|
|
||||||
# TODO(SS): DELETE? Better to convert_pl_ckpt_to_hf and run_eval.py
|
|
||||||
exp_dir = ckpt_path.parent
|
|
||||||
if dest_dir is None:
|
|
||||||
dest_dir = exp_dir
|
|
||||||
clash = list(dest_dir.glob("test_generations*"))
|
|
||||||
if clash:
|
|
||||||
print(f"SKIPPING to avoid overwriting {clash}")
|
|
||||||
ckpt = torch.load(ckpt_path, map_location="cpu")
|
|
||||||
if "hparams" in ckpt:
|
|
||||||
args = argparse.Namespace(**ckpt["hparams"])
|
|
||||||
else:
|
|
||||||
args = argparse.Namespace(**pickle_load(exp_dir / "hparams.pkl"))
|
|
||||||
args.resume_from_checkpoint = str(ckpt_path)
|
|
||||||
args.do_train = False
|
|
||||||
args.output_dir = str(dest_dir)
|
|
||||||
args.n_gpu = 1
|
|
||||||
args.eval_batch_size = 16
|
|
||||||
Path(args.output_dir).mkdir(exist_ok=True)
|
|
||||||
model = create_module(args)
|
|
||||||
trainer: pl.Trainer = generic_train(model, args, early_stopping_callback=False)
|
|
||||||
trainer.test(model)
|
|
||||||
|
|
||||||
|
|
||||||
def distill_main(args):
|
def distill_main(args):
|
||||||
Path(args.output_dir).mkdir(exist_ok=True)
|
Path(args.output_dir).mkdir(exist_ok=True)
|
||||||
if len(os.listdir(args.output_dir)) > 3 and args.do_train:
|
if len(os.listdir(args.output_dir)) > 3 and args.do_train:
|
||||||
|
|||||||
@@ -181,6 +181,7 @@ class SummarizationModule(BaseTransformer):
|
|||||||
return self._generative_step(batch)
|
return self._generative_step(batch)
|
||||||
|
|
||||||
def validation_epoch_end(self, outputs, prefix="val") -> Dict:
|
def validation_epoch_end(self, outputs, prefix="val") -> Dict:
|
||||||
|
|
||||||
self.step_count += 1
|
self.step_count += 1
|
||||||
losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
|
losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
|
||||||
loss = losses["loss"]
|
loss = losses["loss"]
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ import torch
|
|||||||
|
|
||||||
import lightning_base
|
import lightning_base
|
||||||
from convert_pl_checkpoint_to_hf import convert_pl_to_hf
|
from convert_pl_checkpoint_to_hf import convert_pl_to_hf
|
||||||
from distillation import distill_main, evaluate_checkpoint
|
from distillation import distill_main
|
||||||
from finetune import SummarizationModule, main
|
from finetune import SummarizationModule, main
|
||||||
from run_eval import generate_summaries_or_translations, run_generate
|
from run_eval import generate_summaries_or_translations, run_generate
|
||||||
from run_eval_search import run_search
|
from run_eval_search import run_search
|
||||||
@@ -178,7 +178,6 @@ class TestSummarizationDistiller(unittest.TestCase):
|
|||||||
generate_summaries_or_translations(examples, out_path, str(model.output_dir / "best_tfmr"))
|
generate_summaries_or_translations(examples, out_path, str(model.output_dir / "best_tfmr"))
|
||||||
self.assertTrue(Path(out_path).exists())
|
self.assertTrue(Path(out_path).exists())
|
||||||
|
|
||||||
evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp()))
|
|
||||||
out_path_new = tempfile.mkdtemp()
|
out_path_new = tempfile.mkdtemp()
|
||||||
convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new)
|
convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new)
|
||||||
assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin"))
|
assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin"))
|
||||||
@@ -227,8 +226,6 @@ class TestSummarizationDistiller(unittest.TestCase):
|
|||||||
assert len(all_files) > 2
|
assert len(all_files) > 2
|
||||||
self.assertEqual(len(transformer_ckpts), 2)
|
self.assertEqual(len(transformer_ckpts), 2)
|
||||||
|
|
||||||
evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp()))
|
|
||||||
|
|
||||||
def test_distill_t5(self):
|
def test_distill_t5(self):
|
||||||
updates = dict(
|
updates = dict(
|
||||||
student_encoder_layers=1,
|
student_encoder_layers=1,
|
||||||
|
|||||||
@@ -116,8 +116,8 @@ class ExamplesTests(TestCasePlus):
|
|||||||
testargs.append("--fp16")
|
testargs.append("--fp16")
|
||||||
|
|
||||||
with patch.object(sys, "argv", testargs):
|
with patch.object(sys, "argv", testargs):
|
||||||
result = run_pl_glue.main()
|
result = run_pl_glue.main()[0]
|
||||||
# for now just testing that the script can run to a completion
|
# for now just testing that the script can run to completion
|
||||||
self.assertGreater(result["acc"], 0.25)
|
self.assertGreater(result["acc"], 0.25)
|
||||||
#
|
#
|
||||||
# TODO: this fails on CI - doesn't get acc/f1>=0.75:
|
# TODO: this fails on CI - doesn't get acc/f1>=0.75:
|
||||||
|
|||||||
Reference in New Issue
Block a user