[deepspeed] saving checkpoint fallback when fp16 weights aren't saved (#14948)
* [deepspeed] saving checkpoint fallback when fp16 weights aren't saved * Bump required deepspeed version to match usage when saving checkpoints * update version Co-authored-by: Mihai Balint <balint.mihai@gmail.com>
This commit is contained in:
2
setup.py
2
setup.py
@@ -98,7 +98,7 @@ _deps = [
|
|||||||
"cookiecutter==1.7.2",
|
"cookiecutter==1.7.2",
|
||||||
"dataclasses",
|
"dataclasses",
|
||||||
"datasets",
|
"datasets",
|
||||||
"deepspeed>=0.5.7",
|
"deepspeed>=0.5.9",
|
||||||
"fairscale>0.3",
|
"fairscale>0.3",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ deps = {
|
|||||||
"cookiecutter": "cookiecutter==1.7.2",
|
"cookiecutter": "cookiecutter==1.7.2",
|
||||||
"dataclasses": "dataclasses",
|
"dataclasses": "dataclasses",
|
||||||
"datasets": "datasets",
|
"datasets": "datasets",
|
||||||
"deepspeed": "deepspeed>=0.5.7",
|
"deepspeed": "deepspeed>=0.5.9",
|
||||||
"fairscale": "fairscale>0.3",
|
"fairscale": "fairscale>0.3",
|
||||||
"faiss-cpu": "faiss-cpu",
|
"faiss-cpu": "faiss-cpu",
|
||||||
"fastapi": "fastapi",
|
"fastapi": "fastapi",
|
||||||
|
|||||||
@@ -2054,7 +2054,12 @@ class Trainer:
|
|||||||
# now save the real model if stage3_gather_fp16_weights_on_model_save=True
|
# now save the real model if stage3_gather_fp16_weights_on_model_save=True
|
||||||
# if false it will not be saved.
|
# if false it will not be saved.
|
||||||
# This must be called on all ranks
|
# This must be called on all ranks
|
||||||
self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME)
|
if not self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME):
|
||||||
|
logger.warning(
|
||||||
|
"deepspeed.save_fp16_model didn't save the model, since stage3_gather_fp16_weights_on_model_save=false. "
|
||||||
|
"Saving the full checkpoint instead, use zero_to_fp32.py to recover weights"
|
||||||
|
)
|
||||||
|
self.deepspeed.save_checkpoint(output_dir)
|
||||||
|
|
||||||
elif self.args.should_save:
|
elif self.args.should_save:
|
||||||
self._save(output_dir)
|
self._save(output_dir)
|
||||||
|
|||||||
Reference in New Issue
Block a user