Fix bug with rotating checkpoints (#28009)

* Fix bug

* Write test

* Keep back old modification for grad accum steps

* Whitespace...

* Whitespace again

* Race condition

* Wait for everyone
This commit is contained in:
Zach Mueller
2023-12-13 12:17:30 -05:00
committed by GitHub
parent ec43d6870a
commit 93766251cb
2 changed files with 21 additions and 1 deletions

View File

@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import Dict
import numpy as np
@@ -236,6 +237,20 @@ if __name__ == "__main__":
trainer.args.eval_accumulation_steps = None
# Check that saving does indeed work with temp dir rotation
# If this fails, will see a FileNotFoundError
model = RegressionModel()
training_args.max_steps = 1
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
sched = torch.optim.lr_scheduler.LambdaLR(opt, lambda x: 1)
trainer = Trainer(
model, training_args, optimizers=(opt, sched), data_collator=DummyDataCollator(), eval_dataset=dataset
)
trainer._save_checkpoint(model=None, trial=None)
# Check that the temp folder does not exist
assert not (Path(training_args.output_dir) / "tmp-checkpoint-0").exists()
assert (Path(training_args.output_dir) / "checkpoint-0").exists()
# Check that `dispatch_batches=False` will work on a finite iterable dataset
train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1)