🚨 Fully revert atomic checkpointing 🚨 (#29370)
Fully revert atomic checkpointing
This commit is contained in:
@@ -84,8 +84,7 @@ from transformers.testing_utils import (
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend, get_last_checkpoint
|
||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend
|
||||
from transformers.training_args import OptimizerNames
|
||||
from transformers.utils import (
|
||||
SAFE_WEIGHTS_INDEX_NAME,
|
||||
@@ -1406,19 +1405,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
trainer.train()
|
||||
self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
|
||||
|
||||
def test_save_checkpoints_is_atomic(self):
|
||||
class UnsaveableTokenizer(PreTrainedTokenizerBase):
|
||||
def save_pretrained(self, *args, **kwargs):
|
||||
raise OSError("simulated file write error")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
|
||||
# Attach unsaveable tokenizer to partially fail checkpointing
|
||||
trainer.tokenizer = UnsaveableTokenizer()
|
||||
with self.assertRaises(OSError) as _context:
|
||||
trainer.train()
|
||||
assert get_last_checkpoint(tmpdir) is None
|
||||
|
||||
@require_safetensors
|
||||
def test_safe_checkpoints(self):
|
||||
for save_safetensors in [True, False]:
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
@@ -237,20 +236,6 @@ if __name__ == "__main__":
|
||||
|
||||
trainer.args.eval_accumulation_steps = None
|
||||
|
||||
# Check that saving does indeed work with temp dir rotation
|
||||
# If this fails, will see a FileNotFoundError
|
||||
model = RegressionModel()
|
||||
training_args.max_steps = 1
|
||||
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
|
||||
sched = torch.optim.lr_scheduler.LambdaLR(opt, lambda x: 1)
|
||||
trainer = Trainer(
|
||||
model, training_args, optimizers=(opt, sched), data_collator=DummyDataCollator(), eval_dataset=dataset
|
||||
)
|
||||
trainer._save_checkpoint(model=None, trial=None)
|
||||
# Check that the temp folder does not exist
|
||||
assert not (Path(training_args.output_dir) / "tmp-checkpoint-0").exists()
|
||||
assert (Path(training_args.output_dir) / "checkpoint-0").exists()
|
||||
|
||||
# Check that `dispatch_batches=False` will work on a finite iterable dataset
|
||||
|
||||
train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1)
|
||||
|
||||
Reference in New Issue
Block a user