CI: update to ROCm 6.0.2 and test MI300 (#30266)

* update to ROCm 6.0.2 and test MI300

* add callers for mi300

* update dockerfile

* fix trainer tests

* remove apex

* style

* Update tests/trainer/test_trainer_seq2seq.py

* Update tests/trainer/test_trainer_seq2seq.py

* Update tests/trainer/test_trainer_seq2seq.py

* Update tests/trainer/test_trainer_seq2seq.py

* update to torch 2.3

* add workflow dispatch target

* we may need branches: mi300-ci after all

* nit

* fix docker build

* nit

* add check runner

* remove docker-gpu

* fix issues

* fix

---------

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
fxmarty
2024-05-13 18:14:36 +02:00
committed by GitHub
parent 539ed75d50
commit 37bba2a32d
14 changed files with 170 additions and 81 deletions

View File

@@ -119,6 +119,7 @@ class Seq2seqTrainerTester(TestCasePlus):
warmup_steps=0,
eval_steps=2,
logging_steps=2,
report_to="none",
)
# instantiate trainer
@@ -152,7 +153,7 @@ class Seq2seqTrainerTester(TestCasePlus):
"google-t5/t5-small", max_length=None, min_length=None, max_new_tokens=256, min_new_tokens=1, num_beams=5
)
training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True)
training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True, report_to="none")
trainer = Seq2SeqTrainer(
model=model,
@@ -160,6 +161,7 @@ class Seq2seqTrainerTester(TestCasePlus):
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=lambda x: {"samples": x[0].shape[0]},
report_to="none",
)
def prepare_data(examples):
@@ -191,7 +193,9 @@ class Seq2seqTrainerTester(TestCasePlus):
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
gen_config = GenerationConfig(do_sample=False, top_p=0.9) # bad: top_p is not compatible with do_sample=False
training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True, generation_config=gen_config)
training_args = Seq2SeqTrainingArguments(
".", predict_with_generate=True, generation_config=gen_config, report_to="none"
)
with self.assertRaises(ValueError) as exc:
_ = Seq2SeqTrainer(
model=model,