TP initialization module-by-module (#35996)

* module-by-module loading!

* Update modeling_utils.py

* dtyle and comments

* Update modeling_utils.py

* Update modeling_utils.py

* Update test

* Update modeling_utils.py

* Update modeling_utils.py

* Update test_tp.py

* Update test_tp.py

* Update modeling_utils.py

* re-trigger CIs

* re-trigger CIs
This commit is contained in:
Cyril Vallez
2025-02-19 14:04:57 +01:00
committed by GitHub
parent 0863eef248
commit 60226c6ff3
2 changed files with 78 additions and 29 deletions

View File

@@ -81,17 +81,13 @@ class TestTensorParallel(TestCasePlus):
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, tp_plan="auto")
torch.distributed.barrier()
# The expected full model memory footprint
expected_model_memory = 16
# The expected model memory footprint. We add 1 as not all the modules are split (e.g. the embeddings)
expected_model_memory_per_device = (16 / world_size) + 1
overhead_factor = 1.2
# Assert we did not use more than the full model expected memory (with some overhead)
if not torch.cuda.max_memory_allocated(device) / 1024**3 < expected_model_memory * overhead_factor:
raise ValueError("Loading the model used more than the full model size")
# Assert we correctly handled the sharding between devices
if not torch.cuda.memory_allocated(device) / 1024**3 < (expected_model_memory / world_size) * overhead_factor:
raise ValueError("Each model shard is larger than what is expected.")
# Check that we do not use more than the expected sharded size during initialization
if torch.cuda.max_memory_allocated(device) / 1024**3 > expected_model_memory_per_device * overhead_factor:
raise ValueError("Loading the model used more than the expected fraction of model size per device")
torch.distributed.barrier()
torch.distributed.destroy_process_group()