From bff964c429a5bfc8ca85789f20f37d6bfb60b294 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Thu, 3 Jul 2025 11:07:11 +0200 Subject: [PATCH] Decouple device_map='auto' and tp_plan='auto' (#38942) * dissociate * better place * fix --- src/transformers/modeling_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index e6b7031ab3..f7f34c75ea 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -4431,10 +4431,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi "`tp_plan` and `device_map` are mutually exclusive. Choose either one for parallelization." ) - # If torchrun was used, make sure to TP by default. This way people don't need to change tp or device map - if device_map == "auto" and tp_plan is None and int(os.environ.get("WORLD_SIZE", 0)): - tp_plan = "auto" # device_map = "auto" in torchrun equivalent to TP plan = AUTO! - device_map = None + if device_map == "auto" and int(os.environ.get("WORLD_SIZE", 0)): + logger.info( + "You've set device_map=`auto` while triggering a distributed run with torchrun. This might lead to unexpected behavior. " + "If your plan is to load the model on each device, you should set device_map={" + ": PartialState().process_index} where PartialState comes from accelerate library" + ) # We need to correctly dispatch the model on the current process device. The easiest way for this is to use a simple # `device_map` pointing to the correct device