Remove all traces of low_cpu_mem_usage (#38792)

* remove it from all py files * remove it from the doc * remove it from examples * style * remove traces of _fast_init * Update test_peft_integration.py * CIs
2025-06-12 16:39:33 +02:00
parent 3542e0b844
commit 4b8ec667e9
76 changed files with 100 additions and 598 deletions
--- a/docs/source/ko/llm_tutorial_optimization.md
+++ b/docs/source/ko/llm_tutorial_optimization.md
@@ -227,7 +227,7 @@ flush()
 이제 4비트 양자화가 제공하는 최대 GPU 메모리 사용량을 확인해 봅시다. 4비트로 모델을 양자화하려면 이전과 동일한 API를 사용하되 이번에는 `load_in_8bit=True` 대신 `load_in_4bit=True`를 전달하면 됩니다.

 ```python
-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, pad_token_id=0)

 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

--- a/docs/source/ko/model_doc/chameleon.md
+++ b/docs/source/ko/model_doc/chameleon.md
@@ -148,7 +148,6 @@ model_id = "facebook/chameleon-7b"
 model = ChameleonForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
    attn_implementation="flash_attention_2"
 ).to(0)
 ```
--- a/docs/source/ko/trainer.md
+++ b/docs/source/ko/trainer.md
@@ -421,7 +421,7 @@ args = TrainingArguments(
 model_id = "google/gemma-2b"

 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+model = AutoModelForCausalLM.from_pretrained(model_id).to(0)

 trainer = trl.SFTTrainer(
    model=model,