From 6b550462139655d488d4c663086a63e98713c6b9 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Sat, 8 Feb 2025 04:42:22 +0800 Subject: [PATCH] [docs] fix not-working example code in `perf_infer_gpu_one.md` (#36087) * bug fix * update memory limit --- docs/source/en/perf_infer_gpu_one.md | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 5ffdefec9e..6ffd3213c2 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -357,7 +357,7 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda") input_text = "Hello my dog is cute and" -inputs = tokenizer(input_text, return_tensors="pt").to("cuda") +inputs = tokenizer(input_text, return_tensors="pt").to(model.device) + with sdpa_kernel(SDPBackend.FLASH_ATTENTION): outputs = model.generate(**inputs) @@ -431,14 +431,14 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d ```py from transformers import AutoModelForCausalLM -model_name = "bigscience/bloom-2b5" +model_name = "bigscience/bloom-1b7" model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True) ``` -To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU: +To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU: ```py -max_memory_mapping = {0: "600MB", 1: "1GB"} +max_memory_mapping = {0: "2GB", 1: "5GB"} model_name = "bigscience/bloom-3b" model_4bit = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping @@ -458,7 +458,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d ```py from transformers import AutoModelForCausalLM, BitsAndBytesConfig -model_name = "bigscience/bloom-2b5" +model_name = "bigscience/bloom-1b7" model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` @@ -467,20 +467,20 @@ If you're loading a model in 8-bit for text generation, you should use the [`~tr ```py from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig -model_name = "bigscience/bloom-2b5" +model_name = "bigscience/bloom-1b7" tokenizer = AutoTokenizer.from_pretrained(model_name) model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) prompt = "Hello, my llama is cute" -inputs = tokenizer(prompt, return_tensors="pt").to("cuda") -generated_ids = model.generate(**inputs) +inputs = tokenizer(prompt, return_tensors="pt").to(model_8bit.device) +generated_ids = model_8bit.generate(**inputs) outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) ``` -To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU: +To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU: ```py -max_memory_mapping = {0: "1GB", 1: "2GB"} +max_memory_mapping = {0: "2GB", 1: "5GB"} model_name = "bigscience/bloom-3b" model_8bit = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping @@ -545,11 +545,8 @@ quantization_config = BitsAndBytesConfig( tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config) -# enable BetterTransformer -model = model.to_bettertransformer() - input_text = "Hello my dog is cute and" -inputs = tokenizer(input_text, return_tensors="pt").to("cuda") +inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # enable FlashAttention with sdpa_kernel(SDPBackend.FLASH_ATTENTION):