From 715fdd6459b37336ba79f90ffdd6f9adb8ee22a5 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 14 Jan 2025 18:33:48 +0800 Subject: [PATCH] Update torchao.md: use auto-compilation (#35490) * Update torchao.md: use auto-compilation * Update torchao.md: indicate updating transformers to the latest --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- docs/source/en/quantization/torchao.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 38f7c074c9..46fb0f8cbb 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -16,7 +16,8 @@ rendered properly in your Markdown viewer. Before you begin, make sure the following libraries are installed with their latest version: ```bash -pip install --upgrade torch torchao +# Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation +pip install --upgrade torch torchao transformers ``` By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. @@ -35,12 +36,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "What are we having for dinner?" input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") -# compile the quantized model to get speedup -import torchao -torchao.quantization.utils.recommended_inductor_config_setter() -quantized_model = torch.compile(quantized_model, mode="max-autotune") - -output = quantized_model.generate(**input_ids, max_new_tokens=10) +# auto-compile the quantized model with `cache_implementation="static"` to get speedup +output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) # benchmark the performance @@ -59,11 +56,11 @@ def benchmark_fn(f, *args, **kwargs): return f"{(t0.blocked_autorange().mean):.3f}" MAX_NEW_TOKENS = 1000 -print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)) +print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16) -bf16_model = torch.compile(bf16_model, mode="max-autotune") -print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)) +output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile +print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) ```