From 886f690e76cdf647bb38851abca7b59add27dd95 Mon Sep 17 00:00:00 2001 From: HMJ0628 <2383422508@qq.com> Date: Tue, 17 Dec 2024 01:22:35 +0800 Subject: [PATCH] Translating "translate perf_infer_gpu_multi.md" to Chinese (#35271) add "translate perf_infer_gpu_multi" --- docs/source/zh/_toctree.yml | 2 + docs/source/zh/perf_infer_gpu_multi.md | 68 ++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 docs/source/zh/perf_infer_gpu_multi.md diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index 572f4b8572..2cce86b659 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -69,6 +69,8 @@ title: 完全分片数据并行 - local: perf_train_special title: 在 Apple silicon 芯片上进行 PyTorch 训练 + - local: perf_infer_gpu_multi + title: 多GPU推理 - local: perf_train_cpu title: 在CPU上进行高效训练 - local: perf_hardware diff --git a/docs/source/zh/perf_infer_gpu_multi.md b/docs/source/zh/perf_infer_gpu_multi.md new file mode 100644 index 0000000000..ee523bc604 --- /dev/null +++ b/docs/source/zh/perf_infer_gpu_multi.md @@ -0,0 +1,68 @@ + + +# 多GPU推理 + +某些模型现已支持内置的**张量并行**(Tensor Parallelism, TP),并通过 PyTorch 实现。张量并行技术将模型切分到多个 GPU 上,从而支持更大的模型尺寸,并对诸如矩阵乘法等计算任务进行并行化。 + +要启用张量并行,只需在调用 [`~AutoModelForCausalLM.from_pretrained`] 时传递参数 `tp_plan="auto"`: + +```python +import os +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + +# 初始化分布式环境 +rank = int(os.environ["RANK"]) +device = torch.device(f"cuda:{rank}") +torch.distributed.init_process_group("nccl", device_id=device) + +# 获取支持张量并行的模型 +model = AutoModelForCausalLM.from_pretrained( + model_id, + tp_plan="auto", +) + +# 准备输入tokens +tokenizer = AutoTokenizer.from_pretrained(model_id) +prompt = "Can I help" +inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + +# 分布式运行 +outputs = model(inputs) +``` + +您可以使用 `torchrun` 命令启动上述脚本,多进程模式会自动将每个进程映射到一张 GPU: + +``` +torchrun --nproc-per-node 4 demo.py +``` + +目前,PyTorch 张量并行支持以下模型: +* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) + +如果您希望对其他模型添加张量并行支持,可以通过提交 GitHub Issue 或 Pull Request 来提出请求。 + +### 预期性能提升 + +对于推理场景(尤其是处理大批量或长序列的输入),张量并行可以显著提升计算速度。 + +以下是 [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) 模型在序列长度为 512 且不同批量大小情况下的单次前向推理的预期加速效果: + +
+