diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md index 53f7f7f929..c93d3eafe7 100644 --- a/docs/source/en/perf_train_cpu_many.md +++ b/docs/source/en/perf_train_cpu_many.md @@ -155,13 +155,20 @@ This example assumes that you have: The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image: ```dockerfile -FROM intel/ai-workflows:torch-2.0.1-huggingface-multinode-py3.9 +FROM intel/intel-optimized-pytorch:2.3.0-pip-multinode + +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends --fix-missing \ + google-perftools \ + libomp-dev WORKDIR /workspace # Download and extract the transformers code -ARG HF_TRANSFORMERS_VER="4.35.2" -RUN mkdir transformers && \ +ARG HF_TRANSFORMERS_VER="4.44.0" +RUN pip install --no-cache-dir \ + transformers==${HF_TRANSFORMERS_VER} && \ + mkdir transformers && \ curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf - ``` The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the @@ -189,7 +196,6 @@ apiVersion: "kubeflow.org/v1" kind: PyTorchJob metadata: name: transformers-pytorchjob - namespace: kubeflow spec: elasticPolicy: rdzvBackend: c10d @@ -206,32 +212,27 @@ spec: - name: pytorch image: : # Specify the docker image to use for the worker pods imagePullPolicy: IfNotPresent - command: - - torchrun - - /workspace/transformers/examples/pytorch/question-answering/run_qa.py - - --model_name_or_path - - "google-bert/bert-large-uncased" - - --dataset_name - - "squad" - - --do_train - - --do_eval - - --per_device_train_batch_size - - "12" - - --learning_rate - - "3e-5" - - --num_train_epochs - - "2" - - --max_seq_length - - "384" - - --doc_stride - - "128" - - --output_dir - - "/tmp/pvc-mount/output" - - --no_cuda - - --ddp_backend - - "ccl" - - --use_ipex - - --bf16 # Specify --bf16 if your hardware supports bfloat16 + command: ["/bin/bash", "-c"] + args: + - >- + cd /workspace/transformers; + pip install -r /workspace/transformers/examples/pytorch/question-answering/requirements.txt; + source /usr/local/lib/python3.10/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh; + torchrun /workspace/transformers/examples/pytorch/question-answering/run_qa.py \ + --model_name_or_path distilbert/distilbert-base-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/pvc-mount/output_$(date +%Y%m%d_%H%M%S) \ + --no_cuda \ + --ddp_backend ccl \ + --bf16 \ + --use_ipex; env: - name: LD_PRELOAD value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so" @@ -244,13 +245,13 @@ spec: - name: CCL_WORKER_COUNT value: "1" - name: OMP_NUM_THREADS # Can be tuned for optimal performance -- value: "56" + value: "240" resources: limits: - cpu: 200 # Update the CPU and memory limit values based on your nodes + cpu: 240 # Update the CPU and memory limit values based on your nodes memory: 128Gi requests: - cpu: 200 # Update the CPU and memory request values based on your nodes + cpu: 240 # Update the CPU and memory request values based on your nodes memory: 128Gi volumeMounts: - name: pvc-volume @@ -258,8 +259,8 @@ spec: - mountPath: /dev/shm name: dshm restartPolicy: Never - nodeSelector: # Optionally use the node selector to specify what types of nodes to use for the workers - node-type: spr + nodeSelector: # Optionally use nodeSelector to match a certain node label for the worker pods + node-type: gnr volumes: - name: pvc-volume persistentVolumeClaim: @@ -287,10 +288,12 @@ set the same CPU and memory amounts for both the resource limits and requests. After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed to the cluster using: ```bash -kubectl create -f pytorchjob.yaml +export NAMESPACE= + +kubectl create -f pytorchjob.yaml -n ${NAMESPACE} ``` -The `kubectl get pods -n kubeflow` command can then be used to list the pods in the `kubeflow` namespace. You should see +The `kubectl get pods -n ${NAMESPACE}` command can then be used to list the pods in your namespace. You should see the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of "Pending" as the containers get pulled and created, then the status should change to "Running". ``` @@ -303,13 +306,13 @@ transformers-pytorchjob-worker-3 1/1 Running ... ``` -The logs for worker can be viewed using `kubectl logs -n kubeflow `. Add `-f` to stream the logs, for example: +The logs for worker can be viewed using `kubectl logs -n ${NAMESPACE}`. Add `-f` to stream the logs, for example: ```bash -kubectl logs -n kubeflow transformers-pytorchjob-worker-0 -f +kubectl logs transformers-pytorchjob-worker-0 -n ${NAMESPACE} -f ``` After the training job completes, the trained model can be copied from the PVC or storage location. When you are done -with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml`. +with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}`. ## Summary