diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index 53f7f7f929..c93d3eafe7 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -155,13 +155,20 @@ This example assumes that you have:
 The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then
 extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image:
 ```dockerfile
-FROM intel/ai-workflows:torch-2.0.1-huggingface-multinode-py3.9
+FROM intel/intel-optimized-pytorch:2.3.0-pip-multinode
+
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    google-perftools \
+    libomp-dev
 
 WORKDIR /workspace
 
 # Download and extract the transformers code
-ARG HF_TRANSFORMERS_VER="4.35.2"
-RUN mkdir transformers && \
+ARG HF_TRANSFORMERS_VER="4.44.0"
+RUN pip install --no-cache-dir \
+    transformers==${HF_TRANSFORMERS_VER} && \
+    mkdir transformers && \
     curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf -
 ```
 The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the
@@ -189,7 +196,6 @@ apiVersion: "kubeflow.org/v1"
 kind: PyTorchJob
 metadata:
   name: transformers-pytorchjob
-  namespace: kubeflow
 spec:
   elasticPolicy:
     rdzvBackend: c10d
@@ -206,32 +212,27 @@ spec:
             - name: pytorch
               image: <image name>:<tag>  # Specify the docker image to use for the worker pods
               imagePullPolicy: IfNotPresent
-              command:
-                - torchrun
-                - /workspace/transformers/examples/pytorch/question-answering/run_qa.py
-                - --model_name_or_path
-                - "google-bert/bert-large-uncased"
-                - --dataset_name
-                - "squad"
-                - --do_train
-                - --do_eval
-                - --per_device_train_batch_size
-                - "12"
-                - --learning_rate
-                - "3e-5"
-                - --num_train_epochs
-                - "2"
-                - --max_seq_length
-                - "384"
-                - --doc_stride
-                - "128"
-                - --output_dir
-                - "/tmp/pvc-mount/output"
-                - --no_cuda
-                - --ddp_backend
-                - "ccl"
-                - --use_ipex
-                - --bf16  # Specify --bf16 if your hardware supports bfloat16
+              command: ["/bin/bash", "-c"]
+              args:
+                - >-
+                  cd /workspace/transformers;
+                  pip install -r /workspace/transformers/examples/pytorch/question-answering/requirements.txt;
+                  source /usr/local/lib/python3.10/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh;
+                  torchrun /workspace/transformers/examples/pytorch/question-answering/run_qa.py \
+                    --model_name_or_path distilbert/distilbert-base-uncased \
+                    --dataset_name squad \
+                    --do_train \
+                    --do_eval \
+                    --per_device_train_batch_size 12 \
+                    --learning_rate 3e-5 \
+                    --num_train_epochs 2 \
+                    --max_seq_length 384 \
+                    --doc_stride 128 \
+                    --output_dir /tmp/pvc-mount/output_$(date +%Y%m%d_%H%M%S) \
+                    --no_cuda \
+                    --ddp_backend ccl \
+                    --bf16 \
+                    --use_ipex;
               env:
               - name: LD_PRELOAD
                 value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so"
@@ -244,13 +245,13 @@ spec:
               - name: CCL_WORKER_COUNT
                 value: "1"
               - name: OMP_NUM_THREADS  # Can be tuned for optimal performance
--                value: "56"
+                value: "240"
               resources:
                 limits:
-                  cpu: 200  # Update the CPU and memory limit values based on your nodes
+                  cpu: 240  # Update the CPU and memory limit values based on your nodes
                   memory: 128Gi
                 requests:
-                  cpu: 200  # Update the CPU and memory request values based on your nodes
+                  cpu: 240  # Update the CPU and memory request values based on your nodes
                   memory: 128Gi
               volumeMounts:
               - name: pvc-volume
@@ -258,8 +259,8 @@ spec:
               - mountPath: /dev/shm
                 name: dshm
           restartPolicy: Never
-          nodeSelector:  #  Optionally use the node selector to specify what types of nodes to use for the workers
-            node-type: spr
+          nodeSelector:  # Optionally use nodeSelector to match a certain node label for the worker pods
+            node-type: gnr
           volumes:
           - name: pvc-volume
             persistentVolumeClaim:
@@ -287,10 +288,12 @@ set the same CPU and memory amounts for both the resource limits and requests.
 After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed
 to the cluster using:
 ```bash
-kubectl create -f pytorchjob.yaml
+export NAMESPACE=<specify your namespace>
+
+kubectl create -f pytorchjob.yaml -n ${NAMESPACE}
 ```
 
-The `kubectl get pods -n kubeflow` command can then be used to list the pods in the `kubeflow` namespace. You should see
+The `kubectl get pods -n ${NAMESPACE}` command can then be used to list the pods in your namespace. You should see
 the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of "Pending" as
 the containers get pulled and created, then the status should change to "Running".
 ```
@@ -303,13 +306,13 @@ transformers-pytorchjob-worker-3                         1/1     Running
 ...
 ```
 
-The logs for worker can be viewed using `kubectl logs -n kubeflow <pod name>`. Add `-f` to stream the logs, for example:
+The logs for worker can be viewed using `kubectl logs <pod name> -n ${NAMESPACE}`. Add `-f` to stream the logs, for example:
 ```bash
-kubectl logs -n kubeflow transformers-pytorchjob-worker-0 -f
+kubectl logs transformers-pytorchjob-worker-0 -n ${NAMESPACE} -f
 ```
 
 After the training job completes, the trained model can be copied from the PVC or storage location. When you are done
-with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml`.
+with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}`.
 
 ## Summary