diff --git a/.circleci/config.yml b/.circleci/config.yml index 5fec084523..dd777c8c3a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,4 +1,67 @@ -version: 2 +version: 2.1 +orbs: + gcp-gke: circleci/gcp-gke@1.0.4 + go: circleci/go@1.3.0 + +# TPU REFERENCES +references: + checkout_ml_testing: &checkout_ml_testing + run: + name: Checkout ml-testing-accelerators + command: | + git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git + cd ml-testing-accelerators + git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable + git checkout stable + build_push_docker: &build_push_docker + run: + name: Configure Docker + command: | + gcloud --quiet auth configure-docker + cd docker/transformers-pytorch-tpu + if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1"; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi + docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" + deploy_cluster: &deploy_cluster + run: + name: Deploy the job on the kubernetes cluster + command: | + go get github.com/google/go-jsonnet/cmd/jsonnet && \ + export PATH=$PATH:$HOME/go/bin && \ + kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \ + job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \ + job_name=${job_name#job.batch/} && \ + job_name=${job_name% created} && \ + echo "Waiting on kubernetes job: $job_name" && \ + i=0 && \ + # 30 checks spaced 30s apart = 900s total. + max_checks=30 && \ + status_code=2 && \ + # Check on the job periodically. Set the status code depending on what + # happened to the job in Kubernetes. If we try max_checks times and + # still the job hasn't finished, give up and return the starting + # non-zero status code. + while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ + echo "Done waiting. Job status code: $status_code" && \ + # Allow time for logs to flush. + sleep 60 && \ + echo "JOB_NAME: $job_name" && \ + gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID && \ + echo "Done with log retrieval attempt." && \ + gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \ + exit $status_code + delete_gke_jobs: &delete_gke_jobs + run: + name: Delete GKE Jobs + command: | + # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job + # that has been around longer than 1hr. First print all columns for + # matches, then execute the delete. + kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}' + kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}') + + + + jobs: run_tests_torch_and_tf: working_directory: ~/transformers @@ -50,7 +113,6 @@ jobs: - store_artifacts: path: ~/transformers/output.txt destination: test_output.txt - run_tests_tf: working_directory: ~/transformers docker: @@ -193,6 +255,35 @@ jobs: - checkout - run: pip install requests - run: python ./utils/link_tester.py + +# TPU JOBS + run_examples_tpu: + docker: + - image: circleci/python:3.6 + environment: + OMP_NUM_THREADS: 1 + resource_class: xlarge + parallelism: 1 + steps: + - checkout + - go/install + - *checkout_ml_testing + - gcp-gke/install + - gcp-gke/update-kubeconfig-with-credentials: + cluster: $GKE_CLUSTER + perform-login: true + - setup_remote_docker + - *build_push_docker + - *deploy_cluster + cleanup-gke-jobs: + docker: + - image: circleci/python:3.6 + steps: + - gcp-gke/install + - gcp-gke/update-kubeconfig-with-credentials: + cluster: $GKE_CLUSTER + perform-login: true + - *delete_gke_jobs workflow_filters: &workflow_filters filters: branches: @@ -211,3 +302,15 @@ workflows: - run_tests_tf - build_doc - deploy_doc: *workflow_filters + tpu_testing_jobs: + triggers: + - schedule: + # Set to run at the first minute of every hour. + cron: "0 8 * * *" + filters: + branches: + only: + - master + jobs: + - cleanup-gke-jobs + - run_examples_tpu diff --git a/docker/transformers-pytorch-tpu/Dockerfile b/docker/transformers-pytorch-tpu/Dockerfile new file mode 100644 index 0000000000..97702affce --- /dev/null +++ b/docker/transformers-pytorch-tpu/Dockerfile @@ -0,0 +1,65 @@ +FROM google/cloud-sdk:slim + +# Build args. +ARG GITHUB_REF=refs/heads/master + +# TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7 +# wheels available; see below. +ENV PYTHON_VERSION=3.6 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + git \ + curl \ + ca-certificates + +# Install conda and python. +# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 +RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \ + chmod +x ~/miniconda.sh && \ + ~/miniconda.sh -b && \ + rm ~/miniconda.sh + +ENV PATH=/root/miniconda3/bin:$PATH + +RUN conda create -y --name container python=$PYTHON_VERSION + +# Run the rest of commands within the new conda env. +# Use absolute path to appease Codefactor. +SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"] +RUN conda install -y python=$PYTHON_VERSION mkl + +RUN pip uninstall -y torch && \ + # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m + gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ + gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ + gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ + pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ + pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ + pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ + rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ + rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ + rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ + apt-get install -y libomp5 + +ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib + + +# Install huggingface/transformers at the current PR, plus dependencies. +RUN git clone https://github.com/huggingface/transformers.git && \ + cd transformers && \ + git fetch origin $GITHUB_REF:CI && \ + git checkout CI && \ + cd .. && \ + pip install ./transformers && \ + pip install -r ./transformers/examples/requirements.txt && \ + pip install pytest + +RUN python -c "import torch_xla; print(torch_xla.__version__)" +RUN python -c "import transformers as trf; print(trf.__version__)" +RUN conda init bash +COPY docker-entrypoint.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/docker-entrypoint.sh +ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] +CMD ["bash"] diff --git a/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet b/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet new file mode 100644 index 0000000000..ca0c86638f --- /dev/null +++ b/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet @@ -0,0 +1,38 @@ +local base = import 'templates/base.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import "templates/utils.libsonnet"; +local volumes = import "templates/volumes.libsonnet"; + +local bertBaseCased = base.BaseTest { + frameworkPrefix: "hf", + modelName: "bert-base-cased", + mode: "example", + configMaps: [], + + timeout: 3600, # 1 hour, in seconds + + image: std.extVar('image'), + imageTag: std.extVar('image-tag'), + + tpuSettings+: { + softwareVersion: "pytorch-nightly", + }, + accelerator: tpus.v3_8, + + volumeMap+: { + datasets: volumes.PersistentVolumeSpec { + name: "huggingface-cluster-disk", + mountPath: "/datasets", + }, + }, + command: utils.scriptCommand( + ||| + python -m pytest -s transformers/examples/test_xla_examples.py -v + test_exit_code=$? + echo "\nFinished running commands.\n" + test $test_exit_code -eq 0 + ||| + ), +}; + +bertBaseCased.oneshotJob diff --git a/docker/transformers-pytorch-tpu/dataset.yaml b/docker/transformers-pytorch-tpu/dataset.yaml new file mode 100644 index 0000000000..ce022ea6c1 --- /dev/null +++ b/docker/transformers-pytorch-tpu/dataset.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: huggingface-cluster-disk +spec: + storageClassName: "" + capacity: + storage: 500Gi + accessModes: + - ReadOnlyMany + claimRef: + namespace: default + name: huggingface-cluster-disk-claim + gcePersistentDisk: + pdName: huggingface-cluster-disk + fsType: ext4 + readOnly: true +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: huggingface-cluster-disk-claim +spec: + # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass. + # A nil storageClassName value uses the default StorageClass. For details, see + # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1 + storageClassName: "" + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Ki diff --git a/docker/transformers-pytorch-tpu/docker-entrypoint.sh b/docker/transformers-pytorch-tpu/docker-entrypoint.sh new file mode 100644 index 0000000000..fbe59566fd --- /dev/null +++ b/docker/transformers-pytorch-tpu/docker-entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/bash +source ~/.bashrc +echo "running docker-entrypoint.sh" +conda activate container +echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS +echo "printed TPU info" +export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" +exec "$@"#!/bin/bash diff --git a/examples/test_xla_examples.py b/examples/test_xla_examples.py index c192a87e89..8e3aad7b98 100644 --- a/examples/test_xla_examples.py +++ b/examples/test_xla_examples.py @@ -14,7 +14,6 @@ # limitations under the License. -import argparse import logging import sys import unittest @@ -29,13 +28,6 @@ logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger() -def get_setup_file(): - parser = argparse.ArgumentParser() - parser.add_argument("-f") - args = parser.parse_args() - return args.f - - @require_torch_tpu class TorchXLAExamplesTests(unittest.TestCase): def test_run_glue(self): @@ -47,13 +39,13 @@ class TorchXLAExamplesTests(unittest.TestCase): output_directory = "run_glue_output" testargs = f""" - text-classification/run_glue.py + transformers/examples/text-classification/run_glue.py --num_cores=8 - text-classification/run_glue.py + transformers/examples/text-classification/run_glue.py --do_train --do_eval --task_name=MRPC - --data_dir=../glue_data/MRPC + --data_dir=/datasets/glue_data/MRPC --cache_dir=./cache_dir --num_train_epochs=1 --max_seq_length=128 @@ -87,5 +79,5 @@ class TorchXLAExamplesTests(unittest.TestCase): # Assert that the model trains self.assertGreaterEqual(value, 0.70) - # Assert that the script takes less than 100 seconds to make sure it doesn't hang. - self.assertLess(end - start, 100) + # Assert that the script takes less than 300 seconds to make sure it doesn't hang. + self.assertLess(end - start, 300)