[GitHub Runner] Fix flax runner (#13357)
* correct * also comment out multi-gpu test push
This commit is contained in:
committed by
GitHub
parent
c76de1053e
commit
642e1936e3
110
.github/workflows/self-push.yml
vendored
110
.github/workflows/self-push.yml
vendored
@@ -106,9 +106,9 @@ jobs:
|
|||||||
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
||||||
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
||||||
|
|
||||||
# - name: Fetch the tests to run
|
- name: Fetch the tests to run
|
||||||
# run: |
|
run: |
|
||||||
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||||
|
|
||||||
- name: Report fetched tests
|
- name: Report fetched tests
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v2
|
||||||
@@ -118,10 +118,9 @@ jobs:
|
|||||||
|
|
||||||
- name: Run all non-slow tests on GPU
|
- name: Run all non-slow tests on GPU
|
||||||
run: |
|
run: |
|
||||||
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu
|
if [ -f test_list.txt ]; then
|
||||||
# if [ -f test_list.txt ]; then
|
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
|
||||||
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
|
fi
|
||||||
# fi
|
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
@@ -251,61 +250,60 @@ jobs:
|
|||||||
name: run_all_tests_torch_multi_gpu_test_reports
|
name: run_all_tests_torch_multi_gpu_test_reports
|
||||||
path: reports
|
path: reports
|
||||||
|
|
||||||
run_tests_flax_multi_gpu:
|
# run_tests_flax_multi_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
# runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||||
container:
|
# container:
|
||||||
image: tensorflow/tensorflow:2.4.1-gpu
|
# image: tensorflow/tensorflow:2.4.1-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
# steps:
|
||||||
- name: Install dependencies
|
# - name: Install dependencies
|
||||||
run: |
|
# run: |
|
||||||
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
||||||
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||||
pip install --upgrade pip
|
# pip install --upgrade pip
|
||||||
pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
||||||
|
#
|
||||||
- name: Launcher docker
|
# - name: Launcher docker
|
||||||
uses: actions/checkout@v2
|
# uses: actions/checkout@v2
|
||||||
with:
|
# with:
|
||||||
fetch-depth: 2
|
# fetch-depth: 2
|
||||||
|
#
|
||||||
- name: NVIDIA-SMI
|
# - name: NVIDIA-SMI
|
||||||
continue-on-error: true
|
# continue-on-error: true
|
||||||
run: |
|
# run: |
|
||||||
nvidia-smi
|
# nvidia-smi
|
||||||
|
#
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
# - name: Are GPUs recognized by our DL frameworks
|
||||||
run: |
|
# run: |
|
||||||
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
||||||
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
||||||
|
#
|
||||||
# - name: Fetch the tests to run
|
# - name: Fetch the tests to run
|
||||||
# run: |
|
# run: |
|
||||||
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||||
|
#
|
||||||
- name: Report fetched tests
|
# - name: Report fetched tests
|
||||||
uses: actions/upload-artifact@v2
|
# uses: actions/upload-artifact@v2
|
||||||
with:
|
# with:
|
||||||
name: test_fetched
|
# name: test_fetched
|
||||||
path: test_preparation.txt
|
# path: test_preparation.txt
|
||||||
|
#
|
||||||
- name: Run all non-slow tests on GPU
|
# - name: Run all non-slow tests on GPU
|
||||||
run: |
|
# run: |
|
||||||
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu
|
|
||||||
# if [ -f test_list.txt ]; then
|
# if [ -f test_list.txt ]; then
|
||||||
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
|
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
|
||||||
# fi
|
# fi
|
||||||
|
#
|
||||||
- name: Failure short reports
|
# - name: Failure short reports
|
||||||
if: ${{ failure() }}
|
# if: ${{ failure() }}
|
||||||
run: cat reports/tests_flax_multi_gpu_failures_short.txt
|
# run: cat reports/tests_flax_multi_gpu_failures_short.txt
|
||||||
|
#
|
||||||
- name: Test suite reports artifacts
|
# - name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
# if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v2
|
# uses: actions/upload-artifact@v2
|
||||||
with:
|
# with:
|
||||||
name: run_all_tests_flax_multi_gpu_test_reports
|
# name: run_all_tests_flax_multi_gpu_test_reports
|
||||||
path: reports
|
# path: reports
|
||||||
|
|
||||||
# run_tests_tf_multi_gpu:
|
# run_tests_tf_multi_gpu:
|
||||||
# runs-on: [self-hosted, docker-gpu, multi-gpu]
|
# runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||||
|
|||||||
79
.github/workflows/self-scheduled.yml
vendored
79
.github/workflows/self-scheduled.yml
vendored
@@ -86,7 +86,7 @@ jobs:
|
|||||||
path: reports
|
path: reports
|
||||||
|
|
||||||
run_all_tests_flax_gpu:
|
run_all_tests_flax_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
runs-on: [self-hosted, docker-gpu-test, single-gpu]
|
||||||
container:
|
container:
|
||||||
image: tensorflow/tensorflow:2.4.1-gpu
|
image: tensorflow/tensorflow:2.4.1-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@@ -291,45 +291,44 @@ jobs:
|
|||||||
name: run_all_tests_tf_multi_gpu_test_reports
|
name: run_all_tests_tf_multi_gpu_test_reports
|
||||||
path: reports
|
path: reports
|
||||||
|
|
||||||
run_all_tests_flax_multi_gpu:
|
# run_all_tests_flax_multi_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
# runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||||
container:
|
# container:
|
||||||
image: tensorflow/tensorflow:2.4.1-gpu
|
# image: tensorflow/tensorflow:2.4.1-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
# steps:
|
||||||
- name: Launcher docker
|
# - name: Launcher docker
|
||||||
uses: actions/checkout@v2
|
# uses: actions/checkout@v2
|
||||||
|
#
|
||||||
- name: NVIDIA-SMI
|
# - name: NVIDIA-SMI
|
||||||
continue-on-error: true
|
# run: |
|
||||||
run: |
|
# nvidia-smi
|
||||||
nvidia-smi
|
#
|
||||||
|
# - name: Install dependencies
|
||||||
- name: Install dependencies
|
# run: |
|
||||||
run: |
|
# pip install --upgrade pip
|
||||||
pip install --upgrade pip
|
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||||
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
# pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
|
||||||
pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
|
#
|
||||||
|
# - name: Are GPUs recognized by our DL frameworks
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
# run: |
|
||||||
run: |
|
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
||||||
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
||||||
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
#
|
||||||
|
# - name: Run all tests on GPU
|
||||||
- name: Run all tests on GPU
|
# run: |
|
||||||
run: |
|
# python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
|
#
|
||||||
|
# - name: Failure short reports
|
||||||
- name: Failure short reports
|
# if: ${{ always() }}
|
||||||
if: ${{ always() }}
|
# run: cat reports/tests_flax_gpu_failures_short.txt
|
||||||
run: cat reports/tests_flax_gpu_failures_short.txt
|
#
|
||||||
|
# - name: Test suite reports artifacts
|
||||||
- name: Test suite reports artifacts
|
# if: ${{ always() }}
|
||||||
if: ${{ always() }}
|
# uses: actions/upload-artifact@v2
|
||||||
uses: actions/upload-artifact@v2
|
# with:
|
||||||
with:
|
# name: run_all_tests_flax_gpu_test_reports
|
||||||
name: run_all_tests_flax_gpu_test_reports
|
# path: reports
|
||||||
path: reports
|
|
||||||
|
|
||||||
run_all_tests_torch_cuda_extensions_gpu:
|
run_all_tests_torch_cuda_extensions_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||||
|
|||||||
Reference in New Issue
Block a user