Fix quantization tests (#29914)
* revert back to torch 2.1.1 * run test * switch to torch 2.2.1 * udapte dockerfile * fix awq tests * fix test * run quanto tests * update tests * split quantization tests * fix * fix again * final fix * fix report artifact * build docker again * Revert "build docker again" This reverts commit 399a5f9d9308da071d79034f238c719de0f3532e. * debug * revert * style * new notification system * testing notfication * rebuild docker * fix_prev_ci_results * typo * remove warning * fix typo * fix artifact name * debug * issue fixed * debug again * fix * fix time * test notif with faling test * typo * issues again * final fix ? * run all quantization tests again * remove name to clear space * revert modfiication done on workflow * fix * build docker * build only quant docker * fix quantization ci * fix * fix report * better quantization_matrix * add print * revert to the basic one
This commit is contained in:
34
.github/workflows/self-scheduled.yml
vendored
34
.github/workflows/self-scheduled.yml
vendored
@@ -33,7 +33,6 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
setup:
|
setup:
|
||||||
if: ${{ inputs.job == 'run_tests_gpu' }}
|
|
||||||
name: Setup
|
name: Setup
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@@ -45,6 +44,7 @@ jobs:
|
|||||||
outputs:
|
outputs:
|
||||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||||
|
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
|
||||||
steps:
|
steps:
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@@ -63,12 +63,20 @@ jobs:
|
|||||||
run: pip freeze
|
run: pip freeze
|
||||||
|
|
||||||
- id: set-matrix
|
- id: set-matrix
|
||||||
|
if: ${{ inputs.job == 'run_tests_gpu' }}
|
||||||
name: Identify models to test
|
name: Identify models to test
|
||||||
working-directory: /transformers/tests
|
working-directory: /transformers/tests
|
||||||
run: |
|
run: |
|
||||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- id: set-matrix-quantization
|
||||||
|
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
|
||||||
|
name: Identify quantization method to test
|
||||||
|
working-directory: /transformers/tests
|
||||||
|
run: |
|
||||||
|
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
- name: NVIDIA-SMI
|
||||||
run: |
|
run: |
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
@@ -303,16 +311,26 @@ jobs:
|
|||||||
|
|
||||||
run_tests_quantization_torch_gpu:
|
run_tests_quantization_torch_gpu:
|
||||||
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
|
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
|
||||||
name: Quantization tests
|
name: " "
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
|
folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-quantization-latest-gpu
|
image: huggingface/transformers-quantization-latest-gpu
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
steps:
|
||||||
|
- name: Echo folder ${{ matrix.folders }}
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
matrix_folders=${{ matrix.folders }}
|
||||||
|
matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
|
||||||
|
echo "$matrix_folders"
|
||||||
|
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: git fetch && git checkout ${{ github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
@@ -337,19 +355,19 @@ jobs:
|
|||||||
- name: Run quantization tests on GPU
|
- name: Run quantization tests on GPU
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu tests/quantization
|
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu/failures_short.txt
|
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}/failures_short.txt
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu"
|
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}"
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu
|
name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}
|
||||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu
|
path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}
|
||||||
|
|
||||||
run_extract_warnings:
|
run_extract_warnings:
|
||||||
# Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic.
|
# Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic.
|
||||||
@@ -413,4 +431,6 @@ jobs:
|
|||||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||||
# This would be an empty string if `setup` is skipped.
|
# This would be an empty string if `setup` is skipped.
|
||||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
|
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||||
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
23
.github/workflows/slack-report.yml
vendored
23
.github/workflows/slack-report.yml
vendored
@@ -15,6 +15,9 @@ on:
|
|||||||
folder_slices:
|
folder_slices:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
|
quantization_matrix:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
@@ -32,6 +35,7 @@ jobs:
|
|||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- uses: actions/download-artifact@v3
|
- uses: actions/download-artifact@v3
|
||||||
- name: Send message to Slack
|
- name: Send message to Slack
|
||||||
|
if: ${{ inputs.job != 'run_tests_quantization_torch_gpu' }}
|
||||||
env:
|
env:
|
||||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||||
@@ -54,6 +58,25 @@ jobs:
|
|||||||
pip show slack_sdk
|
pip show slack_sdk
|
||||||
python utils/notification_service.py "${{ inputs.folder_slices }}"
|
python utils/notification_service.py "${{ inputs.folder_slices }}"
|
||||||
|
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: actions/download-artifact@v3
|
||||||
|
- name: Send message to Slack for quantization workflow
|
||||||
|
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
|
||||||
|
env:
|
||||||
|
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||||
|
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||||
|
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
|
||||||
|
CI_EVENT: scheduled
|
||||||
|
CI_SHA: ${{ github.sha }}
|
||||||
|
SETUP_STATUS: ${{ inputs.setup_status }}
|
||||||
|
# We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
|
||||||
|
# `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
|
||||||
|
run: |
|
||||||
|
sudo apt-get install -y curl
|
||||||
|
pip install slack_sdk
|
||||||
|
pip show slack_sdk
|
||||||
|
python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}"
|
||||||
|
|
||||||
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||||
- name: Failure table artifacts
|
- name: Failure table artifacts
|
||||||
# Only the model testing job is concerned for this step
|
# Only the model testing job is concerned for this step
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
|
|||||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||||
# to be used as arguments for docker build (so far).
|
# to be used as arguments for docker build (so far).
|
||||||
|
|
||||||
ARG PYTORCH='2.2.0'
|
ARG PYTORCH='2.2.1'
|
||||||
# Example: `cu102`, `cu113`, etc.
|
# Example: `cu102`, `cu113`, etc.
|
||||||
ARG CUDA='cu118'
|
ARG CUDA='cu118'
|
||||||
|
|
||||||
@@ -30,6 +30,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
|
|||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
|
|
||||||
|
# needed in bnb and awq
|
||||||
|
RUN python3 -m pip install --no-cache-dir einops
|
||||||
|
|
||||||
# Add bitsandbytes for mixed int8 testing
|
# Add bitsandbytes for mixed int8 testing
|
||||||
RUN python3 -m pip install --no-cache-dir bitsandbytes
|
RUN python3 -m pip install --no-cache-dir bitsandbytes
|
||||||
|
|
||||||
@@ -43,7 +46,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
|
|||||||
RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
|
RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
|
||||||
|
|
||||||
# Add autoawq for quantization testing
|
# Add autoawq for quantization testing
|
||||||
RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.0/autoawq-0.2.0+cu118-cp38-cp38-linux_x86_64.whl
|
# >=v0.2.3 needed for compatibility with torch 2.2.1
|
||||||
|
RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
|
||||||
|
|
||||||
# Add quanto for quantization testing
|
# Add quanto for quantization testing
|
||||||
RUN python3 -m pip install --no-cache-dir quanto
|
RUN python3 -m pip install --no-cache-dir quanto
|
||||||
|
|||||||
@@ -789,7 +789,7 @@ class AwqConfig(QuantizationConfigMixin):
|
|||||||
|
|
||||||
def get_loading_attributes(self):
|
def get_loading_attributes(self):
|
||||||
attibutes_dict = copy.deepcopy(self.__dict__)
|
attibutes_dict = copy.deepcopy(self.__dict__)
|
||||||
loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len"]
|
loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"]
|
||||||
loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
|
loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
|
||||||
return loading_attibutes_dict
|
return loading_attibutes_dict
|
||||||
|
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ class AwqTest(unittest.TestCase):
|
|||||||
|
|
||||||
EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
|
EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
|
||||||
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a"
|
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a"
|
||||||
|
EXPECTED_OUTPUT_EXLLAMA = "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out"
|
||||||
device_map = "cuda"
|
device_map = "cuda"
|
||||||
|
|
||||||
# called only once for all test in this class
|
# called only once for all test in this class
|
||||||
@@ -200,11 +200,11 @@ class AwqTest(unittest.TestCase):
|
|||||||
|
|
||||||
quantization_config = AwqConfig(version="exllama")
|
quantization_config = AwqConfig(version="exllama")
|
||||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||||
self.model_name, quantization_config=quantization_config
|
self.model_name, quantization_config=quantization_config, device_map=torch_device
|
||||||
).to(torch_device)
|
)
|
||||||
|
|
||||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)
|
||||||
|
|
||||||
def test_quantized_model_no_device_map(self):
|
def test_quantized_model_no_device_map(self):
|
||||||
"""
|
"""
|
||||||
@@ -239,7 +239,7 @@ class AwqTest(unittest.TestCase):
|
|||||||
|
|
||||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
||||||
|
|
||||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1, 2, 3})
|
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||||
|
|
||||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||||
|
|
||||||
@@ -272,8 +272,8 @@ class AwqFusedTest(unittest.TestCase):
|
|||||||
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
|
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
|
||||||
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
|
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
|
||||||
|
|
||||||
custom_mapping_model_id = "TheBloke/Yi-34B-AWQ"
|
custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
|
||||||
custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589"
|
custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"
|
||||||
|
|
||||||
mixtral_model_name = "casperhansen/mixtral-instruct-awq"
|
mixtral_model_name = "casperhansen/mixtral-instruct-awq"
|
||||||
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
|
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
|
||||||
@@ -287,8 +287,8 @@ class AwqFusedTest(unittest.TestCase):
|
|||||||
"You end up exactly where you started. Where are you?"
|
"You end up exactly where you started. Where are you?"
|
||||||
)
|
)
|
||||||
|
|
||||||
EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for"
|
EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf"
|
||||||
EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org"
|
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
|
||||||
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
|
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
@@ -423,28 +423,25 @@ class AwqFusedTest(unittest.TestCase):
|
|||||||
fuse_max_seq_len=512,
|
fuse_max_seq_len=512,
|
||||||
modules_to_fuse={
|
modules_to_fuse={
|
||||||
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
|
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
|
||||||
"layernorm": ["ln1", "ln2", "norm"],
|
|
||||||
"mlp": ["gate_proj", "up_proj", "down_proj"],
|
"mlp": ["gate_proj", "up_proj", "down_proj"],
|
||||||
|
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
|
||||||
"use_alibi": False,
|
"use_alibi": False,
|
||||||
"num_attention_heads": 56,
|
"hidden_size": 4096,
|
||||||
|
"num_attention_heads": 32,
|
||||||
"num_key_value_heads": 8,
|
"num_key_value_heads": 8,
|
||||||
"hidden_size": 7168,
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
self.custom_mapping_model_id,
|
self.custom_mapping_model_id,
|
||||||
quantization_config=quantization_config,
|
quantization_config=quantization_config,
|
||||||
trust_remote_code=True,
|
|
||||||
device_map="balanced",
|
device_map="balanced",
|
||||||
revision=self.custom_model_revision,
|
revision=self.custom_model_revision,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._check_fused_modules(model)
|
self._check_fused_modules(model)
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)
|
||||||
self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt = "Hello"
|
prompt = "Hello"
|
||||||
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
|
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
|
||||||
@@ -452,6 +449,7 @@ class AwqFusedTest(unittest.TestCase):
|
|||||||
outputs = model.generate(**inputs, max_new_tokens=12)
|
outputs = model.generate(**inputs, max_new_tokens=12)
|
||||||
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
|
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
|
||||||
|
|
||||||
|
@unittest.skip("Not enough GPU memory on CI runners")
|
||||||
@require_torch_multi_gpu
|
@require_torch_multi_gpu
|
||||||
def test_generation_mixtral_fused(self):
|
def test_generation_mixtral_fused(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1056,7 +1056,6 @@ if __name__ == "__main__":
|
|||||||
"TensorFlow pipelines": "run_tests_tf_pipeline_gpu",
|
"TensorFlow pipelines": "run_tests_tf_pipeline_gpu",
|
||||||
"Examples directory": "run_examples_gpu",
|
"Examples directory": "run_examples_gpu",
|
||||||
"Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
|
"Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
|
||||||
"Quantization tests": "run_tests_quantization_torch_gpu",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
|
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
|
||||||
@@ -1077,7 +1076,6 @@ if __name__ == "__main__":
|
|||||||
"run_pipelines_tf_gpu": "TensorFlow pipelines",
|
"run_pipelines_tf_gpu": "TensorFlow pipelines",
|
||||||
"run_examples_gpu": "Examples directory",
|
"run_examples_gpu": "Examples directory",
|
||||||
"run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests",
|
"run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests",
|
||||||
"run_tests_quantization_torch_gpu": "Quantization tests",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Remove some entries in `additional_files` if they are not concerned.
|
# Remove some entries in `additional_files` if they are not concerned.
|
||||||
|
|||||||
251
utils/notification_service_quantization.py
Normal file
251
utils/notification_service_quantization.py
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from get_ci_error_statistics import get_jobs
|
||||||
|
from notification_service import (
|
||||||
|
Message,
|
||||||
|
handle_stacktraces,
|
||||||
|
handle_test_results,
|
||||||
|
prepare_reports,
|
||||||
|
retrieve_artifact,
|
||||||
|
retrieve_available_artifacts,
|
||||||
|
)
|
||||||
|
from slack_sdk import WebClient
|
||||||
|
|
||||||
|
|
||||||
|
client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
|
||||||
|
|
||||||
|
|
||||||
|
class QuantizationMessage(Message):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
title: str,
|
||||||
|
results: Dict,
|
||||||
|
):
|
||||||
|
self.title = title
|
||||||
|
|
||||||
|
# Failures and success of the modeling tests
|
||||||
|
self.n_success = sum(r["success"] for r in results.values())
|
||||||
|
self.single_gpu_failures = sum(r["failed"]["single"] for r in results.values())
|
||||||
|
self.multi_gpu_failures = sum(r["failed"]["multi"] for r in results.values())
|
||||||
|
self.n_failures = self.single_gpu_failures + self.multi_gpu_failures
|
||||||
|
|
||||||
|
self.n_tests = self.n_failures + self.n_success
|
||||||
|
self.results = results
|
||||||
|
self.thread_ts = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def payload(self) -> str:
|
||||||
|
blocks = [self.header]
|
||||||
|
|
||||||
|
if self.n_failures > 0:
|
||||||
|
blocks.append(self.failures_overwiew)
|
||||||
|
blocks.append(self.failures_detailed)
|
||||||
|
|
||||||
|
if self.n_failures == 0:
|
||||||
|
blocks.append(self.no_failures)
|
||||||
|
|
||||||
|
return json.dumps(blocks)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def time(self) -> str:
|
||||||
|
all_results = self.results.values()
|
||||||
|
time_spent = []
|
||||||
|
for r in all_results:
|
||||||
|
if len(r["time_spent"]):
|
||||||
|
time_spent.extend([x for x in r["time_spent"].split(", ") if len(x.strip())])
|
||||||
|
total_secs = 0
|
||||||
|
|
||||||
|
for time in time_spent:
|
||||||
|
time_parts = time.split(":")
|
||||||
|
|
||||||
|
# Time can be formatted as xx:xx:xx, as .xx, or as x.xx if the time spent was less than a minute.
|
||||||
|
if len(time_parts) == 1:
|
||||||
|
time_parts = [0, 0, time_parts[0]]
|
||||||
|
|
||||||
|
hours, minutes, seconds = int(time_parts[0]), int(time_parts[1]), float(time_parts[2])
|
||||||
|
total_secs += hours * 3600 + minutes * 60 + seconds
|
||||||
|
|
||||||
|
hours, minutes, seconds = total_secs // 3600, (total_secs % 3600) // 60, total_secs % 60
|
||||||
|
return f"{int(hours)}h{int(minutes)}m{int(seconds)}s"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def failures_overwiew(self) -> Dict:
|
||||||
|
return {
|
||||||
|
"type": "section",
|
||||||
|
"text": {
|
||||||
|
"type": "plain_text",
|
||||||
|
"text": (
|
||||||
|
f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
|
||||||
|
f"The suite ran in {self.time}."
|
||||||
|
),
|
||||||
|
"emoji": True,
|
||||||
|
},
|
||||||
|
"accessory": {
|
||||||
|
"type": "button",
|
||||||
|
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
|
||||||
|
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def failures_detailed(self) -> Dict:
|
||||||
|
failures = {k: v["failed"] for k, v in self.results.items()}
|
||||||
|
|
||||||
|
individual_reports = []
|
||||||
|
for key, value in failures.items():
|
||||||
|
device_report = self.get_device_report(value)
|
||||||
|
if sum(value.values()):
|
||||||
|
report = f"{device_report}{key}"
|
||||||
|
individual_reports.append(report)
|
||||||
|
|
||||||
|
header = "Single | Multi | Category\n"
|
||||||
|
failures_report = prepare_reports(
|
||||||
|
title="The following quantization tests had failures", header=header, reports=individual_reports
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"type": "section", "text": {"type": "mrkdwn", "text": failures_report}}
|
||||||
|
|
||||||
|
def post(self):
|
||||||
|
payload = self.payload
|
||||||
|
print("Sending the following payload")
|
||||||
|
print(json.dumps({"blocks": json.loads(payload)}))
|
||||||
|
|
||||||
|
text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
|
||||||
|
|
||||||
|
self.thread_ts = client.chat_postMessage(
|
||||||
|
channel=SLACK_REPORT_CHANNEL_ID,
|
||||||
|
blocks=payload,
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
|
||||||
|
def post_reply(self):
|
||||||
|
if self.thread_ts is None:
|
||||||
|
raise ValueError("Can only post reply if a post has been made.")
|
||||||
|
|
||||||
|
for job, job_result in self.results.items():
|
||||||
|
if len(job_result["failures"]):
|
||||||
|
for device, failures in job_result["failures"].items():
|
||||||
|
blocks = self.get_reply_blocks(
|
||||||
|
job,
|
||||||
|
job_result,
|
||||||
|
failures,
|
||||||
|
device,
|
||||||
|
text=f'Number of failures: {job_result["failed"][device]}',
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Sending the following reply")
|
||||||
|
print(json.dumps({"blocks": blocks}))
|
||||||
|
|
||||||
|
client.chat_postMessage(
|
||||||
|
channel="#transformers-ci-daily-quantization",
|
||||||
|
text=f"Results for {job}",
|
||||||
|
blocks=blocks,
|
||||||
|
thread_ts=self.thread_ts["ts"],
|
||||||
|
)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
setup_status = os.environ.get("SETUP_STATUS")
|
||||||
|
SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"]
|
||||||
|
setup_failed = True if setup_status is not None and setup_status != "success" else False
|
||||||
|
|
||||||
|
# This env. variable is set in workflow file (under the job `send_results`).
|
||||||
|
ci_event = os.environ["CI_EVENT"]
|
||||||
|
|
||||||
|
title = f"🤗 Results of the {ci_event} tests."
|
||||||
|
|
||||||
|
if setup_failed:
|
||||||
|
Message.error_out(
|
||||||
|
title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=setup_failed
|
||||||
|
)
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
arguments = sys.argv[1:][0]
|
||||||
|
try:
|
||||||
|
quantization_matrix = ast.literal_eval(arguments)
|
||||||
|
# Need to change from elements like `quantization/bnb` to `quantization_bnb` (the ones used as artifact names).
|
||||||
|
quantization_matrix = [x.replace("quantization/", "quantization_") for x in quantization_matrix]
|
||||||
|
except SyntaxError:
|
||||||
|
Message.error_out(title, ci_title="")
|
||||||
|
raise ValueError("Errored out.")
|
||||||
|
|
||||||
|
available_artifacts = retrieve_available_artifacts()
|
||||||
|
|
||||||
|
quantization_results = {
|
||||||
|
quant: {
|
||||||
|
"failed": {"single": 0, "multi": 0},
|
||||||
|
"success": 0,
|
||||||
|
"time_spent": "",
|
||||||
|
"failures": {},
|
||||||
|
"job_link": {},
|
||||||
|
}
|
||||||
|
for quant in quantization_matrix
|
||||||
|
if f"run_tests_quantization_torch_gpu_{quant}" in available_artifacts
|
||||||
|
}
|
||||||
|
|
||||||
|
github_actions_jobs = get_jobs(
|
||||||
|
workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"]
|
||||||
|
)
|
||||||
|
github_actions_job_links = {job["name"]: job["html_url"] for job in github_actions_jobs}
|
||||||
|
|
||||||
|
artifact_name_to_job_map = {}
|
||||||
|
for job in github_actions_jobs:
|
||||||
|
for step in job["steps"]:
|
||||||
|
if step["name"].startswith("Test suite reports artifacts: "):
|
||||||
|
artifact_name = step["name"][len("Test suite reports artifacts: ") :]
|
||||||
|
artifact_name_to_job_map[artifact_name] = job
|
||||||
|
break
|
||||||
|
|
||||||
|
for quant in quantization_results.keys():
|
||||||
|
for artifact_path in available_artifacts[f"run_tests_quantization_torch_gpu_{quant}"].paths:
|
||||||
|
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
|
||||||
|
if "stats" in artifact:
|
||||||
|
# Link to the GitHub Action job
|
||||||
|
job = artifact_name_to_job_map[artifact_path["path"]]
|
||||||
|
quantization_results[quant]["job_link"][artifact_path["gpu"]] = job["html_url"]
|
||||||
|
failed, success, time_spent = handle_test_results(artifact["stats"])
|
||||||
|
quantization_results[quant]["failed"][artifact_path["gpu"]] += failed
|
||||||
|
quantization_results[quant]["success"] += success
|
||||||
|
quantization_results[quant]["time_spent"] += time_spent[1:-1] + ", "
|
||||||
|
|
||||||
|
stacktraces = handle_stacktraces(artifact["failures_line"])
|
||||||
|
|
||||||
|
for line in artifact["summary_short"].split("\n"):
|
||||||
|
if line.startswith("FAILED "):
|
||||||
|
line = line[len("FAILED ") :]
|
||||||
|
line = line.split()[0].replace("\n", "")
|
||||||
|
|
||||||
|
if artifact_path["gpu"] not in quantization_results[quant]["failures"]:
|
||||||
|
quantization_results[quant]["failures"][artifact_path["gpu"]] = []
|
||||||
|
|
||||||
|
quantization_results[quant]["failures"][artifact_path["gpu"]].append(
|
||||||
|
{"line": line, "trace": stacktraces.pop(0)}
|
||||||
|
)
|
||||||
|
|
||||||
|
message = QuantizationMessage(
|
||||||
|
title,
|
||||||
|
results=quantization_results,
|
||||||
|
)
|
||||||
|
|
||||||
|
message.post()
|
||||||
|
message.post_reply()
|
||||||
Reference in New Issue
Block a user