Add CB (#38085)
* stash for now * initial commit * small updated * up * up * works! * nits and fixes * don't loop too much * finish working example * update * fix the small freeblocks issue * feat: stream inputs to continuous batch * fix: update attn from `eager` to `sdpa` * refactor: fmt * refactor: cleanup unnecessary code * feat: add `update` fn to `PagedAttentionCache` * feat: broken optimal block size computation * fix: debugging invalid cache logic * fix: attention mask * refactor: use custom prompts for example * feat: add streaming output * fix: prefill split refactor: add doc strings and unsound/redundant logic fix: compute optimal blocks logic * fix: send decoded tokens when `prefilling_split` -> `decoding` * refactor: move logic to appropriate parent class * fix: remove truncation as we split prefilling anyways refactor: early return when we have enough selected requests * feat: add paged attention forward * push Ggraoh> * add paged sdpa * update * btter mps defaults * feat: add progress bar for `generate_batch` * feat: add opentelemetry metrics (ttft + batch fill %age) * feat: add tracing * Add cuda graphs (#38059) * draft cudagraphs addition * nits * styling * update * fix * kinda draft of what it should look like * fixes * lol * not sure why inf everywhere * can generate but output is shit * some fixes * we should have a single device synch * broken outputs but it does run * refactor * updates * updates with some fixes * fix mask causality * another commit that casts after * add error * simplify example * update * updates * revert llama changes * fix merge conflicts * fix: tracing and metrics * my updates * update script default values * fix block allocation issue * fix prefill split attnetion mask * no bugs * add paged eager * fix * update * style * feat: add pytorch traces * fix * fix * refactor: remove pytorch profiler data * style * nits * cleanup * draft test file * fix * fix * fix paged and graphs * small renamings * cleanups and push * refactor: move tracing and metrics logic to utils * refactor: trace more blocks of code * nits * nits * update * to profile or not to profile * refactor: create new output object * causal by default * cleanup but generations are still off for IDK what reason * simplifications but not running still * this does work. * small quality of life updates * nits * updaet * fix the scheduler * fix warning * ol * fully fixed * nits * different generation parameters * nice * just style * feat: add cache memory usage * feat: add kv cache free memory * feat: add active/waiting count & req latency * do the sampling * fix: synchronize CUDA only if available and improve error handling in ContinuousBatchingManager * fix on mps * feat: add dashboard & histogram buckets * perf: improve waiting reqs data structures * attempt to compile, but we should only do it on mps AFAIK * feat: decouple scheduling logic * just a draft * c;eanup and fixup * optional * style * update * update * remove the draft documentation * fix import as well * update * fix the test * style doomed --------- Co-authored-by: Luc Georges <luc.sydney.georges@gmail.com>
This commit is contained in:
4
examples/metrics-monitoring/README.md
Normal file
4
examples/metrics-monitoring/README.md
Normal file
@@ -0,0 +1,4 @@
|
||||
# Metrics Monitoring
|
||||
|
||||
## Continuous Batching Metrics in Transformers
|
||||
|
||||
974
examples/metrics-monitoring/continuous-batching-dashboard.json
Normal file
974
examples/metrics-monitoring/continuous-batching-dashboard.json
Normal file
@@ -0,0 +1,974 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"target": {
|
||||
"limit": 100,
|
||||
"matchAny": false,
|
||||
"tags": [],
|
||||
"type": "dashboard"
|
||||
},
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": 2,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"description": "Memory usage of the PagedAttentionCache",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 10737418240,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 5368709120
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 8589934592
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"minVizHeight": 75,
|
||||
"minVizWidth": 75,
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"sizing": "auto"
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "kv_cache_memory_bytes",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "KV Cache Memory Usage",
|
||||
"transparent": true,
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "dark-blue"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 13,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "active_requests_count",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "Active Requests",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "dark-orange"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 14,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "waiting_requests_count",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "Waiting Requests",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"description": "Ratio of decode tokens to prefill tokens in a batch",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "decode_prefill_ratio",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "Decode/Prefill Ratio",
|
||||
"transparent": true,
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "rate(decode_tokens_processed_total[$__rate_interval])",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Decode tokens throupught tok/s",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "rate(prefill_tokens_processed_total[$__rate_interval])",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Prefill rate tok/s",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.95, sum by(le) (rate(batch_fill_percentage_percent_bucket[$__rate_interval])))",
|
||||
"legendFormat": "p95",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum by(le) (rate(batch_fill_percentage_percent_bucket[$__rate_interval])))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(batch_fill_percentage_percent_bucket[$__rate_interval])))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "Batch fill percentage percentiles",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"description": "KV Cache Memory Usage Over Time",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 20,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "kv_cache_memory_bytes",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"legendFormat": "Used memory",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "kv_cache_free_memory_bytes",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "free memory",
|
||||
"range": true,
|
||||
"refId": "B",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "KV Cache Memory Usage Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": false
|
||||
},
|
||||
"maxVizHeight": 300,
|
||||
"minVizHeight": 10,
|
||||
"minVizWidth": 0,
|
||||
"namePlacement": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showUnfilled": true,
|
||||
"sizing": "auto",
|
||||
"valueMode": "color"
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.95, sum by(le) (rate(ttft_milliseconds_bucket[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"legendFormat": "p95",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(ttft_milliseconds_bucket[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": true,
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
"refId": "B",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.99, sum by(le) (rate(ttft_milliseconds_bucket[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
"range": true,
|
||||
"refId": "C",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "Time to First Token (TTFT)",
|
||||
"type": "bargauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_milliseconds_bucket[$__rate_interval])))",
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.95, sum by(le) (rate(request_latency_milliseconds_bucket[$__rate_interval])))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p95",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum by(le) (rate(request_latency_milliseconds_bucket[$__rate_interval])))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
"range": true,
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "Request latency percentiles",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"preload": false,
|
||||
"refresh": "5s",
|
||||
"schemaVersion": 41,
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-15m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Transformers Continuous Batching Metrics",
|
||||
"uid": "Lw6CTvVSz",
|
||||
"version": 5
|
||||
}
|
||||
55
examples/metrics-monitoring/docker-compose.yml
Normal file
55
examples/metrics-monitoring/docker-compose.yml
Normal file
@@ -0,0 +1,55 @@
|
||||
services:
|
||||
memcached:
|
||||
image: memcached:1.6.29
|
||||
container_name: memcached
|
||||
ports:
|
||||
- "11211:11211"
|
||||
environment:
|
||||
- MEMCACHED_MAX_MEMORY=64m # Set the maximum memory usage
|
||||
- MEMCACHED_THREADS=4 # Number of threads to use
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- --web.enable-otlp-receiver # Enable OTLP receiver
|
||||
- --web.enable-remote-write-receiver
|
||||
- --enable-feature=exemplar-storage
|
||||
- --enable-feature=native-histograms
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
ports:
|
||||
- "9090:9090"
|
||||
|
||||
tempo:
|
||||
image: grafana/tempo:latest
|
||||
command: [ "-config.file=/etc/tempo.yaml" ]
|
||||
volumes:
|
||||
- ./tempo.yaml:/etc/tempo.yaml
|
||||
ports:
|
||||
- "14268:14268" # jaeger ingest
|
||||
- "3200:3200" # tempo
|
||||
- "9095:9095" # tempo grpc
|
||||
- "4317:4317" # otlp grpc
|
||||
- "4318:4318" # otlp http
|
||||
- "9411:9411" # zipkin
|
||||
depends_on:
|
||||
- memcached
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
volumes:
|
||||
- ./continuous-batching-dashboard.json:/etc/grafana/provisioning/dashboards/continuous-batching-dashboard.json
|
||||
- ./grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/grafana-dashboard.yaml
|
||||
- ./grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
|
||||
environment:
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
|
||||
- GF_AUTH_DISABLE_LOGIN_FORM=true
|
||||
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor metricsSummary
|
||||
- GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
|
||||
ports:
|
||||
- "3000:3000"
|
||||
depends_on:
|
||||
- prometheus
|
||||
- tempo
|
||||
11
examples/metrics-monitoring/grafana-dashboard.yaml
Normal file
11
examples/metrics-monitoring/grafana-dashboard.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Transformers Dashboards'
|
||||
orgId: 1
|
||||
folder: 'Transformers'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
14
examples/metrics-monitoring/grafana-datasources.yaml
Normal file
14
examples/metrics-monitoring/grafana-datasources.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
access: proxy
|
||||
url: http://tempo:3200
|
||||
uid: tempo
|
||||
48
examples/metrics-monitoring/metrics_example.py
Normal file
48
examples/metrics-monitoring/metrics_example.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# Example usage of the trace and attach_tracer decorators
|
||||
|
||||
from transformers.utils.metrics import attach_tracer, traced
|
||||
|
||||
|
||||
@attach_tracer()
|
||||
class ExampleClass:
|
||||
def __init__(self, name):
|
||||
# The attach_tracer decorator has already created self.tracer for us
|
||||
self.name = name
|
||||
|
||||
@traced # This method will use the tracer from the class instance
|
||||
def process_data(self, data):
|
||||
# This method is traced and can use self.tracer
|
||||
return f"Processed {data} with {self.name}"
|
||||
|
||||
@traced(span_name="custom_operation") # With custom span name
|
||||
def special_operation(self, value):
|
||||
# Also traced, with a custom span name
|
||||
return value * 2
|
||||
|
||||
@traced(
|
||||
additional_attributes=[
|
||||
("name", "object.name", lambda x: x.upper()), # Using a transform function
|
||||
("name", "object.fixed_value", "static_value"), # Using a fixed value
|
||||
]
|
||||
)
|
||||
def operation_with_attributes(self):
|
||||
# This will add the specified attributes to the span
|
||||
return "Operation completed"
|
||||
|
||||
|
||||
# For functions without a class, the traced decorator still works
|
||||
@traced
|
||||
def standalone_function(arg1, arg2):
|
||||
# For functions, a tracer is created based on the module name
|
||||
return arg1 + arg2
|
||||
|
||||
|
||||
# Usage:
|
||||
if __name__ == "__main__":
|
||||
# With OpenTelemetry configured, these will produce traces
|
||||
example = ExampleClass("test_object")
|
||||
example.process_data("sample")
|
||||
example.special_operation(42)
|
||||
example.operation_with_attributes()
|
||||
|
||||
result = standalone_function(1, 2)
|
||||
3
examples/metrics-monitoring/prometheus.yml
Normal file
3
examples/metrics-monitoring/prometheus.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
90
examples/metrics-monitoring/tempo.yaml
Normal file
90
examples/metrics-monitoring/tempo.yaml
Normal file
@@ -0,0 +1,90 @@
|
||||
stream_over_http_enabled: true
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
log_level: info
|
||||
|
||||
|
||||
cache:
|
||||
background:
|
||||
writeback_goroutines: 5
|
||||
caches:
|
||||
- roles:
|
||||
- frontend-search
|
||||
memcached:
|
||||
addresses: dns+memcached:11211
|
||||
|
||||
query_frontend:
|
||||
search:
|
||||
duration_slo: 5s
|
||||
throughput_bytes_slo: 1.073741824e+09
|
||||
metadata_slo:
|
||||
duration_slo: 5s
|
||||
throughput_bytes_slo: 1.073741824e+09
|
||||
trace_by_id:
|
||||
duration_slo: 100ms
|
||||
metrics:
|
||||
max_duration: 200h # maximum duration of a metrics query, increase for local setups
|
||||
query_backend_after: 5m
|
||||
duration_slo: 5s
|
||||
throughput_bytes_slo: 1.073741824e+09
|
||||
|
||||
distributor:
|
||||
receivers: # this configuration will listen on all ports and protocols that tempo is capable of.
|
||||
jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can
|
||||
protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
|
||||
thrift_http: #
|
||||
endpoint: "tempo:14268" # for a production deployment you should only enable the receivers you need!
|
||||
grpc:
|
||||
endpoint: "tempo:14250"
|
||||
thrift_binary:
|
||||
endpoint: "tempo:6832"
|
||||
thrift_compact:
|
||||
endpoint: "tempo:6831"
|
||||
zipkin:
|
||||
endpoint: "tempo:9411"
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: "tempo:4317"
|
||||
http:
|
||||
endpoint: "tempo:4318"
|
||||
opencensus:
|
||||
endpoint: "tempo:55678"
|
||||
|
||||
ingester:
|
||||
max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 720h # overall Tempo trace retention. set for demo purposes
|
||||
|
||||
metrics_generator:
|
||||
registry:
|
||||
external_labels:
|
||||
source: tempo
|
||||
cluster: docker-compose
|
||||
storage:
|
||||
path: /var/tempo/generator/wal
|
||||
remote_write:
|
||||
- url: http://prometheus:9090/api/v1/write
|
||||
send_exemplars: true
|
||||
traces_storage:
|
||||
path: /var/tempo/generator/traces
|
||||
processor:
|
||||
local_blocks:
|
||||
filter_server_spans: false
|
||||
flush_to_storage: true
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local # backend configuration to use
|
||||
wal:
|
||||
path: /var/tempo/wal # where to store the wal locally
|
||||
local:
|
||||
path: /var/tempo/blocks
|
||||
|
||||
overrides:
|
||||
defaults:
|
||||
metrics_generator:
|
||||
processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
|
||||
generate_native_histograms: both
|
||||
109
examples/pytorch/continuous_batching.py
Normal file
109
examples/pytorch/continuous_batching.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import time
|
||||
|
||||
import datasets
|
||||
import torch
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.generation import GenerationConfig
|
||||
|
||||
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
model_id = "meta-llama/Llama-3.2-3b-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map="auto"
|
||||
).eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
|
||||
|
||||
generation_config = GenerationConfig(
|
||||
max_new_tokens=512,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
use_cache=False,
|
||||
num_blocks=2048,
|
||||
block_size=128,
|
||||
do_sample=True,
|
||||
max_batch_tokens=1024, # Maximum number of tokens to process in a single batch
|
||||
scheduler="prefill_first",
|
||||
)
|
||||
|
||||
train_dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
|
||||
|
||||
# --- Example 1: Simple Version using generate_batch ---
|
||||
print("--- Running CB Generation Example ---")
|
||||
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples["question"])
|
||||
|
||||
|
||||
tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
|
||||
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
|
||||
|
||||
start_time_simple = time.time()
|
||||
# model.forward = torch.compile(model.forward, mode="max-autotune-no-cudagraphs", fullgraph=True)
|
||||
batch_outputs = model.generate_batch(
|
||||
inputs=simple_batch_inputs,
|
||||
generation_config=generation_config,
|
||||
)
|
||||
end_time_simple = time.time()
|
||||
|
||||
for request in batch_outputs:
|
||||
input_text = tokenizer.decode(batch_outputs[request].prompt_ids, skip_special_tokens=False)
|
||||
try:
|
||||
output_text = tokenizer.decode(batch_outputs[request].generated_tokens, skip_special_tokens=False)
|
||||
except Exception as e:
|
||||
print(f"Decoding failed for request {request}: {e}")
|
||||
output_text = tokenizer.decode(batch_outputs[request].generated_tokens[1:], skip_special_tokens=False)
|
||||
if len(output_text) > 0:
|
||||
print("-" * 20)
|
||||
print(f"{request} Input: {input_text}")
|
||||
print(f"{request} Output: {output_text}")
|
||||
else:
|
||||
print("", end="\r\r\r\r")
|
||||
print("-" * 20)
|
||||
print("--- Finished CB Generation Example ---\n\n")
|
||||
|
||||
|
||||
print(f"CB generation took: {end_time_simple - start_time_simple:.2f} seconds")
|
||||
|
||||
|
||||
# train_dataset = train_dataset.select(range(5)) # Use only 5 examples for the simple version
|
||||
|
||||
# tokenized_test_prompts = tokenizer(_TEST_PROMPTS, padding=True, padding_side="left", truncation=True, max_length=512)
|
||||
# simple_batch_inputs = list(tokenized_test_prompts["input_ids"])
|
||||
|
||||
# def tokenize_function(examples):
|
||||
# # Truncate to avoid overly long prompts exceeding max context length
|
||||
# return tokenizer(examples["question"], padding=True, truncation=True, max_length=512)
|
||||
|
||||
|
||||
# tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
|
||||
# simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
|
||||
|
||||
|
||||
# model.config.attn_implementation = "sdpa"
|
||||
# start_time_simple = time.time()
|
||||
# batch_size = 64
|
||||
# full_outputs = []
|
||||
# from tqdm import tqdm
|
||||
|
||||
# for i in tqdm(range(0, len(simple_batch_inputs)-batch_size, batch_size)):
|
||||
# outputs = model.generate(
|
||||
# torch.tensor(simple_batch_inputs[i:i+batch_size], device=model.device),
|
||||
# generation_config=GenerationConfig(
|
||||
# max_new_tokens=16, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id
|
||||
# ),
|
||||
# )
|
||||
# full_outputs.extend(outputs.tolist())
|
||||
|
||||
# end_time_simple = time.time()
|
||||
# print(f"\nSimple batch generation took: {end_time_simple - start_time_simple:.2f} seconds")
|
||||
|
||||
# print("\nResults from simple generate_batch:")
|
||||
# for i, request in enumerate(full_outputs):
|
||||
# output_text = tokenizer.decode(request, skip_special_tokens=False)
|
||||
# print("-" * 20)
|
||||
# print(f" Output: {output_text}")
|
||||
# print("-" * 20)
|
||||
# print("--- Finished Simple Batch Generation Example ---\n\n")
|
||||
Reference in New Issue
Block a user