diff --git a/engine/docker/dev-host/grafana/dashboards/api.json b/engine/docker/dev-host/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-host/grafana/dashboards/api.json +++ b/engine/docker/dev-host/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/cache.json b/engine/docker/dev-host/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-host/grafana/dashboards/cache.json +++ b/engine/docker/dev-host/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/futures.json b/engine/docker/dev-host/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-host/grafana/dashboards/futures.json +++ b/engine/docker/dev-host/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/gasoline.json b/engine/docker/dev-host/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-host/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-host/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-host/grafana/dashboards/guard.json b/engine/docker/dev-host/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-host/grafana/dashboards/guard.json +++ b/engine/docker/dev-host/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/operation.json b/engine/docker/dev-host/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-host/grafana/dashboards/operation.json +++ b/engine/docker/dev-host/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/pegboard.json b/engine/docker/dev-host/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-host/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-host/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-host/grafana/dashboards/tokio.json b/engine/docker/dev-host/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-host/grafana/dashboards/tokio.json +++ b/engine/docker/dev-host/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/rivet-engine/config.jsonc b/engine/docker/dev-host/rivet-engine/config.jsonc index 87a23b0e07..814d17ecbb 100644 --- a/engine/docker/dev-host/rivet-engine/config.jsonc +++ b/engine/docker/dev-host/rivet-engine/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://127.0.0.1:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "127.0.0.1", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc index 0f74e2a346..a61a8d52f1 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc index 0f74e2a346..a61a8d52f1 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc index 0f74e2a346..a61a8d52f1 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc index 0c940aaf6a..7898758e89 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc index 0c940aaf6a..7898758e89 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc index 0c940aaf6a..7898758e89 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc index b6218cd163..4d40c5693d 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc index b6218cd163..4d40c5693d 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc index b6218cd163..4d40c5693d 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/api.json b/engine/docker/dev-multidc/core/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/api.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/cache.json b/engine/docker/dev-multidc/core/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/cache.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/futures.json b/engine/docker/dev-multidc/core/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/futures.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json b/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/guard.json b/engine/docker/dev-multidc/core/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/guard.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/operation.json b/engine/docker/dev-multidc/core/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/operation.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json b/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json b/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc b/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc index dff69c3809..4c9e465d85 100644 --- a/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc +++ b/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc b/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc index 6fa7e3b42d..f35557210f 100644 --- a/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc +++ b/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc b/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc index 95818a2bfb..c12f4bdc1d 100644 --- a/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc +++ b/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multinode/grafana/dashboards/api.json b/engine/docker/dev-multinode/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/api.json +++ b/engine/docker/dev-multinode/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/cache.json b/engine/docker/dev-multinode/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/cache.json +++ b/engine/docker/dev-multinode/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/futures.json b/engine/docker/dev-multinode/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/futures.json +++ b/engine/docker/dev-multinode/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/gasoline.json b/engine/docker/dev-multinode/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-multinode/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-multinode/grafana/dashboards/guard.json b/engine/docker/dev-multinode/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/guard.json +++ b/engine/docker/dev-multinode/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/operation.json b/engine/docker/dev-multinode/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/operation.json +++ b/engine/docker/dev-multinode/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/pegboard.json b/engine/docker/dev-multinode/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-multinode/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-multinode/grafana/dashboards/tokio.json b/engine/docker/dev-multinode/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/tokio.json +++ b/engine/docker/dev-multinode/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/rivet-engine/0/config.jsonc b/engine/docker/dev-multinode/rivet-engine/0/config.jsonc index 78d35ea7a7..31b5ce77fe 100644 --- a/engine/docker/dev-multinode/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multinode/rivet-engine/0/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multinode/rivet-engine/1/config.jsonc b/engine/docker/dev-multinode/rivet-engine/1/config.jsonc index 78d35ea7a7..31b5ce77fe 100644 --- a/engine/docker/dev-multinode/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multinode/rivet-engine/1/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multinode/rivet-engine/2/config.jsonc b/engine/docker/dev-multinode/rivet-engine/2/config.jsonc index 78d35ea7a7..31b5ce77fe 100644 --- a/engine/docker/dev-multinode/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multinode/rivet-engine/2/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev/grafana/dashboards/api.json b/engine/docker/dev/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev/grafana/dashboards/api.json +++ b/engine/docker/dev/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/cache.json b/engine/docker/dev/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev/grafana/dashboards/cache.json +++ b/engine/docker/dev/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/futures.json b/engine/docker/dev/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev/grafana/dashboards/futures.json +++ b/engine/docker/dev/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/gasoline.json b/engine/docker/dev/grafana/dashboards/gasoline.json index 5bfb0bcb83..64916e56f2 100644 --- a/engine/docker/dev/grafana/dashboards/gasoline.json +++ b/engine/docker/dev/grafana/dashboards/gasoline.json @@ -94,48 +94,7 @@ }, "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "epoxy_replica", - "namespace", - "namespace_metrics_exporter", - "pegboard_actor", - "pegboard_actor_metrics", - "pegboard_actor_runner_name_selector_backfill", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_runner_pool_metadata_poller", - "pegboard_serverless_backfill", - "pegboard_serverless_conn", - "pegboard_serverless_runner", - "pegboard_serverless_runner2" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, @@ -273,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -1229,12 +1188,12 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, - "id": 23, + "id": 20, "interval": "15s", "options": { "calculate": false, @@ -1275,7 +1234,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "s" + "unit": "" } }, "pluginVersion": "11.6.7", @@ -1286,14 +1245,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_pull_workflows_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Pull Workflows Duration", + "title": "CPU Core Usage", "type": "heatmap" }, { @@ -1317,12 +1276,12 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 12, "y": 51 }, - "id": 24, + "id": 26, "interval": "15s", "options": { "calculate": false, @@ -1363,7 +1322,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "s" + "unit": "percentunit" } }, "pluginVersion": "11.6.7", @@ -1374,14 +1333,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_pull_workflows_history_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Pull Workflows History Duration", + "title": "Load Shedding Ratio", "type": "heatmap" }, { @@ -1391,59 +1350,16 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, "scaleDistribution": { "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" + } }, "overrides": [] }, @@ -1451,129 +1367,50 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 60 }, - "id": 13, + "id": 23, + "interval": "15s", "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 59 - }, - "id": 14, - "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "show": true + }, + "rowsFrame": { + "layout": "auto" }, "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" } }, "pluginVersion": "11.6.7", @@ -1584,15 +1421,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum(increase(rivet_gasoline_pull_workflows_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" + "title": "Pull Workflows Duration", + "type": "heatmap" }, { "datasource": { @@ -1615,12 +1452,12 @@ "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, - "x": 0, - "y": 67 + "x": 12, + "y": 60 }, - "id": 20, + "id": 24, "interval": "15s", "options": { "calculate": false, @@ -1661,7 +1498,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "" + "unit": "s" } }, "pluginVersion": "11.6.7", @@ -1672,14 +1509,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_pull_workflows_history_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "CPU Core Usage", + "title": "Pull Workflows History Duration", "type": "heatmap" }, { @@ -1689,67 +1526,81 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" } - } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" }, "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, - "x": 12, - "y": 67 + "x": 0, + "y": 68 }, - "id": 26, + "id": 36, "interval": "15s", "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1760,15 +1611,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s", + "type": "timeseries" }, { "datasource": { @@ -1777,68 +1628,81 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" } - } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 76 + "x": 12, + "y": 68 }, - "id": 34, + "id": 35, "interval": "15s", "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "cellValues": {}, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "s" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1849,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_workflow_wake_delta_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Workflow Wake Delta", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -1882,10 +1746,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 0, "y": 76 }, - "id": 35, + "id": 34, "interval": "15s", "options": { "calculate": false, @@ -1927,7 +1791,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "none" + "unit": "s" } }, "pluginVersion": "11.6.7", @@ -1938,14 +1802,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_workflow_leased_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_workflow_wake_delta_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Workflow Leases Per Tick", + "title": "Workflow Wake Delta", "type": "heatmap" }, { @@ -2705,17 +2569,17 @@ { "current": { "text": [ - "All" + "prod" ], "value": [ - "$__all" + "prod" ] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2723,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2733,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2742,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2750,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2787,7 +2653,7 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, diff --git a/engine/docker/dev/grafana/dashboards/guard.json b/engine/docker/dev/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev/grafana/dashboards/guard.json +++ b/engine/docker/dev/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/operation.json b/engine/docker/dev/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev/grafana/dashboards/operation.json +++ b/engine/docker/dev/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/pegboard.json b/engine/docker/dev/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev/grafana/dashboards/pegboard.json +++ b/engine/docker/dev/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev/grafana/dashboards/tokio.json b/engine/docker/dev/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev/grafana/dashboards/tokio.json +++ b/engine/docker/dev/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/rivet-engine/config.jsonc b/engine/docker/dev/rivet-engine/config.jsonc index 9c5ea86073..3158c9bf72 100644 --- a/engine/docker/dev/rivet-engine/config.jsonc +++ b/engine/docker/dev/rivet-engine/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/template/src/docker-compose.ts b/engine/docker/template/src/docker-compose.ts index 5e7dd6719b..62ad7279e3 100644 --- a/engine/docker/template/src/docker-compose.ts +++ b/engine/docker/template/src/docker-compose.ts @@ -343,6 +343,8 @@ export function generateDockerCompose(context: TemplateContext) { environment: [ `RIVET_ENDPOINT=http://${context.getServiceHost("rivet-engine", datacenter.name, 0)}:6420`, `RIVET_RUNNER_TOTAL_SLOTS=1000000`, + `AUTOSTART_RUNNER=1`, + `AUTOCONFIGURE_SERVERLESS=0` ], stop_grace_period: "4s", ports: isPrimary && i === 0 ? [`5050:5050`] : undefined, diff --git a/engine/docker/template/src/services/edge/rivet-engine.ts b/engine/docker/template/src/services/edge/rivet-engine.ts index 4fa1bf00fa..d388c6dcaa 100644 --- a/engine/docker/template/src/services/edge/rivet-engine.ts +++ b/engine/docker/template/src/services/edge/rivet-engine.ts @@ -63,10 +63,6 @@ export function generateDatacenterRivetEngine( password: "default", secure: false, }, - vector_http: { - host: context.getServiceHost("vector-client", datacenter.name), - port: 5022, - }, }; context.writeDatacenterServiceFile( diff --git a/engine/packages/epoxy/src/http_client.rs b/engine/packages/epoxy/src/http_client.rs index 1eee2eee76..1e3a0d4af0 100644 --- a/engine/packages/epoxy/src/http_client.rs +++ b/engine/packages/epoxy/src/http_client.rs @@ -1,4 +1,4 @@ -use anyhow::*; +use anyhow::{Context, Result, bail}; use epoxy_protocol::{ PROTOCOL_VERSION, protocol::{self, ReplicaId}, @@ -37,7 +37,13 @@ where Fut: Future> + Send, T: Send, { - let quorum_size = utils::calculate_quorum(replica_ids.len(), quorum_type); + let target_responses = utils::calculate_fanout_quorum(replica_ids.len(), quorum_type); + + if target_responses == 0 { + tracing::warn!("no fanout, target is 0"); + + return Ok(Vec::new()); + } // Create futures for all replicas (excluding the sender) let mut responses = futures_util::stream::iter( @@ -57,32 +63,22 @@ where ) .collect::>() .await; - tracing::debug!(?quorum_size, len = ?responses.len(), ?quorum_type, "fanout quorum size"); - - // Choose how many successful responses we need before considering a success - let target_responses = match quorum_type { - // Only require 1 response - utils::QuorumType::Any => 1, - // Include all responses - utils::QuorumType::All => responses.len(), - // Subtract 1 from quorum size since we're not counting ourselves - utils::QuorumType::Fast | utils::QuorumType::Slow => quorum_size - 1, - }; + tracing::debug!(?target_responses, len=?responses.len(), "fanout target"); // Collect responses until we reach quorum or all futures complete let mut successful_responses = Vec::new(); while successful_responses.len() < target_responses { if let Some(response) = responses.next().await { match response { - std::result::Result::Ok(result) => match result { - std::result::Result::Ok(response) => { + Ok(result) => match result { + Ok(response) => { successful_responses.push(response); } - std::result::Result::Err(err) => { + Err(err) => { tracing::warn!(?err, "received error from replica"); } }, - std::result::Result::Err(err) => { + Err(err) => { tracing::warn!(?err, "received timeout from replica"); } } @@ -159,8 +155,8 @@ pub async fn send_message_to_address( .await; let response = match response_result { - std::result::Result::Ok(resp) => resp, - std::result::Result::Err(e) => { + Ok(resp) => resp, + Err(e) => { tracing::error!( to_replica = to_replica_id, replica_url = %replica_url, diff --git a/engine/packages/epoxy/src/utils.rs b/engine/packages/epoxy/src/utils.rs index cd7f51953a..a0840a0102 100644 --- a/engine/packages/epoxy/src/utils.rs +++ b/engine/packages/epoxy/src/utils.rs @@ -43,12 +43,54 @@ pub fn get_all_replicas(config: &protocol::ClusterConfig) -> Vec { config.replicas.iter().map(|r| r.replica_id).collect() } +// See EPaxos 4.3 pub fn calculate_quorum(n: usize, q: QuorumType) -> usize { - match q { - QuorumType::Fast => (n * 3) / 4 + 1, - QuorumType::Slow => n / 2 + 1, - QuorumType::All => n, - QuorumType::Any => 1, + match n { + // Nonsensical + 0 => 0, + 1 => 1, + // EPaxos does not apply to clusters with N < 3 because you cannot tolerate any faults. However we can + // still get correctness invariants to hold by requiring both nodes to agree on everything (quorum + // size is always 2) + 2 => match q { + QuorumType::Fast => 2, + QuorumType::Slow => 2, + QuorumType::All => 2, + QuorumType::Any => 1, + }, + // Note that for even N's we don't gain any extra fault tolerance but we get potentially better read + // latency. N=4 acts like N=3 in terms of fault tolerance. + n => { + let f = (n - 1) / 2; + + match q { + QuorumType::Fast => f + (f + 1) / 2, + QuorumType::Slow => f + 1, + QuorumType::All => n, + QuorumType::Any => 1, + } + } + } +} + +/// Calculates quorum size assuming the sender is excluded. +pub fn calculate_fanout_quorum(n: usize, q: QuorumType) -> usize { + match n { + // Nonsensical + 0 => 0, + 1 => 0, + // NOTE: See comments in `calculate_quorum` + 2 => 1, + n => { + let f = (n - 1) / 2; + + match q { + QuorumType::Fast => (f + (f + 1) / 2) - 1, + QuorumType::Slow => f, + QuorumType::All => n - 1, + QuorumType::Any => 1, + } + } } } diff --git a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/conn-error-serialization.ts b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/conn-error-serialization.ts new file mode 100644 index 0000000000..900943bbc8 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/conn-error-serialization.ts @@ -0,0 +1,27 @@ +import { actor } from "rivetkit"; +import { ActorError } from "@/actor/errors"; + +// Custom error that will be thrown in createConnState +class CustomConnectionError extends ActorError { + constructor(message: string) { + super("connection", "custom_error", message, { public: true }); + } +} + +/** + * Actor that throws a custom error in createConnState to test error serialization + */ +export const connErrorSerializationActor = actor({ + state: { + value: 0, + }, + createConnState: (_c, params: { shouldThrow?: boolean }) => { + if (params.shouldThrow) { + throw new CustomConnectionError("Test error from createConnState"); + } + return { initialized: true }; + }, + actions: { + getValue: (c) => c.state.value, + }, +}); diff --git a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts index d85c7e881e..4b9b840a39 100644 --- a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts +++ b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts @@ -75,6 +75,8 @@ import { workflowSleepActor, workflowStopTeardownActor, } from "./workflow"; +import { startStopRaceActor, lifecycleObserver } from "./start-stop-race"; +import { connErrorSerializationActor } from "./conn-error-serialization"; // Consolidated setup with all actors export const registry = setup({ @@ -177,5 +179,10 @@ export const registry = setup({ // From access-control.ts accessControlActor, accessControlNoQueuesActor, + // From start-stop-race.ts + startStopRaceActor, + lifecycleObserver, + // From conn-error-serialization.ts + connErrorSerializationActor, }, }); diff --git a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/start-stop-race.ts b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/start-stop-race.ts new file mode 100644 index 0000000000..9fad609233 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/start-stop-race.ts @@ -0,0 +1,71 @@ +import { actor } from "rivetkit"; + +/** + * Actor designed to test start/stop race conditions. + * Has a slow initialization to make race conditions easier to trigger. + */ +export const startStopRaceActor = actor({ + state: { + initialized: false, + startTime: 0, + destroyCalled: false, + startCompleted: false, + }, + onWake: async (c) => { + c.state.startTime = Date.now(); + + // Simulate slow initialization to create window for race condition + await new Promise((resolve) => setTimeout(resolve, 100)); + + c.state.initialized = true; + c.state.startCompleted = true; + }, + onDestroy: (c) => { + c.state.destroyCalled = true; + // Don't save state here - the actor framework will save it automatically + }, + actions: { + getState: (c) => { + return { + initialized: c.state.initialized, + startTime: c.state.startTime, + destroyCalled: c.state.destroyCalled, + startCompleted: c.state.startCompleted, + }; + }, + ping: (c) => { + return "pong"; + }, + destroy: (c) => { + c.destroy(); + }, + }, +}); + +/** + * Observer actor to track lifecycle events from other actors + */ +export const lifecycleObserver = actor({ + state: { + events: [] as Array<{ + actorKey: string; + event: string; + timestamp: number; + }>, + }, + actions: { + recordEvent: (c, params: { actorKey: string; event: string }) => { + c.state.events.push({ + actorKey: params.actorKey, + event: params.event, + timestamp: Date.now(), + }); + }, + getEvents: (c) => { + return c.state.events; + }, + clearEvents: (c) => { + c.state.events = []; + }, + }, +}); diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts index cf3590672e..98db320e1a 100644 --- a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts @@ -17,6 +17,7 @@ import { runActorConnTests } from "./tests/actor-conn"; import { runActorConnHibernationTests } from "./tests/actor-conn-hibernation"; import { runActorConnStateTests } from "./tests/actor-conn-state"; import { runActorDbTests } from "./tests/actor-db"; +import { runConnErrorSerializationTests } from "./tests/conn-error-serialization"; import { runActorDestroyTests } from "./tests/actor-destroy"; import { runActorDriverTests } from "./tests/actor-driver"; import { runActorErrorHandlingTests } from "./tests/actor-error-handling"; @@ -111,6 +112,8 @@ export function runDriverTests( runActorConnHibernationTests(driverTestConfig); + runConnErrorSerializationTests(driverTestConfig); + runActorDbTests(driverTestConfig); runActorDestroyTests(driverTestConfig); diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts index 438348285a..efa2d96cd9 100644 --- a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts @@ -1,5 +1,6 @@ import { describe } from "vitest"; import type { DriverTestConfig } from "../mod"; +import { runActorLifecycleTests } from "./actor-lifecycle"; import { runActorScheduleTests } from "./actor-schedule"; import { runActorSleepTests } from "./actor-sleep"; import { runActorStateTests } from "./actor-state"; @@ -14,5 +15,8 @@ export function runActorDriverTests(driverTestConfig: DriverTestConfig) { // Run actor sleep tests runActorSleepTests(driverTestConfig); + + // Run actor lifecycle tests + runActorLifecycleTests(driverTestConfig); }); } diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-lifecycle.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-lifecycle.ts new file mode 100644 index 0000000000..7333cfa977 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-lifecycle.ts @@ -0,0 +1,157 @@ +import { describe, expect, test } from "vitest"; +import type { DriverTestConfig } from "../mod"; +import { setupDriverTest } from "../utils"; + +export function runActorLifecycleTests(driverTestConfig: DriverTestConfig) { + describe("Actor Lifecycle Tests", () => { + test("actor stop during start waits for start to complete", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-stop-during-start-${Date.now()}`; + + // Create actor - this starts the actor + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + + // Immediately try to call an action and then destroy + // This creates a race where the actor might not be fully started yet + const pingPromise = actor.ping(); + + // Get actor ID + const actorId = await actor.resolve(); + + // Destroy immediately while start might still be in progress + await actor.destroy(); + + // The ping should still complete successfully because destroy waits for start + const result = await pingPromise; + expect(result).toBe("pong"); + + // Verify actor was actually destroyed + let destroyed = false; + try { + await client.startStopRaceActor.getForId(actorId).ping(); + } catch (err: any) { + destroyed = true; + expect(err.group).toBe("actor"); + expect(err.code).toBe("not_found"); + } + expect(destroyed).toBe(true); + }); + + test("actor stop before actor instantiation completes cleans up handler", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-stop-before-instantiation-${Date.now()}`; + + // Create multiple actors rapidly to increase chance of race + const actors = Array.from({ length: 5 }, (_, i) => + client.startStopRaceActor.getOrCreate([ + `${actorKey}-${i}`, + ]), + ); + + // Resolve all actor IDs (this triggers start) + const ids = await Promise.all(actors.map((a) => a.resolve())); + + // Immediately destroy all actors + await Promise.all(actors.map((a) => a.destroy())); + + // Verify all actors were cleaned up + for (const id of ids) { + let destroyed = false; + try { + await client.startStopRaceActor.getForId(id).ping(); + } catch (err: any) { + destroyed = true; + expect(err.group).toBe("actor"); + expect(err.code).toBe("not_found"); + } + expect(destroyed, `actor ${id} should be destroyed`).toBe( + true, + ); + } + }); + + test("onBeforeActorStart completes before stop proceeds", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-before-actor-start-${Date.now()}`; + + // Create actor + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + + // Call action to ensure actor is starting + const statePromise = actor.getState(); + + // Destroy immediately + await actor.destroy(); + + // State should be initialized because onBeforeActorStart must complete + const state = await statePromise; + expect(state.initialized).toBe(true); + expect(state.startCompleted).toBe(true); + }); + + test("multiple rapid create/destroy cycles handle race correctly", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + // Perform multiple rapid create/destroy cycles + for (let i = 0; i < 10; i++) { + const actorKey = `test-rapid-cycle-${Date.now()}-${i}`; + const actor = client.startStopRaceActor.getOrCreate([ + actorKey, + ]); + + // Trigger start + const resolvePromise = actor.resolve(); + + // Immediately destroy + const destroyPromise = actor.destroy(); + + // Both should complete without errors + await Promise.all([resolvePromise, destroyPromise]); + } + + // If we get here without errors, the race condition is handled correctly + expect(true).toBe(true); + }); + + test("actor stop called with no actor instance cleans up handler", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-cleanup-no-instance-${Date.now()}`; + + // Create and immediately destroy + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + const id = await actor.resolve(); + await actor.destroy(); + + // Try to recreate with same key - should work without issues + const newActor = client.startStopRaceActor.getOrCreate([ + actorKey, + ]); + const result = await newActor.ping(); + expect(result).toBe("pong"); + + // Clean up + await newActor.destroy(); + }); + + test("onDestroy is called even when actor is destroyed during start", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-ondestroy-during-start-${Date.now()}`; + + // Create actor + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + + // Start and immediately destroy + const statePromise = actor.getState(); + await actor.destroy(); + + // Verify onDestroy was called (requires actor to be started) + const state = await statePromise; + expect(state.destroyCalled).toBe(true); + }); + }); +} diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/conn-error-serialization.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/conn-error-serialization.ts new file mode 100644 index 0000000000..e5ccf1ef23 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/conn-error-serialization.ts @@ -0,0 +1,64 @@ +import { describe, expect, test } from "vitest"; +import type { DriverTestConfig } from "../mod"; +import { setupDriverTest } from "../utils"; + +export function runConnErrorSerializationTests(driverTestConfig: DriverTestConfig) { + describe("Connection Error Serialization Tests", () => { + test("error thrown in createConnState preserves group and code through WebSocket serialization", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-error-serialization-${Date.now()}`; + + // Create actor handle with params that will trigger error in createConnState + const actor = client.connErrorSerializationActor.getOrCreate( + [actorKey], + { params: { shouldThrow: true } }, + ); + + // Try to connect, which will trigger error in createConnState + const conn = actor.connect(); + + // Wait for connection to fail + let caughtError: any; + try { + // Try to call an action, which should fail because connection couldn't be established + await conn.getValue(); + } catch (err) { + caughtError = err; + } + + // Verify the error was caught + expect(caughtError).toBeDefined(); + + // Verify the error has the correct group and code from the original error + // Original error: new CustomConnectionError("...") with group="connection", code="custom_error" + expect(caughtError.group).toBe("connection"); + expect(caughtError.code).toBe("custom_error"); + + // Clean up + await conn.dispose(); + }); + + test("successful createConnState does not throw error", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-no-error-${Date.now()}`; + + // Create actor handle with params that will NOT trigger error + const actor = client.connErrorSerializationActor.getOrCreate( + [actorKey], + { params: { shouldThrow: false } }, + ); + + // Connect without triggering error + const conn = actor.connect(); + + // This should succeed + const value = await conn.getValue(); + expect(value).toBe(0); + + // Clean up + await conn.dispose(); + }); + }); +} diff --git a/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts b/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts index fb3b50f133..b76c5f69d2 100644 --- a/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts +++ b/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts @@ -156,7 +156,7 @@ export class EngineActorDriver implements ActorDriver { onConnected: () => { this.#runnerStarted.resolve(undefined); }, - onDisconnected: (_code, _reason) => {}, + onDisconnected: (_code, _reason) => { }, onShutdown: () => { this.#runnerStopped.resolve(undefined); this.#isRunnerStopped = true; @@ -395,7 +395,7 @@ export class EngineActorDriver implements ActorDriver { async serverlessHandleStart(c: HonoContext): Promise { return streamSSE(c, async (stream) => { // NOTE: onAbort does not work reliably - stream.onAbort(() => {}); + stream.onAbort(() => { }); c.req.raw.signal.addEventListener("abort", () => { logger().debug("SSE aborted, shutting down runner"); @@ -514,9 +514,9 @@ export class EngineActorDriver implements ActorDriver { const error = innerError instanceof Error ? new Error( - `Failed to start actor ${actorId}: ${innerError.message}`, - { cause: innerError }, - ) + `Failed to start actor ${actorId}: ${innerError.message}`, + { cause: innerError }, + ) : new Error(`Failed to start actor ${actorId}: ${String(innerError)}`); handler.actor = undefined; handler.actorStartError = error; @@ -559,15 +559,26 @@ export class EngineActorDriver implements ActorDriver { this.#actorStopIntent.delete(actorId); const handler = this.#actors.get(actorId); - if (handler?.actorStartPromise) { - const startError = - handler.actorStartError ?? - new Error(`Actor ${actorId} stopped before start completed`); - handler.actorStartError = startError; - handler.actorStartPromise.reject(startError); - handler.actorStartPromise = undefined; + if (!handler) { + logger().debug({ msg: "no runner actor handler to stop", actorId, reason }); + return; + } + + if (handler.actorStartPromise) { + try { + logger().debug({ msg: "runner actor stopping before it started, waiting", actorId, generation }); + await handler.actorStartPromise.promise; + } catch (err) { + // Start failed, but we still want to clean up the handler + logger().debug({ + msg: "actor start failed during stop, cleaning up handler", + actorId, + err: stringifyError(err), + }); + } } - if (handler?.actor) { + + if (handler.actor) { try { await handler.actor.onStop(reason); } catch (err) { @@ -577,7 +588,8 @@ export class EngineActorDriver implements ActorDriver { }); } } - if (handler) this.#actors.delete(actorId); + + this.#actors.delete(actorId); logger().debug({ msg: "runner actor stopped", actorId, reason }); }