diff --git a/Cargo.lock b/Cargo.lock index 4354e43af6..defad71b69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -178,25 +178,25 @@ dependencies = [ [[package]] name = "async-nats" -version = "0.33.0" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc1f1a75fd07f0f517322d103211f12d757658e91676def9a2e688774656c60" +checksum = "df5af9ebfb0a14481d3eaf6101e6391261e4f30d25b26a7635ade8a39482ded0" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "bytes", - "futures", - "http 0.2.12", + "futures-util", "memchr", "nkeys", "nuid", "once_cell", + "pin-project", + "portable-atomic", "rand 0.8.5", "regex", "ring", - "rustls 0.21.12", "rustls-native-certs", - "rustls-pemfile 1.0.4", - "rustls-webpki 0.101.7", + "rustls-pki-types", + "rustls-webpki 0.102.8", "serde", "serde_json", "serde_nanos", @@ -204,9 +204,12 @@ dependencies = [ "thiserror 1.0.69", "time", "tokio", - "tokio-retry", - "tokio-rustls 0.24.1", + "tokio-rustls", + "tokio-stream", + "tokio-util", + "tokio-websockets", "tracing", + "tryhard", "url", ] @@ -2114,10 +2117,10 @@ dependencies = [ "http 1.3.1", "hyper 1.6.0", "hyper-util", - "rustls 0.23.29", + "rustls", "rustls-pki-types", "tokio", - "tokio-rustls 0.26.2", + "tokio-rustls", "tower-service", "webpki-roots 1.0.2", ] @@ -2860,11 +2863,10 @@ dependencies = [ [[package]] name = "nkeys" -version = "0.3.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aad178aad32087b19042ee36dfd450b73f5f934fbfb058b59b198684dfec4c47" +checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf" dependencies = [ - "byteorder", "data-encoding", "ed25519", "ed25519-dalek", @@ -3379,6 +3381,7 @@ dependencies = [ "rivet-metrics", "rivet-runner-protocol", "rivet-runtime", + "rivet-types", "serde", "serde_bare", "serde_json", @@ -3772,7 +3775,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.29", + "rustls", "socket2 0.6.0", "thiserror 2.0.12", "tokio", @@ -3792,7 +3795,7 @@ dependencies = [ "rand 0.9.2", "ring", "rustc-hash", - "rustls 0.23.29", + "rustls", "rustls-pki-types", "slab", "thiserror 2.0.12", @@ -4033,7 +4036,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.29", + "rustls", "rustls-pki-types", "serde", "serde_json", @@ -4041,7 +4044,7 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-native-tls", - "tokio-rustls 0.26.2", + "tokio-rustls", "tokio-util", "tower 0.5.2", "tower-http", @@ -4505,8 +4508,8 @@ dependencies = [ "rivet-pools", "rivet-runner-protocol", "rivet-runtime", - "rustls 0.23.29", - "rustls-pemfile 2.2.0", + "rustls", + "rustls-pemfile", "serde", "serde_json", "tokio", @@ -4547,12 +4550,12 @@ dependencies = [ "rivet-runner-protocol", "rivet-runtime", "rivet-util", - "rustls 0.23.29", - "rustls-pemfile 2.2.0", + "rustls", + "rustls-pemfile", "serde", "serde_json", "tokio", - "tokio-rustls 0.26.2", + "tokio-rustls", "tokio-stream", "tokio-tungstenite", "tracing", @@ -4632,8 +4635,8 @@ name = "rivet-postgres-util" version = "2.1.3" dependencies = [ "anyhow", - "rustls 0.23.29", - "rustls-pemfile 2.2.0", + "rustls", + "rustls-pemfile", "tracing", "webpki-roots 0.26.11", ] @@ -4957,18 +4960,6 @@ dependencies = [ "windows-sys 0.60.2", ] -[[package]] -name = "rustls" -version = "0.21.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" -dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.7", - "sct", -] - [[package]] name = "rustls" version = "0.23.29" @@ -4986,25 +4977,17 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.6.3" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" dependencies = [ "openssl-probe", - "rustls-pemfile 1.0.4", + "rustls-pemfile", + "rustls-pki-types", "schannel", "security-framework", ] -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", -] - [[package]] name = "rustls-pemfile" version = "2.2.0" @@ -5026,11 +5009,11 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.101.7" +version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ - "ring", + "rustls-pki-types", "untrusted", ] @@ -5175,16 +5158,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "sdd" version = "4.2.4" @@ -5242,7 +5215,7 @@ checksum = "48b85e25e8a1fc13928885e8bf13abe8a09e15c46993aed05d6405f7755d6e20" dependencies = [ "httpdate", "reqwest", - "rustls 0.23.29", + "rustls", "sentry-anyhow", "sentry-backtrace", "sentry-contexts", @@ -6130,41 +6103,20 @@ checksum = "27d684bad428a0f2481f42241f821db42c54e2dc81d8c00db8536c506b0a0144" dependencies = [ "const-oid", "ring", - "rustls 0.23.29", + "rustls", "tokio", "tokio-postgres", - "tokio-rustls 0.26.2", + "tokio-rustls", "x509-cert", ] -[[package]] -name = "tokio-retry" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f57eb36ecbe0fc510036adff84824dd3c24bb781e21bfa67b69d556aa85214f" -dependencies = [ - "pin-project", - "rand 0.8.5", - "tokio", -] - -[[package]] -name = "tokio-rustls" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" -dependencies = [ - "rustls 0.21.12", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" dependencies = [ - "rustls 0.23.29", + "rustls", "tokio", ] @@ -6204,6 +6156,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-websockets" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-core", + "futures-sink", + "http 1.3.1", + "httparse", + "rand 0.8.5", + "ring", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tokio-util", + "webpki-roots 0.26.11", +] + [[package]] name = "toml_datetime" version = "0.6.11" @@ -6441,6 +6414,16 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tryhard" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5" +dependencies = [ + "pin-project-lite", + "tokio", +] + [[package]] name = "tungstenite" version = "0.26.2" @@ -6617,8 +6600,8 @@ dependencies = [ "base64 0.22.1", "log", "percent-encoding", - "rustls 0.23.29", - "rustls-pemfile 2.2.0", + "rustls", + "rustls-pemfile", "rustls-pki-types", "ureq-proto", "utf-8", diff --git a/Cargo.toml b/Cargo.toml index 4e9e7f5fc2..aa7c1ca63d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ members = [ [workspace.dependencies] anyhow = "1.0.82" async-channel = "2.1.1" - async-nats = "0.33" + async-nats = "0.46.0" async-stream = "0.3" async-trait = "0.1" axum-test = "17" diff --git a/engine/artifacts/config-schema.json b/engine/artifacts/config-schema.json index 064defe444..da87de9fb3 100644 --- a/engine/artifacts/config-schema.json +++ b/engine/artifacts/config-schema.json @@ -960,6 +960,15 @@ "format": "uint", "minimum": 0.0 }, + "serverless_drain_grace_period": { + "description": "Drain grace period for serverless runners.\n\nThis time is subtracted from the configured request duration. Once `duration - grace` is reached, the runner is sent stop commands for all of its actors. After the grace period is over (i.e. the full duration is reached) the runner websocket is forcibly closed.\n\nDefault is 10 seconds.\n\nUnit is in milliseconds.\n\n**Experimental**", + "type": [ + "integer", + "null" + ], + "format": "uint64", + "minimum": 0.0 + }, "serverless_retry_reset_duration": { "description": "How long a serverless runner goes without connection failures before it's retry count is reset to 0, effectively resetting its backoff to 0.\n\nUnit is in milliseconds.\n\n**Experimental**", "type": [ @@ -1192,7 +1201,7 @@ "minimum": 0.0 }, "load_shedding_curve": { - "description": "Determine load shedding ratio based on linear mapping on cpu usage. We will gradually pull less workflows as the cpu usage increases. Units are in (permilli overall cpu usage, permilli) Default: | . . 100% | _____ . | .\\ . % wfs | . \\ . | . \\. 5% | . \\_____ |_____.___.______ 0 60% 80% avg cpu usage", + "description": "Determine load shedding ratio based on linear mapping on cpu usage. We will gradually pull less workflows as the cpu usage increases. Units are in (permilli overall cpu usage, permilli) Default: | . . 100% | _____ . | .\\ . % wfs | . \\ . | . \\. 5% | . \\_____ |_____.___.______ 0 70% 90% avg cpu usage", "type": [ "array", "null" diff --git a/engine/artifacts/errors/guard.invalid_request_body.json b/engine/artifacts/errors/guard.invalid_request_body.json new file mode 100644 index 0000000000..c9eb742829 --- /dev/null +++ b/engine/artifacts/errors/guard.invalid_request_body.json @@ -0,0 +1,5 @@ +{ + "code": "invalid_request_body", + "group": "guard", + "message": "Unable to parse request body." +} \ No newline at end of file diff --git a/engine/artifacts/errors/guard.invalid_response_body.json b/engine/artifacts/errors/guard.invalid_response_body.json new file mode 100644 index 0000000000..0ac8786239 --- /dev/null +++ b/engine/artifacts/errors/guard.invalid_response_body.json @@ -0,0 +1,5 @@ +{ + "code": "invalid_response_body", + "group": "guard", + "message": "Unable to parse response body." +} \ No newline at end of file diff --git a/engine/docker/dev-host/grafana/dashboards/api.json b/engine/docker/dev-host/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-host/grafana/dashboards/api.json +++ b/engine/docker/dev-host/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/cache.json b/engine/docker/dev-host/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-host/grafana/dashboards/cache.json +++ b/engine/docker/dev-host/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/futures.json b/engine/docker/dev-host/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-host/grafana/dashboards/futures.json +++ b/engine/docker/dev-host/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/gasoline.json b/engine/docker/dev-host/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-host/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-host/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-host/grafana/dashboards/guard.json b/engine/docker/dev-host/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-host/grafana/dashboards/guard.json +++ b/engine/docker/dev-host/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/operation.json b/engine/docker/dev-host/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-host/grafana/dashboards/operation.json +++ b/engine/docker/dev-host/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/grafana/dashboards/pegboard.json b/engine/docker/dev-host/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-host/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-host/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-host/grafana/dashboards/tokio.json b/engine/docker/dev-host/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-host/grafana/dashboards/tokio.json +++ b/engine/docker/dev-host/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-host/rivet-engine/config.jsonc b/engine/docker/dev-host/rivet-engine/config.jsonc index 87a23b0e07..814d17ecbb 100644 --- a/engine/docker/dev-host/rivet-engine/config.jsonc +++ b/engine/docker/dev-host/rivet-engine/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://127.0.0.1:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "127.0.0.1", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc index 0f74e2a346..a61a8d52f1 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/0/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc index 0f74e2a346..a61a8d52f1 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/1/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc index 0f74e2a346..a61a8d52f1 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-a/rivet-engine/2/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc index 0c940aaf6a..7898758e89 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/0/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc index 0c940aaf6a..7898758e89 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/1/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc index 0c940aaf6a..7898758e89 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-b/rivet-engine/2/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc index b6218cd163..4d40c5693d 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/0/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc index b6218cd163..4d40c5693d 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/1/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc index b6218cd163..4d40c5693d 100644 --- a/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multidc-multinode/datacenters/dc-c/rivet-engine/2/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/api.json b/engine/docker/dev-multidc/core/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/api.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/cache.json b/engine/docker/dev-multidc/core/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/cache.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/futures.json b/engine/docker/dev-multidc/core/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/futures.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json b/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/guard.json b/engine/docker/dev-multidc/core/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/guard.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/operation.json b/engine/docker/dev-multidc/core/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/operation.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json b/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json b/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc b/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc index dff69c3809..4c9e465d85 100644 --- a/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc +++ b/engine/docker/dev-multidc/datacenters/dc-a/rivet-engine/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-a", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc b/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc index 6fa7e3b42d..f35557210f 100644 --- a/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc +++ b/engine/docker/dev-multidc/datacenters/dc-b/rivet-engine/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-b", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc b/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc index 95818a2bfb..c12f4bdc1d 100644 --- a/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc +++ b/engine/docker/dev-multidc/datacenters/dc-c/rivet-engine/config.jsonc @@ -61,17 +61,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client-dc-c", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multinode/grafana/dashboards/api.json b/engine/docker/dev-multinode/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/api.json +++ b/engine/docker/dev-multinode/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/cache.json b/engine/docker/dev-multinode/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/cache.json +++ b/engine/docker/dev-multinode/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/futures.json b/engine/docker/dev-multinode/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/futures.json +++ b/engine/docker/dev-multinode/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/gasoline.json b/engine/docker/dev-multinode/grafana/dashboards/gasoline.json index 0c6616f4e3..64916e56f2 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/gasoline.json +++ b/engine/docker/dev-multinode/grafana/dashboards/gasoline.json @@ -91,7 +91,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -127,7 +128,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -194,7 +195,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -230,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (workflow_name) (rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -297,7 +299,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -333,7 +336,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -401,7 +404,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -437,7 +441,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name, error) (rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"})", + "expr": "sum by (workflow_name, error) (\n max by(workflow_name, error, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_dead{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}: {{error}}", "range": true, @@ -605,7 +609,8 @@ "value": 80 } ] - } + }, + "unit": "sishort" }, "overrides": [] }, @@ -641,7 +646,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (signal_name) (rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (signal_name) (\n max by(signal_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_signal_pending{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{signal_name}}", "range": true, @@ -1183,11 +1188,187 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, + "id": 20, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Usage", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 26, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "percentunit" + } + }, + "pluginVersion": "11.6.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Load Shedding Ratio", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, "id": 23, "interval": "15s", "options": { @@ -1274,7 +1455,7 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 60 }, "id": 24, "interval": "15s", @@ -1354,12 +1535,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1368,6 +1547,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1377,14 +1559,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1397,7 +1578,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1405,23 +1586,20 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 68 }, - "id": 13, + "id": 36, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -1433,14 +1611,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", + "format": "heatmap", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows Duration", + "title": "Workflows Dispatched/s", "type": "timeseries" }, { @@ -1459,12 +1637,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1473,6 +1649,9 @@ }, "insertNulls": false, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -1482,14 +1661,13 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -1502,7 +1680,7 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, @@ -1510,200 +1688,21 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 68 }, - "id": 14, + "id": 35, + "interval": "15s", "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "showLegend": true }, "tooltip": { "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 20, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", - "format": "heatmap", - "legendFormat": "{{le}}", - "range": true, - "refId": "A" - } - ], - "title": "CPU Core Usage", - "type": "heatmap" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 26, - "interval": "15s", - "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1714,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -2162,41 +2161,9 @@ } ] }, - "unit": "none" + "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "namespace", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_serverless_conn" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 9, @@ -2612,7 +2579,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2620,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2630,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2639,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2647,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2657,9 +2626,7 @@ }, { "current": { - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -2686,12 +2653,12 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Gasoline", "uid": "636d22f9-d18f-4086-8b45-7c50886a105c", - "version": 7 + "version": 9 } \ No newline at end of file diff --git a/engine/docker/dev-multinode/grafana/dashboards/guard.json b/engine/docker/dev-multinode/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/guard.json +++ b/engine/docker/dev-multinode/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/operation.json b/engine/docker/dev-multinode/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/operation.json +++ b/engine/docker/dev-multinode/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/grafana/dashboards/pegboard.json b/engine/docker/dev-multinode/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/pegboard.json +++ b/engine/docker/dev-multinode/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev-multinode/grafana/dashboards/tokio.json b/engine/docker/dev-multinode/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/tokio.json +++ b/engine/docker/dev-multinode/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev-multinode/rivet-engine/0/config.jsonc b/engine/docker/dev-multinode/rivet-engine/0/config.jsonc index 78d35ea7a7..31b5ce77fe 100644 --- a/engine/docker/dev-multinode/rivet-engine/0/config.jsonc +++ b/engine/docker/dev-multinode/rivet-engine/0/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multinode/rivet-engine/1/config.jsonc b/engine/docker/dev-multinode/rivet-engine/1/config.jsonc index 78d35ea7a7..31b5ce77fe 100644 --- a/engine/docker/dev-multinode/rivet-engine/1/config.jsonc +++ b/engine/docker/dev-multinode/rivet-engine/1/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev-multinode/rivet-engine/2/config.jsonc b/engine/docker/dev-multinode/rivet-engine/2/config.jsonc index 78d35ea7a7..31b5ce77fe 100644 --- a/engine/docker/dev-multinode/rivet-engine/2/config.jsonc +++ b/engine/docker/dev-multinode/rivet-engine/2/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/dev/grafana/dashboards/api.json b/engine/docker/dev/grafana/dashboards/api.json index 2623796db4..2af2eb2b2d 100644 --- a/engine/docker/dev/grafana/dashboards/api.json +++ b/engine/docker/dev/grafana/dashboards/api.json @@ -940,7 +940,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -948,7 +948,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -967,7 +967,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -975,7 +975,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/cache.json b/engine/docker/dev/grafana/dashboards/cache.json index 282e099a8e..921af7008d 100644 --- a/engine/docker/dev/grafana/dashboards/cache.json +++ b/engine/docker/dev/grafana/dashboards/cache.json @@ -830,7 +830,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -838,7 +838,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -857,7 +857,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -865,7 +865,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/futures.json b/engine/docker/dev/grafana/dashboards/futures.json index fdca7320ea..b57d243698 100644 --- a/engine/docker/dev/grafana/dashboards/futures.json +++ b/engine/docker/dev/grafana/dashboards/futures.json @@ -128,7 +128,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -136,7 +136,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -157,7 +157,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -165,7 +165,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/gasoline.json b/engine/docker/dev/grafana/dashboards/gasoline.json index 5bfb0bcb83..64916e56f2 100644 --- a/engine/docker/dev/grafana/dashboards/gasoline.json +++ b/engine/docker/dev/grafana/dashboards/gasoline.json @@ -94,48 +94,7 @@ }, "unit": "sishort" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "epoxy_coordinator", - "epoxy_purger", - "epoxy_replica", - "namespace", - "namespace_metrics_exporter", - "pegboard_actor", - "pegboard_actor_metrics", - "pegboard_actor_runner_name_selector_backfill", - "pegboard_runner", - "pegboard_runner2", - "pegboard_runner_pool", - "pegboard_runner_pool_error_tracker", - "pegboard_runner_pool_metadata_poller", - "pegboard_serverless_backfill", - "pegboard_serverless_conn", - "pegboard_serverless_runner", - "pegboard_serverless_runner2" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, @@ -273,7 +232,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", + "expr": "sum by (workflow_name) (\n max by(workflow_name, rivet_project, rivet_datacenter) (\n rivet_gasoline_workflow_sleeping{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, "legendFormat": "{{workflow_name}}", "range": true, @@ -1229,12 +1188,12 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, "y": 51 }, - "id": 23, + "id": 20, "interval": "15s", "options": { "calculate": false, @@ -1275,7 +1234,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "s" + "unit": "" } }, "pluginVersion": "11.6.7", @@ -1286,14 +1245,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_pull_workflows_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Pull Workflows Duration", + "title": "CPU Core Usage", "type": "heatmap" }, { @@ -1317,12 +1276,12 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 12, "y": 51 }, - "id": 24, + "id": 26, "interval": "15s", "options": { "calculate": false, @@ -1363,7 +1322,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "s" + "unit": "percentunit" } }, "pluginVersion": "11.6.7", @@ -1374,14 +1333,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_pull_workflows_history_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Pull Workflows History Duration", + "title": "Load Shedding Ratio", "type": "heatmap" }, { @@ -1391,59 +1350,16 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, "scaleDistribution": { "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" + } }, "overrides": [] }, @@ -1451,129 +1367,50 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 60 }, - "id": 13, + "id": 23, + "interval": "15s", "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "11.6.7", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Last Pull Workflows Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 0.1, - "axisSoftMin": 0, - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 59 - }, - "id": 14, - "options": { "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": false, - "sortBy": "Last *", - "sortDesc": true + "show": true + }, + "rowsFrame": { + "layout": "auto" }, "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" } }, "pluginVersion": "11.6.7", @@ -1584,15 +1421,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "max by (worker_id) (rivet_gasoline_last_pull_workflows_history_duration{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", - "instant": false, - "legendFormat": "{{worker_id}}", + "expr": "sum(increase(rivet_gasoline_pull_workflows_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Last Pull Workflows History Duration", - "type": "timeseries" + "title": "Pull Workflows Duration", + "type": "heatmap" }, { "datasource": { @@ -1615,12 +1452,12 @@ "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, - "x": 0, - "y": 67 + "x": 12, + "y": 60 }, - "id": 20, + "id": 24, "interval": "15s", "options": { "calculate": false, @@ -1661,7 +1498,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "" + "unit": "s" } }, "pluginVersion": "11.6.7", @@ -1672,14 +1509,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_cpu_usage_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_pull_workflows_history_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "CPU Core Usage", + "title": "Pull Workflows History Duration", "type": "heatmap" }, { @@ -1689,67 +1526,81 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" } - } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" }, "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, - "x": 12, - "y": 67 + "x": 0, + "y": 68 }, - "id": 26, + "id": 36, "interval": "15s", "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "percentunit" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1760,15 +1611,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_load_shedding_ratio_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{signal_name}}", "range": true, "refId": "A" } ], - "title": "Load Shedding Ratio", - "type": "heatmap" + "title": "Workflows Dispatched/s", + "type": "timeseries" }, { "datasource": { @@ -1777,68 +1628,81 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" } - } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 76 + "x": 12, + "y": 68 }, - "id": 34, + "id": 35, "interval": "15s", "options": { - "calculate": false, - "calculation": { - "xBuckets": { - "mode": "size" - } - }, - "cellGap": 0, - "cellValues": {}, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "RdBu", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "max": "60", - "min": 0, - "reverse": false, - "unit": "s" + "sort": "none" } }, "pluginVersion": "11.6.7", @@ -1849,15 +1713,15 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_workflow_wake_delta_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval])) by (le)", + "expr": "sum by (sub_workflow_name) (rate(rivet_gasoline_workflow_dispatched_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=\"\"} [$__rate_interval]))", "format": "heatmap", - "legendFormat": "{{le}}", + "legendFormat": "{{sub_workflow_name}}", "range": true, "refId": "A" } ], - "title": "Workflow Wake Delta", - "type": "heatmap" + "title": "Workflows Dispatched/s (Not From Workflow)", + "type": "timeseries" }, { "datasource": { @@ -1882,10 +1746,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 0, "y": 76 }, - "id": 35, + "id": 34, "interval": "15s", "options": { "calculate": false, @@ -1927,7 +1791,7 @@ "max": "60", "min": 0, "reverse": false, - "unit": "none" + "unit": "s" } }, "pluginVersion": "11.6.7", @@ -1938,14 +1802,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(increase(rivet_gasoline_workflow_leased_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval])) by (le)", + "expr": "sum(increase(rivet_gasoline_workflow_wake_delta_duration_bucket{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\",workflow_name=~\"$workflow_name\"} [$__rate_interval])) by (le)", "format": "heatmap", "legendFormat": "{{le}}", "range": true, "refId": "A" } ], - "title": "Workflow Leases Per Tick", + "title": "Workflow Wake Delta", "type": "heatmap" }, { @@ -2705,17 +2569,17 @@ { "current": { "text": [ - "All" + "prod" ], "value": [ - "$__all" + "prod" ] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -2723,7 +2587,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2733,7 +2597,9 @@ }, { "current": { - "text": "All", + "text": [ + "All" + ], "value": [ "$__all" ] @@ -2742,7 +2608,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -2750,7 +2616,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -2787,7 +2653,7 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, "timepicker": {}, diff --git a/engine/docker/dev/grafana/dashboards/guard.json b/engine/docker/dev/grafana/dashboards/guard.json index f6190ff52c..cd9d61f4a6 100644 --- a/engine/docker/dev/grafana/dashboards/guard.json +++ b/engine/docker/dev/grafana/dashboards/guard.json @@ -1352,7 +1352,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -1360,7 +1360,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1379,7 +1379,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -1387,7 +1387,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/operation.json b/engine/docker/dev/grafana/dashboards/operation.json index 7ccc0d3e0c..d4348cec8f 100644 --- a/engine/docker/dev/grafana/dashboards/operation.json +++ b/engine/docker/dev/grafana/dashboards/operation.json @@ -780,7 +780,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -788,7 +788,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -809,7 +809,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -817,7 +817,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/grafana/dashboards/pegboard.json b/engine/docker/dev/grafana/dashboards/pegboard.json index 52bd7422d4..8ed083805d 100644 --- a/engine/docker/dev/grafana/dashboards/pegboard.json +++ b/engine/docker/dev/grafana/dashboards/pegboard.json @@ -196,9 +196,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_actor_pending_allocation{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -381,9 +381,9 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", + "expr": "sum by (rivet_datacenter) (\n max by(rivet_project, rivet_datacenter) (\n rivet_pegboard_serverless_desired_slots{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}\n * on(k8s_pod_name) group_left(rivet_project, rivet_datacenter)\n (\n (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} == bool\n on(rivet_project, rivet_datacenter) group_left()\n max by (rivet_project, rivet_datacenter) (rivet_gasoline_worker_last_metrics_publish{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"}))\n )\n )\n)", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -487,7 +487,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_serverless_outbound_req_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -591,7 +591,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_serverless_outbound_req_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -695,7 +695,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_connection_active{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -799,7 +799,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_runner_connection_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -903,7 +903,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_event_multiplexer_count{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1007,7 +1007,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rate(rivet_pegboard_ingested_events_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"} [$__rate_interval]))", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1192,7 +1192,7 @@ "editorMode": "code", "expr": "sum by (rivet_datacenter) (rivet_pegboard_runner_version_upgrade_drain_total{rivet_project=~\"$project\",rivet_datacenter=~\"$datacenter\"})", "instant": false, - "legendFormat": "{{workflow_name}}", + "legendFormat": "{{rivet_datacenter}}", "range": true, "refId": "A" } @@ -1209,10 +1209,10 @@ { "current": { "text": [ - "prod" + "staging" ], "value": [ - "prod" + "staging" ] }, "datasource": { @@ -1270,5 +1270,5 @@ "timezone": "browser", "title": "Pegboard", "uid": "afa77odsjjk74d", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/engine/docker/dev/grafana/dashboards/tokio.json b/engine/docker/dev/grafana/dashboards/tokio.json index e9d9771791..786a3aeada 100644 --- a/engine/docker/dev/grafana/dashboards/tokio.json +++ b/engine/docker/dev/grafana/dashboards/tokio.json @@ -846,7 +846,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(rivet_project)", + "definition": "label_values(up, rivet_project)", "includeAll": true, "label": "Project", "multi": true, @@ -854,7 +854,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(rivet_project)", + "query": "label_values(up, rivet_project)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -875,7 +875,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "definition": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "includeAll": true, "label": "Datacenter", "multi": true, @@ -883,7 +883,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values({rivet_project=~\"$project\"},rivet_datacenter)", + "query": "label_values(up{rivet_project=~\"$project\"},rivet_datacenter)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/engine/docker/dev/rivet-engine/config.jsonc b/engine/docker/dev/rivet-engine/config.jsonc index 9c5ea86073..3158c9bf72 100644 --- a/engine/docker/dev/rivet-engine/config.jsonc +++ b/engine/docker/dev/rivet-engine/config.jsonc @@ -37,17 +37,6 @@ "native_url": "http://clickhouse:9301", "username": "system", "password": "default", - "provision_users": { - "vector": { - "username": "vector", - "password": "vector", - "role": "write" - } - }, "secure": false - }, - "vector_http": { - "host": "vector-client", - "port": 5022 } } \ No newline at end of file diff --git a/engine/docker/template/src/docker-compose.ts b/engine/docker/template/src/docker-compose.ts index 5e7dd6719b..62ad7279e3 100644 --- a/engine/docker/template/src/docker-compose.ts +++ b/engine/docker/template/src/docker-compose.ts @@ -343,6 +343,8 @@ export function generateDockerCompose(context: TemplateContext) { environment: [ `RIVET_ENDPOINT=http://${context.getServiceHost("rivet-engine", datacenter.name, 0)}:6420`, `RIVET_RUNNER_TOTAL_SLOTS=1000000`, + `AUTOSTART_RUNNER=1`, + `AUTOCONFIGURE_SERVERLESS=0` ], stop_grace_period: "4s", ports: isPrimary && i === 0 ? [`5050:5050`] : undefined, diff --git a/engine/docker/template/src/services/edge/rivet-engine.ts b/engine/docker/template/src/services/edge/rivet-engine.ts index 4fa1bf00fa..d388c6dcaa 100644 --- a/engine/docker/template/src/services/edge/rivet-engine.ts +++ b/engine/docker/template/src/services/edge/rivet-engine.ts @@ -63,10 +63,6 @@ export function generateDatacenterRivetEngine( password: "default", secure: false, }, - vector_http: { - host: context.getServiceHost("vector-client", datacenter.name), - port: 5022, - }, }; context.writeDatacenterServiceFile( diff --git a/engine/packages/config/src/config/pegboard.rs b/engine/packages/config/src/config/pegboard.rs index d3817dc548..75e9df57c4 100644 --- a/engine/packages/config/src/config/pegboard.rs +++ b/engine/packages/config/src/config/pegboard.rs @@ -178,6 +178,19 @@ pub struct Pegboard { pub runner_event_demuxer_gc_interval_ms: Option, /// Max time since last seen before actor is considered stale, in milliseconds. pub runner_event_demuxer_max_last_seen_ms: Option, + + /// Drain grace period for serverless runners. + /// + /// This time is subtracted from the configured request duration. Once `duration - grace` is reached, the + /// runner is sent stop commands for all of its actors. After the grace period is over (i.e. the full + /// duration is reached) the runner websocket is forcibly closed. + /// + /// Default is 10 seconds. + /// + /// Unit is in milliseconds. + /// + /// **Experimental** + pub serverless_drain_grace_period: Option, } impl Pegboard { @@ -370,4 +383,8 @@ impl Pegboard { pub fn runner_event_demuxer_max_last_seen_ms(&self) -> u64 { self.runner_event_demuxer_max_last_seen_ms.unwrap_or(30_000) } + + pub fn serverless_drain_grace_period(&self) -> u64 { + self.serverless_drain_grace_period.unwrap_or(10_000) + } } diff --git a/engine/packages/config/src/config/runtime.rs b/engine/packages/config/src/config/runtime.rs index 2f2c495b85..31b9e92ac5 100644 --- a/engine/packages/config/src/config/runtime.rs +++ b/engine/packages/config/src/config/runtime.rs @@ -56,7 +56,7 @@ pub struct Worker { /// | . \. /// 5% | . \_____ /// |_____.___.______ - /// 0 60% 80% + /// 0 70% 90% /// avg cpu usage load_shedding_curve: Option<[(u64, u64); 2]>, /// Time (in seconds) to allow for the gasoline worker engine to stop gracefully after receiving SIGTERM. @@ -66,7 +66,7 @@ pub struct Worker { impl Worker { pub fn load_shedding_curve(&self) -> [(u64, u64); 2] { - self.load_shedding_curve.unwrap_or([(600, 1000), (800, 50)]) + self.load_shedding_curve.unwrap_or([(700, 1000), (900, 50)]) } pub fn shutdown_duration(&self) -> Duration { diff --git a/engine/packages/epoxy/src/http_client.rs b/engine/packages/epoxy/src/http_client.rs index 1eee2eee76..1e3a0d4af0 100644 --- a/engine/packages/epoxy/src/http_client.rs +++ b/engine/packages/epoxy/src/http_client.rs @@ -1,4 +1,4 @@ -use anyhow::*; +use anyhow::{Context, Result, bail}; use epoxy_protocol::{ PROTOCOL_VERSION, protocol::{self, ReplicaId}, @@ -37,7 +37,13 @@ where Fut: Future> + Send, T: Send, { - let quorum_size = utils::calculate_quorum(replica_ids.len(), quorum_type); + let target_responses = utils::calculate_fanout_quorum(replica_ids.len(), quorum_type); + + if target_responses == 0 { + tracing::warn!("no fanout, target is 0"); + + return Ok(Vec::new()); + } // Create futures for all replicas (excluding the sender) let mut responses = futures_util::stream::iter( @@ -57,32 +63,22 @@ where ) .collect::>() .await; - tracing::debug!(?quorum_size, len = ?responses.len(), ?quorum_type, "fanout quorum size"); - - // Choose how many successful responses we need before considering a success - let target_responses = match quorum_type { - // Only require 1 response - utils::QuorumType::Any => 1, - // Include all responses - utils::QuorumType::All => responses.len(), - // Subtract 1 from quorum size since we're not counting ourselves - utils::QuorumType::Fast | utils::QuorumType::Slow => quorum_size - 1, - }; + tracing::debug!(?target_responses, len=?responses.len(), "fanout target"); // Collect responses until we reach quorum or all futures complete let mut successful_responses = Vec::new(); while successful_responses.len() < target_responses { if let Some(response) = responses.next().await { match response { - std::result::Result::Ok(result) => match result { - std::result::Result::Ok(response) => { + Ok(result) => match result { + Ok(response) => { successful_responses.push(response); } - std::result::Result::Err(err) => { + Err(err) => { tracing::warn!(?err, "received error from replica"); } }, - std::result::Result::Err(err) => { + Err(err) => { tracing::warn!(?err, "received timeout from replica"); } } @@ -159,8 +155,8 @@ pub async fn send_message_to_address( .await; let response = match response_result { - std::result::Result::Ok(resp) => resp, - std::result::Result::Err(e) => { + Ok(resp) => resp, + Err(e) => { tracing::error!( to_replica = to_replica_id, replica_url = %replica_url, diff --git a/engine/packages/epoxy/src/utils.rs b/engine/packages/epoxy/src/utils.rs index cd7f51953a..a0840a0102 100644 --- a/engine/packages/epoxy/src/utils.rs +++ b/engine/packages/epoxy/src/utils.rs @@ -43,12 +43,54 @@ pub fn get_all_replicas(config: &protocol::ClusterConfig) -> Vec { config.replicas.iter().map(|r| r.replica_id).collect() } +// See EPaxos 4.3 pub fn calculate_quorum(n: usize, q: QuorumType) -> usize { - match q { - QuorumType::Fast => (n * 3) / 4 + 1, - QuorumType::Slow => n / 2 + 1, - QuorumType::All => n, - QuorumType::Any => 1, + match n { + // Nonsensical + 0 => 0, + 1 => 1, + // EPaxos does not apply to clusters with N < 3 because you cannot tolerate any faults. However we can + // still get correctness invariants to hold by requiring both nodes to agree on everything (quorum + // size is always 2) + 2 => match q { + QuorumType::Fast => 2, + QuorumType::Slow => 2, + QuorumType::All => 2, + QuorumType::Any => 1, + }, + // Note that for even N's we don't gain any extra fault tolerance but we get potentially better read + // latency. N=4 acts like N=3 in terms of fault tolerance. + n => { + let f = (n - 1) / 2; + + match q { + QuorumType::Fast => f + (f + 1) / 2, + QuorumType::Slow => f + 1, + QuorumType::All => n, + QuorumType::Any => 1, + } + } + } +} + +/// Calculates quorum size assuming the sender is excluded. +pub fn calculate_fanout_quorum(n: usize, q: QuorumType) -> usize { + match n { + // Nonsensical + 0 => 0, + 1 => 0, + // NOTE: See comments in `calculate_quorum` + 2 => 1, + n => { + let f = (n - 1) / 2; + + match q { + QuorumType::Fast => (f + (f + 1) / 2) - 1, + QuorumType::Slow => f, + QuorumType::All => n - 1, + QuorumType::Any => 1, + } + } } } diff --git a/engine/packages/gasoline/src/ctx/test.rs b/engine/packages/gasoline/src/ctx/test.rs index 25e822336c..e31928977d 100644 --- a/engine/packages/gasoline/src/ctx/test.rs +++ b/engine/packages/gasoline/src/ctx/test.rs @@ -94,6 +94,19 @@ impl TestCtx { &*self.debug_db } + pub fn standalone(&self) -> Result { + StandaloneCtx::new( + self.db.clone(), + self.config.clone(), + self.pools.clone(), + self.cache.clone(), + &self.name, + self.ray_id, + Id::new_v1(self.config.dc_label()), + ) + .map_err(Into::into) + } + pub async fn shutdown(&mut self) -> Result<()> { if let Some(worker_handle) = self.worker_handle.take() { tracing::info!("stopping workflow worker"); diff --git a/engine/packages/gasoline/src/db/kv/mod.rs b/engine/packages/gasoline/src/db/kv/mod.rs index 1f51e40295..3be187cd45 100644 --- a/engine/packages/gasoline/src/db/kv/mod.rs +++ b/engine/packages/gasoline/src/db/kv/mod.rs @@ -481,7 +481,7 @@ impl Database for DatabaseKv { let start = Instant::now(); let now = rivet_util::timestamp::now(); - let mut last_ping_cache: Vec<(Id, i64)> = Vec::new(); + let mut last_ping_cache = HashMap::::new(); let mut lost_worker_ids = HashSet::new(); let mut expired_workflow_count = 0; @@ -518,8 +518,8 @@ impl Database for DatabaseKv { let last_ping_ts_key = keys::worker::LastPingTsKey::new(worker_id); // Get last ping of worker for this lease - let last_ping_ts = if let Some((_, last_ping_ts)) = - last_ping_cache.iter().find(|(k, _)| k == &worker_id) + let last_ping_ts = if let Some(last_ping_ts) = + last_ping_cache.get(&worker_id) { *last_ping_ts } else if let Some(last_ping_entry) = tx @@ -534,12 +534,12 @@ impl Database for DatabaseKv { let last_ping_ts = last_ping_ts_key.deserialize(&last_ping_entry)?; // Update cache - last_ping_cache.push((worker_id, last_ping_ts)); + last_ping_cache.insert(worker_id, last_ping_ts); last_ping_ts } else { // Update cache - last_ping_cache.push((worker_id, 0)); + last_ping_cache.insert(worker_id, 0); 0 }; diff --git a/engine/packages/gasoline/src/history/cursor.rs b/engine/packages/gasoline/src/history/cursor.rs index 5f707a2380..5b315abdf5 100644 --- a/engine/packages/gasoline/src/history/cursor.rs +++ b/engine/packages/gasoline/src/history/cursor.rs @@ -683,12 +683,12 @@ mod tests { Event { coordinate: coord![2, 1], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![4], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, ], )] @@ -722,12 +722,12 @@ mod tests { Event { coordinate: coord![2, 1], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![3], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, ], )] @@ -755,12 +755,12 @@ mod tests { Event { coordinate: coord![2, 1], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![2, 2], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, ], )] @@ -783,12 +783,12 @@ mod tests { Event { coordinate: coord![2, 1], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![2, 1, 1], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, ], )] @@ -810,17 +810,17 @@ mod tests { Event { coordinate: coord![1], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![3], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![6], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, ], )] @@ -852,17 +852,17 @@ mod tests { Event { coordinate: coord![1], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![3], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, Event { coordinate: coord![6], version: 1, - data: EventData::VersionCheck, + data: EventData::Branch, }, ], )] diff --git a/engine/packages/guard-core/src/errors.rs b/engine/packages/guard-core/src/errors.rs index da75f5bd71..dc774b2a42 100644 --- a/engine/packages/guard-core/src/errors.rs +++ b/engine/packages/guard-core/src/errors.rs @@ -1,6 +1,24 @@ use rivet_error::*; use serde::{Deserialize, Serialize}; +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "invalid_request_body", + "Unable to parse request body.", + "Unable to parse request body: {0}." +)] +pub struct InvalidRequestBody(pub String); + +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "invalid_response_body", + "Unable to parse response body.", + "Unable to parse response body: {0}." +)] +pub struct InvalidResponseBody(pub String); + #[derive(RivetError, Serialize, Deserialize)] #[error( "guard", @@ -42,7 +60,7 @@ pub struct UriParseError(pub String); pub struct RequestBuildError(pub String); #[derive(RivetError)] -#[error("guard", "upstream_error", "Upstream error.", "Upstream error: {0}")] +#[error("guard", "upstream_error", "Upstream error.", "Upstream error: {0}.")] pub struct UpstreamError(pub String); #[derive(RivetError, Serialize, Deserialize)] diff --git a/engine/packages/guard-core/src/proxy_service.rs b/engine/packages/guard-core/src/proxy_service.rs index a63f53c6ca..bda3624061 100644 --- a/engine/packages/guard-core/src/proxy_service.rs +++ b/engine/packages/guard-core/src/proxy_service.rs @@ -1,7 +1,7 @@ use anyhow::{Context, Result, bail, ensure}; use bytes::Bytes; use futures_util::{SinkExt, StreamExt}; -use http_body_util::{BodyExt, Full}; +use http_body_util::{BodyExt, Full, Limited}; use hyper::{ Request, Response, StatusCode, body::Incoming as BodyIncoming, @@ -40,6 +40,7 @@ use crate::{ pub const X_FORWARDED_FOR: HeaderName = HeaderName::from_static("x-forwarded-for"); pub const X_RIVET_ERROR: HeaderName = HeaderName::from_static("x-rivet-error"); +pub const MAX_BODY_SIZE: usize = rivet_util::size::mebibytes(20) as usize; const PROXY_STATE_CACHE_TTL: Duration = Duration::from_secs(60 * 60); // 1 hour const WEBSOCKET_CLOSE_LINGER: Duration = Duration::from_millis(100); // Keep TCP connection open briefly after WebSocket close @@ -723,13 +724,11 @@ impl ProxyService { ResolveRouteOutput::Target(mut target) => { // Read the request body before proceeding with retries let (req_parts, body) = req.into_parts(); - let req_body = match http_body_util::BodyExt::collect(body).await { - Ok(collected) => collected.to_bytes(), - Err(err) => { - tracing::debug!(?err, "Failed to read request body"); - Bytes::new() - } - }; + let req_body = Limited::new(body, MAX_BODY_SIZE) + .collect() + .await + .map_err(|err| errors::InvalidRequestBody(err.to_string()).build())? + .to_bytes(); // Use a value-returning loop to handle both errors and successful responses let mut attempts = 0; @@ -742,7 +741,8 @@ impl ProxyService { // Create the final request with body let proxied_req = builder - .body(Full::::new(req_body.clone())) + // NOTE: the `Bytes` type is cheaply cloneable, this is not resource intensive + .body(Full::new(req_body.clone())) .map_err(|err| errors::RequestBuildError(err.to_string()).build())?; // Send the request with timeout @@ -800,10 +800,13 @@ impl ProxyService { return Ok(Response::from_parts(parts, streaming_body)); } else { // For non-streaming responses, buffer as before - let body_bytes = match BodyExt::collect(body).await { - Ok(collected) => collected.to_bytes(), - Err(_) => Bytes::new(), - }; + let body_bytes = Limited::new(body, MAX_BODY_SIZE) + .collect() + .await + .map_err(|err| { + errors::InvalidResponseBody(err.to_string()).build() + })? + .to_bytes(); let full_body = ResponseBody::Full(Full::new(body_bytes)); return Ok(Response::from_parts(parts, full_body)); @@ -857,15 +860,13 @@ impl ProxyService { ResolveRouteOutput::CustomServe(mut handler) => { // Collect request body let (req_parts, body) = req.into_parts(); - let collected_body = match http_body_util::BodyExt::collect(body).await { - Ok(collected) => collected.to_bytes(), - Err(err) => { - tracing::debug!(?err, "Failed to read request body"); - Bytes::new() - } - }; + let req_body = Limited::new(body, MAX_BODY_SIZE) + .collect() + .await + .map_err(|err| errors::InvalidRequestBody(err.to_string()).build())? + .to_bytes(); let req_collected = - hyper::Request::from_parts(req_parts, Full::::new(collected_body)); + hyper::Request::from_parts(req_parts, Full::::new(req_body)); // Attempt request let mut attempts = 0; diff --git a/engine/packages/guard-core/src/utils.rs b/engine/packages/guard-core/src/utils.rs index 8d3ecb17b6..5e1090d3df 100644 --- a/engine/packages/guard-core/src/utils.rs +++ b/engine/packages/guard-core/src/utils.rs @@ -181,6 +181,8 @@ pub(crate) fn err_into_response(err: anyhow::Error) -> Result StatusCode::SERVICE_UNAVAILABLE, ("guard", "actor_ready_timeout") => StatusCode::SERVICE_UNAVAILABLE, ("guard", "no_route") => StatusCode::NOT_FOUND, + ("guard", "invalid_request_body") => StatusCode::PAYLOAD_TOO_LARGE, + ("guard", "invalid_response_body") => StatusCode::BAD_GATEWAY, _ => StatusCode::BAD_REQUEST, }; diff --git a/engine/packages/pegboard-gateway/src/lib.rs b/engine/packages/pegboard-gateway/src/lib.rs index 73c7bf85f2..bd267ad50f 100644 --- a/engine/packages/pegboard-gateway/src/lib.rs +++ b/engine/packages/pegboard-gateway/src/lib.rs @@ -154,6 +154,7 @@ impl PegboardGateway { max_age: None, }); + // NOTE: Size constraints have already been applied by guard let body_bytes = req .into_body() .collect() diff --git a/engine/packages/pegboard-runner/Cargo.toml b/engine/packages/pegboard-runner/Cargo.toml index b350657a30..3566407f40 100644 --- a/engine/packages/pegboard-runner/Cargo.toml +++ b/engine/packages/pegboard-runner/Cargo.toml @@ -26,6 +26,7 @@ rivet-guard-core.workspace = true rivet-metrics.workspace = true rivet-runner-protocol.workspace = true rivet-runtime.workspace = true +rivet-types.workspace = true serde_bare.workspace = true serde_json.workspace = true serde.workspace = true diff --git a/engine/packages/pegboard-runner/src/conn.rs b/engine/packages/pegboard-runner/src/conn.rs index 4b3b5accb6..60e02a3788 100644 --- a/engine/packages/pegboard-runner/src/conn.rs +++ b/engine/packages/pegboard-runner/src/conn.rs @@ -6,13 +6,13 @@ use std::{ use anyhow::Context; use futures_util::StreamExt; use futures_util::TryStreamExt; -use gas::prelude::Id; use gas::prelude::*; use hyper_tungstenite::tungstenite::Message; use pegboard::ops::runner::update_alloc_idx::{Action, RunnerEligibility}; use rivet_data::converted::{ActorNameKeyData, MetadataKeyData}; use rivet_guard_core::WebSocketHandle; use rivet_runner_protocol::{self as protocol, versioned}; +use rivet_types::runner_configs::RunnerConfigKind; use universaldb::prelude::*; use vbare::OwnedVersionedData; @@ -280,9 +280,13 @@ pub async fn handle_init( ) })?; - let missed_commands = ctx - .udb()? - .run(|tx| { + let udb = ctx.udb()?; + let (runner_config_res, missed_commands) = tokio::try_join!( + ctx.op(pegboard::ops::runner_config::get::Input { + runners: vec![(conn.namespace_id, conn.runner_name.clone())], + bypass_cache: false, + }), + udb.run(|tx| { let init = init.clone(); async move { let tx = tx.with_subspace(pegboard::keys::subspace()); @@ -367,15 +371,23 @@ pub async fn handle_init( .await } }) - .custom_instrument(tracing::info_span!("runner_process_init_tx")) - .await?; + .custom_instrument(tracing::info_span!("runner_process_init_tx")), + )?; + + let is_serverless = runner_config_res.first().map_or(false, |c| { + matches!(c.config.kind, RunnerConfigKind::Serverless { .. }) + }); + let pb = ctx.config().pegboard(); // Send init packet let init_msg = versioned::ToClientMk2::wrap_latest(protocol::mk2::ToClient::ToClientInit( protocol::mk2::ToClientInit { runner_id: conn.runner_id.to_string(), metadata: protocol::mk2::ProtocolMetadata { - runner_lost_threshold: ctx.config().pegboard().runner_lost_threshold(), + runner_lost_threshold: pb.runner_lost_threshold(), + actor_stop_threshold: pb.actor_stop_threshold(), + serverless_drain_grace_period: is_serverless + .then(|| pb.serverless_drain_grace_period() as i64), }, }, )); diff --git a/engine/packages/pegboard-runner/src/errors.rs b/engine/packages/pegboard-runner/src/errors.rs index 0799858cf1..2c8fc3111d 100644 --- a/engine/packages/pegboard-runner/src/errors.rs +++ b/engine/packages/pegboard-runner/src/errors.rs @@ -1,18 +1,6 @@ use rivet_error::*; use serde::{Deserialize, Serialize}; -#[derive(RivetError, Serialize, Deserialize)] -#[error( - "guard", - "response_body_too_large", - "Response body too large.", - "Response body size {size} bytes exceeds maximum allowed {max_size} bytes." -)] -pub struct ResponseBodyTooLarge { - pub size: usize, - pub max_size: usize, -} - #[derive(RivetError, Debug)] #[error("ws")] pub enum WsError { diff --git a/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs b/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs index 4268dfeb27..bddedd416b 100644 --- a/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs +++ b/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs @@ -6,6 +6,7 @@ use gas::prelude::*; use hyper_tungstenite::tungstenite::Message; use pegboard::actor_kv; use pegboard::pubsub_subjects::GatewayReceiverSubject; +use rivet_guard_core::proxy_service::MAX_BODY_SIZE; use rivet_guard_core::websocket_handle::WebSocketReceiver; use rivet_runner_protocol::{self as protocol, PROTOCOL_MK2_VERSION, versioned}; use std::sync::{Arc, atomic::Ordering}; @@ -783,29 +784,15 @@ async fn handle_tunnel_message_mk2( ctx: &StandaloneCtx, msg: protocol::mk2::ToServerTunnelMessage, ) -> Result<()> { - // Check response body size limit for HTTP responses - if let protocol::mk2::ToServerTunnelMessageKind::ToServerResponseStart(ref resp) = - msg.message_kind - { - if let Some(ref body) = resp.body { - let max_response_body_size = - ctx.config().pegboard().runner_http_max_response_body_size(); - if body.len() > max_response_body_size { - return Err(errors::ResponseBodyTooLarge { - size: body.len(), - max_size: max_response_body_size, - } - .build()); - } - } + // Extract inner data length before consuming msg + let inner_data_len = tunnel_message_inner_data_len_mk2(&msg.message_kind); + + // Enforce incoming payload size + if inner_data_len > ctx.config().pegboard().runner_http_max_response_body_size() { + return Err(errors::WsError::InvalidPacket("payload too large".to_string()).build()); } - // Publish message to UPS let gateway_reply_to = GatewayReceiverSubject::new(msg.message_id.gateway_id).to_string(); - - // Extract inner data length before consuming msg - let inner_data_len = tunnel_message_inner_data_len(&msg.message_kind); - let msg_serialized = versioned::ToGateway::wrap_latest(protocol::mk2::ToGateway::ToServerTunnelMessage(msg)) .serialize_with_embedded_version(PROTOCOL_MK2_VERSION) @@ -817,6 +804,7 @@ async fn handle_tunnel_message_mk2( "publishing tunnel message to gateway" ); + // Publish message to UPS ctx.ups() .context("failed to get UPS instance for tunnel message")? .publish(&gateway_reply_to, &msg_serialized, PublishOpts::one()) @@ -831,22 +819,6 @@ async fn handle_tunnel_message_mk2( Ok(()) } -/// Returns the length of the inner data payload for a tunnel message kind. -fn tunnel_message_inner_data_len(kind: &protocol::mk2::ToServerTunnelMessageKind) -> usize { - use protocol::mk2::ToServerTunnelMessageKind; - match kind { - ToServerTunnelMessageKind::ToServerResponseStart(resp) => { - resp.body.as_ref().map_or(0, |b| b.len()) - } - ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => chunk.body.len(), - ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => msg.data.len(), - ToServerTunnelMessageKind::ToServerResponseAbort - | ToServerTunnelMessageKind::ToServerWebSocketOpen(_) - | ToServerTunnelMessageKind::ToServerWebSocketMessageAck(_) - | ToServerTunnelMessageKind::ToServerWebSocketClose(_) => 0, - } -} - #[tracing::instrument(skip_all)] async fn handle_tunnel_message_mk1( ctx: &StandaloneCtx, @@ -860,24 +832,17 @@ async fn handle_tunnel_message_mk1( return Ok(()); } - // Check response body size limit for HTTP responses - if let protocol::ToServerTunnelMessageKind::ToServerResponseStart(ref resp) = msg.message_kind { - if let Some(ref body) = resp.body { - let max_response_body_size = - ctx.config().pegboard().runner_http_max_response_body_size(); - if body.len() > max_response_body_size { - return Err(errors::ResponseBodyTooLarge { - size: body.len(), - max_size: max_response_body_size, - } - .build()); - } - } + // Extract inner data length before consuming msg + let inner_data_len = tunnel_message_inner_data_len_mk1(&msg.message_kind); + + // Enforce incoming payload size + if inner_data_len > ctx.config().pegboard().runner_http_max_response_body_size() { + return Err(errors::WsError::InvalidPacket("payload too large".to_string()).build()); } // Publish message to UPS let gateway_reply_to = GatewayReceiverSubject::new(msg.message_id.gateway_id).to_string(); - let msg_serialized = versioned::ToGateway::v3_to_v4(versioned::ToGateway::V3( + let msg_serialized = versioned::ToGateway::v3_to_v6(versioned::ToGateway::V3( protocol::ToGateway::ToServerTunnelMessage(msg), ))? .serialize_with_embedded_version(PROTOCOL_MK2_VERSION) @@ -896,6 +861,39 @@ async fn handle_tunnel_message_mk1( Ok(()) } +/// Returns the length of the inner data payload for a tunnel message kind. +fn tunnel_message_inner_data_len_mk2(kind: &protocol::mk2::ToServerTunnelMessageKind) -> usize { + use protocol::mk2::ToServerTunnelMessageKind; + match kind { + ToServerTunnelMessageKind::ToServerResponseStart(resp) => { + resp.body.as_ref().map_or(0, |b| b.len()) + } + ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => chunk.body.len(), + ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => msg.data.len(), + ToServerTunnelMessageKind::ToServerResponseAbort + | ToServerTunnelMessageKind::ToServerWebSocketOpen(_) + | ToServerTunnelMessageKind::ToServerWebSocketMessageAck(_) + | ToServerTunnelMessageKind::ToServerWebSocketClose(_) => 0, + } +} + +/// Returns the length of the inner data payload for a tunnel message kind. +fn tunnel_message_inner_data_len_mk1(kind: &protocol::ToServerTunnelMessageKind) -> usize { + use protocol::ToServerTunnelMessageKind; + match kind { + ToServerTunnelMessageKind::ToServerResponseStart(resp) => { + resp.body.as_ref().map_or(0, |b| b.len()) + } + ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => chunk.body.len(), + ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => msg.data.len(), + ToServerTunnelMessageKind::ToServerResponseAbort + | ToServerTunnelMessageKind::ToServerWebSocketOpen(_) + | ToServerTunnelMessageKind::ToServerWebSocketMessageAck(_) + | ToServerTunnelMessageKind::ToServerWebSocketClose(_) + | ToServerTunnelMessageKind::DeprecatedTunnelAck => 0, + } +} + /// Send ack message for deprecated tunnel versions. /// /// We have to parse as specifically a v2 message since we need the exact request & message ID diff --git a/engine/packages/pegboard/src/workflows/actor/mod.rs b/engine/packages/pegboard/src/workflows/actor/mod.rs index dc832ca020..c182ecb27d 100644 --- a/engine/packages/pegboard/src/workflows/actor/mod.rs +++ b/engine/packages/pegboard/src/workflows/actor/mod.rs @@ -260,8 +260,6 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> ctx.listen_n::
(256).await? }; - let now = util::timestamp::now(); - for sig in signals { match sig { // NOTE: This is only received when allocated to mk1 runner @@ -584,6 +582,8 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> Main::Wake(sig) => { // Clear alarm if let Some(alarm_ts) = state.alarm_ts { + let now = ctx.v(3).activity(GetTsInput {}).await?; + if now >= alarm_ts { state.alarm_ts = None; } @@ -1225,6 +1225,14 @@ async fn handle_stopped( Ok(StoppedResult::Continue) } +#[derive(Debug, Serialize, Deserialize, Hash)] +struct GetTsInput {} + +#[activity(GetTs)] +async fn get_ts(ctx: &ActivityCtx, input: &GetTsInput) -> Result { + Ok(util::timestamp::now()) +} + #[message("pegboard_actor_create_complete")] pub struct CreateComplete {} diff --git a/engine/packages/pegboard/src/workflows/actor/runtime.rs b/engine/packages/pegboard/src/workflows/actor/runtime.rs index aa8f093e90..3655fc0079 100644 --- a/engine/packages/pegboard/src/workflows/actor/runtime.rs +++ b/engine/packages/pegboard/src/workflows/actor/runtime.rs @@ -8,10 +8,7 @@ use rand::prelude::SliceRandom; use rivet_runner_protocol::{ self as protocol, PROTOCOL_MK1_VERSION, PROTOCOL_MK2_VERSION, versioned, }; -use rivet_types::{ - actors::CrashPolicy, keys::namespace::runner_config::RunnerConfigVariant, - runner_configs::RunnerConfigKind, -}; +use rivet_types::{actors::CrashPolicy, keys::namespace::runner_config::RunnerConfigVariant}; use super::FailureReason; use std::time::Instant; @@ -246,21 +243,6 @@ async fn allocate_actor_v2( let crash_policy = state.crash_policy; let runner_name_selector = &state.runner_name_selector; - // Check if valid serverless config exists for the current ns + runner name - let runner_config_res = ctx - .op(crate::ops::runner_config::get::Input { - runners: vec![(namespace_id, runner_name_selector.clone())], - bypass_cache: false, - }) - .await?; - let has_valid_serverless = runner_config_res - .first() - .map(|runner| match &runner.config.kind { - RunnerConfigKind::Serverless { max_runners, .. } => *max_runners != 0, - _ => false, - }) - .unwrap_or_default(); - let runner_eligible_threshold = ctx.config().pegboard().runner_eligible_threshold(); let actor_allocation_candidate_sample_size = ctx .config() @@ -274,21 +256,40 @@ async fn allocate_actor_v2( .run(|tx| async move { let ping_threshold_ts = util::timestamp::now() - runner_eligible_threshold; - // Check if runner is an serverless runner - let for_serverless = tx - .with_subspace(namespace::keys::subspace()) - .exists( - &keys::runner_config::ByVariantKey::new( - namespace_id, - RunnerConfigVariant::Serverless, - runner_name_selector.clone(), - ), - Serializable, - ) - .await?; - let tx = tx.with_subspace(keys::subspace()); + // Check if a queue exists + let pending_actor_subspace = keys::subspace().subspace( + &keys::ns::PendingActorByRunnerNameSelectorKey::subspace( + namespace_id, + runner_name_selector.clone(), + ), + ); + + let ns_tx = tx.with_subspace(namespace::keys::subspace()); + let runner_config_variant_key = keys::runner_config::ByVariantKey::new( + namespace_id, + RunnerConfigVariant::Serverless, + runner_name_selector.clone(), + ); + let mut queue_stream = tx.get_ranges_keyvalues( + universaldb::RangeOption { + mode: StreamingMode::Exact, + limit: Some(1), + ..(&pending_actor_subspace).into() + }, + // NOTE: This is not Serializable because we don't want to conflict with other + // inserts/clears to this range + Snapshot, + ); + let (for_serverless_res, queue_exists_res) = tokio::join!( + // Check if runner is an serverless runner + ns_tx.exists(&runner_config_variant_key, Serializable), + queue_stream.next(), + ); + let for_serverless = for_serverless_res?; + let queue_exists = queue_exists_res.is_some(); + if for_serverless { tx.atomic_op( &rivet_types::keys::pegboard::ns::ServerlessDesiredSlotsKey::new( @@ -300,28 +301,6 @@ async fn allocate_actor_v2( ); } - // Check if a queue exists - let pending_actor_subspace = keys::subspace().subspace( - &keys::ns::PendingActorByRunnerNameSelectorKey::subspace( - namespace_id, - runner_name_selector.clone(), - ), - ); - let queue_exists = tx - .get_ranges_keyvalues( - universaldb::RangeOption { - mode: StreamingMode::Exact, - limit: Some(1), - ..(&pending_actor_subspace).into() - }, - // NOTE: This is not Serializable because we don't want to conflict with other - // inserts/clears to this range - Snapshot, - ) - .next() - .await - .is_some(); - if !queue_exists { let runner_alloc_subspace = keys::subspace().subspace(&keys::ns::RunnerAllocIdxKey::subspace( @@ -454,9 +433,9 @@ async fn allocate_actor_v2( // At this point in the txn there is no availability - match (crash_policy, input.force_allocate, has_valid_serverless) { + match (crash_policy, input.force_allocate, for_serverless) { (CrashPolicy::Sleep, false, false) => Ok(AllocateActorOutputV2 { - serverless: for_serverless, + serverless: false, status: AllocateActorStatus::Sleep, }), // Write the actor to the alloc queue to wait @@ -904,6 +883,31 @@ pub async fn spawn_actor( }) .await?; } + // Bump the pool so it can scale down + else if allocate_res.serverless { + let res = ctx + .v(2) + .signal(crate::workflows::runner_pool::Bump::default()) + .to_workflow::() + .tag("namespace_id", input.namespace_id) + .tag("runner_name", input.runner_name_selector.clone()) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res + .as_ref() + .err() + .and_then(|x| x.chain().find_map(|x| x.downcast_ref::())) + { + tracing::warn!( + namespace_id=%input.namespace_id, + runner_name=%input.runner_name_selector, + "serverless pool workflow not found, respective runner config likely deleted" + ); + } else { + res?; + } + } Ok(SpawnActorOutput::Destroy) } @@ -990,6 +994,31 @@ pub async fn spawn_actor( runner_protocol_version, }) } else { + // Bump the pool so it can scale down + if allocate_res.serverless { + let res = ctx + .v(2) + .signal(crate::workflows::runner_pool::Bump::default()) + .to_workflow::() + .tag("namespace_id", input.namespace_id) + .tag("runner_name", input.runner_name_selector.clone()) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = + res.as_ref().err().and_then(|x| { + x.chain().find_map(|x| x.downcast_ref::()) + }) { + tracing::warn!( + namespace_id=%input.namespace_id, + runner_name=%input.runner_name_selector, + "serverless pool workflow not found, respective runner config likely deleted" + ); + } else { + res?; + } + } + Ok(SpawnActorOutput::Sleep) } } @@ -1099,18 +1128,18 @@ pub async fn clear_pending_allocation( let cleared = ctx .udb()? .run(|tx| async move { - let pending_alloc_key = - keys::subspace().pack(&keys::ns::PendingActorByRunnerNameSelectorKey::new( - input.namespace_id, - input.runner_name_selector.clone(), - input.pending_allocation_ts, - input.actor_id, - )); + let tx = tx.with_subspace(keys::subspace()); - let exists = tx.get(&pending_alloc_key, Serializable).await?.is_some(); + let pending_alloc_key = keys::ns::PendingActorByRunnerNameSelectorKey::new( + input.namespace_id, + input.runner_name_selector.clone(), + input.pending_allocation_ts, + input.actor_id, + ); + let exists = tx.exists(&pending_alloc_key, Serializable).await?; if exists { - tx.clear(&pending_alloc_key); + tx.delete(&pending_alloc_key); // If the pending actor key still exists, we must clear its desired slot because after this // activity the actor will go to sleep or be destroyed. We don't clear the slot if the key diff --git a/engine/packages/pegboard/src/workflows/runner2.rs b/engine/packages/pegboard/src/workflows/runner2.rs index 860214d0a5..90133f4ed8 100644 --- a/engine/packages/pegboard/src/workflows/runner2.rs +++ b/engine/packages/pegboard/src/workflows/runner2.rs @@ -636,6 +636,8 @@ pub(crate) struct AllocatePendingActorsInput { #[derive(Debug, Serialize, Deserialize)] pub(crate) struct AllocatePendingActorsOutput { pub allocations: Vec, + #[serde(default)] + pub attempted: usize, } #[derive(Debug, Serialize, Deserialize)] @@ -697,6 +699,7 @@ pub(crate) async fn allocate_pending_actors( // Shuffle for good measure pending_actors.shuffle(&mut rand::thread_rng()); + let attempted = pending_actors.len(); let runner_eligible_threshold = ctx.config().pegboard().runner_eligible_threshold(); let actor_allocation_candidate_sample_size = ctx .config() @@ -875,7 +878,10 @@ pub(crate) async fn allocate_pending_actors( .collect() .await; - Ok(AllocatePendingActorsOutput { allocations }) + Ok(AllocatePendingActorsOutput { + allocations, + attempted, + }) } #[derive(Debug, Serialize, Deserialize, Hash)] diff --git a/engine/packages/pegboard/src/workflows/runner_pool.rs b/engine/packages/pegboard/src/workflows/runner_pool.rs index 8ea4bf9f1e..74e568712a 100644 --- a/engine/packages/pegboard/src/workflows/runner_pool.rs +++ b/engine/packages/pegboard/src/workflows/runner_pool.rs @@ -47,121 +47,124 @@ pub async fn pegboard_runner_pool(ctx: &mut WorkflowCtx, input: &Input) -> Resul .dispatch() .await?; - ctx.loope(LifecycleState::default(), |ctx, state| { - let input = input.clone(); - async move { - // Get desired count -> drain and start counts - let ReadDesiredOutput::Desired { - desired_count, - details_hash, - } = ctx.activity(ReadDesiredInput { - namespace_id: input.namespace_id, - runner_name: input.runner_name.clone(), - }) - .await? - else { - // Drain all - for runner in &state.runners { - ctx.signal(serverless::receiver::Drain {}) - .to_workflow_id(runner.receiver_wf_id) - .send() - .await?; - } - - return Ok(Loop::Break(())); - }; - - // Remove runners that have an outdated hash. This is done outside of the below draining mechanism - // because we drain specific runners, not just a number of runners - let (new, outdated) = std::mem::take(&mut state.runners) - .into_iter() - .partition::, _>(|r| r.details_hash == details_hash); - state.runners = new; - - for runner in outdated { - // TODO: Spawn sub wf to process these so this is not blocking the loop - ctx.signal(serverless::receiver::Drain {}) - .to_workflow_id(runner.receiver_wf_id) - .send() - .await?; - } + ctx.lupe() + .commit_interval(5) + .with_state(LifecycleState::default()) + .run(|ctx, state| { + let input = input.clone(); + async move { + // Get desired count -> drain and start counts + let ReadDesiredOutput::Desired { + desired_count, + details_hash, + } = ctx.activity(ReadDesiredInput { + namespace_id: input.namespace_id, + runner_name: input.runner_name.clone(), + }) + .await? + else { + // Drain all + for runner in &state.runners { + ctx.signal(serverless::receiver::Drain {}) + .to_workflow_id(runner.receiver_wf_id) + .send() + .await?; + } - let drain_count = state.runners.len().saturating_sub(desired_count); - let start_count = desired_count.saturating_sub(state.runners.len()); + return Ok(Loop::Break(())); + }; - // Drain unnecessary runners - if drain_count != 0 { - // TODO: Implement smart logic of draining runners with the lowest allocated actors - let draining_runners = state.runners.iter().take(drain_count).collect::>(); + // Remove runners that have an outdated hash. This is done outside of the below draining mechanism + // because we drain specific runners, not just a number of runners + let (new, outdated) = std::mem::take(&mut state.runners) + .into_iter() + .partition::, _>(|r| r.details_hash == details_hash); + state.runners = new; - // TODO: Spawn sub wf to process these so this is not blocking the loop - for runner in draining_runners { + for runner in outdated { + // TODO: Spawn sub wf to process these so this is not blocking the loop ctx.signal(serverless::receiver::Drain {}) .to_workflow_id(runner.receiver_wf_id) .send() .await?; } - } - // Dispatch new runner workflows - if start_count != 0 { - // TODO: Spawn sub wf to process these so this is not blocking the loop - for _ in 0..start_count { - let receiver_wf_id = ctx - .workflow(serverless::receiver::Input { - pool_wf_id: ctx.workflow_id(), - namespace_id: input.namespace_id, - runner_name: input.runner_name.clone(), - }) - .tag("namespace_id", input.namespace_id) - .tag("runner_name", input.runner_name.clone()) - .dispatch() - .await?; + let drain_count = state.runners.len().saturating_sub(desired_count); + let start_count = desired_count.saturating_sub(state.runners.len()); + + // Drain unnecessary runners + if drain_count != 0 { + // TODO: Implement smart logic of draining runners with the lowest allocated actors + let draining_runners = + state.runners.iter().take(drain_count).collect::>(); + + // TODO: Spawn sub wf to process these so this is not blocking the loop + for runner in draining_runners { + ctx.signal(serverless::receiver::Drain {}) + .to_workflow_id(runner.receiver_wf_id) + .send() + .await?; + } + } - state.runners.push(RunnerState { - receiver_wf_id, - details_hash, - }); + // Dispatch new runner workflows + if start_count != 0 { + // TODO: Spawn sub wf to process these so this is not blocking the loop + for _ in 0..start_count { + let receiver_wf_id = ctx + .workflow(serverless::receiver::Input { + pool_wf_id: ctx.workflow_id(), + namespace_id: input.namespace_id, + runner_name: input.runner_name.clone(), + }) + .tag("namespace_id", input.namespace_id) + .tag("runner_name", input.runner_name.clone()) + .dispatch() + .await?; + + state.runners.push(RunnerState { + receiver_wf_id, + details_hash, + }); + } } - } - // Wait for Bump or serverless signals until we tick again - for sig in ctx.listen_n::
(512).await? { - match sig { - Main::OutboundConnDrainStarted(sig) => { - let (new, drain_started) = - std::mem::take(&mut state.runners) + // Wait for Bump or serverless signals until we tick again + for sig in ctx.listen_n::
(256).await? { + match sig { + Main::OutboundConnDrainStarted(sig) => { + let (new, drain_started) = std::mem::take(&mut state.runners) .into_iter() .partition::, _>(|r| r.receiver_wf_id != sig.receiver_wf_id); - state.runners = new; - - for runner in drain_started { - // TODO: Spawn sub wf to process these so this is not blocking the loop - ctx.signal(serverless::receiver::Drain {}) - .to_workflow_id(runner.receiver_wf_id) - .send() - .await?; + state.runners = new; + + for runner in drain_started { + // TODO: Spawn sub wf to process these so this is not blocking the loop + ctx.signal(serverless::receiver::Drain {}) + .to_workflow_id(runner.receiver_wf_id) + .send() + .await?; + } } - } - Main::Bump(bump) => { - if bump.endpoint_config_changed { - // Forward to metadata poller to trigger immediate metadata fetch - ctx.signal(runner_pool_metadata_poller::EndpointConfigChanged {}) - .to_workflow::() - .tag("namespace_id", input.namespace_id) - .tag("runner_name", &input.runner_name) - .send() - .await?; + Main::Bump(bump) => { + if bump.endpoint_config_changed { + // Forward to metadata poller to trigger immediate metadata fetch + ctx.signal(runner_pool_metadata_poller::EndpointConfigChanged {}) + .to_workflow::() + .tag("namespace_id", input.namespace_id) + .tag("runner_name", &input.runner_name) + .send() + .await?; + } } } } - } - Ok(Loop::Continue) - } - .boxed() - }) - .await?; + Ok(Loop::Continue) + } + .boxed() + }) + .await?; Ok(()) } diff --git a/engine/packages/pegboard/src/workflows/serverless/conn.rs b/engine/packages/pegboard/src/workflows/serverless/conn.rs index 6646d45a2b..b00031139c 100644 --- a/engine/packages/pegboard/src/workflows/serverless/conn.rs +++ b/engine/packages/pegboard/src/workflows/serverless/conn.rs @@ -25,8 +25,6 @@ const X_RIVET_TOTAL_SLOTS: HeaderName = HeaderName::from_static("x-rivet-total-s const X_RIVET_RUNNER_NAME: HeaderName = HeaderName::from_static("x-rivet-runner-name"); const X_RIVET_NAMESPACE_NAME: HeaderName = HeaderName::from_static("x-rivet-namespace-name"); -const DRAIN_GRACE_PERIOD: Duration = Duration::from_secs(10); - #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Input { pub pool_wf_id: Id, @@ -412,8 +410,9 @@ async fn outbound_req_inner( .with_label_values(&[&input.namespace_id.to_string(), &input.runner_name]) .inc(); - let sleep_until_drain = - Duration::from_secs(request_lifespan as u64).saturating_sub(DRAIN_GRACE_PERIOD); + let sleep_until_drain = Duration::from_secs(request_lifespan as u64).saturating_sub( + Duration::from_millis(ctx.config().pegboard().serverless_drain_grace_period()), + ); tokio::select! { res = stream_handler => { match res { @@ -518,9 +517,7 @@ async fn finish_non_critical_draining( // Wait for runner to shut down tokio::select! { res = wait_for_shutdown_fut => return res.map_err(Into::into), - _ = tokio::time::sleep(DRAIN_GRACE_PERIOD) => { - tracing::debug!(?runner_id, "reached drain grace period before runner shut down") - } + _ = tokio::time::sleep(Duration::from_millis(ctx.config().pegboard().serverless_drain_grace_period())) => {} _ = term_signal.recv() => {} } @@ -540,7 +537,7 @@ async fn finish_non_critical_draining( #[tracing::instrument(skip_all)] async fn drain_runner(ctx: &ActivityCtx, runner_id: Id) -> Result<()> { let res = ctx - .signal(crate::workflows::runner::Stop { + .signal(crate::workflows::runner2::Stop { reset_actor_rescheduling: true, }) // This is ok, because runner_id changes every retry of outbound_req diff --git a/engine/packages/pegboard/tests/kv_list_edge_cases.rs b/engine/packages/pegboard/tests/kv_list_edge_cases.rs index 4c98d40704..d18b0b5001 100644 --- a/engine/packages/pegboard/tests/kv_list_edge_cases.rs +++ b/engine/packages/pegboard/tests/kv_list_edge_cases.rs @@ -1,8 +1,7 @@ use anyhow::Result; -use pegboard_actor_kv as kv; -use rivet_runner_protocol as rp; -use rivet_util_id::Id; -use uuid::Uuid; +use gas::prelude::*; +use pegboard::actor_kv as kv; +use rivet_runner_protocol::mk2 as rp; #[tokio::test] async fn test_list_edge_cases() -> Result<()> { @@ -37,18 +36,23 @@ async fn test_list_edge_cases() -> Result<()> { let db = &test_deps.pools.udb()?; let actor_id = Id::new_v1(dc_label); + let recipient = kv::Recipient { + actor_id, + namespace_id: Id::new_v1(dc_label), + name: "default".to_string(), + }; // Test 1: List when empty tracing::info!("test 1: list when empty"); let (empty_keys, _, _) = - kv::list(db, actor_id, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(empty_keys.len(), 0, "should return empty list"); // Test 2: Prefix that matches nothing tracing::info!("test 2: prefix that matches nothing"); kv::put( db, - actor_id, + &recipient, vec![b"foo".to_vec(), b"bar".to_vec()], vec![b"1".to_vec(), b"2".to_vec()], ) @@ -56,7 +60,7 @@ async fn test_list_edge_cases() -> Result<()> { let (no_match, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: b"xyz".to_vec(), }), @@ -74,7 +78,7 @@ async fn test_list_edge_cases() -> Result<()> { tracing::info!("test 3: range where start > end"); let (backwards_range, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { start: b"z".to_vec(), end: b"a".to_vec(), @@ -94,7 +98,7 @@ async fn test_list_edge_cases() -> Result<()> { tracing::info!("test 4: range where start == end"); let (same_inclusive, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { start: b"foo".to_vec(), end: b"foo".to_vec(), @@ -112,7 +116,7 @@ async fn test_list_edge_cases() -> Result<()> { let (same_exclusive, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { start: b"foo".to_vec(), end: b"foo".to_vec(), @@ -128,27 +132,27 @@ async fn test_list_edge_cases() -> Result<()> { "same key exclusive range should return 0" ); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Test 5: Keys with null bytes (0x00) tracing::info!("test 5: keys with null bytes"); let null_key = vec![b'a', 0x00, b'b']; kv::put( db, - actor_id, + &recipient, vec![null_key.clone(), b"abc".to_vec()], vec![b"null_value".to_vec(), b"normal_value".to_vec()], ) .await?; - let (null_keys, null_values, _) = kv::get(db, actor_id, vec![null_key.clone()]).await?; + let (null_keys, null_values, _) = kv::get(db, &recipient, vec![null_key.clone()]).await?; assert_eq!(null_keys.len(), 1, "should retrieve key with null byte"); assert_eq!(null_values[0], b"null_value"); // Prefix query should work with null bytes let (null_prefix, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: vec![b'a', 0x00], }), @@ -163,29 +167,29 @@ async fn test_list_edge_cases() -> Result<()> { ); assert_eq!(null_prefix[0], null_key); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Test 6: Keys with 0xFF bytes tracing::info!("test 6: keys with 0xFF bytes"); let ff_key = vec![b'a', 0xFF, b'b']; kv::put( db, - actor_id, + &recipient, vec![ff_key.clone()], vec![b"ff_value".to_vec()], ) .await?; - let (ff_keys, _, _) = kv::get(db, actor_id, vec![ff_key.clone()]).await?; + let (ff_keys, _, _) = kv::get(db, &recipient, vec![ff_key.clone()]).await?; assert_eq!(ff_keys.len(), 1, "should retrieve key with 0xFF byte"); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Test 7: Empty prefix (should match all keys) tracing::info!("test 7: empty prefix"); kv::put( db, - actor_id, + &recipient, vec![b"a".to_vec(), b"b".to_vec(), b"c".to_vec()], vec![b"1".to_vec(), b"2".to_vec(), b"3".to_vec()], ) @@ -193,7 +197,7 @@ async fn test_list_edge_cases() -> Result<()> { let (empty_prefix, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: vec![] }), false, None, @@ -201,15 +205,15 @@ async fn test_list_edge_cases() -> Result<()> { .await?; assert_eq!(empty_prefix.len(), 3, "empty prefix should match all keys"); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Test 8: Prefix longer than any stored key tracing::info!("test 8: prefix longer than stored keys"); - kv::put(db, actor_id, vec![b"ab".to_vec()], vec![b"val".to_vec()]).await?; + kv::put(db, &recipient, vec![b"ab".to_vec()], vec![b"val".to_vec()]).await?; let (long_prefix, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: b"abcdefghijk".to_vec(), }), @@ -223,7 +227,7 @@ async fn test_list_edge_cases() -> Result<()> { "prefix longer than keys should return empty" ); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Test 9: Keys that differ only in last byte tracing::info!("test 9: keys differing only in last byte"); @@ -239,11 +243,11 @@ async fn test_list_edge_cases() -> Result<()> { b"v2".to_vec(), b"vFF".to_vec(), ]; - kv::put(db, actor_id, keys.clone(), values.clone()).await?; + kv::put(db, &recipient, keys.clone(), values.clone()).await?; let (prefix_match, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: b"key".to_vec(), }), @@ -266,7 +270,7 @@ async fn test_list_edge_cases() -> Result<()> { // Range from key\x00 to key\x02 inclusive should get 3 keys let (byte_range, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { start: b"key\x00".to_vec(), end: b"key\x02".to_vec(), @@ -278,13 +282,13 @@ async fn test_list_edge_cases() -> Result<()> { .await?; assert_eq!(byte_range.len(), 3, "byte range should get 3 keys"); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Test 10: Limit of 0 tracing::info!("test 10: limit of 0"); kv::put( db, - actor_id, + &recipient, vec![b"a".to_vec(), b"b".to_vec()], vec![b"1".to_vec(), b"2".to_vec()], ) @@ -292,7 +296,7 @@ async fn test_list_edge_cases() -> Result<()> { let (zero_limit, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListAllQuery, false, Some(0), @@ -304,7 +308,7 @@ async fn test_list_edge_cases() -> Result<()> { tracing::info!("test 11: limit of 1"); let (one_limit, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListAllQuery, false, Some(1), @@ -316,7 +320,7 @@ async fn test_list_edge_cases() -> Result<()> { tracing::info!("test 12: limit larger than total"); let (large_limit, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListAllQuery, false, Some(1000), @@ -328,20 +332,26 @@ async fn test_list_edge_cases() -> Result<()> { "should return all keys when limit > total" ); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Test 13: Reverse with limit tracing::info!("test 13: reverse with limit"); kv::put( db, - actor_id, + &recipient, vec![b"a".to_vec(), b"b".to_vec(), b"c".to_vec(), b"d".to_vec()], vec![b"1".to_vec(), b"2".to_vec(), b"3".to_vec(), b"4".to_vec()], ) .await?; - let (reverse_limited, _, _) = - kv::list(db, actor_id, rp::KvListQuery::KvListAllQuery, true, Some(2)).await?; + let (reverse_limited, _, _) = kv::list( + db, + &recipient, + rp::KvListQuery::KvListAllQuery, + true, + Some(2), + ) + .await?; assert_eq!( reverse_limited.len(), 2, @@ -355,7 +365,7 @@ async fn test_list_edge_cases() -> Result<()> { tracing::info!("test 14: prefix with reverse"); let (prefix_reverse, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: vec![] }), true, None, diff --git a/engine/packages/pegboard/tests/kv_operations.rs b/engine/packages/pegboard/tests/kv_operations.rs index c2fea6b951..a17c4247e7 100644 --- a/engine/packages/pegboard/tests/kv_operations.rs +++ b/engine/packages/pegboard/tests/kv_operations.rs @@ -1,8 +1,7 @@ use anyhow::Result; -use pegboard_actor_kv as kv; -use rivet_runner_protocol as rp; -use rivet_util_id::Id; -use uuid::Uuid; +use gas::prelude::*; +use pegboard::actor_kv as kv; +use rivet_runner_protocol::mk2 as rp; #[tokio::test] async fn test_kv_operations() -> Result<()> { @@ -38,6 +37,11 @@ async fn test_kv_operations() -> Result<()> { let db = &test_deps.pools.udb()?; let actor_id = Id::new_v1(dc_label); + let recipient = kv::Recipient { + actor_id, + namespace_id: Id::new_v1(dc_label), + name: "default".to_string(), + }; tracing::info!(?actor_id, "starting kv operations test"); @@ -58,12 +62,12 @@ async fn test_kv_operations() -> Result<()> { b"other_value".to_vec(), ]; - kv::put(db, actor_id, keys.clone(), values.clone()).await?; + kv::put(db, &recipient, keys.clone(), values.clone()).await?; tracing::info!("successfully put {} keys", keys.len()); // Test 2: Get the keys back tracing::info!("test 2: getting keys"); - let (got_keys, got_values, got_metadata) = kv::get(db, actor_id, keys.clone()).await?; + let (got_keys, got_values, got_metadata) = kv::get(db, &recipient, keys.clone()).await?; assert_eq!(got_keys.len(), 5, "should get 5 keys back"); assert_eq!(got_values.len(), 5, "should get 5 values back"); @@ -85,7 +89,7 @@ async fn test_kv_operations() -> Result<()> { "metadata should have version" ); assert!( - got_metadata[got_idx].create_ts > 0, + got_metadata[got_idx].update_ts > 0, "metadata should have timestamp" ); } @@ -94,7 +98,7 @@ async fn test_kv_operations() -> Result<()> { // Test 3: List all keys tracing::info!("test 3: listing all keys"); let (list_keys, list_values, list_metadata) = - kv::list(db, actor_id, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(list_keys.len(), 5, "should list 5 keys"); assert_eq!(list_values.len(), 5, "should list 5 values"); @@ -105,7 +109,7 @@ async fn test_kv_operations() -> Result<()> { tracing::info!("test 4: listing with limit"); let (limited_keys, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListAllQuery, false, Some(2), @@ -118,9 +122,9 @@ async fn test_kv_operations() -> Result<()> { // Test 5: List with reverse tracing::info!("test 5: listing in reverse"); let (forward_keys, _, _) = - kv::list(db, actor_id, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; let (reverse_keys, _, _) = - kv::list(db, actor_id, rp::KvListQuery::KvListAllQuery, true, None).await?; + kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, true, None).await?; assert_eq!(forward_keys.len(), reverse_keys.len()); // Keys should be in opposite order @@ -151,12 +155,12 @@ async fn test_kv_operations() -> Result<()> { b"Post 2".to_vec(), b"Comment 100".to_vec(), ]; - kv::put(db, actor_id, prefix_keys.clone(), prefix_values.clone()).await?; + kv::put(db, &recipient, prefix_keys.clone(), prefix_values.clone()).await?; // Query with "users:" prefix let (users_keys, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: b"users:".to_vec(), }), @@ -172,7 +176,7 @@ async fn test_kv_operations() -> Result<()> { // Query with "posts:" prefix let (posts_keys, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: b"posts:".to_vec(), }), @@ -188,13 +192,13 @@ async fn test_kv_operations() -> Result<()> { tracing::info!("successfully listed keys with prefix"); // Clean up the prefix test keys - kv::delete(db, actor_id, prefix_keys).await?; + kv::delete(db, &recipient, prefix_keys).await?; // Test 7: List with range tracing::info!("test 7: listing with range"); let (range_keys, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { start: b"key1".to_vec(), end: b"key2".to_vec(), @@ -218,7 +222,7 @@ async fn test_kv_operations() -> Result<()> { tracing::info!("test 8: listing with exclusive range"); let (exclusive_range_keys, _, _) = kv::list( db, - actor_id, + &recipient, rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { start: b"key1".to_vec(), end: b"key2".to_vec(), @@ -237,11 +241,11 @@ async fn test_kv_operations() -> Result<()> { // Test 9: Delete specific keys tracing::info!("test 9: deleting specific keys"); let keys_to_delete = vec![b"key1".to_vec(), b"key2".to_vec()]; - kv::delete(db, actor_id, keys_to_delete.clone()).await?; + kv::delete(db, &recipient, keys_to_delete.clone()).await?; // Verify keys are deleted let (remaining_keys, _, _) = - kv::list(db, actor_id, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(remaining_keys.len(), 3, "should have 3 keys remaining"); assert!(!remaining_keys.contains(&b"key1".to_vec())); assert!(!remaining_keys.contains(&b"key2".to_vec())); @@ -249,18 +253,20 @@ async fn test_kv_operations() -> Result<()> { // Test 10: Delete all keys tracing::info!("test 10: deleting all keys"); - kv::delete_all(db, actor_id).await?; + kv::delete_all(db, &recipient).await?; // Verify all keys are deleted let (all_keys, _, _) = - kv::list(db, actor_id, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(all_keys.len(), 0, "should have no keys remaining"); tracing::info!("successfully deleted all keys"); // Test 11: Test storage size tracing::info!("test 11: testing storage size"); - let subspace = pegboard::keys::actor_kv_subspace().subspace(&actor_id); - let size = kv::get_subspace_size(db, &subspace).await?; + let size = db + .run(|tx| async move { kv::estimate_kv_size(&tx, actor_id).await }) + .await + .unwrap(); assert_eq!(size, 0, "storage size should be 0 after delete_all"); tracing::info!("successfully verified storage size"); @@ -269,13 +275,14 @@ async fn test_kv_operations() -> Result<()> { let large_value = vec![42u8; 50_000]; // 50 KB, will be split into chunks kv::put( db, - actor_id, + &recipient, vec![b"large_key".to_vec()], vec![large_value.clone()], ) .await?; - let (large_keys, large_values, _) = kv::get(db, actor_id, vec![b"large_key".to_vec()]).await?; + let (large_keys, large_values, _) = + kv::get(db, &recipient, vec![b"large_key".to_vec()]).await?; assert_eq!(large_keys.len(), 1); assert_eq!(large_values[0], large_value, "large value should match"); tracing::info!("successfully stored and retrieved large value"); @@ -283,7 +290,10 @@ async fn test_kv_operations() -> Result<()> { // Test 13: Verify storage size increased // Note: Storage size estimation may not be accurate on all backends (e.g., FileSystem) tracing::info!("test 13: verifying storage size with data"); - let size_with_data = kv::get_subspace_size(db, &subspace).await?; + let size_with_data = db + .run(|tx| async move { kv::estimate_kv_size(&tx, actor_id).await }) + .await + .unwrap(); tracing::info!( ?size_with_data, "storage size with data (may be 0 on some backends)" diff --git a/engine/packages/pools/src/db/ups.rs b/engine/packages/pools/src/db/ups.rs index 83acfacaf0..9d098c2bf9 100644 --- a/engine/packages/pools/src/db/ups.rs +++ b/engine/packages/pools/src/db/ups.rs @@ -44,13 +44,19 @@ pub async fn setup(config: &Config, client_name: &str) -> Result { tracing::debug!(?server_addrs, "nats reconnected"); } async_nats::Event::Disconnected => { - tracing::error!(?server_addrs, "nats disconnected"); + tracing::warn!(?server_addrs, "nats disconnected"); } async_nats::Event::LameDuckMode => { tracing::warn!(?server_addrs, "nats lame duck mode"); } - async_nats::Event::SlowConsumer(_) => { - tracing::warn!(?server_addrs, "nats slow consumer"); + async_nats::Event::Draining => { + tracing::warn!(?server_addrs, "nats draining"); + } + async_nats::Event::Closed => { + tracing::error!(?server_addrs, "nats closed"); + } + async_nats::Event::SlowConsumer(sid) => { + tracing::warn!(?server_addrs, ?sid, "nats slow consumer"); } async_nats::Event::ServerError(err) => { tracing::error!(?server_addrs, ?err, "nats server error"); diff --git a/engine/packages/test-deps-docker/src/database.rs b/engine/packages/test-deps-docker/src/database.rs index 13b2401502..11532ebf25 100644 --- a/engine/packages/test-deps-docker/src/database.rs +++ b/engine/packages/test-deps-docker/src/database.rs @@ -89,48 +89,56 @@ impl TestDatabase { } } - /// Wait for Postgres to be ready to accept connections - pub async fn wait_for_postgres_ready(port: u16, max_attempts: u32) -> Result<()> { - use std::str::FromStr; - use tokio_postgres::Config; - - let connection_string = - format!("postgres://postgres:test_password@127.0.0.1:{port}/test_db"); - - for attempt in 1..=max_attempts { - tracing::debug!(attempt, max_attempts, "Checking if Postgres is ready"); - - match Config::from_str(&connection_string)? - .connect(tokio_postgres::NoTls) - .await - { - std::result::Result::Ok((client, connection)) => { - // Spawn connection handler - tokio::spawn(async move { - if let Err(e) = connection.await { - tracing::debug!(error = ?e, "Connection error"); - } - }); + pub async fn wait_for_ready(&self, docker_config: &DockerRunConfig) -> Result<()> { + match self { + TestDatabase::Postgres => { + wait_for_postgres_ready(docker_config.port_mapping.0, 10).await + } + TestDatabase::FileSystem => Ok(()), + } + } +} - // Try a simple query - if client.simple_query("SELECT 1").await.is_ok() { - tracing::debug!("Postgres is ready"); - return Ok(()); +/// Wait for Postgres to be ready to accept connections +async fn wait_for_postgres_ready(port: u16, max_attempts: u32) -> Result<()> { + use std::str::FromStr; + use tokio_postgres::Config; + + let connection_string = format!("postgres://postgres:test_password@127.0.0.1:{port}/test_db"); + + for attempt in 1..=max_attempts { + tracing::debug!(attempt, max_attempts, "Checking if Postgres is ready"); + + match Config::from_str(&connection_string)? + .connect(tokio_postgres::NoTls) + .await + { + std::result::Result::Ok((client, connection)) => { + // Spawn connection handler + tokio::spawn(async move { + if let Err(e) = connection.await { + tracing::debug!(error = ?e, "Connection error"); } - } - Err(e) => { - tracing::debug!(error = ?e, attempt, "Postgres not ready yet"); + }); + + // Try a simple query + if client.simple_query("SELECT 1").await.is_ok() { + tracing::debug!("Postgres is ready"); + return Ok(()); } } - - if attempt < max_attempts { - sleep(Duration::from_millis(500)).await; + Err(e) => { + tracing::debug!(error = ?e, attempt, "Postgres not ready yet"); } } - anyhow::bail!( - "Postgres failed to become ready after {} attempts", - max_attempts - ) + if attempt < max_attempts { + sleep(Duration::from_millis(500)).await; + } } + + anyhow::bail!( + "Postgres failed to become ready after {} attempts", + max_attempts + ) } diff --git a/engine/packages/test-deps/src/datacenter.rs b/engine/packages/test-deps/src/datacenter.rs index 19971b826c..6ffdb4f087 100644 --- a/engine/packages/test-deps/src/datacenter.rs +++ b/engine/packages/test-deps/src/datacenter.rs @@ -37,14 +37,13 @@ pub async fn setup_single_datacenter( let was_started = docker_config.start().await?; container_names.push(docker_config.container_name.clone()); - // If Postgres was just started, wait for it to be ready - if was_started && test_database == TestDatabase::Postgres { + if was_started { tracing::info!( dc = dc.datacenter_label, port = docker_config.port_mapping.0, - "waiting for Postgres to be ready" + "waiting for database to be ready" ); - TestDatabase::wait_for_postgres_ready(docker_config.port_mapping.0, 10).await?; + test_database.wait_for_ready(&docker_config).await?; } } diff --git a/engine/packages/universalpubsub/src/chunking.rs b/engine/packages/universalpubsub/src/chunking.rs index 93d86dc864..57950fdf42 100644 --- a/engine/packages/universalpubsub/src/chunking.rs +++ b/engine/packages/universalpubsub/src/chunking.rs @@ -110,6 +110,17 @@ impl ChunkTracker { } } +/// Returns the number of bytes needed to encode `n` as a BARE unsigned integer (LEB128). +fn bare_uint_len(n: usize) -> usize { + let mut len = 1; + let mut v = n >> 7; + while v > 0 { + len += 1; + v >>= 7; + } + len +} + /// Splits a payload into chunks that fit within message size limits. /// /// This function handles chunking by accounting for different overhead @@ -160,9 +171,21 @@ pub fn split_payload_into_chunks( .serialize_with_embedded_version(PROTOCOL_VERSION)? .len(); - // Calculate max payload sizes - let first_chunk_max_payload = max_message_size.saturating_sub(start_overhead); - let other_chunk_max_payload = max_message_size.saturating_sub(chunk_overhead); + // Calculate max payload sizes, correcting for the variable-length encoding of the + // data length prefix. The overhead above was computed with an empty payload + // (uint(0) = 1 byte). For payloads >= 128 bytes the length prefix grows (LEB128 + // encoding), so we subtract those extra bytes to ensure every encoded chunk fits + // within max_message_size. + let first_chunk_max_payload = { + let raw = max_message_size.saturating_sub(start_overhead); + let extra = bare_uint_len(raw).saturating_sub(1); + raw.saturating_sub(extra) + }; + let other_chunk_max_payload = { + let raw = max_message_size.saturating_sub(chunk_overhead); + let extra = bare_uint_len(raw).saturating_sub(1); + raw.saturating_sub(extra) + }; if first_chunk_max_payload == 0 || other_chunk_max_payload == 0 { bail!("message overhead exceeds max message size"); diff --git a/engine/packages/universalpubsub/tests/chunking.rs b/engine/packages/universalpubsub/tests/chunking.rs new file mode 100644 index 0000000000..3177aa3a0d --- /dev/null +++ b/engine/packages/universalpubsub/tests/chunking.rs @@ -0,0 +1,298 @@ +use universalpubsub::chunking::{ChunkTracker, encode_chunk, split_payload_into_chunks}; + +fn setup_logging() { + let _ = tracing_subscriber::fmt() + .with_env_filter("debug") + .with_ansi(false) + .with_test_writer() + .try_init(); +} + +/// Encodes a payload through the full chunking pipeline and reassembles it. +/// +/// Returns `(reassembled_payload, reply_subject)`. +fn roundtrip( + payload: &[u8], + max_message_size: usize, + reply_subject: Option<&str>, +) -> (Vec, Option) { + let message_id = [0u8; 16]; + let chunks = split_payload_into_chunks(payload, max_message_size, message_id, reply_subject) + .expect("split failed"); + let chunk_count = chunks.len() as u32; + + let mut tracker = ChunkTracker::new(); + let mut final_result = None; + + for (i, chunk_payload) in chunks.into_iter().enumerate() { + let encoded = encode_chunk( + chunk_payload, + i as u32, + chunk_count, + message_id, + reply_subject.map(|s| s.to_string()), + ) + .expect("encode failed"); + + let result = tracker + .process_chunk(&encoded) + .expect("process_chunk failed"); + + if i < (chunk_count as usize - 1) { + assert!( + result.is_none(), + "expected None for intermediate chunk {}", + i + ); + } else { + assert!(result.is_some(), "expected Some for final chunk"); + final_result = result; + } + } + + final_result.unwrap() +} + +#[test] +fn test_single_chunk_small_payload() { + setup_logging(); + + let payload = b"hello world"; + let (reassembled, reply) = roundtrip(payload, 1024, None); + assert_eq!(reassembled, payload); + assert_eq!(reply, None); +} + +#[test] +fn test_multi_chunk_roundtrip() { + setup_logging(); + + let payload: Vec = (0..10000_usize).map(|i| (i % 256) as u8).collect(); + let (reassembled, reply) = roundtrip(&payload, 512, None); + assert_eq!(reassembled, payload); + assert_eq!(reply, None); +} + +#[test] +fn test_empty_payload() { + setup_logging(); + + let payload = b""; + let (reassembled, reply) = roundtrip(payload, 512, None); + assert_eq!(reassembled, payload); + assert_eq!(reply, None); +} + +#[test] +fn test_reply_subject_preserved_single_chunk() { + setup_logging(); + + let payload = b"hello"; + let (reassembled, reply) = roundtrip(payload, 1024, Some("_INBOX.abc")); + assert_eq!(reassembled, payload); + assert_eq!(reply, Some("_INBOX.abc".to_string())); +} + +#[test] +fn test_reply_subject_preserved_multi_chunk() { + setup_logging(); + + let payload: Vec = (0..5000_usize).map(|i| (i % 256) as u8).collect(); + let (reassembled, reply) = roundtrip(&payload, 512, Some("_INBOX.xyz")); + assert_eq!(reassembled, payload); + assert_eq!(reply, Some("_INBOX.xyz".to_string())); +} + +/// Verifies that every encoded chunk fits within the declared max_message_size. +#[test] +fn test_encoded_chunks_fit_within_limit() { + setup_logging(); + + let max_message_size = 512; + let payload: Vec = (0..5000_usize).map(|i| (i % 256) as u8).collect(); + let message_id = [1u8; 16]; + + let chunks = split_payload_into_chunks(&payload, max_message_size, message_id, None).unwrap(); + let chunk_count = chunks.len() as u32; + assert!(chunk_count > 1, "expected multi-chunk message"); + + for (i, chunk_payload) in chunks.into_iter().enumerate() { + let encoded = encode_chunk(chunk_payload, i as u32, chunk_count, message_id, None).unwrap(); + assert!( + encoded.len() <= max_message_size, + "chunk {} is {} bytes, exceeds limit of {}", + i, + encoded.len(), + max_message_size + ); + } +} + +/// Verifies that encoded chunks including the reply_subject fit within the limit. +#[test] +fn test_encoded_chunks_with_reply_fit_within_limit() { + setup_logging(); + + let max_message_size = 512; + let reply_subject = "_INBOX.some-reply-subject"; + let payload: Vec = (0..5000_usize).map(|i| (i % 256) as u8).collect(); + let message_id = [2u8; 16]; + + let chunks = + split_payload_into_chunks(&payload, max_message_size, message_id, Some(reply_subject)) + .unwrap(); + let chunk_count = chunks.len() as u32; + + for (i, chunk_payload) in chunks.into_iter().enumerate() { + let reply = if i == 0 { + Some(reply_subject.to_string()) + } else { + None + }; + let encoded = + encode_chunk(chunk_payload, i as u32, chunk_count, message_id, reply).unwrap(); + assert!( + encoded.len() <= max_message_size, + "chunk {} is {} bytes, exceeds limit of {}", + i, + encoded.len(), + max_message_size + ); + } +} + +/// Two messages with different IDs can be tracked simultaneously, even when +/// their chunks arrive interleaved. +#[test] +fn test_multiple_concurrent_messages() { + setup_logging(); + + let message_id_1 = [1u8; 16]; + let message_id_2 = [2u8; 16]; + let max_message_size = 512; + + let payload1: Vec = (0..2000_usize).map(|i| (i % 256) as u8).collect(); + let payload2: Vec = (0..2000_usize).map(|i| ((i + 128) % 256) as u8).collect(); + + let chunks1 = + split_payload_into_chunks(&payload1, max_message_size, message_id_1, None).unwrap(); + let chunks2 = + split_payload_into_chunks(&payload2, max_message_size, message_id_2, None).unwrap(); + assert!(chunks1.len() > 1, "expected multi-chunk for message 1"); + assert!(chunks2.len() > 1, "expected multi-chunk for message 2"); + + let chunk_count1 = chunks1.len() as u32; + let chunk_count2 = chunks2.len() as u32; + + let encoded1: Vec> = chunks1 + .into_iter() + .enumerate() + .map(|(i, p)| encode_chunk(p, i as u32, chunk_count1, message_id_1, None).unwrap()) + .collect(); + let encoded2: Vec> = chunks2 + .into_iter() + .enumerate() + .map(|(i, p)| encode_chunk(p, i as u32, chunk_count2, message_id_2, None).unwrap()) + .collect(); + + let mut tracker = ChunkTracker::new(); + let mut result1 = None; + let mut result2 = None; + + // Feed chunks from both messages in alternating order. + let max_len = encoded1.len().max(encoded2.len()); + for i in 0..max_len { + if i < encoded1.len() { + let r = tracker.process_chunk(&encoded1[i]).unwrap(); + if r.is_some() { + result1 = r; + } + } + if i < encoded2.len() { + let r = tracker.process_chunk(&encoded2[i]).unwrap(); + if r.is_some() { + result2 = r; + } + } + } + + assert_eq!(result1.expect("message 1 not reassembled").0, payload1); + assert_eq!(result2.expect("message 2 not reassembled").0, payload2); +} + +/// Sending a later chunk before an earlier one returns an error. +#[test] +fn test_out_of_order_chunk_error() { + setup_logging(); + + let message_id = [3u8; 16]; + let max_message_size = 256; + let payload: Vec = (0..3000_usize).map(|i| (i % 256) as u8).collect(); + + let chunks = split_payload_into_chunks(&payload, max_message_size, message_id, None).unwrap(); + let chunk_count = chunks.len() as u32; + assert!( + chunk_count >= 3, + "need at least 3 chunks, got {}", + chunk_count + ); + + let encoded: Vec> = chunks + .into_iter() + .enumerate() + .map(|(i, p)| encode_chunk(p, i as u32, chunk_count, message_id, None).unwrap()) + .collect(); + + let mut tracker = ChunkTracker::new(); + + // First chunk is accepted. + assert!(tracker.process_chunk(&encoded[0]).unwrap().is_none()); + + // Skipping chunk 1 and sending chunk 2 should fail. + let err = tracker.process_chunk(&encoded[2]).unwrap_err(); + assert!( + err.to_string().contains("expected chunk"), + "expected order error, got: {}", + err + ); +} + +/// A MessageChunk with no preceding MessageStart returns an error. +#[test] +fn test_orphan_chunk_without_start() { + setup_logging(); + + let message_id = [4u8; 16]; + let encoded = encode_chunk(b"orphan".to_vec(), 1, 3, message_id, None).unwrap(); + + let mut tracker = ChunkTracker::new(); + let err = tracker.process_chunk(&encoded).unwrap_err(); + assert!( + err.to_string().contains("no matching buffer found"), + "expected missing buffer error, got: {}", + err + ); +} + +#[test] +fn test_split_count_single_vs_multi() { + setup_logging(); + + let message_id = [5u8; 16]; + let max_message_size = 256; + + let small = vec![0u8; 10]; + let chunks = split_payload_into_chunks(&small, max_message_size, message_id, None).unwrap(); + assert_eq!( + chunks.len(), + 1, + "small payload should produce exactly 1 chunk" + ); + + let large = vec![0u8; max_message_size * 10]; + let chunks = split_payload_into_chunks(&large, max_message_size, message_id, None).unwrap(); + assert!( + chunks.len() > 1, + "large payload should produce multiple chunks" + ); +} diff --git a/engine/sdks/rust/runner-protocol/src/lib.rs b/engine/sdks/rust/runner-protocol/src/lib.rs index 2b6ddcdb65..befa759806 100644 --- a/engine/sdks/rust/runner-protocol/src/lib.rs +++ b/engine/sdks/rust/runner-protocol/src/lib.rs @@ -6,10 +6,10 @@ pub mod versioned; // Re-export latest pub use generated::v3::*; -pub use generated::v5 as mk2; +pub use generated::v6 as mk2; pub const PROTOCOL_MK1_VERSION: u16 = 3; -pub const PROTOCOL_MK2_VERSION: u16 = 5; +pub const PROTOCOL_MK2_VERSION: u16 = 6; pub fn is_mk2(protocol_version: u16) -> bool { protocol_version > PROTOCOL_MK1_VERSION diff --git a/engine/sdks/rust/runner-protocol/src/versioned.rs b/engine/sdks/rust/runner-protocol/src/versioned.rs index 1f440010c6..f3cca5d099 100644 --- a/engine/sdks/rust/runner-protocol/src/versioned.rs +++ b/engine/sdks/rust/runner-protocol/src/versioned.rs @@ -2,23 +2,24 @@ use anyhow::{Ok, Result, bail}; use vbare::OwnedVersionedData; use crate::PROTOCOL_MK1_VERSION; -use crate::generated::{v1, v2, v3, v4, v5}; +use crate::generated::{v1, v2, v3, v4, v5, v6}; use crate::uuid_compat::{decode_bytes_from_uuid, encode_bytes_to_uuid}; pub enum ToClientMk2 { V4(v4::ToClient), V5(v5::ToClient), + V6(v6::ToClient), } impl OwnedVersionedData for ToClientMk2 { - type Latest = v5::ToClient; + type Latest = v6::ToClient; - fn wrap_latest(latest: v5::ToClient) -> Self { - ToClientMk2::V5(latest) + fn wrap_latest(latest: v6::ToClient) -> Self { + ToClientMk2::V6(latest) } fn unwrap_latest(self) -> Result { - if let ToClientMk2::V5(data) = self { + if let ToClientMk2::V6(data) = self { Ok(data) } else { bail!("version not latest"); @@ -29,6 +30,7 @@ impl OwnedVersionedData for ToClientMk2 { match version { 4 => Ok(ToClientMk2::V4(serde_bare::from_slice(payload)?)), 5 => Ok(ToClientMk2::V5(serde_bare::from_slice(payload)?)), + 6 => Ok(ToClientMk2::V6(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -37,17 +39,18 @@ impl OwnedVersionedData for ToClientMk2 { match self { ToClientMk2::V4(data) => serde_bare::to_vec(&data).map_err(Into::into), ToClientMk2::V5(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToClientMk2::V6(data) => serde_bare::to_vec(&data).map_err(Into::into), } } fn deserialize_converters() -> Vec Result> { // No changes between v1 and v4 - vec![Ok, Ok, Ok, Self::v4_to_v5] + vec![Ok, Ok, Ok, Self::v4_to_v5, Self::v5_to_v6] } fn serialize_converters() -> Vec Result> { // No changes between v1 and v4 - vec![Self::v5_to_v4, Ok, Ok, Ok] + vec![Self::v6_to_v5, Self::v5_to_v4, Ok, Ok, Ok] } } @@ -225,22 +228,192 @@ impl ToClientMk2 { bail!("unexpected version"); } } + + fn v5_to_v6(self) -> Result { + if let ToClientMk2::V5(x) = self { + let inner = match x { + v5::ToClient::ToClientInit(init) => v6::ToClient::ToClientInit(v6::ToClientInit { + runner_id: init.runner_id, + metadata: v6::ProtocolMetadata { + runner_lost_threshold: init.metadata.runner_lost_threshold, + actor_stop_threshold: 0, + serverless_drain_grace_period: None, + }, + }), + v5::ToClient::ToClientCommands(commands) => v6::ToClient::ToClientCommands( + commands + .into_iter() + .map(|cmd| v6::CommandWrapper { + checkpoint: v6::ActorCheckpoint { + actor_id: cmd.checkpoint.actor_id, + generation: cmd.checkpoint.generation, + index: cmd.checkpoint.index, + }, + inner: match cmd.inner { + v5::Command::CommandStartActor(start) => { + v6::Command::CommandStartActor(v6::CommandStartActor { + config: v6::ActorConfig { + name: start.config.name, + key: start.config.key, + create_ts: start.config.create_ts, + input: start.config.input, + }, + hibernating_requests: start + .hibernating_requests + .into_iter() + .map(|req| v6::HibernatingRequest { + gateway_id: req.gateway_id, + request_id: req.request_id, + }) + .collect(), + }) + } + v5::Command::CommandStopActor => v6::Command::CommandStopActor, + }, + }) + .collect(), + ), + v5::ToClient::ToClientAckEvents(ack) => { + v6::ToClient::ToClientAckEvents(v6::ToClientAckEvents { + last_event_checkpoints: ack + .last_event_checkpoints + .into_iter() + .map(|cp| v6::ActorCheckpoint { + actor_id: cp.actor_id, + generation: cp.generation, + index: cp.index, + }) + .collect(), + }) + } + v5::ToClient::ToClientKvResponse(resp) => { + v6::ToClient::ToClientKvResponse(v6::ToClientKvResponse { + request_id: resp.request_id, + data: convert_kv_response_data_v5_to_v6(resp.data), + }) + } + v5::ToClient::ToClientTunnelMessage(msg) => { + v6::ToClient::ToClientTunnelMessage(v6::ToClientTunnelMessage { + message_id: v6::MessageId { + gateway_id: msg.message_id.gateway_id, + request_id: msg.message_id.request_id, + message_index: msg.message_id.message_index, + }, + message_kind: convert_to_client_tunnel_message_kind_v5_to_v6( + msg.message_kind, + ), + }) + } + v5::ToClient::ToClientPing(ping) => { + v6::ToClient::ToClientPing(v6::ToClientPing { ts: ping.ts }) + } + }; + + Ok(ToClientMk2::V6(inner)) + } else { + bail!("unexpected version"); + } + } + + fn v6_to_v5(self) -> Result { + if let ToClientMk2::V6(x) = self { + let inner = match x { + v6::ToClient::ToClientInit(init) => v5::ToClient::ToClientInit(v5::ToClientInit { + runner_id: init.runner_id, + metadata: v5::ProtocolMetadata { + runner_lost_threshold: init.metadata.runner_lost_threshold, + }, + }), + v6::ToClient::ToClientCommands(commands) => v5::ToClient::ToClientCommands( + commands + .into_iter() + .map(|cmd| v5::CommandWrapper { + checkpoint: v5::ActorCheckpoint { + actor_id: cmd.checkpoint.actor_id, + generation: cmd.checkpoint.generation, + index: cmd.checkpoint.index, + }, + inner: match cmd.inner { + v6::Command::CommandStartActor(start) => { + v5::Command::CommandStartActor(v5::CommandStartActor { + config: v5::ActorConfig { + name: start.config.name, + key: start.config.key, + create_ts: start.config.create_ts, + input: start.config.input, + }, + hibernating_requests: start + .hibernating_requests + .into_iter() + .map(|req| v5::HibernatingRequest { + gateway_id: req.gateway_id, + request_id: req.request_id, + }) + .collect(), + }) + } + v6::Command::CommandStopActor => v5::Command::CommandStopActor, + }, + }) + .collect(), + ), + v6::ToClient::ToClientAckEvents(ack) => { + v5::ToClient::ToClientAckEvents(v5::ToClientAckEvents { + last_event_checkpoints: ack + .last_event_checkpoints + .into_iter() + .map(|cp| v5::ActorCheckpoint { + actor_id: cp.actor_id, + generation: cp.generation, + index: cp.index, + }) + .collect(), + }) + } + v6::ToClient::ToClientKvResponse(resp) => { + v5::ToClient::ToClientKvResponse(v5::ToClientKvResponse { + request_id: resp.request_id, + data: convert_kv_response_data_v6_to_v5(resp.data), + }) + } + v6::ToClient::ToClientTunnelMessage(msg) => { + v5::ToClient::ToClientTunnelMessage(v5::ToClientTunnelMessage { + message_id: v5::MessageId { + gateway_id: msg.message_id.gateway_id, + request_id: msg.message_id.request_id, + message_index: msg.message_id.message_index, + }, + message_kind: convert_to_client_tunnel_message_kind_v6_to_v5( + msg.message_kind, + ), + }) + } + v6::ToClient::ToClientPing(ping) => { + v5::ToClient::ToClientPing(v5::ToClientPing { ts: ping.ts }) + } + }; + + Ok(ToClientMk2::V5(inner)) + } else { + bail!("unexpected version"); + } + } } pub enum ToServerMk2 { V4(v4::ToServer), - V5(v5::ToServer), + V6(v6::ToServer), } impl OwnedVersionedData for ToServerMk2 { - type Latest = v5::ToServer; + type Latest = v6::ToServer; - fn wrap_latest(latest: v5::ToServer) -> Self { - ToServerMk2::V5(latest) + fn wrap_latest(latest: v6::ToServer) -> Self { + ToServerMk2::V6(latest) } fn unwrap_latest(self) -> Result { - if let ToServerMk2::V5(data) = self { + if let ToServerMk2::V6(data) = self { Ok(data) } else { bail!("version not latest"); @@ -250,7 +423,8 @@ impl OwnedVersionedData for ToServerMk2 { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { 4 => Ok(ToServerMk2::V4(serde_bare::from_slice(payload)?)), - 5 => Ok(ToServerMk2::V5(serde_bare::from_slice(payload)?)), + // v5 and v6 have the same ToServer binary format + 5 | 6 => Ok(ToServerMk2::V6(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -258,26 +432,26 @@ impl OwnedVersionedData for ToServerMk2 { fn serialize_version(self, _version: u16) -> Result> { match self { ToServerMk2::V4(data) => serde_bare::to_vec(&data).map_err(Into::into), - ToServerMk2::V5(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToServerMk2::V6(data) => serde_bare::to_vec(&data).map_err(Into::into), } } fn deserialize_converters() -> Vec Result> { - // No changes between v1 and v4 - vec![Ok, Ok, Ok, Self::v4_to_v5] + // No changes between v1 and v4, no changes between v5 and v6 + vec![Ok, Ok, Ok, Self::v4_to_v6, Ok] } fn serialize_converters() -> Vec Result> { - // No changes between v1 and v4 - vec![Self::v5_to_v4, Ok, Ok, Ok] + // No changes between v1 and v4, no changes between v5 and v6 + vec![Ok, Self::v6_to_v4, Ok, Ok, Ok] } } impl ToServerMk2 { - fn v4_to_v5(self) -> Result { + fn v4_to_v6(self) -> Result { if let ToServerMk2::V4(x) = self { let inner = match x { - v4::ToServer::ToServerInit(init) => v5::ToServer::ToServerInit(v5::ToServerInit { + v4::ToServer::ToServerInit(init) => v6::ToServer::ToServerInit(v6::ToServerInit { name: init.name, version: init.version, total_slots: init.total_slots, @@ -286,7 +460,7 @@ impl ToServerMk2 { .map(|(k, v)| { ( k, - v5::ActorName { + v6::ActorName { metadata: v.metadata, }, ) @@ -295,7 +469,7 @@ impl ToServerMk2 { }), metadata: init.metadata, }), - v4::ToServer::ToServerEvents(events) => v5::ToServer::ToServerEvents( + v4::ToServer::ToServerEvents(events) => v6::ToServer::ToServerEvents( events .into_iter() .map(|event| { @@ -305,27 +479,27 @@ impl ToServerMk2 { v4::Event::EventActorSetAlarm(alarm) => alarm.generation, }; - v5::EventWrapper { - checkpoint: v5::ActorCheckpoint { + v6::EventWrapper { + checkpoint: v6::ActorCheckpoint { actor_id: event.checkpoint.actor_id, generation, index: event.checkpoint.index, }, inner: match event.inner { v4::Event::EventActorIntent(intent) => { - v5::Event::EventActorIntent(v5::EventActorIntent { - intent: convert_actor_intent_v4_to_v5(intent.intent), + v6::Event::EventActorIntent(v6::EventActorIntent { + intent: convert_actor_intent_v4_to_v6(intent.intent), }) } v4::Event::EventActorStateUpdate(state) => { - v5::Event::EventActorStateUpdate( - v5::EventActorStateUpdate { - state: convert_actor_state_v4_to_v5(state.state), + v6::Event::EventActorStateUpdate( + v6::EventActorStateUpdate { + state: convert_actor_state_v4_to_v6(state.state), }, ) } v4::Event::EventActorSetAlarm(alarm) => { - v5::Event::EventActorSetAlarm(v5::EventActorSetAlarm { + v6::Event::EventActorSetAlarm(v6::EventActorSetAlarm { alarm_ts: alarm.alarm_ts, }) } @@ -335,11 +509,11 @@ impl ToServerMk2 { .collect(), ), v4::ToServer::ToServerAckCommands(ack) => { - v5::ToServer::ToServerAckCommands(v5::ToServerAckCommands { + v6::ToServer::ToServerAckCommands(v6::ToServerAckCommands { last_command_checkpoints: ack .last_command_checkpoints .into_iter() - .map(|cp| v5::ActorCheckpoint { + .map(|cp| v6::ActorCheckpoint { actor_id: cp.actor_id, generation: 0, // Unknown in v4, use default index: cp.index, @@ -347,41 +521,41 @@ impl ToServerMk2 { .collect(), }) } - v4::ToServer::ToServerStopping => v5::ToServer::ToServerStopping, + v4::ToServer::ToServerStopping => v6::ToServer::ToServerStopping, v4::ToServer::ToServerPong(pong) => { - v5::ToServer::ToServerPong(v5::ToServerPong { ts: pong.ts }) + v6::ToServer::ToServerPong(v6::ToServerPong { ts: pong.ts }) } v4::ToServer::ToServerKvRequest(req) => { - v5::ToServer::ToServerKvRequest(v5::ToServerKvRequest { + v6::ToServer::ToServerKvRequest(v6::ToServerKvRequest { actor_id: req.actor_id, request_id: req.request_id, - data: convert_kv_request_data_v4_to_v5(req.data), + data: convert_kv_request_data_v4_to_v6(req.data), }) } v4::ToServer::ToServerTunnelMessage(msg) => { - v5::ToServer::ToServerTunnelMessage(v5::ToServerTunnelMessage { - message_id: v5::MessageId { + v6::ToServer::ToServerTunnelMessage(v6::ToServerTunnelMessage { + message_id: v6::MessageId { gateway_id: msg.message_id.gateway_id, request_id: msg.message_id.request_id, message_index: msg.message_id.message_index, }, - message_kind: convert_to_server_tunnel_message_kind_v4_to_v5( + message_kind: convert_to_server_tunnel_message_kind_v4_to_v6( msg.message_kind, ), }) } }; - Ok(ToServerMk2::V5(inner)) + Ok(ToServerMk2::V6(inner)) } else { bail!("unexpected version"); } } - fn v5_to_v4(self) -> Result { - if let ToServerMk2::V5(x) = self { + fn v6_to_v4(self) -> Result { + if let ToServerMk2::V6(x) = self { let inner = match x { - v5::ToServer::ToServerInit(init) => v4::ToServer::ToServerInit(v4::ToServerInit { + v6::ToServer::ToServerInit(init) => v4::ToServer::ToServerInit(v4::ToServerInit { name: init.name, version: init.version, total_slots: init.total_slots, @@ -399,7 +573,7 @@ impl ToServerMk2 { }), metadata: init.metadata, }), - v5::ToServer::ToServerEvents(events) => v4::ToServer::ToServerEvents( + v6::ToServer::ToServerEvents(events) => v4::ToServer::ToServerEvents( events .into_iter() .map(|event| v4::EventWrapper { @@ -408,21 +582,21 @@ impl ToServerMk2 { index: event.checkpoint.index, }, inner: match event.inner { - v5::Event::EventActorIntent(intent) => { + v6::Event::EventActorIntent(intent) => { v4::Event::EventActorIntent(v4::EventActorIntent { actor_id: event.checkpoint.actor_id, generation: event.checkpoint.generation, - intent: convert_actor_intent_v5_to_v4(intent.intent), + intent: convert_actor_intent_v6_to_v4(intent.intent), }) } - v5::Event::EventActorStateUpdate(state) => { + v6::Event::EventActorStateUpdate(state) => { v4::Event::EventActorStateUpdate(v4::EventActorStateUpdate { actor_id: event.checkpoint.actor_id, generation: event.checkpoint.generation, - state: convert_actor_state_v5_to_v4(state.state), + state: convert_actor_state_v6_to_v4(state.state), }) } - v5::Event::EventActorSetAlarm(alarm) => { + v6::Event::EventActorSetAlarm(alarm) => { v4::Event::EventActorSetAlarm(v4::EventActorSetAlarm { actor_id: event.checkpoint.actor_id, generation: event.checkpoint.generation, @@ -433,7 +607,7 @@ impl ToServerMk2 { }) .collect(), ), - v5::ToServer::ToServerAckCommands(ack) => { + v6::ToServer::ToServerAckCommands(ack) => { v4::ToServer::ToServerAckCommands(v4::ToServerAckCommands { last_command_checkpoints: ack .last_command_checkpoints @@ -445,25 +619,25 @@ impl ToServerMk2 { .collect(), }) } - v5::ToServer::ToServerStopping => v4::ToServer::ToServerStopping, - v5::ToServer::ToServerPong(pong) => { + v6::ToServer::ToServerStopping => v4::ToServer::ToServerStopping, + v6::ToServer::ToServerPong(pong) => { v4::ToServer::ToServerPong(v4::ToServerPong { ts: pong.ts }) } - v5::ToServer::ToServerKvRequest(req) => { + v6::ToServer::ToServerKvRequest(req) => { v4::ToServer::ToServerKvRequest(v4::ToServerKvRequest { actor_id: req.actor_id, request_id: req.request_id, - data: convert_kv_request_data_v5_to_v4(req.data), + data: convert_kv_request_data_v6_to_v4(req.data), }) } - v5::ToServer::ToServerTunnelMessage(msg) => { + v6::ToServer::ToServerTunnelMessage(msg) => { v4::ToServer::ToServerTunnelMessage(v4::ToServerTunnelMessage { message_id: v4::MessageId { gateway_id: msg.message_id.gateway_id, request_id: msg.message_id.request_id, message_index: msg.message_id.message_index, }, - message_kind: convert_to_server_tunnel_message_kind_v5_to_v4( + message_kind: convert_to_server_tunnel_message_kind_v6_to_v4( msg.message_kind, )?, }) @@ -479,18 +653,18 @@ impl ToServerMk2 { pub enum ToRunnerMk2 { V4(v4::ToRunner), - V5(v5::ToRunner), + V6(v6::ToRunner), } impl OwnedVersionedData for ToRunnerMk2 { - type Latest = v5::ToRunner; + type Latest = v6::ToRunner; - fn wrap_latest(latest: v5::ToRunner) -> Self { - ToRunnerMk2::V5(latest) + fn wrap_latest(latest: v6::ToRunner) -> Self { + ToRunnerMk2::V6(latest) } fn unwrap_latest(self) -> Result { - if let ToRunnerMk2::V5(data) = self { + if let ToRunnerMk2::V6(data) = self { Ok(data) } else { bail!("version not latest"); @@ -500,7 +674,8 @@ impl OwnedVersionedData for ToRunnerMk2 { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { 4 => Ok(ToRunnerMk2::V4(serde_bare::from_slice(payload)?)), - 5 => Ok(ToRunnerMk2::V5(serde_bare::from_slice(payload)?)), + // v5 and v6 have the same ToRunner binary format + 5 | 6 => Ok(ToRunnerMk2::V6(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -508,36 +683,36 @@ impl OwnedVersionedData for ToRunnerMk2 { fn serialize_version(self, _version: u16) -> Result> { match self { ToRunnerMk2::V4(data) => serde_bare::to_vec(&data).map_err(Into::into), - ToRunnerMk2::V5(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToRunnerMk2::V6(data) => serde_bare::to_vec(&data).map_err(Into::into), } } fn deserialize_converters() -> Vec Result> { - // No changes between v1 and v4 - vec![Ok, Ok, Ok, Self::v4_to_v5] + // No changes between v1 and v4, no changes between v5 and v6 + vec![Ok, Ok, Ok, Self::v4_to_v6, Ok] } fn serialize_converters() -> Vec Result> { - // No changes between v1 and v4 - vec![Self::v5_to_v4, Ok, Ok, Ok] + // No changes between v1 and v4, no changes between v5 and v6 + vec![Ok, Self::v6_to_v4, Ok, Ok, Ok] } } impl ToRunnerMk2 { - fn v4_to_v5(self) -> Result { + fn v4_to_v6(self) -> Result { if let ToRunnerMk2::V4(x) = self { let inner = match x { - v4::ToRunner::ToRunnerPing(ping) => v5::ToRunner::ToRunnerPing(v5::ToRunnerPing { + v4::ToRunner::ToRunnerPing(ping) => v6::ToRunner::ToRunnerPing(v6::ToRunnerPing { gateway_id: ping.gateway_id, request_id: ping.request_id, ts: ping.ts, }), - v4::ToRunner::ToRunnerClose => v5::ToRunner::ToRunnerClose, - v4::ToRunner::ToClientCommands(commands) => v5::ToRunner::ToClientCommands( + v4::ToRunner::ToRunnerClose => v6::ToRunner::ToRunnerClose, + v4::ToRunner::ToClientCommands(commands) => v6::ToRunner::ToClientCommands( commands .into_iter() - .map(|cmd| v5::CommandWrapper { - checkpoint: v5::ActorCheckpoint { + .map(|cmd| v6::CommandWrapper { + checkpoint: v6::ActorCheckpoint { actor_id: cmd.checkpoint.actor_id, generation: match &cmd.inner { v4::Command::CommandStartActor(start) => start.generation, @@ -547,8 +722,8 @@ impl ToRunnerMk2 { }, inner: match cmd.inner { v4::Command::CommandStartActor(start) => { - v5::Command::CommandStartActor(v5::CommandStartActor { - config: v5::ActorConfig { + v6::Command::CommandStartActor(v6::CommandStartActor { + config: v6::ActorConfig { name: start.config.name, key: start.config.key, create_ts: start.config.create_ts, @@ -557,24 +732,24 @@ impl ToRunnerMk2 { hibernating_requests: start .hibernating_requests .into_iter() - .map(|req| v5::HibernatingRequest { + .map(|req| v6::HibernatingRequest { gateway_id: req.gateway_id, request_id: req.request_id, }) .collect(), }) } - v4::Command::CommandStopActor(_) => v5::Command::CommandStopActor, + v4::Command::CommandStopActor(_) => v6::Command::CommandStopActor, }, }) .collect(), ), v4::ToRunner::ToClientAckEvents(ack) => { - v5::ToRunner::ToClientAckEvents(v5::ToClientAckEvents { + v6::ToRunner::ToClientAckEvents(v6::ToClientAckEvents { last_event_checkpoints: ack .last_event_checkpoints .into_iter() - .map(|cp| v5::ActorCheckpoint { + .map(|cp| v6::ActorCheckpoint { actor_id: cp.actor_id, generation: 0, // Unknown in v4, use default index: cp.index, @@ -583,35 +758,35 @@ impl ToRunnerMk2 { }) } v4::ToRunner::ToClientTunnelMessage(msg) => { - v5::ToRunner::ToClientTunnelMessage(v5::ToClientTunnelMessage { - message_id: v5::MessageId { + v6::ToRunner::ToClientTunnelMessage(v6::ToClientTunnelMessage { + message_id: v6::MessageId { gateway_id: msg.message_id.gateway_id, request_id: msg.message_id.request_id, message_index: msg.message_id.message_index, }, - message_kind: convert_to_client_tunnel_message_kind_v4_to_v5( + message_kind: convert_to_client_tunnel_message_kind_v4_to_v6( msg.message_kind, ), }) } }; - Ok(ToRunnerMk2::V5(inner)) + Ok(ToRunnerMk2::V6(inner)) } else { bail!("unexpected version"); } } - fn v5_to_v4(self) -> Result { - if let ToRunnerMk2::V5(x) = self { + fn v6_to_v4(self) -> Result { + if let ToRunnerMk2::V6(x) = self { let inner = match x { - v5::ToRunner::ToRunnerPing(ping) => v4::ToRunner::ToRunnerPing(v4::ToRunnerPing { + v6::ToRunner::ToRunnerPing(ping) => v4::ToRunner::ToRunnerPing(v4::ToRunnerPing { gateway_id: ping.gateway_id, request_id: ping.request_id, ts: ping.ts, }), - v5::ToRunner::ToRunnerClose => v4::ToRunner::ToRunnerClose, - v5::ToRunner::ToClientCommands(commands) => v4::ToRunner::ToClientCommands( + v6::ToRunner::ToRunnerClose => v4::ToRunner::ToRunnerClose, + v6::ToRunner::ToClientCommands(commands) => v4::ToRunner::ToClientCommands( commands .into_iter() .map(|cmd| v4::CommandWrapper { @@ -620,7 +795,7 @@ impl ToRunnerMk2 { index: cmd.checkpoint.index, }, inner: match cmd.inner { - v5::Command::CommandStartActor(start) => { + v6::Command::CommandStartActor(start) => { v4::Command::CommandStartActor(v4::CommandStartActor { generation: cmd.checkpoint.generation, config: v4::ActorConfig { @@ -639,7 +814,7 @@ impl ToRunnerMk2 { .collect(), }) } - v5::Command::CommandStopActor => { + v6::Command::CommandStopActor => { v4::Command::CommandStopActor(v4::CommandStopActor { generation: cmd.checkpoint.generation, }) @@ -648,7 +823,7 @@ impl ToRunnerMk2 { }) .collect(), ), - v5::ToRunner::ToClientAckEvents(ack) => { + v6::ToRunner::ToClientAckEvents(ack) => { v4::ToRunner::ToClientAckEvents(v4::ToClientAckEvents { last_event_checkpoints: ack .last_event_checkpoints @@ -660,14 +835,14 @@ impl ToRunnerMk2 { .collect(), }) } - v5::ToRunner::ToClientTunnelMessage(msg) => { + v6::ToRunner::ToClientTunnelMessage(msg) => { v4::ToRunner::ToClientTunnelMessage(v4::ToClientTunnelMessage { message_id: v4::MessageId { gateway_id: msg.message_id.gateway_id, request_id: msg.message_id.request_id, message_index: msg.message_id.message_index, }, - message_kind: convert_to_client_tunnel_message_kind_v5_to_v4( + message_kind: convert_to_client_tunnel_message_kind_v6_to_v4( msg.message_kind, ), }) @@ -1411,19 +1586,19 @@ impl OwnedVersionedData for ToRunner { pub enum ToGateway { V3(v3::ToGateway), - V5(v5::ToGateway), + V6(v6::ToGateway), } impl OwnedVersionedData for ToGateway { - type Latest = v5::ToGateway; + type Latest = v6::ToGateway; - fn wrap_latest(latest: v5::ToGateway) -> Self { - ToGateway::V5(latest) + fn wrap_latest(latest: v6::ToGateway) -> Self { + ToGateway::V6(latest) } fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] - if let ToGateway::V5(data) = self { + if let ToGateway::V6(data) = self { Ok(data) } else { bail!("version not latest"); @@ -1433,7 +1608,8 @@ impl OwnedVersionedData for ToGateway { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { 1 | 2 | 3 => Ok(ToGateway::V3(serde_bare::from_slice(payload)?)), - 4 | 5 => Ok(ToGateway::V5(serde_bare::from_slice(payload)?)), + // v4, v5, and v6 have the same ToGateway binary format + 4 | 5 | 6 => Ok(ToGateway::V6(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -1441,34 +1617,34 @@ impl OwnedVersionedData for ToGateway { fn serialize_version(self, _version: u16) -> Result> { match self { ToGateway::V3(data) => serde_bare::to_vec(&data).map_err(Into::into), - ToGateway::V5(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToGateway::V6(data) => serde_bare::to_vec(&data).map_err(Into::into), } } fn deserialize_converters() -> Vec Result> { - // No changes between v1-v5 but we need a converter to bridge mk1 to mk2 - vec![Ok, Ok, Self::v3_to_v4, Ok] + // No changes between v1-v6 but we need a converter to bridge mk1 to mk2 + vec![Ok, Ok, Self::v3_to_v6, Ok, Ok] } fn serialize_converters() -> Vec Result> { - // No changes between v1-v5 but we need a converter to bridge mk2 to mk1 - vec![Ok, Self::v4_to_v3, Ok, Ok] + // No changes between v1-v6 but we need a converter to bridge mk2 to mk1 + vec![Ok, Ok, Self::v6_to_v3, Ok, Ok] } } impl ToGateway { - pub fn v3_to_v4(self) -> Result { + pub fn v3_to_v6(self) -> Result { if let ToGateway::V3(x) = self { let inner = match x { v3::ToGateway::ToGatewayPong(pong) => { - v5::ToGateway::ToGatewayPong(v5::ToGatewayPong { + v6::ToGateway::ToGatewayPong(v6::ToGatewayPong { request_id: pong.request_id, ts: pong.ts, }) } v3::ToGateway::ToServerTunnelMessage(msg) => { - v5::ToGateway::ToServerTunnelMessage(v5::ToServerTunnelMessage { - message_id: v5::MessageId { + v6::ToGateway::ToServerTunnelMessage(v6::ToServerTunnelMessage { + message_id: v6::MessageId { gateway_id: msg.message_id.gateway_id, request_id: msg.message_id.request_id, message_index: msg.message_id.message_index, @@ -1480,22 +1656,22 @@ impl ToGateway { } }; - Ok(ToGateway::V5(inner)) + Ok(ToGateway::V6(inner)) } else { bail!("unexpected version"); } } - fn v4_to_v3(self) -> Result { - if let ToGateway::V5(x) = self { + fn v6_to_v3(self) -> Result { + if let ToGateway::V6(x) = self { let inner = match x { - v5::ToGateway::ToGatewayPong(pong) => { + v6::ToGateway::ToGatewayPong(pong) => { v3::ToGateway::ToGatewayPong(v3::ToGatewayPong { request_id: pong.request_id, ts: pong.ts, }) } - v5::ToGateway::ToServerTunnelMessage(msg) => { + v6::ToGateway::ToServerTunnelMessage(msg) => { v3::ToGateway::ToServerTunnelMessage(v3::ToServerTunnelMessage { message_id: v3::MessageId { gateway_id: msg.message_id.gateway_id, @@ -1518,19 +1694,19 @@ impl ToGateway { pub enum ToServerlessServer { V3(v3::ToServerlessServer), - V5(v5::ToServerlessServer), + V6(v6::ToServerlessServer), } impl OwnedVersionedData for ToServerlessServer { - type Latest = v5::ToServerlessServer; + type Latest = v6::ToServerlessServer; - fn wrap_latest(latest: v5::ToServerlessServer) -> Self { - ToServerlessServer::V5(latest) + fn wrap_latest(latest: v6::ToServerlessServer) -> Self { + ToServerlessServer::V6(latest) } fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] - if let ToServerlessServer::V5(data) = self { + if let ToServerlessServer::V6(data) = self { Ok(data) } else { bail!("version not latest"); @@ -1540,7 +1716,8 @@ impl OwnedVersionedData for ToServerlessServer { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { 1 | 2 | 3 => Ok(ToServerlessServer::V3(serde_bare::from_slice(payload)?)), - 4 | 5 => Ok(ToServerlessServer::V5(serde_bare::from_slice(payload)?)), + // v4, v5, and v6 have the same ToServerlessServer binary format + 4 | 5 | 6 => Ok(ToServerlessServer::V6(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -1548,43 +1725,43 @@ impl OwnedVersionedData for ToServerlessServer { fn serialize_version(self, _version: u16) -> Result> { match self { ToServerlessServer::V3(data) => serde_bare::to_vec(&data).map_err(Into::into), - ToServerlessServer::V5(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToServerlessServer::V6(data) => serde_bare::to_vec(&data).map_err(Into::into), } } fn deserialize_converters() -> Vec Result> { - // No changes between v1-v3, v4-v5 - vec![Ok, Ok, Self::v3_to_v4, Ok] + // No changes between v1-v3, v4-v6 + vec![Ok, Ok, Self::v3_to_v6, Ok, Ok] } fn serialize_converters() -> Vec Result> { - // No changes between v1-v3, v4-v5 - vec![Ok, Self::v4_to_v3, Ok, Ok] + // No changes between v1-v3, v4-v6 + vec![Ok, Ok, Self::v6_to_v3, Ok, Ok] } } impl ToServerlessServer { - fn v3_to_v4(self) -> Result { + fn v3_to_v6(self) -> Result { if let ToServerlessServer::V3(x) = self { let inner = match x { v3::ToServerlessServer::ToServerlessServerInit(init) => { - v5::ToServerlessServer::ToServerlessServerInit(v5::ToServerlessServerInit { + v6::ToServerlessServer::ToServerlessServerInit(v6::ToServerlessServerInit { runner_id: init.runner_id, runner_protocol_version: PROTOCOL_MK1_VERSION, }) } }; - Ok(ToServerlessServer::V5(inner)) + Ok(ToServerlessServer::V6(inner)) } else { bail!("unexpected version"); } } - fn v4_to_v3(self) -> Result { - if let ToServerlessServer::V5(x) = self { + fn v6_to_v3(self) -> Result { + if let ToServerlessServer::V6(x) = self { let inner = match x { - v5::ToServerlessServer::ToServerlessServerInit(init) => { + v6::ToServerlessServer::ToServerlessServerInit(init) => { v3::ToServerlessServer::ToServerlessServerInit(v3::ToServerlessServerInit { runner_id: init.runner_id, }) @@ -1600,18 +1777,18 @@ impl ToServerlessServer { pub enum ActorCommandKeyData { V4(v4::ActorCommandKeyData), - V5(v5::ActorCommandKeyData), + V6(v6::ActorCommandKeyData), } impl OwnedVersionedData for ActorCommandKeyData { - type Latest = v5::ActorCommandKeyData; + type Latest = v6::ActorCommandKeyData; - fn wrap_latest(latest: v5::ActorCommandKeyData) -> Self { - ActorCommandKeyData::V5(latest) + fn wrap_latest(latest: v6::ActorCommandKeyData) -> Self { + ActorCommandKeyData::V6(latest) } fn unwrap_latest(self) -> Result { - if let ActorCommandKeyData::V5(data) = self { + if let ActorCommandKeyData::V6(data) = self { Ok(data) } else { bail!("version not latest"); @@ -1621,7 +1798,8 @@ impl OwnedVersionedData for ActorCommandKeyData { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { 4 => Ok(ActorCommandKeyData::V4(serde_bare::from_slice(payload)?)), - 5 => Ok(ActorCommandKeyData::V5(serde_bare::from_slice(payload)?)), + // v5 and v6 have the same ActorCommandKeyData binary format + 5 | 6 => Ok(ActorCommandKeyData::V6(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -1629,28 +1807,28 @@ impl OwnedVersionedData for ActorCommandKeyData { fn serialize_version(self, _version: u16) -> Result> { match self { ActorCommandKeyData::V4(data) => serde_bare::to_vec(&data).map_err(Into::into), - ActorCommandKeyData::V5(data) => serde_bare::to_vec(&data).map_err(Into::into), + ActorCommandKeyData::V6(data) => serde_bare::to_vec(&data).map_err(Into::into), } } fn deserialize_converters() -> Vec Result> { - // No changes between v1 and v4 - vec![Ok, Ok, Ok, Self::v4_to_v5] + // No changes between v1 and v4, no changes between v5 and v6 + vec![Ok, Ok, Ok, Self::v4_to_v6, Ok] } fn serialize_converters() -> Vec Result> { - // No changes between v1 and v4 - vec![Self::v5_to_v4, Ok, Ok, Ok] + // No changes between v1 and v4, no changes between v5 and v6 + vec![Ok, Self::v6_to_v4, Ok, Ok, Ok] } } impl ActorCommandKeyData { - fn v4_to_v5(self) -> Result { + fn v4_to_v6(self) -> Result { if let ActorCommandKeyData::V4(x) = self { let inner = match x { v4::ActorCommandKeyData::CommandStartActor(start) => { - v5::ActorCommandKeyData::CommandStartActor(v5::CommandStartActor { - config: v5::ActorConfig { + v6::ActorCommandKeyData::CommandStartActor(v6::CommandStartActor { + config: v6::ActorConfig { name: start.config.name, key: start.config.key, create_ts: start.config.create_ts, @@ -1659,7 +1837,7 @@ impl ActorCommandKeyData { hibernating_requests: start .hibernating_requests .into_iter() - .map(|req| v5::HibernatingRequest { + .map(|req| v6::HibernatingRequest { gateway_id: req.gateway_id, request_id: req.request_id, }) @@ -1667,22 +1845,21 @@ impl ActorCommandKeyData { }) } v4::ActorCommandKeyData::CommandStopActor(_) => { - v5::ActorCommandKeyData::CommandStopActor + v6::ActorCommandKeyData::CommandStopActor } }; - Ok(ActorCommandKeyData::V5(inner)) + Ok(ActorCommandKeyData::V6(inner)) } else { bail!("unexpected version"); } } - fn v5_to_v4(self) -> Result { - if let ActorCommandKeyData::V5(x) = self { - // Since v4 commands have generation but v5 doesn't, we can't fully convert back - // We'll use generation 0 as a placeholder + fn v6_to_v4(self) -> Result { + if let ActorCommandKeyData::V6(x) = self { + // Since v4 commands have generation but v6 doesn't, use generation 0 as a placeholder let inner = match x { - v5::ActorCommandKeyData::CommandStartActor(start) => { + v6::ActorCommandKeyData::CommandStartActor(start) => { v4::ActorCommandKeyData::CommandStartActor(v4::CommandStartActor { generation: 0, // Lost during conversion config: v4::ActorConfig { @@ -1701,7 +1878,7 @@ impl ActorCommandKeyData { .collect(), }) } - v5::ActorCommandKeyData::CommandStopActor => { + v6::ActorCommandKeyData::CommandStopActor => { v4::ActorCommandKeyData::CommandStopActor(v4::CommandStopActor { generation: 0, // Lost during conversion }) @@ -2625,10 +2802,10 @@ fn convert_kv_metadata_v3_to_v2(metadata: v3::KvMetadata) -> v2::KvMetadata { fn convert_to_server_tunnel_message_kind_v3_to_v4( kind: v3::ToServerTunnelMessageKind, -) -> v5::ToServerTunnelMessageKind { +) -> v6::ToServerTunnelMessageKind { match kind { v3::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { - v5::ToServerTunnelMessageKind::ToServerResponseStart(v5::ToServerResponseStart { + v6::ToServerTunnelMessageKind::ToServerResponseStart(v6::ToServerResponseStart { status: resp.status, headers: resp.headers, body: resp.body, @@ -2636,32 +2813,32 @@ fn convert_to_server_tunnel_message_kind_v3_to_v4( }) } v3::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { - v5::ToServerTunnelMessageKind::ToServerResponseChunk(v5::ToServerResponseChunk { + v6::ToServerTunnelMessageKind::ToServerResponseChunk(v6::ToServerResponseChunk { body: chunk.body, finish: chunk.finish, }) } v3::ToServerTunnelMessageKind::ToServerResponseAbort => { - v5::ToServerTunnelMessageKind::ToServerResponseAbort + v6::ToServerTunnelMessageKind::ToServerResponseAbort } v3::ToServerTunnelMessageKind::ToServerWebSocketOpen(open) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketOpen(v5::ToServerWebSocketOpen { + v6::ToServerTunnelMessageKind::ToServerWebSocketOpen(v6::ToServerWebSocketOpen { can_hibernate: open.can_hibernate, }) } v3::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketMessage(v5::ToServerWebSocketMessage { + v6::ToServerTunnelMessageKind::ToServerWebSocketMessage(v6::ToServerWebSocketMessage { data: msg.data, binary: msg.binary, }) } v3::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketMessageAck( - v5::ToServerWebSocketMessageAck { index: ack.index }, + v6::ToServerTunnelMessageKind::ToServerWebSocketMessageAck( + v6::ToServerWebSocketMessageAck { index: ack.index }, ) } v3::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketClose(v5::ToServerWebSocketClose { + v6::ToServerTunnelMessageKind::ToServerWebSocketClose(v6::ToServerWebSocketClose { code: close.code, reason: close.reason, hibernate: close.hibernate, @@ -2670,16 +2847,16 @@ fn convert_to_server_tunnel_message_kind_v3_to_v4( v3::ToServerTunnelMessageKind::DeprecatedTunnelAck => { // v4 removed DeprecatedTunnelAck, this should not occur in practice // but if it does, we'll convert it to a response abort as a safe fallback - v5::ToServerTunnelMessageKind::ToServerResponseAbort + v6::ToServerTunnelMessageKind::ToServerResponseAbort } } } fn convert_to_server_tunnel_message_kind_v4_to_v3( - kind: v5::ToServerTunnelMessageKind, + kind: v6::ToServerTunnelMessageKind, ) -> Result { Ok(match kind { - v5::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { + v6::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { v3::ToServerTunnelMessageKind::ToServerResponseStart(v3::ToServerResponseStart { status: resp.status, headers: resp.headers, @@ -2687,32 +2864,32 @@ fn convert_to_server_tunnel_message_kind_v4_to_v3( stream: resp.stream, }) } - v5::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { + v6::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { v3::ToServerTunnelMessageKind::ToServerResponseChunk(v3::ToServerResponseChunk { body: chunk.body, finish: chunk.finish, }) } - v5::ToServerTunnelMessageKind::ToServerResponseAbort => { + v6::ToServerTunnelMessageKind::ToServerResponseAbort => { v3::ToServerTunnelMessageKind::ToServerResponseAbort } - v5::ToServerTunnelMessageKind::ToServerWebSocketOpen(open) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketOpen(open) => { v3::ToServerTunnelMessageKind::ToServerWebSocketOpen(v3::ToServerWebSocketOpen { can_hibernate: open.can_hibernate, }) } - v5::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { v3::ToServerTunnelMessageKind::ToServerWebSocketMessage(v3::ToServerWebSocketMessage { data: msg.data, binary: msg.binary, }) } - v5::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { v3::ToServerTunnelMessageKind::ToServerWebSocketMessageAck( v3::ToServerWebSocketMessageAck { index: ack.index }, ) } - v5::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { v3::ToServerTunnelMessageKind::ToServerWebSocketClose(v3::ToServerWebSocketClose { code: close.code, reason: close.reason, @@ -2724,7 +2901,7 @@ fn convert_to_server_tunnel_message_kind_v4_to_v3( // Used specifically for the gateway because there were no changes between mk2 and mk1 for the tunnel messages pub fn to_client_tunnel_message_mk2_to_mk1( - msg: v5::ToClientTunnelMessage, + msg: v6::ToClientTunnelMessage, ) -> v3::ToClientTunnelMessage { v3::ToClientTunnelMessage { message_id: v3::MessageId { @@ -2737,10 +2914,10 @@ pub fn to_client_tunnel_message_mk2_to_mk1( } fn convert_to_client_tunnel_message_kind_mk2_to_mk1( - kind: v5::ToClientTunnelMessageKind, + kind: v6::ToClientTunnelMessageKind, ) -> v3::ToClientTunnelMessageKind { match kind { - v5::ToClientTunnelMessageKind::ToClientRequestStart(req) => { + v6::ToClientTunnelMessageKind::ToClientRequestStart(req) => { v3::ToClientTunnelMessageKind::ToClientRequestStart(v3::ToClientRequestStart { actor_id: req.actor_id, method: req.method, @@ -2750,29 +2927,29 @@ fn convert_to_client_tunnel_message_kind_mk2_to_mk1( stream: req.stream, }) } - v5::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { + v6::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { v3::ToClientTunnelMessageKind::ToClientRequestChunk(v3::ToClientRequestChunk { body: chunk.body, finish: chunk.finish, }) } - v5::ToClientTunnelMessageKind::ToClientRequestAbort => { + v6::ToClientTunnelMessageKind::ToClientRequestAbort => { v3::ToClientTunnelMessageKind::ToClientRequestAbort } - v5::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { v3::ToClientTunnelMessageKind::ToClientWebSocketOpen(v3::ToClientWebSocketOpen { actor_id: ws.actor_id, path: ws.path, headers: ws.headers, }) } - v5::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { v3::ToClientTunnelMessageKind::ToClientWebSocketMessage(v3::ToClientWebSocketMessage { data: msg.data, binary: msg.binary, }) } - v5::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { v3::ToClientTunnelMessageKind::ToClientWebSocketClose(v3::ToClientWebSocketClose { code: close.code, reason: close.reason, @@ -2781,106 +2958,6 @@ fn convert_to_client_tunnel_message_kind_mk2_to_mk1( } } -// Helper conversion functions for v4 <-> v5 - -fn convert_actor_intent_v4_to_v5(intent: v4::ActorIntent) -> v5::ActorIntent { - match intent { - v4::ActorIntent::ActorIntentSleep => v5::ActorIntent::ActorIntentSleep, - v4::ActorIntent::ActorIntentStop => v5::ActorIntent::ActorIntentStop, - } -} - -fn convert_actor_intent_v5_to_v4(intent: v5::ActorIntent) -> v4::ActorIntent { - match intent { - v5::ActorIntent::ActorIntentSleep => v4::ActorIntent::ActorIntentSleep, - v5::ActorIntent::ActorIntentStop => v4::ActorIntent::ActorIntentStop, - } -} - -fn convert_actor_state_v4_to_v5(state: v4::ActorState) -> v5::ActorState { - match state { - v4::ActorState::ActorStateRunning => v5::ActorState::ActorStateRunning, - v4::ActorState::ActorStateStopped(stopped) => { - v5::ActorState::ActorStateStopped(v5::ActorStateStopped { - code: convert_stop_code_v4_to_v5(stopped.code), - message: stopped.message, - }) - } - } -} - -fn convert_actor_state_v5_to_v4(state: v5::ActorState) -> v4::ActorState { - match state { - v5::ActorState::ActorStateRunning => v4::ActorState::ActorStateRunning, - v5::ActorState::ActorStateStopped(stopped) => { - v4::ActorState::ActorStateStopped(v4::ActorStateStopped { - code: convert_stop_code_v5_to_v4(stopped.code), - message: stopped.message, - }) - } - } -} - -fn convert_stop_code_v4_to_v5(code: v4::StopCode) -> v5::StopCode { - match code { - v4::StopCode::Ok => v5::StopCode::Ok, - v4::StopCode::Error => v5::StopCode::Error, - } -} - -fn convert_stop_code_v5_to_v4(code: v5::StopCode) -> v4::StopCode { - match code { - v5::StopCode::Ok => v4::StopCode::Ok, - v5::StopCode::Error => v4::StopCode::Error, - } -} - -fn convert_kv_request_data_v4_to_v5(data: v4::KvRequestData) -> v5::KvRequestData { - match data { - v4::KvRequestData::KvGetRequest(req) => { - v5::KvRequestData::KvGetRequest(v5::KvGetRequest { keys: req.keys }) - } - v4::KvRequestData::KvListRequest(req) => { - v5::KvRequestData::KvListRequest(v5::KvListRequest { - query: convert_kv_list_query_v4_to_v5(req.query), - reverse: req.reverse, - limit: req.limit, - }) - } - v4::KvRequestData::KvPutRequest(req) => v5::KvRequestData::KvPutRequest(v5::KvPutRequest { - keys: req.keys, - values: req.values, - }), - v4::KvRequestData::KvDeleteRequest(req) => { - v5::KvRequestData::KvDeleteRequest(v5::KvDeleteRequest { keys: req.keys }) - } - v4::KvRequestData::KvDropRequest => v5::KvRequestData::KvDropRequest, - } -} - -fn convert_kv_request_data_v5_to_v4(data: v5::KvRequestData) -> v4::KvRequestData { - match data { - v5::KvRequestData::KvGetRequest(req) => { - v4::KvRequestData::KvGetRequest(v4::KvGetRequest { keys: req.keys }) - } - v5::KvRequestData::KvListRequest(req) => { - v4::KvRequestData::KvListRequest(v4::KvListRequest { - query: convert_kv_list_query_v5_to_v4(req.query), - reverse: req.reverse, - limit: req.limit, - }) - } - v5::KvRequestData::KvPutRequest(req) => v4::KvRequestData::KvPutRequest(v4::KvPutRequest { - keys: req.keys, - values: req.values, - }), - v5::KvRequestData::KvDeleteRequest(req) => { - v4::KvRequestData::KvDeleteRequest(v4::KvDeleteRequest { keys: req.keys }) - } - v5::KvRequestData::KvDropRequest => v4::KvRequestData::KvDropRequest, - } -} - fn convert_kv_response_data_v4_to_v5(data: v4::KvResponseData) -> v5::KvResponseData { match data { v4::KvResponseData::KvErrorResponse(err) => { @@ -2951,38 +3028,6 @@ fn convert_kv_response_data_v5_to_v4(data: v5::KvResponseData) -> v4::KvResponse } } -fn convert_kv_list_query_v4_to_v5(query: v4::KvListQuery) -> v5::KvListQuery { - match query { - v4::KvListQuery::KvListAllQuery => v5::KvListQuery::KvListAllQuery, - v4::KvListQuery::KvListRangeQuery(range) => { - v5::KvListQuery::KvListRangeQuery(v5::KvListRangeQuery { - start: range.start, - end: range.end, - exclusive: range.exclusive, - }) - } - v4::KvListQuery::KvListPrefixQuery(prefix) => { - v5::KvListQuery::KvListPrefixQuery(v5::KvListPrefixQuery { key: prefix.key }) - } - } -} - -fn convert_kv_list_query_v5_to_v4(query: v5::KvListQuery) -> v4::KvListQuery { - match query { - v5::KvListQuery::KvListAllQuery => v4::KvListQuery::KvListAllQuery, - v5::KvListQuery::KvListRangeQuery(range) => { - v4::KvListQuery::KvListRangeQuery(v4::KvListRangeQuery { - start: range.start, - end: range.end, - exclusive: range.exclusive, - }) - } - v5::KvListQuery::KvListPrefixQuery(prefix) => { - v4::KvListQuery::KvListPrefixQuery(v4::KvListPrefixQuery { key: prefix.key }) - } - } -} - fn convert_kv_metadata_v4_to_v5(metadata: v4::KvMetadata) -> v5::KvMetadata { v5::KvMetadata { version: metadata.version, @@ -3087,12 +3132,234 @@ fn convert_to_client_tunnel_message_kind_v5_to_v4( } } -fn convert_to_server_tunnel_message_kind_v4_to_v5( +// MARK: v4 <-> v6 helpers (ToServer and ToRunner; v5 and v6 are structurally identical) + +fn convert_actor_intent_v4_to_v6(intent: v4::ActorIntent) -> v6::ActorIntent { + match intent { + v4::ActorIntent::ActorIntentSleep => v6::ActorIntent::ActorIntentSleep, + v4::ActorIntent::ActorIntentStop => v6::ActorIntent::ActorIntentStop, + } +} + +fn convert_actor_intent_v6_to_v4(intent: v6::ActorIntent) -> v4::ActorIntent { + match intent { + v6::ActorIntent::ActorIntentSleep => v4::ActorIntent::ActorIntentSleep, + v6::ActorIntent::ActorIntentStop => v4::ActorIntent::ActorIntentStop, + } +} + +fn convert_actor_state_v4_to_v6(state: v4::ActorState) -> v6::ActorState { + match state { + v4::ActorState::ActorStateRunning => v6::ActorState::ActorStateRunning, + v4::ActorState::ActorStateStopped(stopped) => { + v6::ActorState::ActorStateStopped(v6::ActorStateStopped { + code: convert_stop_code_v4_to_v6(stopped.code), + message: stopped.message, + }) + } + } +} + +fn convert_actor_state_v6_to_v4(state: v6::ActorState) -> v4::ActorState { + match state { + v6::ActorState::ActorStateRunning => v4::ActorState::ActorStateRunning, + v6::ActorState::ActorStateStopped(stopped) => { + v4::ActorState::ActorStateStopped(v4::ActorStateStopped { + code: convert_stop_code_v6_to_v4(stopped.code), + message: stopped.message, + }) + } + } +} + +fn convert_stop_code_v4_to_v6(code: v4::StopCode) -> v6::StopCode { + match code { + v4::StopCode::Ok => v6::StopCode::Ok, + v4::StopCode::Error => v6::StopCode::Error, + } +} + +fn convert_stop_code_v6_to_v4(code: v6::StopCode) -> v4::StopCode { + match code { + v6::StopCode::Ok => v4::StopCode::Ok, + v6::StopCode::Error => v4::StopCode::Error, + } +} + +fn convert_kv_request_data_v4_to_v6(data: v4::KvRequestData) -> v6::KvRequestData { + match data { + v4::KvRequestData::KvGetRequest(req) => { + v6::KvRequestData::KvGetRequest(v6::KvGetRequest { keys: req.keys }) + } + v4::KvRequestData::KvListRequest(req) => { + v6::KvRequestData::KvListRequest(v6::KvListRequest { + query: convert_kv_list_query_v4_to_v6(req.query), + reverse: req.reverse, + limit: req.limit, + }) + } + v4::KvRequestData::KvPutRequest(req) => v6::KvRequestData::KvPutRequest(v6::KvPutRequest { + keys: req.keys, + values: req.values, + }), + v4::KvRequestData::KvDeleteRequest(req) => { + v6::KvRequestData::KvDeleteRequest(v6::KvDeleteRequest { keys: req.keys }) + } + v4::KvRequestData::KvDropRequest => v6::KvRequestData::KvDropRequest, + } +} + +fn convert_kv_request_data_v6_to_v4(data: v6::KvRequestData) -> v4::KvRequestData { + match data { + v6::KvRequestData::KvGetRequest(req) => { + v4::KvRequestData::KvGetRequest(v4::KvGetRequest { keys: req.keys }) + } + v6::KvRequestData::KvListRequest(req) => { + v4::KvRequestData::KvListRequest(v4::KvListRequest { + query: convert_kv_list_query_v6_to_v4(req.query), + reverse: req.reverse, + limit: req.limit, + }) + } + v6::KvRequestData::KvPutRequest(req) => v4::KvRequestData::KvPutRequest(v4::KvPutRequest { + keys: req.keys, + values: req.values, + }), + v6::KvRequestData::KvDeleteRequest(req) => { + v4::KvRequestData::KvDeleteRequest(v4::KvDeleteRequest { keys: req.keys }) + } + v6::KvRequestData::KvDropRequest => v4::KvRequestData::KvDropRequest, + } +} + +fn convert_kv_list_query_v4_to_v6(query: v4::KvListQuery) -> v6::KvListQuery { + match query { + v4::KvListQuery::KvListAllQuery => v6::KvListQuery::KvListAllQuery, + v4::KvListQuery::KvListRangeQuery(range) => { + v6::KvListQuery::KvListRangeQuery(v6::KvListRangeQuery { + start: range.start, + end: range.end, + exclusive: range.exclusive, + }) + } + v4::KvListQuery::KvListPrefixQuery(prefix) => { + v6::KvListQuery::KvListPrefixQuery(v6::KvListPrefixQuery { key: prefix.key }) + } + } +} + +fn convert_kv_list_query_v6_to_v4(query: v6::KvListQuery) -> v4::KvListQuery { + match query { + v6::KvListQuery::KvListAllQuery => v4::KvListQuery::KvListAllQuery, + v6::KvListQuery::KvListRangeQuery(range) => { + v4::KvListQuery::KvListRangeQuery(v4::KvListRangeQuery { + start: range.start, + end: range.end, + exclusive: range.exclusive, + }) + } + v6::KvListQuery::KvListPrefixQuery(prefix) => { + v4::KvListQuery::KvListPrefixQuery(v4::KvListPrefixQuery { key: prefix.key }) + } + } +} + +fn convert_to_client_tunnel_message_kind_v4_to_v6( + kind: v4::ToClientTunnelMessageKind, +) -> v6::ToClientTunnelMessageKind { + match kind { + v4::ToClientTunnelMessageKind::ToClientRequestStart(req) => { + v6::ToClientTunnelMessageKind::ToClientRequestStart(v6::ToClientRequestStart { + actor_id: req.actor_id, + method: req.method, + path: req.path, + headers: req.headers, + body: req.body, + stream: req.stream, + }) + } + v4::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { + v6::ToClientTunnelMessageKind::ToClientRequestChunk(v6::ToClientRequestChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v4::ToClientTunnelMessageKind::ToClientRequestAbort => { + v6::ToClientTunnelMessageKind::ToClientRequestAbort + } + v4::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketOpen(v6::ToClientWebSocketOpen { + actor_id: ws.actor_id, + path: ws.path, + headers: ws.headers, + }) + } + v4::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketMessage(v6::ToClientWebSocketMessage { + data: msg.data, + binary: msg.binary, + }) + } + v4::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketClose(v6::ToClientWebSocketClose { + code: close.code, + reason: close.reason, + }) + } + } +} + +fn convert_to_client_tunnel_message_kind_v6_to_v4( + kind: v6::ToClientTunnelMessageKind, +) -> v4::ToClientTunnelMessageKind { + match kind { + v6::ToClientTunnelMessageKind::ToClientRequestStart(req) => { + v4::ToClientTunnelMessageKind::ToClientRequestStart(v4::ToClientRequestStart { + actor_id: req.actor_id, + method: req.method, + path: req.path, + headers: req.headers, + body: req.body, + stream: req.stream, + }) + } + v6::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { + v4::ToClientTunnelMessageKind::ToClientRequestChunk(v4::ToClientRequestChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v6::ToClientTunnelMessageKind::ToClientRequestAbort => { + v4::ToClientTunnelMessageKind::ToClientRequestAbort + } + v6::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { + v4::ToClientTunnelMessageKind::ToClientWebSocketOpen(v4::ToClientWebSocketOpen { + actor_id: ws.actor_id, + path: ws.path, + headers: ws.headers, + }) + } + v6::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { + v4::ToClientTunnelMessageKind::ToClientWebSocketMessage(v4::ToClientWebSocketMessage { + data: msg.data, + binary: msg.binary, + }) + } + v6::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { + v4::ToClientTunnelMessageKind::ToClientWebSocketClose(v4::ToClientWebSocketClose { + code: close.code, + reason: close.reason, + }) + } + } +} + +fn convert_to_server_tunnel_message_kind_v4_to_v6( kind: v4::ToServerTunnelMessageKind, -) -> v5::ToServerTunnelMessageKind { +) -> v6::ToServerTunnelMessageKind { match kind { v4::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { - v5::ToServerTunnelMessageKind::ToServerResponseStart(v5::ToServerResponseStart { + v6::ToServerTunnelMessageKind::ToServerResponseStart(v6::ToServerResponseStart { status: resp.status, headers: resp.headers, body: resp.body, @@ -3100,32 +3367,32 @@ fn convert_to_server_tunnel_message_kind_v4_to_v5( }) } v4::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { - v5::ToServerTunnelMessageKind::ToServerResponseChunk(v5::ToServerResponseChunk { + v6::ToServerTunnelMessageKind::ToServerResponseChunk(v6::ToServerResponseChunk { body: chunk.body, finish: chunk.finish, }) } v4::ToServerTunnelMessageKind::ToServerResponseAbort => { - v5::ToServerTunnelMessageKind::ToServerResponseAbort + v6::ToServerTunnelMessageKind::ToServerResponseAbort } v4::ToServerTunnelMessageKind::ToServerWebSocketOpen(open) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketOpen(v5::ToServerWebSocketOpen { + v6::ToServerTunnelMessageKind::ToServerWebSocketOpen(v6::ToServerWebSocketOpen { can_hibernate: open.can_hibernate, }) } v4::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketMessage(v5::ToServerWebSocketMessage { + v6::ToServerTunnelMessageKind::ToServerWebSocketMessage(v6::ToServerWebSocketMessage { data: msg.data, binary: msg.binary, }) } v4::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketMessageAck( - v5::ToServerWebSocketMessageAck { index: ack.index }, + v6::ToServerTunnelMessageKind::ToServerWebSocketMessageAck( + v6::ToServerWebSocketMessageAck { index: ack.index }, ) } v4::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { - v5::ToServerTunnelMessageKind::ToServerWebSocketClose(v5::ToServerWebSocketClose { + v6::ToServerTunnelMessageKind::ToServerWebSocketClose(v6::ToServerWebSocketClose { code: close.code, reason: close.reason, hibernate: close.hibernate, @@ -3134,11 +3401,11 @@ fn convert_to_server_tunnel_message_kind_v4_to_v5( } } -fn convert_to_server_tunnel_message_kind_v5_to_v4( - kind: v5::ToServerTunnelMessageKind, +fn convert_to_server_tunnel_message_kind_v6_to_v4( + kind: v6::ToServerTunnelMessageKind, ) -> Result { Ok(match kind { - v5::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { + v6::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { v4::ToServerTunnelMessageKind::ToServerResponseStart(v4::ToServerResponseStart { status: resp.status, headers: resp.headers, @@ -3146,32 +3413,32 @@ fn convert_to_server_tunnel_message_kind_v5_to_v4( stream: resp.stream, }) } - v5::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { + v6::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { v4::ToServerTunnelMessageKind::ToServerResponseChunk(v4::ToServerResponseChunk { body: chunk.body, finish: chunk.finish, }) } - v5::ToServerTunnelMessageKind::ToServerResponseAbort => { + v6::ToServerTunnelMessageKind::ToServerResponseAbort => { v4::ToServerTunnelMessageKind::ToServerResponseAbort } - v5::ToServerTunnelMessageKind::ToServerWebSocketOpen(open) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketOpen(open) => { v4::ToServerTunnelMessageKind::ToServerWebSocketOpen(v4::ToServerWebSocketOpen { can_hibernate: open.can_hibernate, }) } - v5::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { v4::ToServerTunnelMessageKind::ToServerWebSocketMessage(v4::ToServerWebSocketMessage { data: msg.data, binary: msg.binary, }) } - v5::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { v4::ToServerTunnelMessageKind::ToServerWebSocketMessageAck( v4::ToServerWebSocketMessageAck { index: ack.index }, ) } - v5::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { + v6::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { v4::ToServerTunnelMessageKind::ToServerWebSocketClose(v4::ToServerWebSocketClose { code: close.code, reason: close.reason, @@ -3180,3 +3447,179 @@ fn convert_to_server_tunnel_message_kind_v5_to_v4( } }) } + +// MARK: v5 <-> v6 helpers (ToClient; only ProtocolMetadata changed, other types are identical) + +fn convert_kv_response_data_v5_to_v6(data: v5::KvResponseData) -> v6::KvResponseData { + match data { + v5::KvResponseData::KvErrorResponse(err) => { + v6::KvResponseData::KvErrorResponse(v6::KvErrorResponse { + message: err.message, + }) + } + v5::KvResponseData::KvGetResponse(resp) => { + v6::KvResponseData::KvGetResponse(v6::KvGetResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v5_to_v6) + .collect(), + }) + } + v5::KvResponseData::KvListResponse(resp) => { + v6::KvResponseData::KvListResponse(v6::KvListResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v5_to_v6) + .collect(), + }) + } + v5::KvResponseData::KvPutResponse => v6::KvResponseData::KvPutResponse, + v5::KvResponseData::KvDeleteResponse => v6::KvResponseData::KvDeleteResponse, + v5::KvResponseData::KvDropResponse => v6::KvResponseData::KvDropResponse, + } +} + +fn convert_kv_response_data_v6_to_v5(data: v6::KvResponseData) -> v5::KvResponseData { + match data { + v6::KvResponseData::KvErrorResponse(err) => { + v5::KvResponseData::KvErrorResponse(v5::KvErrorResponse { + message: err.message, + }) + } + v6::KvResponseData::KvGetResponse(resp) => { + v5::KvResponseData::KvGetResponse(v5::KvGetResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v6_to_v5) + .collect(), + }) + } + v6::KvResponseData::KvListResponse(resp) => { + v5::KvResponseData::KvListResponse(v5::KvListResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v6_to_v5) + .collect(), + }) + } + v6::KvResponseData::KvPutResponse => v5::KvResponseData::KvPutResponse, + v6::KvResponseData::KvDeleteResponse => v5::KvResponseData::KvDeleteResponse, + v6::KvResponseData::KvDropResponse => v5::KvResponseData::KvDropResponse, + } +} + +fn convert_kv_metadata_v5_to_v6(metadata: v5::KvMetadata) -> v6::KvMetadata { + v6::KvMetadata { + version: metadata.version, + update_ts: metadata.update_ts, + } +} + +fn convert_kv_metadata_v6_to_v5(metadata: v6::KvMetadata) -> v5::KvMetadata { + v5::KvMetadata { + version: metadata.version, + update_ts: metadata.update_ts, + } +} + +fn convert_to_client_tunnel_message_kind_v5_to_v6( + kind: v5::ToClientTunnelMessageKind, +) -> v6::ToClientTunnelMessageKind { + match kind { + v5::ToClientTunnelMessageKind::ToClientRequestStart(req) => { + v6::ToClientTunnelMessageKind::ToClientRequestStart(v6::ToClientRequestStart { + actor_id: req.actor_id, + method: req.method, + path: req.path, + headers: req.headers, + body: req.body, + stream: req.stream, + }) + } + v5::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { + v6::ToClientTunnelMessageKind::ToClientRequestChunk(v6::ToClientRequestChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v5::ToClientTunnelMessageKind::ToClientRequestAbort => { + v6::ToClientTunnelMessageKind::ToClientRequestAbort + } + v5::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketOpen(v6::ToClientWebSocketOpen { + actor_id: ws.actor_id, + path: ws.path, + headers: ws.headers, + }) + } + v5::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketMessage(v6::ToClientWebSocketMessage { + data: msg.data, + binary: msg.binary, + }) + } + v5::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { + v6::ToClientTunnelMessageKind::ToClientWebSocketClose(v6::ToClientWebSocketClose { + code: close.code, + reason: close.reason, + }) + } + } +} + +fn convert_to_client_tunnel_message_kind_v6_to_v5( + kind: v6::ToClientTunnelMessageKind, +) -> v5::ToClientTunnelMessageKind { + match kind { + v6::ToClientTunnelMessageKind::ToClientRequestStart(req) => { + v5::ToClientTunnelMessageKind::ToClientRequestStart(v5::ToClientRequestStart { + actor_id: req.actor_id, + method: req.method, + path: req.path, + headers: req.headers, + body: req.body, + stream: req.stream, + }) + } + v6::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { + v5::ToClientTunnelMessageKind::ToClientRequestChunk(v5::ToClientRequestChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v6::ToClientTunnelMessageKind::ToClientRequestAbort => { + v5::ToClientTunnelMessageKind::ToClientRequestAbort + } + v6::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { + v5::ToClientTunnelMessageKind::ToClientWebSocketOpen(v5::ToClientWebSocketOpen { + actor_id: ws.actor_id, + path: ws.path, + headers: ws.headers, + }) + } + v6::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { + v5::ToClientTunnelMessageKind::ToClientWebSocketMessage(v5::ToClientWebSocketMessage { + data: msg.data, + binary: msg.binary, + }) + } + v6::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { + v5::ToClientTunnelMessageKind::ToClientWebSocketClose(v5::ToClientWebSocketClose { + code: close.code, + reason: close.reason, + }) + } + } +} diff --git a/engine/sdks/schemas/runner-protocol/v6.bare b/engine/sdks/schemas/runner-protocol/v6.bare new file mode 100644 index 0000000000..d469f724fb --- /dev/null +++ b/engine/sdks/schemas/runner-protocol/v6.bare @@ -0,0 +1,432 @@ +# Runner Protocol v1 + +# MARK: Core Primitives + +type Id str +type Json str + +type GatewayId data[4] +type RequestId data[4] +type MessageIndex u16 + +# MARK: KV + +# Basic types +type KvKey data +type KvValue data +type KvMetadata struct { + version: data + updateTs: i64 +} + +# Query types +type KvListAllQuery void +type KvListRangeQuery struct { + start: KvKey + end: KvKey + exclusive: bool +} + +type KvListPrefixQuery struct { + key: KvKey +} + +type KvListQuery union { + KvListAllQuery | + KvListRangeQuery | + KvListPrefixQuery +} + +# Request types +type KvGetRequest struct { + keys: list +} + +type KvListRequest struct { + query: KvListQuery + reverse: optional + limit: optional +} + +type KvPutRequest struct { + keys: list + values: list +} + +type KvDeleteRequest struct { + keys: list +} + +type KvDropRequest void + +# Response types +type KvErrorResponse struct { + message: str +} + +type KvGetResponse struct { + keys: list + values: list + metadata: list +} + +type KvListResponse struct { + keys: list + values: list + metadata: list +} + +type KvPutResponse void +type KvDeleteResponse void +type KvDropResponse void + +# Request/Response unions +type KvRequestData union { + KvGetRequest | + KvListRequest | + KvPutRequest | + KvDeleteRequest | + KvDropRequest +} + +type KvResponseData union { + KvErrorResponse | + KvGetResponse | + KvListResponse | + KvPutResponse | + KvDeleteResponse | + KvDropResponse +} + +# MARK: Actor + +# Core +type StopCode enum { + OK + ERROR +} + +type ActorName struct { + metadata: Json +} + +type ActorConfig struct { + name: str + key: optional + createTs: i64 + input: optional +} + +type ActorCheckpoint struct { + actorId: Id + generation: u32 + index: i64 +} + +# Intent +type ActorIntentSleep void + +type ActorIntentStop void + +type ActorIntent union { + ActorIntentSleep | + ActorIntentStop +} + +# State +type ActorStateRunning void + +type ActorStateStopped struct { + code: StopCode + message: optional +} + +type ActorState union { + ActorStateRunning | + ActorStateStopped +} + +# MARK: Events +type EventActorIntent struct { + intent: ActorIntent +} + +type EventActorStateUpdate struct { + state: ActorState +} + +type EventActorSetAlarm struct { + alarmTs: optional +} + +type Event union { + EventActorIntent | + EventActorStateUpdate | + EventActorSetAlarm +} + +type EventWrapper struct { + checkpoint: ActorCheckpoint + inner: Event +} + +# MARK: Commands + +type HibernatingRequest struct { + gatewayId: GatewayId + requestId: RequestId +} + +type CommandStartActor struct { + config: ActorConfig + hibernatingRequests: list +} + +type CommandStopActor void + +type Command union { + CommandStartActor | + CommandStopActor +} + +type CommandWrapper struct { + checkpoint: ActorCheckpoint + inner: Command +} + +# We redeclare this so its top level +type ActorCommandKeyData union { + CommandStartActor | + CommandStopActor +} + +# MARK: Tunnel + +# Message ID + +type MessageId struct { + # Globally unique ID + gatewayId: GatewayId + # Unique ID to the gateway + requestId: RequestId + # Unique ID to the request + messageIndex: MessageIndex +} + + +# HTTP +type ToClientRequestStart struct { + actorId: Id + method: str + path: str + headers: map + body: optional + stream: bool +} + +type ToClientRequestChunk struct { + body: data + finish: bool +} + +type ToClientRequestAbort void + +type ToServerResponseStart struct { + status: u16 + headers: map + body: optional + stream: bool +} + +type ToServerResponseChunk struct { + body: data + finish: bool +} + +type ToServerResponseAbort void + +# WebSocket +type ToClientWebSocketOpen struct { + actorId: Id + path: str + headers: map +} + +type ToClientWebSocketMessage struct { + data: data + binary: bool +} + +type ToClientWebSocketClose struct { + code: optional + reason: optional +} + +type ToServerWebSocketOpen struct { + canHibernate: bool +} + +type ToServerWebSocketMessage struct { + data: data + binary: bool +} + +type ToServerWebSocketMessageAck struct { + index: MessageIndex +} + +type ToServerWebSocketClose struct { + code: optional + reason: optional + hibernate: bool +} + +# To Server +type ToServerTunnelMessageKind union { + # HTTP + ToServerResponseStart | + ToServerResponseChunk | + ToServerResponseAbort | + + # WebSocket + ToServerWebSocketOpen | + ToServerWebSocketMessage | + ToServerWebSocketMessageAck | + ToServerWebSocketClose +} + +type ToServerTunnelMessage struct { + messageId: MessageId + messageKind: ToServerTunnelMessageKind +} + +# To Client +type ToClientTunnelMessageKind union { + # HTTP + ToClientRequestStart | + ToClientRequestChunk | + ToClientRequestAbort | + + # WebSocket + ToClientWebSocketOpen | + ToClientWebSocketMessage | + ToClientWebSocketClose +} + +type ToClientTunnelMessage struct { + messageId: MessageId + messageKind: ToClientTunnelMessageKind +} + +type ToClientPing struct { + ts: i64 +} + +# MARK: To Server +type ToServerInit struct { + name: str + version: u32 + totalSlots: u32 + prepopulateActorNames: optional> + metadata: optional +} + +type ToServerEvents list + +type ToServerAckCommands struct { + lastCommandCheckpoints: list +} + +type ToServerStopping void + +type ToServerPong struct { + ts: i64 +} + +type ToServerKvRequest struct { + actorId: Id + requestId: u32 + data: KvRequestData +} + +type ToServer union { + ToServerInit | + ToServerEvents | + ToServerAckCommands | + ToServerStopping | + ToServerPong | + ToServerKvRequest | + ToServerTunnelMessage +} + +# MARK: To Client +type ProtocolMetadata struct { + runnerLostThreshold: i64 + actorStopThreshold: i64 + serverlessDrainGracePeriod: optional +} + +type ToClientInit struct { + runnerId: Id + metadata: ProtocolMetadata +} + +type ToClientCommands list + +type ToClientAckEvents struct { + lastEventCheckpoints: list +} + +type ToClientKvResponse struct { + requestId: u32 + data: KvResponseData +} + +type ToClient union { + ToClientInit | + ToClientCommands | + ToClientAckEvents | + ToClientKvResponse | + ToClientTunnelMessage | + ToClientPing +} + +# MARK: To Runner +type ToRunnerPing struct { + gatewayId: GatewayId + requestId: RequestId + ts: i64 +} + +type ToRunnerClose void + +# We have to re-declare the entire union since BARE will not generate the +# ser/de for ToClient if it's not a top-level type +type ToRunner union { + ToRunnerPing | + ToRunnerClose | + ToClientCommands | + ToClientAckEvents | + ToClientTunnelMessage +} + +# MARK: To Gateway +type ToGatewayPong struct { + requestId: RequestId + ts: i64 +} + +type ToGateway union { + ToGatewayPong | + ToServerTunnelMessage +} + +# MARK: Serverless +type ToServerlessServerInit struct { + runnerId: Id + runnerProtocolVersion: u16 +} + +type ToServerlessServer union { + ToServerlessServerInit +} diff --git a/engine/sdks/typescript/runner-protocol/src/index.ts b/engine/sdks/typescript/runner-protocol/src/index.ts index b6b7619a84..0e391552ab 100644 --- a/engine/sdks/typescript/runner-protocol/src/index.ts +++ b/engine/sdks/typescript/runner-protocol/src/index.ts @@ -1743,16 +1743,22 @@ export function decodeToServer(bytes: Uint8Array): ToServer { */ export type ProtocolMetadata = { readonly runnerLostThreshold: i64 + readonly actorStopThreshold: i64 + readonly serverlessDrainGracePeriod: i64 | null } export function readProtocolMetadata(bc: bare.ByteCursor): ProtocolMetadata { return { runnerLostThreshold: bare.readI64(bc), + actorStopThreshold: bare.readI64(bc), + serverlessDrainGracePeriod: read7(bc), } } export function writeProtocolMetadata(bc: bare.ByteCursor, x: ProtocolMetadata): void { bare.writeI64(bc, x.runnerLostThreshold) + bare.writeI64(bc, x.actorStopThreshold) + write7(bc, x.serverlessDrainGracePeriod) } export type ToClientInit = { diff --git a/engine/sdks/typescript/runner/src/mod.ts b/engine/sdks/typescript/runner/src/mod.ts index 80834e10dc..405d28443f 100644 --- a/engine/sdks/typescript/runner/src/mod.ts +++ b/engine/sdks/typescript/runner/src/mod.ts @@ -208,8 +208,10 @@ export class Runner { #reconnectAttempt: number = 0; #reconnectTimeout?: NodeJS.Timeout; + // Protocol metadata + #protocolMetadata?: protocol.ProtocolMetadata; + // Runner lost threshold management - #runnerLostThreshold?: number; #runnerLostTimeout?: NodeJS.Timeout; // Event storage for resending @@ -839,14 +841,11 @@ export class Runner { this.#stopAllActors(); } - // Store the runner lost threshold from metadata - this.#runnerLostThreshold = init.metadata?.runnerLostThreshold - ? Number(init.metadata.runnerLostThreshold) - : undefined; + this.#protocolMetadata = init.metadata; this.log?.info({ msg: "received init", - runnerLostThreshold: this.#runnerLostThreshold, + protocolMetadata: this.#protocolMetadata, }); // Resend pending events @@ -888,27 +887,7 @@ export class Runner { }); if (!this.#shutdown) { - // Start runner lost timeout if we have a threshold and are not shutting down - if ( - !this.#runnerLostTimeout && - this.#runnerLostThreshold && - this.#runnerLostThreshold > 0 - ) { - this.log?.info({ - msg: "starting runner lost timeout", - seconds: this.#runnerLostThreshold / 1000, - }); - this.#runnerLostTimeout = setTimeout(() => { - try { - this.#handleLost(); - } catch (err) { - this.log?.error({ - msg: "error handling runner lost", - error: stringifyError(err), - }); - } - }, this.#runnerLostThreshold); - } + this.#startRunnerLostTimeout(); // Attempt to reconnect if not stopped this.#scheduleReconnect(); @@ -944,27 +923,7 @@ export class Runner { this.#ackInterval = undefined; } - // Start runner lost timeout if we have a threshold and are not shutting down - if ( - !this.#runnerLostTimeout && - this.#runnerLostThreshold && - this.#runnerLostThreshold > 0 - ) { - this.log?.info({ - msg: "starting runner lost timeout", - seconds: this.#runnerLostThreshold / 1000, - }); - this.#runnerLostTimeout = setTimeout(() => { - try { - this.#handleLost(); - } catch (err) { - this.log?.error({ - msg: "error handling runner lost", - error: stringifyError(err), - }); - } - }, this.#runnerLostThreshold); - } + this.#startRunnerLostTimeout(); // Attempt to reconnect if not stopped this.#scheduleReconnect(); @@ -976,6 +935,30 @@ export class Runner { }); } + #startRunnerLostTimeout() { + // Start runner lost timeout if we have a threshold and are not shutting down + if ( + !this.#runnerLostTimeout && + this.#protocolMetadata && + this.#protocolMetadata.runnerLostThreshold > 0 + ) { + this.log?.info({ + msg: "starting runner lost timeout", + seconds: this.#protocolMetadata.runnerLostThreshold / 1000n, + }); + this.#runnerLostTimeout = setTimeout(() => { + try { + this.#handleLost(); + } catch (err) { + this.log?.error({ + msg: "error handling runner lost", + error: stringifyError(err), + }); + } + }, Number(this.#protocolMetadata.runnerLostThreshold)); + } + } + #handleCommands(commands: protocol.ToClientCommands) { this.log?.info({ msg: "received commands", @@ -1859,4 +1842,8 @@ export class Runner { //this.#log?.log(`Cleaned up ${toDelete.length} expired KV requests`); } } + + getProtocolMetadata(): protocol.ProtocolMetadata | undefined { + return this.#protocolMetadata; + } } diff --git a/engine/sdks/typescript/runner/src/tunnel.ts b/engine/sdks/typescript/runner/src/tunnel.ts index 0175d15454..0894eaa3e0 100644 --- a/engine/sdks/typescript/runner/src/tunnel.ts +++ b/engine/sdks/typescript/runner/src/tunnel.ts @@ -15,7 +15,7 @@ import { stringifyToClientTunnelMessageKind, stringifyToServerTunnelMessageKind, } from "./stringify"; -import { arraysEqual, idToStr, stringifyError, unreachable } from "./utils"; +import { arraysEqual, idToStr, MAX_BODY_SIZE, stringifyError, unreachable } from "./utils"; import { HIBERNATABLE_SYMBOL, WebSocketTunnelAdapter, @@ -855,6 +855,10 @@ export class Tunnel { // Read the body first to get the actual content const body = response.body ? await response.arrayBuffer() : null; + if (body && body.byteLength > MAX_BODY_SIZE) { + throw new Error("Response body too large"); + } + // Convert headers to map and add Content-Length if not present const headers = new Map(); response.headers.forEach((value, key) => { @@ -1079,7 +1083,7 @@ export class Tunnel { }); if (clientMessageIndex < 0 || clientMessageIndex > 65535) - throw new Error("invalid websocket ack index"); + throw new Error("Invalid websocket ack index"); // Get the actor to find the gatewayId // @@ -1157,7 +1161,7 @@ function buildRequestForWebSocket( }; if (!path.startsWith("/")) { - throw new Error("path must start with leading slash"); + throw new Error("Path must start with leading slash"); } const request = new Request(`http://actor${path}`, { diff --git a/engine/sdks/typescript/runner/src/utils.ts b/engine/sdks/typescript/runner/src/utils.ts index c21c68ed64..b01d06be85 100644 --- a/engine/sdks/typescript/runner/src/utils.ts +++ b/engine/sdks/typescript/runner/src/utils.ts @@ -1,5 +1,8 @@ import { logger } from "./log"; +// 20MiB. Keep in sync with MAX_BODY_SIZE from engine/packages/guard-core/src/proxy_service.rs +export const MAX_BODY_SIZE = 20 * 1024 * 1024; + export function unreachable(x: never): never { throw `Unreachable: ${x}`; } diff --git a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts index 42eaaf137d..75f1f6fbb8 100644 --- a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts +++ b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts @@ -1,7 +1,7 @@ import type { Logger } from "pino"; import { VirtualWebSocket, type UniversalWebSocket, type RivetMessageEvent } from "@rivetkit/virtual-websocket"; import type { Tunnel } from "./tunnel"; -import { wrappingAddU16, wrappingLteU16, wrappingSubU16 } from "./utils"; +import { MAX_BODY_SIZE, wrappingAddU16, wrappingLteU16, wrappingSubU16 } from "./utils"; export const HIBERNATABLE_SYMBOL = Symbol("hibernatable"); @@ -70,11 +70,20 @@ export class WebSocketTunnelAdapter { let messageData: string | ArrayBuffer; if (typeof data === "string") { + const encoder = new TextEncoder(); + if (encoder.encode(data).byteLength > MAX_BODY_SIZE) { + throw new Error("WebSocket message too large"); + } + messageData = data; } else if (data instanceof ArrayBuffer) { + if (data.byteLength > MAX_BODY_SIZE) throw new Error("WebSocket message too large"); + isBinary = true; messageData = data; } else if (ArrayBuffer.isView(data)) { + if (data.byteLength > MAX_BODY_SIZE) throw new Error("WebSocket message too large"); + isBinary = true; const view = data; const buffer = view.buffer instanceof SharedArrayBuffer diff --git a/engine/sdks/typescript/test-runner/src/index.ts b/engine/sdks/typescript/test-runner/src/index.ts index de576b2eba..938b14f1fe 100644 --- a/engine/sdks/typescript/test-runner/src/index.ts +++ b/engine/sdks/typescript/test-runner/src/index.ts @@ -70,6 +70,10 @@ app.get("/has-actor", async (c) => { return c.text("ok"); }); +app.get("/health", (c) => { + return c.text("ok"); +}); + app.get("/shutdown", async (c) => { await runner?.shutdown(true); return c.text("ok"); diff --git a/k8s/engine/07-nats-configmap.yaml b/k8s/engine/07-nats-configmap.yaml index 576a2c1d6d..37b98e58fe 100644 --- a/k8s/engine/07-nats-configmap.yaml +++ b/k8s/engine/07-nats-configmap.yaml @@ -9,6 +9,10 @@ data: # Clustering configuration cluster { + # Static cluster name. Must be identical across all pods; without it NATS + # generates a random name on every startup and pods reject each other. + name: rivet-engine + # Cluster port for peer-to-peer communication port: 6222 diff --git a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/conn-error-serialization.ts b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/conn-error-serialization.ts new file mode 100644 index 0000000000..900943bbc8 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/conn-error-serialization.ts @@ -0,0 +1,27 @@ +import { actor } from "rivetkit"; +import { ActorError } from "@/actor/errors"; + +// Custom error that will be thrown in createConnState +class CustomConnectionError extends ActorError { + constructor(message: string) { + super("connection", "custom_error", message, { public: true }); + } +} + +/** + * Actor that throws a custom error in createConnState to test error serialization + */ +export const connErrorSerializationActor = actor({ + state: { + value: 0, + }, + createConnState: (_c, params: { shouldThrow?: boolean }) => { + if (params.shouldThrow) { + throw new CustomConnectionError("Test error from createConnState"); + } + return { initialized: true }; + }, + actions: { + getValue: (c) => c.state.value, + }, +}); diff --git a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts index d85c7e881e..4b9b840a39 100644 --- a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts +++ b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/registry.ts @@ -75,6 +75,8 @@ import { workflowSleepActor, workflowStopTeardownActor, } from "./workflow"; +import { startStopRaceActor, lifecycleObserver } from "./start-stop-race"; +import { connErrorSerializationActor } from "./conn-error-serialization"; // Consolidated setup with all actors export const registry = setup({ @@ -177,5 +179,10 @@ export const registry = setup({ // From access-control.ts accessControlActor, accessControlNoQueuesActor, + // From start-stop-race.ts + startStopRaceActor, + lifecycleObserver, + // From conn-error-serialization.ts + connErrorSerializationActor, }, }); diff --git a/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/start-stop-race.ts b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/start-stop-race.ts new file mode 100644 index 0000000000..9fad609233 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/fixtures/driver-test-suite/start-stop-race.ts @@ -0,0 +1,71 @@ +import { actor } from "rivetkit"; + +/** + * Actor designed to test start/stop race conditions. + * Has a slow initialization to make race conditions easier to trigger. + */ +export const startStopRaceActor = actor({ + state: { + initialized: false, + startTime: 0, + destroyCalled: false, + startCompleted: false, + }, + onWake: async (c) => { + c.state.startTime = Date.now(); + + // Simulate slow initialization to create window for race condition + await new Promise((resolve) => setTimeout(resolve, 100)); + + c.state.initialized = true; + c.state.startCompleted = true; + }, + onDestroy: (c) => { + c.state.destroyCalled = true; + // Don't save state here - the actor framework will save it automatically + }, + actions: { + getState: (c) => { + return { + initialized: c.state.initialized, + startTime: c.state.startTime, + destroyCalled: c.state.destroyCalled, + startCompleted: c.state.startCompleted, + }; + }, + ping: (c) => { + return "pong"; + }, + destroy: (c) => { + c.destroy(); + }, + }, +}); + +/** + * Observer actor to track lifecycle events from other actors + */ +export const lifecycleObserver = actor({ + state: { + events: [] as Array<{ + actorKey: string; + event: string; + timestamp: number; + }>, + }, + actions: { + recordEvent: (c, params: { actorKey: string; event: string }) => { + c.state.events.push({ + actorKey: params.actorKey, + event: params.event, + timestamp: Date.now(), + }); + }, + getEvents: (c) => { + return c.state.events; + }, + clearEvents: (c) => { + c.state.events = []; + }, + }, +}); diff --git a/rivetkit-typescript/packages/rivetkit/src/actor/config.ts b/rivetkit-typescript/packages/rivetkit/src/actor/config.ts index 06a1b77ca5..ae64722afe 100644 --- a/rivetkit-typescript/packages/rivetkit/src/actor/config.ts +++ b/rivetkit-typescript/packages/rivetkit/src/actor/config.ts @@ -208,9 +208,7 @@ export const ActorConfigSchema = z createVarsTimeout: z.number().positive().default(5000), createConnStateTimeout: z.number().positive().default(5000), onConnectTimeout: z.number().positive().default(5000), - // This must be less than engine config > pegboard.actor_stop_threshold onSleepTimeout: z.number().positive().default(5000), - // This must be less than engine config > pegboard.actor_stop_threshold onDestroyTimeout: z.number().positive().default(5000), stateSaveInterval: z.number().positive().default(10_000), actionTimeout: z.number().positive().default(60_000), diff --git a/rivetkit-typescript/packages/rivetkit/src/actor/instance/mod.ts b/rivetkit-typescript/packages/rivetkit/src/actor/instance/mod.ts index 197cc6948d..41d0d048f0 100644 --- a/rivetkit-typescript/packages/rivetkit/src/actor/instance/mod.ts +++ b/rivetkit-typescript/packages/rivetkit/src/actor/instance/mod.ts @@ -195,6 +195,19 @@ export class ActorInstance< // MARK: - Tracing #traces!: Traces; + // MARK: - Driver Overrides + /** + * Per-instance config option overrides applied by the driver after creation. + * When set, the effective option value is the minimum of the base config + * value and the override value. + */ + overrides: { + onSleepTimeout?: number; + onDestroyTimeout?: number; + runStopTimeout?: number; + waitUntilTimeout?: number; + } = {}; + // MARK: - Constructor constructor(config: ActorConfig) { this.#config = config; @@ -495,7 +508,7 @@ export class ActorInstance< } catch { } // Wait for run handler to complete - await this.#waitForRunHandler(this.#config.options.runStopTimeout); + await this.#waitForRunHandler(this.overrides.runStopTimeout !== undefined ? Math.min(this.#config.options.runStopTimeout, this.overrides.runStopTimeout) : this.#config.options.runStopTimeout); // Call onStop lifecycle if (mode === "sleep") { @@ -511,7 +524,7 @@ export class ActorInstance< // Wait for background tasks await this.#waitBackgroundPromises( - this.#config.options.waitUntilTimeout, + this.overrides.waitUntilTimeout !== undefined ? Math.min(this.#config.options.waitUntilTimeout, this.overrides.waitUntilTimeout) : this.#config.options.waitUntilTimeout, ); // Clear timeouts and save state @@ -1265,7 +1278,7 @@ export class ActorInstance< if (result instanceof Promise) { await deadline( result, - this.#config.options.onSleepTimeout, + this.overrides.onSleepTimeout !== undefined ? Math.min(this.#config.options.onSleepTimeout, this.overrides.onSleepTimeout) : this.#config.options.onSleepTimeout, ); } }, @@ -1297,7 +1310,7 @@ export class ActorInstance< if (result instanceof Promise) { await deadline( result, - this.#config.options.onDestroyTimeout, + this.overrides.onDestroyTimeout !== undefined ? Math.min(this.#config.options.onDestroyTimeout, this.overrides.onDestroyTimeout) : this.#config.options.onDestroyTimeout, ); } }, diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts index cf3590672e..98db320e1a 100644 --- a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/mod.ts @@ -17,6 +17,7 @@ import { runActorConnTests } from "./tests/actor-conn"; import { runActorConnHibernationTests } from "./tests/actor-conn-hibernation"; import { runActorConnStateTests } from "./tests/actor-conn-state"; import { runActorDbTests } from "./tests/actor-db"; +import { runConnErrorSerializationTests } from "./tests/conn-error-serialization"; import { runActorDestroyTests } from "./tests/actor-destroy"; import { runActorDriverTests } from "./tests/actor-driver"; import { runActorErrorHandlingTests } from "./tests/actor-error-handling"; @@ -111,6 +112,8 @@ export function runDriverTests( runActorConnHibernationTests(driverTestConfig); + runConnErrorSerializationTests(driverTestConfig); + runActorDbTests(driverTestConfig); runActorDestroyTests(driverTestConfig); diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts index 438348285a..efa2d96cd9 100644 --- a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-driver.ts @@ -1,5 +1,6 @@ import { describe } from "vitest"; import type { DriverTestConfig } from "../mod"; +import { runActorLifecycleTests } from "./actor-lifecycle"; import { runActorScheduleTests } from "./actor-schedule"; import { runActorSleepTests } from "./actor-sleep"; import { runActorStateTests } from "./actor-state"; @@ -14,5 +15,8 @@ export function runActorDriverTests(driverTestConfig: DriverTestConfig) { // Run actor sleep tests runActorSleepTests(driverTestConfig); + + // Run actor lifecycle tests + runActorLifecycleTests(driverTestConfig); }); } diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-lifecycle.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-lifecycle.ts new file mode 100644 index 0000000000..7333cfa977 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-lifecycle.ts @@ -0,0 +1,157 @@ +import { describe, expect, test } from "vitest"; +import type { DriverTestConfig } from "../mod"; +import { setupDriverTest } from "../utils"; + +export function runActorLifecycleTests(driverTestConfig: DriverTestConfig) { + describe("Actor Lifecycle Tests", () => { + test("actor stop during start waits for start to complete", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-stop-during-start-${Date.now()}`; + + // Create actor - this starts the actor + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + + // Immediately try to call an action and then destroy + // This creates a race where the actor might not be fully started yet + const pingPromise = actor.ping(); + + // Get actor ID + const actorId = await actor.resolve(); + + // Destroy immediately while start might still be in progress + await actor.destroy(); + + // The ping should still complete successfully because destroy waits for start + const result = await pingPromise; + expect(result).toBe("pong"); + + // Verify actor was actually destroyed + let destroyed = false; + try { + await client.startStopRaceActor.getForId(actorId).ping(); + } catch (err: any) { + destroyed = true; + expect(err.group).toBe("actor"); + expect(err.code).toBe("not_found"); + } + expect(destroyed).toBe(true); + }); + + test("actor stop before actor instantiation completes cleans up handler", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-stop-before-instantiation-${Date.now()}`; + + // Create multiple actors rapidly to increase chance of race + const actors = Array.from({ length: 5 }, (_, i) => + client.startStopRaceActor.getOrCreate([ + `${actorKey}-${i}`, + ]), + ); + + // Resolve all actor IDs (this triggers start) + const ids = await Promise.all(actors.map((a) => a.resolve())); + + // Immediately destroy all actors + await Promise.all(actors.map((a) => a.destroy())); + + // Verify all actors were cleaned up + for (const id of ids) { + let destroyed = false; + try { + await client.startStopRaceActor.getForId(id).ping(); + } catch (err: any) { + destroyed = true; + expect(err.group).toBe("actor"); + expect(err.code).toBe("not_found"); + } + expect(destroyed, `actor ${id} should be destroyed`).toBe( + true, + ); + } + }); + + test("onBeforeActorStart completes before stop proceeds", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-before-actor-start-${Date.now()}`; + + // Create actor + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + + // Call action to ensure actor is starting + const statePromise = actor.getState(); + + // Destroy immediately + await actor.destroy(); + + // State should be initialized because onBeforeActorStart must complete + const state = await statePromise; + expect(state.initialized).toBe(true); + expect(state.startCompleted).toBe(true); + }); + + test("multiple rapid create/destroy cycles handle race correctly", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + // Perform multiple rapid create/destroy cycles + for (let i = 0; i < 10; i++) { + const actorKey = `test-rapid-cycle-${Date.now()}-${i}`; + const actor = client.startStopRaceActor.getOrCreate([ + actorKey, + ]); + + // Trigger start + const resolvePromise = actor.resolve(); + + // Immediately destroy + const destroyPromise = actor.destroy(); + + // Both should complete without errors + await Promise.all([resolvePromise, destroyPromise]); + } + + // If we get here without errors, the race condition is handled correctly + expect(true).toBe(true); + }); + + test("actor stop called with no actor instance cleans up handler", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-cleanup-no-instance-${Date.now()}`; + + // Create and immediately destroy + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + const id = await actor.resolve(); + await actor.destroy(); + + // Try to recreate with same key - should work without issues + const newActor = client.startStopRaceActor.getOrCreate([ + actorKey, + ]); + const result = await newActor.ping(); + expect(result).toBe("pong"); + + // Clean up + await newActor.destroy(); + }); + + test("onDestroy is called even when actor is destroyed during start", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-ondestroy-during-start-${Date.now()}`; + + // Create actor + const actor = client.startStopRaceActor.getOrCreate([actorKey]); + + // Start and immediately destroy + const statePromise = actor.getState(); + await actor.destroy(); + + // Verify onDestroy was called (requires actor to be started) + const state = await statePromise; + expect(state.destroyCalled).toBe(true); + }); + }); +} diff --git a/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/conn-error-serialization.ts b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/conn-error-serialization.ts new file mode 100644 index 0000000000..e5ccf1ef23 --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/conn-error-serialization.ts @@ -0,0 +1,64 @@ +import { describe, expect, test } from "vitest"; +import type { DriverTestConfig } from "../mod"; +import { setupDriverTest } from "../utils"; + +export function runConnErrorSerializationTests(driverTestConfig: DriverTestConfig) { + describe("Connection Error Serialization Tests", () => { + test("error thrown in createConnState preserves group and code through WebSocket serialization", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-error-serialization-${Date.now()}`; + + // Create actor handle with params that will trigger error in createConnState + const actor = client.connErrorSerializationActor.getOrCreate( + [actorKey], + { params: { shouldThrow: true } }, + ); + + // Try to connect, which will trigger error in createConnState + const conn = actor.connect(); + + // Wait for connection to fail + let caughtError: any; + try { + // Try to call an action, which should fail because connection couldn't be established + await conn.getValue(); + } catch (err) { + caughtError = err; + } + + // Verify the error was caught + expect(caughtError).toBeDefined(); + + // Verify the error has the correct group and code from the original error + // Original error: new CustomConnectionError("...") with group="connection", code="custom_error" + expect(caughtError.group).toBe("connection"); + expect(caughtError.code).toBe("custom_error"); + + // Clean up + await conn.dispose(); + }); + + test("successful createConnState does not throw error", async (c) => { + const { client } = await setupDriverTest(c, driverTestConfig); + + const actorKey = `test-no-error-${Date.now()}`; + + // Create actor handle with params that will NOT trigger error + const actor = client.connErrorSerializationActor.getOrCreate( + [actorKey], + { params: { shouldThrow: false } }, + ); + + // Connect without triggering error + const conn = actor.connect(); + + // This should succeed + const value = await conn.getValue(); + expect(value).toBe(0); + + // Clean up + await conn.dispose(); + }); + }); +} diff --git a/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts b/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts index fb3b50f133..4adb8959fa 100644 --- a/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts +++ b/rivetkit-typescript/packages/rivetkit/src/drivers/engine/actor-driver.ts @@ -156,7 +156,7 @@ export class EngineActorDriver implements ActorDriver { onConnected: () => { this.#runnerStarted.resolve(undefined); }, - onDisconnected: (_code, _reason) => {}, + onDisconnected: (_code, _reason) => { }, onShutdown: () => { this.#runnerStopped.resolve(undefined); this.#isRunnerStopped = true; @@ -395,7 +395,7 @@ export class EngineActorDriver implements ActorDriver { async serverlessHandleStart(c: HonoContext): Promise { return streamSSE(c, async (stream) => { // NOTE: onAbort does not work reliably - stream.onAbort(() => {}); + stream.onAbort(() => { }); c.req.raw.signal.addEventListener("abort", () => { logger().debug("SSE aborted, shutting down runner"); @@ -497,8 +497,28 @@ export class EngineActorDriver implements ActorDriver { // Create actor instance const definition = lookupInRegistry(this.#config, actorConfig.name); + handler.actor = await definition.instantiate(); + // Apply protocol limits as per-instance overrides without mutating the shared definition + const protocolMetadata = this.#runner.getProtocolMetadata(); + if (protocolMetadata) { + logger().debug({ + msg: "applying config limits from protocol", + protocolMetadata, + }); + + const stopThresholdMax = Math.max(Number(protocolMetadata.actorStopThreshold) - 1000, 0); + handler.actor.overrides.onSleepTimeout = stopThresholdMax; + handler.actor.overrides.onDestroyTimeout = stopThresholdMax; + + if (protocolMetadata.serverlessDrainGracePeriod) { + const drainMax = Math.max(Number(protocolMetadata.serverlessDrainGracePeriod) - 1000, 0); + handler.actor.overrides.runStopTimeout = drainMax; + handler.actor.overrides.waitUntilTimeout = drainMax; + } + } + // Start actor await handler.actor.start( this, @@ -514,9 +534,9 @@ export class EngineActorDriver implements ActorDriver { const error = innerError instanceof Error ? new Error( - `Failed to start actor ${actorId}: ${innerError.message}`, - { cause: innerError }, - ) + `Failed to start actor ${actorId}: ${innerError.message}`, + { cause: innerError }, + ) : new Error(`Failed to start actor ${actorId}: ${String(innerError)}`); handler.actor = undefined; handler.actorStartError = error; @@ -559,15 +579,26 @@ export class EngineActorDriver implements ActorDriver { this.#actorStopIntent.delete(actorId); const handler = this.#actors.get(actorId); - if (handler?.actorStartPromise) { - const startError = - handler.actorStartError ?? - new Error(`Actor ${actorId} stopped before start completed`); - handler.actorStartError = startError; - handler.actorStartPromise.reject(startError); - handler.actorStartPromise = undefined; + if (!handler) { + logger().debug({ msg: "no runner actor handler to stop", actorId, reason }); + return; } - if (handler?.actor) { + + if (handler.actorStartPromise) { + try { + logger().debug({ msg: "runner actor stopping before it started, waiting", actorId, generation }); + await handler.actorStartPromise.promise; + } catch (err) { + // Start failed, but we still want to clean up the handler + logger().debug({ + msg: "actor start failed during stop, cleaning up handler", + actorId, + err: stringifyError(err), + }); + } + } + + if (handler.actor) { try { await handler.actor.onStop(reason); } catch (err) { @@ -577,7 +608,8 @@ export class EngineActorDriver implements ActorDriver { }); } } - if (handler) this.#actors.delete(actorId); + + this.#actors.delete(actorId); logger().debug({ msg: "runner actor stopped", actorId, reason }); } @@ -762,7 +794,7 @@ export class EngineActorDriver implements ActorDriver { entry.bufferedMessageSize >= CONN_BUFFERED_MESSAGE_SIZE_THRESHOLD ) { - // Reset buffered message size immeidatley (instead + // Reset buffered message size immediately (instead // of waiting for onAfterPersistConn) since we may // receive more messages before onAfterPersistConn // is called, which would called saveState diff --git a/scripts/misc/endian-converter.ts b/scripts/debug/endian-converter.ts similarity index 100% rename from scripts/misc/endian-converter.ts rename to scripts/debug/endian-converter.ts diff --git a/scripts/tests/load-test/actor-lifecycle.js b/scripts/tests/load-test/actor-lifecycle.js index c9e67fa09d..b26e81f52a 100644 --- a/scripts/tests/load-test/actor-lifecycle.js +++ b/scripts/tests/load-test/actor-lifecycle.js @@ -87,8 +87,6 @@ export const options = { 'actor_destroy_success': ['rate>0.95'], 'actor_ping_success': ['rate>0.95'], 'websocket_success': ['rate>0.90'], - 'http_req_duration': ['p(95)<5000', 'p(99)<10000'], - // 'actor_create_duration': ['p(95)<3000'], }, noConnectionReuse: false, userAgent: 'k6-actor-lifecycle-test',