Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/perf-thresholds.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"_comment": "Per-test perf thresholds. Schema: <test_name>: { <metric_name>: <max value> }. A run is a regression iff metric value > max. Units match the metric (e.g. *_total_time_ns is in nanoseconds; 1 s = 1e9 ns).",

"SvDsoStoreIngestionPerformanceTest": {
"splice_perf_ingestion_total_time_ns": 60000000000
},

"ScanStoreIngestionPerformanceTest": {
"splice_perf_ingestion_total_time_ns": 600000000000
},

"UpdateHistoryIngestionPerformanceTest": {
"splice_perf_ingestion_total_time_ns": 1800000000000
},

"UpdateHistoryReadPerformanceTest-getUpdate": {
"splice_perf_read_total_time_ns": 5000000000
},

"UpdateHistoryReadPerformanceTest-encodeUpdate": {
"splice_perf_read_total_time_ns": 5000000000
}
}
15 changes: 15 additions & 0 deletions .github/perf-thresholds.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
SvDsoStoreIngestionPerformanceTest:
splice_perf_ingestion_total_time_ns:
max: 60000000000 # 60 s
ScanStoreIngestionPerformanceTest:
splice_perf_ingestion_total_time_ns:
max: 600000000000 # 600 s
UpdateHistoryIngestionPerformanceTest:
splice_perf_ingestion_total_time_ns:
max: 1800000000000 # 1800 s (30 min)
UpdateHistoryReadPerformanceTest-getUpdate:
splice_perf_read_total_time_ns:
max: 5000000000 # 5 s
UpdateHistoryReadPerformanceTest-encodeUpdate:
splice_perf_read_total_time_ns:
max: 5000000000 # 5 s
42 changes: 38 additions & 4 deletions .github/workflows/performance_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,31 @@ jobs:
echo "Downloading updates from mainnet for performance tests"
curl -sSL --fail-with-body "https://storage.googleapis.com/mainnet-history-dumps/mainnetupdates.json" -o /tmp/mainnetupdates.json
ls -lah /tmp/mainnetupdates.json
cmd: '"apps-app / Test / runMain org.lfdecentralizedtrust.splice.performance.SplicePerf run -t DbSvDsoStore -c ./apps/app/src/test/resources/performance/tests.conf -d /tmp/mainnetupdates.json" "apps-app / Test / runMain org.lfdecentralizedtrust.splice.performance.SplicePerf run -t DbScanStore -c ./apps/app/src/test/resources/performance/tests.conf -d /tmp/mainnetupdates.json" "apps-app / Test / runMain org.lfdecentralizedtrust.splice.performance.SplicePerf run -t UpdateHistory -c ./apps/app/src/test/resources/performance/tests.conf -d /tmp/mainnetupdates.json"'
cmd: '"apps-app / Test / runMain org.lfdecentralizedtrust.splice.performance.SplicePerf run -t DbSvDsoStore -c ./apps/app/src/test/resources/performance/tests.conf -d /tmp/mainnetupdates.json"'
#cmd: '"apps-app / Test / runMain org.lfdecentralizedtrust.splice.performance.SplicePerf run -t DbSvDsoStore -c ./apps/app/src/test/resources/performance/tests.conf -d /tmp/mainnetupdates.json" "apps-app / Test / runMain org.lfdecentralizedtrust.splice.performance.SplicePerf run -t DbScanStore -c ./apps/app/src/test/resources/performance/tests.conf -d /tmp/mainnetupdates.json" "apps-app / Test / runMain org.lfdecentralizedtrust.splice.performance.SplicePerf run -t UpdateHistory -c ./apps/app/src/test/resources/performance/tests.conf -d /tmp/mainnetupdates.json"'

- name: Push ingestion metrics to Prometheus Pushgateway
if: ${{ success() }}
# TODO: re-enable once <reason>; replace `false` with the original predicate below.
if: false
# if: ${{ success() }}
run: |
python3 scripts/performance/push_store_ingestion_perf_metrics.py

- name: Upload ingestion metrics artifact
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: store-ingestion-perf-metrics-${{ github.run_id }}
path: /tmp/store-ingestion-perf-metrics
if-no-files-found: ignore

- name: Check ingestion thresholds
id: perf_check
#if: ${{ success() && github.event_name == 'schedule' }}
if: ${{ success() }}
run: |
python3 scripts/performance/check_perf_thresholds.py /tmp/store-ingestion-perf-metrics || true

# sanity check
- name: Check logs for errors
uses: ./splice-shared-gha/.github/actions/sbt/execute_sbt_command
Expand All @@ -86,7 +104,8 @@ jobs:
name: "logs-ingestion-performance-tests"

- name: Report Failures on Slack & Github
if: failure() && github.event_name == 'schedule'
#if: ${{ github.event_name == 'schedule' && (failure() || steps.perf_check.outputs.has_breaches == 'true') }}
if: ${{ failure() || steps.perf_check.outputs.has_breaches == 'true' }}
uses: ./splice-shared-gha/.github/actions/tests/failure_notifications
with:
job_subname: "Daily Store Ingestion Performance Tests"
Expand All @@ -101,7 +120,9 @@ jobs:
container:
image: us-central1-docker.pkg.dev/da-cn-shared/ghcr/digital-asset/decentralized-canton-sync-dev/docker/splice-test-ci:0.3.12
name: Read Performance Tests
if: github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name
# TODO: re-enable once <reason>; replace `false` with the original predicate below.
if: false
# if: github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name
steps:
- name: Check out repository code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
Expand Down Expand Up @@ -149,6 +170,19 @@ jobs:
run: |
python3 scripts/performance/push_store_ingestion_perf_metrics.py /tmp/store-read-perf-metrics

- name: Upload read metrics artifact
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: store-read-perf-metrics-${{ github.run_id }}
path: /tmp/store-read-perf-metrics
if-no-files-found: ignore

- name: Check read thresholds
if: ${{ success() && github.event_name == 'schedule' }}
run: |
python3 scripts/performance/check_perf_thresholds.py /tmp/store-read-perf-metrics

# sanity check
- name: Check logs for errors
uses: ./splice-shared-gha/.github/actions/sbt/execute_sbt_command
Expand Down
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw, do you know where do the alerts from the splice cluster end up?

Copy link
Copy Markdown
Contributor Author

@jagathweerasinghe-da jagathweerasinghe-da Apr 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It ends up in team-canton-network-internal-ci
./cluster/deployment/splice/.envrc.vars
export SLACK_ALERT_NOTIFICATION_CHANNEL_FULL_NAME="team-canton-network-internal-ci"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pretty sure that that channel is mostly ignored in favor of the test failures dashboard, you might want to ask for opinions on #internal

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@OriolMunoz-da

As per the internal discussion we had, let's go with the GH issue. Thanks for bringing this point:

There are two routes to create a GH issue:

  1. GH Action->Prometheous->Grafana Alerts->GH Issue
  2. GH Action->GH Issue

I would like go ahead with the second approach, which makes the integration clean and simple. WDYT?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, 2 seems easiest

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
apiVersion: 1
groups:
- orgId: 1
name: Store Ingestion Performance
folder: database
interval: 1h
rules:
# Single multi-dimensional rule: one alert instance per `test` label.
# To add a new test, append one `or label_replace(...)` line below with the test name and its threshold (in seconds).
- uid: store-ingestion-total-time
title: Store Ingestion Total Time Too High
condition: C
data:
- refId: A
relativeTimeRange:
from: 86400
to: 0
datasourceUid: prometheus
model:
refId: A
expr: |
(splice_perf_ingestion_total_time_ns / 1e9)
/ on(test) group_left() (
label_replace(vector(300), "test", "ScanStoreIngestionPerformanceTest", "", "")
or label_replace(vector(300), "test", "SvDsoStoreIngestionPerformanceTest", "", "")
or label_replace(vector(1200), "test", "UpdateHistoryIngestionPerformanceTest","", "")
)
- refId: B
datasourceUid: __expr__
model:
refId: B
type: reduce
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
refId: C
type: threshold
expression: B
conditions:
- evaluator:
type: gt
params: [ 1 ]
for: 5m
noDataState: OK
execErrState: Alerting
annotations:
summary: 'Store ingestion total time for {{ $labels.test }} exceeded its threshold (ratio: {{ $values.B }})'
labels:
severity: warning
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
apiVersion: 1
groups:
- orgId: 1
name: Store Read Performance
folder: database
interval: 1h
rules:
# Single multi-dimensional rule: one alert instance per `test` label.
# To add a new read test/operation, append one `or label_replace(...)` line below with the test name and its threshold (in seconds).
- uid: store-read-total-time
title: Store Read Total Time Too High
condition: C
data:
- refId: A
relativeTimeRange:
from: 86400
to: 0
datasourceUid: prometheus
model:
refId: A
expr: |
(splice_perf_read_total_time_ns / 1e9)
/ on(test) group_left() (
label_replace(vector(1), "test", "UpdateHistoryReadPerformanceTest-getUpdate", "", "")
or label_replace(vector(1), "test", "UpdateHistoryReadPerformanceTest-encodeUpdate", "", "")
)
- refId: B
datasourceUid: __expr__
model:
refId: B
type: reduce
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
refId: C
type: threshold
expression: B
conditions:
- evaluator:
type: gt
params: [ 1 ]
for: 5m
noDataState: OK
execErrState: Alerting
annotations:
summary: 'Store read total time for {{ $labels.test }} exceeded its threshold (ratio: {{ $values.B }})'
labels:
severity: warning
6 changes: 6 additions & 0 deletions cluster/pulumi/observability/src/observability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,12 @@ function createGrafanaAlerting(namespace: Input<string>) {
}
: {}),
...{
'store_ingestion_performance_alerts.yaml': readGrafanaAlertingFile(
'store_ingestion_performance_alerts.yaml'
),
'store_read_performance_alerts.yaml': readGrafanaAlertingFile(
'store_read_performance_alerts.yaml'
),
'acs-stores_alerts.yaml': readGrafanaAlertingFile('acs-stores_alerts.yaml').replaceAll(
'$NODATA',
loadTesterConfig?.enable ? 'Alerting' : 'OK'
Expand Down
Loading
Loading