Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CELEBORN-1831] Add ratis commitIndex metrics #3063

Closed
wants to merge 10 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add difference value metrics
zaynt4606 committed Jan 13, 2025
commit c6b61f58e4d580f1e308544606934a6f05e5f28c
93 changes: 93 additions & 0 deletions assets/grafana/celeborn-dashboard.json
Original file line number Diff line number Diff line change
@@ -1375,6 +1375,99 @@
"title": "metrics_MasterCommitIndex_Value",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The difference value of commit index of the masters in HA mode.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 74
},
"id": 136,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"maxHeight": 600,
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "metrics_MasterCommitIndexDiff_Value{instance=~\"${instance}\"}",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
"title": "metrics_MasterCommitIndexDiff_Value",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
1 change: 1 addition & 0 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
@@ -110,6 +110,7 @@ These metrics are exposed by Celeborn master.
| DecommissionWorkerCount | The count of workers in decommission list. |
| IsActiveMaster | Whether the current master is active. |
| MasterCommitIndex | The commit index of the current master in HA mode. |
| MasterCommitIndexDiff | The difference value of commit index of the masters in HA mode. |
| PartitionSize | The size of estimated shuffle partition. |
| OfferSlotsTime | The time for masters to handle `RequestSlots` request when registering shuffle. |

Original file line number Diff line number Diff line change
@@ -26,6 +26,7 @@ import java.util.concurrent.atomic.AtomicBoolean
import java.util.function.ToLongFunction

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.util.Random

import com.google.common.annotations.VisibleForTesting
@@ -285,7 +286,9 @@ private[celeborn] class Master(
statusSystem.decommissionWorkers.size()
}

masterSource.addGauge(MasterSource.MASTER_COMMIT_INDEX) { () => getMasterRaftCommitIndex }
masterSource.addGauge(MasterSource.MASTER_COMMIT_INDEX) { () => getMasterRaftCommitIndex._1 }

masterSource.addGauge(MasterSource.MASTER_COMMIT_INDEX_DIFF) { () => getMasterRaftCommitIndex._2 }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only addGauge if haEnabled

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, has been updated.


private val threadsStarted: AtomicBoolean = new AtomicBoolean(false)
rpcEnv.setupEndpoint(RpcNameConstants.MASTER_EP, this)
@@ -1480,25 +1483,35 @@ private[celeborn] class Master(
}
}

private def getMasterRaftCommitIndex: Long = {
private def getMasterRaftCommitIndex: (Long, Long) = {
if (conf.haEnabled) {
val ratisServer = statusSystem.asInstanceOf[HAMasterMetaManager].getRatisServer.getServer
if (ratisServer == null) {
0
(0, 0)
} else {
val raftPeerProto = ratisServer.getPeer.getRaftPeerProto
val peerProtoId = ratisServer.getPeer.getRaftPeerProto.getId.toStringUtf8
val groupInfo = statusSystem.asInstanceOf[HAMasterMetaManager].getRatisServer.getGroupInfo
val commitInfos = groupInfo.getCommitInfos
var commitIndex: Long = 0
var minIndex = Long.MaxValue
var maxIndex: Long = 0
commitInfos.asScala.foreach { commitInfo =>
if (commitInfo.getServer.equals(raftPeerProto)) {
commitIndex = commitInfo.getCommitIndex
val indexPeerProtoId = commitInfo.getServer.getId.toStringUtf8
val peerCommitIndex = commitInfo.getCommitIndex
if (indexPeerProtoId.equals(peerProtoId)) {
commitIndex = peerCommitIndex
}
if (minIndex > peerCommitIndex) {
minIndex = peerCommitIndex
}
if (maxIndex < peerCommitIndex) {
maxIndex = peerCommitIndex
}
}
commitIndex
(commitIndex, Math.max(0, maxIndex - minIndex))
}
} else {
0
(0, 0)
}
}

Original file line number Diff line number Diff line change
@@ -62,6 +62,8 @@ object MasterSource {

val MASTER_COMMIT_INDEX = "MasterCommitIndex"

val MASTER_COMMIT_INDEX_DIFF = "MasterCommitIndexDiff"
Copy link
Member

@turboFei turboFei Jan 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RatisCommitIndex and RatisCommitIndexDiff are more straight forward.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done~


// Capacity
val DEVICE_CELEBORN_FREE_CAPACITY = "DeviceCelebornFreeBytes"
val DEVICE_CELEBORN_TOTAL_CAPACITY = "DeviceCelebornTotalBytes"