Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

My release 0.21 from dev branch #115

Open
wants to merge 49 commits into
base: development
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
0f8f66b
nodes: Add partition and feature_set labels to nodes collector
fihuer Dec 13, 2019
f8fb30d
queue: Collect jobs with user, partition and reason information
fihuer Dec 13, 2019
2ff4fba
scheduler: Collect rpc stats
fihuer Dec 13, 2019
7538040
vendor dependencies
fihuer Dec 16, 2019
895c7c6
Merge branch '74_Correct_spec_file_path'
mtds Mar 29, 2022
df50ddf
nodes: Add partition and feature_set labels to nodes collector
fihuer Dec 13, 2019
214e48a
queue: Collect jobs with user, partition and reason information
fihuer Dec 13, 2019
dff6df3
scheduler: Collect rpc stats
fihuer Dec 13, 2019
4b7ac9f
Create go.yml
fihuer Jul 1, 2024
867002b
nodes_tests: Fix tests with new test data according to new way of par…
fihuer Jul 3, 2024
fd17889
lint & static checks fixes
fihuer Jul 3, 2024
8b24889
nodes: Also count the nodes that are not parsed correctly
fihuer Jul 3, 2024
b909964
Merge branch 'master' into rebase_020
fihuer Jul 3, 2024
903630b
Merge pull request #2 from cea-hpc/rebase_020
fihuer Jul 3, 2024
228bd94
Create go-ossf-slsa3-publish.yml
fihuer Jul 3, 2024
fc1338b
Create go.yml
fihuer Jul 3, 2024
641544a
Remove tests that cannot run without slurm
fihuer Jul 3, 2024
73303e7
Update go-ossf-slsa3-publish.yml
fihuer Jul 3, 2024
d6fd92e
Add the "planned" node state
fihuer Jul 3, 2024
c57324f
Add goreleaser configuration files
fihuer Jul 3, 2024
ee0e8a7
Add .slsa-goreleaser.yml configuration file
fihuer Jul 3, 2024
fc28823
Update gpus.go for Slurm>=19.05.0rc1
SckyzO Sep 11, 2024
6721bb3
import packages update
SckyzO Sep 11, 2024
ecc3d5c
Add TLS & BasicAuth support
SckyzO Sep 11, 2024
9efe0d1
update gitignore
SckyzO Sep 11, 2024
b1dd3b1
fix log package + go.sum + rename bin to slurm_exporter
Sep 18, 2024
1ef9293
update README
Sep 18, 2024
3c10b49
Add CGO_ENABLED
Sep 18, 2024
39a5021
update workflow to go 1.20wq
Sep 18, 2024
81776ef
Create FUNDING.yml
SckyzO Oct 9, 2024
7075a83
Update README.md
SckyzO Oct 9, 2024
b4ba665
Update README.md
SckyzO Oct 9, 2024
14fa4d9
Add new metric slurm_node_status
Oct 9, 2024
7e35a22
Merge branch 'master' of github.com:SckyzO/slurm_exporter
Oct 9, 2024
00339fc
replace io/ioutil by io (deprecated)
Oct 9, 2024
614337b
update gitignore
Oct 9, 2024
8a7e81a
update makefile
Oct 9, 2024
b0717ae
update go modules
Oct 9, 2024
12f9363
add Slurm Binaries informations metrics
Oct 9, 2024
7ce0a14
add release workflow
Oct 9, 2024
75d30ad
update slurm_binary_info
Oct 9, 2024
b377b21
update go version
Oct 9, 2024
3866f2d
update workflow
Oct 9, 2024
6e8207f
update workfow
Oct 10, 2024
b3c8a67
dev update workflow
Oct 10, 2024
faef9f9
update
Oct 10, 2024
48e8c8d
Merge branch 'development' into master
SckyzO Jan 14, 2025
ceea63c
Update go-slsa-release.yml
SckyzO Feb 19, 2025
81a78c7
Update go-release.yml
SckyzO Feb 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
scheduler: Collect rpc stats
fihuer committed Dec 13, 2019
commit 2ff4fbadb7b5de0f5b790f883235b6d88b7ba294
424 changes: 289 additions & 135 deletions scheduler.go
Original file line number Diff line number Diff line change
@@ -16,13 +16,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"io/ioutil"
"log"
"os/exec"
"regexp"
"strings"
"strconv"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
"io/ioutil"
"os/exec"
"regexp"
"strconv"
"strings"
)

/*
@@ -33,78 +33,168 @@ import (

// Basic metrics for the scheduler
type SchedulerMetrics struct {
threads float64
queue_size float64
last_cycle float64
mean_cycle float64
cycle_per_minute float64
backfill_last_cycle float64
backfill_mean_cycle float64
backfill_depth_mean float64
threads float64
queue_size float64
last_cycle float64
mean_cycle float64
cycle_per_minute float64
backfill_last_cycle float64
backfill_mean_cycle float64
backfill_depth_mean float64
rpc_stats_count map[string]float64
rpc_stats_avg_time map[string]float64
rpc_stats_total_time map[string]float64
user_rpc_stats_count map[string]float64
user_rpc_stats_avg_time map[string]float64
user_rpc_stats_total_time map[string]float64
}

// Execute the sdiag command and return its output
func SchedulerData() []byte {
cmd := exec.Command("sdiag")
stdout, err := cmd.StdoutPipe()
if err != nil { log.Fatal(err) }
if err := cmd.Start(); err != nil { log.Fatal(err) }
out, _ := ioutil.ReadAll(stdout)
if err := cmd.Wait(); err != nil { log.Fatal(err) }
return out
cmd := exec.Command("/usr/bin/sdiag")
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Fatal(err)
}
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
out, _ := ioutil.ReadAll(stdout)
if err := cmd.Wait(); err != nil {
log.Fatal(err)
}
return out
}

// Extract the relevant metrics from the sdiag output
func ParseSchedulerMetrics(input []byte) *SchedulerMetrics {
var sm SchedulerMetrics
lines := strings.Split(string(input),"\n")
// Guard variables to check for string repetitions in the output of sdiag
// (two 'Last cycle', two 'Mean cycle')
lc_count := 0
mc_count := 0
for _, line := range lines {
if strings.Contains(line, ":") {
state := strings.Split(line, ":")[0]
st := regexp.MustCompile(`^Server thread`)
qs := regexp.MustCompile(`^Agent queue`)
lc := regexp.MustCompile(`^[\s]+Last cycle$`)
mc := regexp.MustCompile(`^[\s]+Mean cycle$`)
cpm := regexp.MustCompile(`^[\s]+Cycles per`)
dpm := regexp.MustCompile(`^[\s]+Depth Mean$`)
switch {
case st.MatchString(state) == true:
sm.threads, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
case qs.MatchString(state) == true:
sm.queue_size, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
case lc.MatchString(state) == true:
if lc_count == 0 {
sm.last_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
lc_count = 1
}
if lc_count == 1 {
sm.backfill_last_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
}
case mc.MatchString(state) == true:
if mc_count == 0 {
sm.mean_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
mc_count = 1
}
if mc_count == 1 {
sm.backfill_mean_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
}
case cpm.MatchString(state) == true:
sm.cycle_per_minute, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
case dpm.MatchString(state) == true:
sm.backfill_depth_mean, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
}
}
}
return &sm
var sm SchedulerMetrics
lines := strings.Split(string(input), "\n")
// Guard variables to check for string repetitions in the output of sdiag
// (two 'Last cycle', two 'Mean cycle')
lc_count := 0
mc_count := 0
for _, line := range lines {
if strings.Contains(line, ":") {
state := strings.Split(line, ":")[0]
st := regexp.MustCompile(`^Server thread`)
qs := regexp.MustCompile(`^Agent queue`)
lc := regexp.MustCompile(`^[\s]+Last cycle$`)
mc := regexp.MustCompile(`^[\s]+Mean cycle$`)
cpm := regexp.MustCompile(`^[\s]+Cycles per`)
dpm := regexp.MustCompile(`^[\s]+Depth Mean$`)
switch {
case st.MatchString(state) == true:
sm.threads, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
case qs.MatchString(state) == true:
sm.queue_size, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
case lc.MatchString(state) == true:
if lc_count == 0 {
sm.last_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
lc_count = 1
}
if lc_count == 1 {
sm.backfill_last_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
}
case mc.MatchString(state) == true:
if mc_count == 0 {
sm.mean_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
mc_count = 1
}
if mc_count == 1 {
sm.backfill_mean_cycle, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
}
case cpm.MatchString(state) == true:
sm.cycle_per_minute, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
case dpm.MatchString(state) == true:
sm.backfill_depth_mean, _ = strconv.ParseFloat(strings.TrimSpace(strings.Split(line, ":")[1]), 64)
}
}
}
rpc_stats := ParseRpcStats(lines)
sm.rpc_stats_count = rpc_stats[0]
sm.rpc_stats_avg_time = rpc_stats[1]
sm.rpc_stats_total_time = rpc_stats[2]
sm.user_rpc_stats_count = rpc_stats[3]
sm.user_rpc_stats_avg_time = rpc_stats[4]
sm.user_rpc_stats_total_time = rpc_stats[5]
return &sm
}

// Helper function to split a single line from the sdiag output
func SplitColonValueToFloat(input string) float64 {
str := strings.Split(input, ":")
if len(str) == 1 {
return 0
} else {
rvalue := strings.TrimSpace(str[1])
flt, _ := strconv.ParseFloat(rvalue, 64)
return flt
}
}

// Helper function to return RPC stats from sdiag output
func ParseRpcStats(lines []string) []map[string]float64 {
var in_rpc bool
var in_rpc_per_user bool
var count_stats map[string]float64
var avg_stats map[string]float64
var total_stats map[string]float64
var user_count_stats map[string]float64
var user_avg_stats map[string]float64
var user_total_stats map[string]float64

count_stats = make(map[string]float64)
avg_stats = make(map[string]float64)
total_stats = make(map[string]float64)
user_count_stats = make(map[string]float64)
user_avg_stats = make(map[string]float64)
user_total_stats = make(map[string]float64)

in_rpc = false
in_rpc_per_user = false

stat_line_re := regexp.MustCompile(`^\s*([A-Za-z0-9_]*).*count:([0-9]*)\s*ave_time:([0-9]*)\s\s*total_time:([0-9]*)\s*$`)

for _, line := range lines {
if strings.Contains(line, "Remote Procedure Call statistics by message type") {
in_rpc = true
in_rpc_per_user = false
} else if strings.Contains(line, "Remote Procedure Call statistics by user") {
in_rpc = false
in_rpc_per_user = true
}
if in_rpc || in_rpc_per_user {
re_match := stat_line_re.FindAllStringSubmatch(line, -1)
if re_match != nil {
re_match_first := re_match[0]
if in_rpc {
count_stats[re_match_first[1]], _ = strconv.ParseFloat(re_match_first[2], 64)
avg_stats[re_match_first[1]], _ = strconv.ParseFloat(re_match_first[3], 64)
total_stats[re_match_first[1]], _ = strconv.ParseFloat(re_match_first[4], 64)
} else if in_rpc_per_user {
user_count_stats[re_match_first[1]], _ = strconv.ParseFloat(re_match_first[2], 64)
user_avg_stats[re_match_first[1]], _ = strconv.ParseFloat(re_match_first[3], 64)
user_total_stats[re_match_first[1]], _ = strconv.ParseFloat(re_match_first[4], 64)
}
}
}
}

rpc_stats_final := []map[string]float64{
count_stats,
avg_stats,
total_stats,
user_count_stats,
user_avg_stats,
user_total_stats,
}
return rpc_stats_final
}

// Returns the scheduler metrics
func SchedulerGetMetrics() *SchedulerMetrics {
return ParseSchedulerMetrics(SchedulerData())
return ParseSchedulerMetrics(SchedulerData())
}

/*
@@ -113,86 +203,150 @@ func SchedulerGetMetrics() *SchedulerMetrics {
* https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector
*/


// Collector strcture
type SchedulerCollector struct {
threads *prometheus.Desc
queue_size *prometheus.Desc
last_cycle *prometheus.Desc
mean_cycle *prometheus.Desc
cycle_per_minute *prometheus.Desc
backfill_last_cycle *prometheus.Desc
backfill_mean_cycle *prometheus.Desc
backfill_depth_mean *prometheus.Desc
threads *prometheus.Desc
queue_size *prometheus.Desc
last_cycle *prometheus.Desc
mean_cycle *prometheus.Desc
cycle_per_minute *prometheus.Desc
backfill_last_cycle *prometheus.Desc
backfill_mean_cycle *prometheus.Desc
backfill_depth_mean *prometheus.Desc
rpc_stats_count *prometheus.Desc
rpc_stats_avg_time *prometheus.Desc
rpc_stats_total_time *prometheus.Desc
user_rpc_stats_count *prometheus.Desc
user_rpc_stats_avg_time *prometheus.Desc
user_rpc_stats_total_time *prometheus.Desc
}

// Send all metric descriptions
func (c *SchedulerCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.threads
ch <- c.queue_size
ch <- c.last_cycle
ch <- c.mean_cycle
ch <- c.cycle_per_minute
ch <- c.backfill_last_cycle
ch <- c.backfill_mean_cycle
ch <- c.backfill_depth_mean
ch <- c.threads
ch <- c.queue_size
ch <- c.last_cycle
ch <- c.mean_cycle
ch <- c.cycle_per_minute
ch <- c.backfill_last_cycle
ch <- c.backfill_mean_cycle
ch <- c.backfill_depth_mean
ch <- c.rpc_stats_count
ch <- c.rpc_stats_avg_time
ch <- c.rpc_stats_total_time
ch <- c.user_rpc_stats_count
ch <- c.user_rpc_stats_avg_time
ch <- c.user_rpc_stats_total_time
}

// Send the values of all metrics
func (sc *SchedulerCollector) Collect(ch chan<- prometheus.Metric) {
sm := SchedulerGetMetrics()
ch <- prometheus.MustNewConstMetric(sc.threads, prometheus.GaugeValue, sm.threads)
ch <- prometheus.MustNewConstMetric(sc.queue_size, prometheus.GaugeValue, sm.queue_size)
ch <- prometheus.MustNewConstMetric(sc.last_cycle, prometheus.GaugeValue, sm.last_cycle)
ch <- prometheus.MustNewConstMetric(sc.mean_cycle, prometheus.GaugeValue, sm.mean_cycle)
ch <- prometheus.MustNewConstMetric(sc.cycle_per_minute, prometheus.GaugeValue, sm.cycle_per_minute)
ch <- prometheus.MustNewConstMetric(sc.backfill_last_cycle, prometheus.GaugeValue, sm.backfill_last_cycle)
ch <- prometheus.MustNewConstMetric(sc.backfill_mean_cycle, prometheus.GaugeValue, sm.backfill_mean_cycle)
ch <- prometheus.MustNewConstMetric(sc.backfill_depth_mean, prometheus.GaugeValue, sm.backfill_depth_mean)
sm := SchedulerGetMetrics()
ch <- prometheus.MustNewConstMetric(sc.threads, prometheus.GaugeValue, sm.threads)
ch <- prometheus.MustNewConstMetric(sc.queue_size, prometheus.GaugeValue, sm.queue_size)
ch <- prometheus.MustNewConstMetric(sc.last_cycle, prometheus.GaugeValue, sm.last_cycle)
ch <- prometheus.MustNewConstMetric(sc.mean_cycle, prometheus.GaugeValue, sm.mean_cycle)
ch <- prometheus.MustNewConstMetric(sc.cycle_per_minute, prometheus.GaugeValue, sm.cycle_per_minute)
ch <- prometheus.MustNewConstMetric(sc.backfill_last_cycle, prometheus.GaugeValue, sm.backfill_last_cycle)
ch <- prometheus.MustNewConstMetric(sc.backfill_mean_cycle, prometheus.GaugeValue, sm.backfill_mean_cycle)
ch <- prometheus.MustNewConstMetric(sc.backfill_depth_mean, prometheus.GaugeValue, sm.backfill_depth_mean)
for rpc_type, value := range sm.rpc_stats_count {
ch <- prometheus.MustNewConstMetric(sc.rpc_stats_count, prometheus.GaugeValue, value, rpc_type)
}
for rpc_type, value := range sm.rpc_stats_avg_time {
ch <- prometheus.MustNewConstMetric(sc.rpc_stats_avg_time, prometheus.GaugeValue, value, rpc_type)
}
for rpc_type, value := range sm.rpc_stats_total_time {
ch <- prometheus.MustNewConstMetric(sc.rpc_stats_total_time, prometheus.GaugeValue, value, rpc_type)
}
for user, value := range sm.user_rpc_stats_count {
ch <- prometheus.MustNewConstMetric(sc.user_rpc_stats_count, prometheus.GaugeValue, value, user)
}
for user, value := range sm.user_rpc_stats_avg_time {
ch <- prometheus.MustNewConstMetric(sc.user_rpc_stats_avg_time, prometheus.GaugeValue, value, user)
}
for user, value := range sm.user_rpc_stats_total_time {
ch <- prometheus.MustNewConstMetric(sc.user_rpc_stats_total_time, prometheus.GaugeValue, value, user)
}

}

// Returns the Slurm scheduler collector, used to register with the prometheus client
func NewSchedulerCollector() *SchedulerCollector {
return &SchedulerCollector{
threads: prometheus.NewDesc(
"slurm_scheduler_threads",
"Information provided by the Slurm sdiag command, number of scheduler threads ",
nil,
nil),
queue_size: prometheus.NewDesc(
"slurm_scheduler_queue_size",
"Information provided by the Slurm sdiag command, length of the scheduler queue",
nil,
nil),
last_cycle: prometheus.NewDesc(
"slurm_scheduler_last_cycle",
"Information provided by the Slurm sdiag command, scheduler last cycle time in (microseconds)",
nil,
nil),
mean_cycle: prometheus.NewDesc(
"slurm_scheduler_mean_cycle",
"Information provided by the Slurm sdiag command, scheduler mean cycle time in (microseconds)",
nil,
nil),
cycle_per_minute: prometheus.NewDesc(
"slurm_scheduler_cycle_per_minute",
"Information provided by the Slurm sdiag command, number scheduler cycles per minute",
nil,
nil),
backfill_last_cycle: prometheus.NewDesc(
"slurm_scheduler_backfill_last_cycle",
"Information provided by the Slurm sdiag command, scheduler backfill last cycle time in (microseconds)",
nil,
nil),
backfill_mean_cycle: prometheus.NewDesc(
"slurm_scheduler_backfill_mean_cycle",
"Information provided by the Slurm sdiag command, scheduler backfill mean cycle time in (microseconds)",
nil,
nil),
backfill_depth_mean: prometheus.NewDesc(
"slurm_scheduler_backfill_depth_mean",
"Information provided by the Slurm sdiag command, scheduler backfill mean depth",
nil,
nil),
}
rpc_stats_labels := make([]string, 0, 1)
rpc_stats_labels = append(rpc_stats_labels, "operation")
user_rpc_stats_labels := make([]string, 0, 1)
user_rpc_stats_labels = append(user_rpc_stats_labels, "user")
return &SchedulerCollector{
threads: prometheus.NewDesc(
"slurm_scheduler_threads",
"Information provided by the Slurm sdiag command, number of scheduler threads ",
nil,
nil),
queue_size: prometheus.NewDesc(
"slurm_scheduler_queue_size",
"Information provided by the Slurm sdiag command, length of the scheduler queue",
nil,
nil),
last_cycle: prometheus.NewDesc(
"slurm_scheduler_last_cycle",
"Information provided by the Slurm sdiag command, scheduler last cycle time in (microseconds)",
nil,
nil),
mean_cycle: prometheus.NewDesc(
"slurm_scheduler_mean_cycle",
"Information provided by the Slurm sdiag command, scheduler mean cycle time in (microseconds)",
nil,
nil),
cycle_per_minute: prometheus.NewDesc(
"slurm_scheduler_cycle_per_minute",
"Information provided by the Slurm sdiag command, number scheduler cycles per minute",
nil,
nil),
backfill_last_cycle: prometheus.NewDesc(
"slurm_scheduler_backfill_last_cycle",
"Information provided by the Slurm sdiag command, scheduler backfill last cycle time in (microseconds)",
nil,
nil),
backfill_mean_cycle: prometheus.NewDesc(
"slurm_scheduler_backfill_mean_cycle",
"Information provided by the Slurm sdiag command, scheduler backfill mean cycle time in (microseconds)",
nil,
nil),
backfill_depth_mean: prometheus.NewDesc(
"slurm_scheduler_backfill_depth_mean",
"Information provided by the Slurm sdiag command, scheduler backfill mean depth",
nil,
nil),
rpc_stats_count: prometheus.NewDesc(
"slurm_rpc_stats",
"Information provided by the Slurm sdiag command, rpc count statistic",
rpc_stats_labels,
nil),
rpc_stats_avg_time: prometheus.NewDesc(
"slurm_rpc_stats_avg_time",
"Information provided by the Slurm sdiag command, rpc average time statistic",
rpc_stats_labels,
nil),
rpc_stats_total_time: prometheus.NewDesc(
"slurm_rpc_stats_total_time",
"Information provided by the Slurm sdiag command, rpc total time statistic",
rpc_stats_labels,
nil),
user_rpc_stats_count: prometheus.NewDesc(
"slurm_user_rpc_stats",
"Information provided by the Slurm sdiag command, rpc count statistic per user",
user_rpc_stats_labels,
nil),
user_rpc_stats_avg_time: prometheus.NewDesc(
"slurm_user_rpc_stats_avg_time",
"Information provided by the Slurm sdiag command, rpc average time statistic per user",
user_rpc_stats_labels,
nil),
user_rpc_stats_total_time: prometheus.NewDesc(
"slurm_user_rpc_stats_total_time",
"Information provided by the Slurm sdiag command, rpc total time statistic per user",
user_rpc_stats_labels,
nil),
}
}