Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate IB network with MNNVL topology #12

Merged
merged 6 commits into from
Oct 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pkg/ib/ib.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ type Switch struct {
}

func GenerateTopologyConfig(data []byte) (*common.Vertex, error) {
switches, hca, err := parseIbnetdiscoverFile(data)
switches, hca, err := ParseIbnetdiscoverFile(data)
if err != nil {
return nil, fmt.Errorf("unable to parse ibnetdiscover file: %v", err)
}
Expand Down Expand Up @@ -111,7 +111,7 @@ func (sw *Switch) getHeight() int {
}

// process output of ibnetdiscover
func parseIbnetdiscoverFile(data []byte) (map[string]*Switch, map[string]string, error) {
func ParseIbnetdiscoverFile(data []byte) (map[string]*Switch, map[string]string, error) {
switches := make(map[string]*Switch)
hca := make(map[string]string)
var entry *Switch
Expand Down
2 changes: 1 addition & 1 deletion pkg/ib/ib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ Switch 41 "S-b8cef603008032b8" # "MF0;SJC3-A04-IB-ComputeLeaf-007:MQM8700/U1" e

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
switches, hca, err := parseIbnetdiscoverFile(tt.input)
switches, hca, err := ParseIbnetdiscoverFile(tt.input)
if (err != nil) != tt.expectedErr {
t.Fatalf("expected error: %v, got: %v", tt.expectedErr, err)
}
Expand Down
140 changes: 130 additions & 10 deletions pkg/providers/baremetal/mnnvl.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strings"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/ib"
"github.com/NVIDIA/topograph/pkg/utils"
)

Expand Down Expand Up @@ -36,6 +37,122 @@ func domainIDExists(id string, domainMap map[string]domain) bool {
return false
}

func getIbTree(ctx context.Context, nodes []string) (*common.Vertex, error) {
nodeVisited := make(map[string]bool)
treeRoot := &common.Vertex{
Vertices: make(map[string]*common.Vertex),
}
ibPrefix := "IB"
ibCount := 0
partitionNodeMap := make(map[string][]string)
partitionVisitedMap := make(map[string]bool)

args := []string{"-h"}
stdout, err := utils.Exec(ctx, "sinfo", args, nil)
if err != nil {
return nil, fmt.Errorf("Exec error in sinfo\n")
}

scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
nodeLine := scanner.Text()
arr := strings.Fields(nodeLine)
if arr[3] == "0" {
continue
}
partitionName := strings.TrimSpace(arr[0])
state := strings.TrimSpace(arr[4])
nodeList := strings.TrimSpace(arr[5])
if strings.HasPrefix(state, "down") || strings.HasSuffix(state, "*") {
continue
}
nodesArr := deCompressNodeNames(nodeList)
partitionNodeMap[partitionName] = append(partitionNodeMap[partitionName], nodesArr...)
}
for pName, nodes := range partitionNodeMap {
if _, exists := partitionVisitedMap[pName]; !exists {
for _, node := range nodes {
if _, exists := nodeVisited[node]; !exists {
args := []string{"-N", "-R", "ssh", "-w", node, "sudo ibnetdiscover"}
stdout, err := utils.Exec(ctx, "pdsh", args, nil)
if err != nil {
return nil, fmt.Errorf("Exec error while pdsh IB command\n")
}
if strings.Contains(stdout.String(), "Topology file:") {
_, hca, _ := ib.ParseIbnetdiscoverFile(stdout.Bytes())
for _, nodeName := range hca {
nodeVisited[nodeName] = true
}
partitionVisitedMap[pName] = true
ibRoot, err := ib.GenerateTopologyConfig(stdout.Bytes())
if err != nil {
return nil, fmt.Errorf("IB GenerateTopologyConfig failed: %v\n", err)
}
ibCount++
ibKey := ibPrefix + strconv.Itoa(ibCount)
treeRoot.Vertices[ibKey] = ibRoot
break
}
} else {
partitionVisitedMap[pName] = true
}
}
}
}
return treeRoot, nil
}

// deCompressNodeNames returns array of node names
func deCompressNodeNames(nodeList string) []string {
nodeArr := []string{}
arr := strings.Split(nodeList, ",")
prefix := ""
var nodeName string
for _, entry := range arr {
if strings.Contains(entry, "[") {
tuple := strings.Split(entry, "[")
prefix = tuple[0]
if strings.Contains(tuple[1], "-") {
nr := strings.Split(tuple[1], "-")
start, _ := strconv.Atoi(nr[0])
end, _ := strconv.Atoi(nr[1])
for i := start; i <= end; i++ {
nodeName = prefix + strconv.Itoa(i)
nodeArr = append(nodeArr, nodeName)
}
continue
} else {
nv := tuple[1]
nodeName = prefix + nv
}
} else { // no [ means, this could be whole nodename or suffix
if len(prefix) > 0 { //prefix exists, so must be a suffix.
if strings.HasSuffix(entry, "]") { //if suffix has ], reset prefix
nv := strings.Split(entry, "]")
nodeName = prefix + nv[0]
prefix = ""
} else if strings.Contains(entry, "-") { // suffix containing range of nodes
nr := strings.Split(entry, "-")
start, _ := strconv.Atoi(nr[0])
end, _ := strconv.Atoi(nr[1])
for i := start; i <= end; i++ {
nodeName = prefix + strconv.Itoa(i)
nodeArr = append(nodeArr, nodeName)
}
continue
} else {
nodeName = prefix + entry
}
} else { // no prefix yet, must be whole nodename
nodeName = entry
}

}
nodeArr = append(nodeArr, nodeName)
}
return nodeArr
}

// getClusterOutput reads output from nodeInfo and populates the structs
func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes []string, cmd string) error {
args := []string{"-R", "ssh", "-w", strings.Join(nodes, ","), cmd}
Expand Down Expand Up @@ -63,31 +180,29 @@ func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes []
}
return nil
}
func toGraph(domainMap map[string]domain) *common.Vertex {
func toGraph(domainMap map[string]domain, treeRoot *common.Vertex) *common.Vertex {
root := &common.Vertex{
Vertices: make(map[string]*common.Vertex),
Metadata: make(map[string]string),
}
blockSize := -1
blockRoot := &common.Vertex{
Vertices: make(map[string]*common.Vertex),
}
root.Vertices[common.ValTopologyTree] = treeRoot
for domainName, domain := range domainMap {
tree := &common.Vertex{
ID: domainName,
Vertices: make(map[string]*common.Vertex),
}
for node := range domain.nodeMap {
tree.Vertices[node] = &common.Vertex{Name: node, ID: node}
if blockSize == -1 {
blockSize = len(domain.nodeMap)
} else {
fmt.Printf("blockSize different between NVL domains")
}
}
root.Vertices[domainName] = tree
blockRoot.Vertices[domainName] = tree
}
// add root metadata
root.Metadata[common.KeyEngine] = common.EngineSLURM
root.Metadata[common.KeyPlugin] = common.ValTopologyBlock
root.Metadata[common.KeyBlockSizes] = strconv.Itoa(blockSize)
root.Vertices[common.ValTopologyBlock] = blockRoot
return root
}

Expand All @@ -98,5 +213,10 @@ func generateTopologyConfig(ctx context.Context, cis []common.ComputeInstances)
if err != nil {
return nil, fmt.Errorf("getClusterOutput failed: %v\n", err)
}
return toGraph(domainMap), nil
// get ibnetdiscover output from all unvisited nodes
treeRoot, err := getIbTree(ctx, nodes)
if err != nil {
return nil, fmt.Errorf("getIbTree failed: %v\n", err)
}
return toGraph(domainMap, treeRoot), nil
}
Loading
Loading