Skip to content

Commit

Permalink
Merge pull request #20 from NVIDIA/config-parameters
Browse files Browse the repository at this point in the history
Config parameters
  • Loading branch information
henryh2 authored Oct 31, 2024
2 parents a2e0233 + e436cf9 commit 24d2801
Show file tree
Hide file tree
Showing 16 changed files with 327 additions and 322 deletions.
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ http:
# ssl: enables HTTPS protocol if set to `true` (optional).
ssl: false

# provider: the provider that topograph will use (optional)
# Valid options include "aws", "oci", "gcp", "cw", "baremetal" or "test".
# Can be overridden if the provider is specified in a topology request to topograph
provider: "aws"

# engine: the engine that topograph will use (optional)
# Valid options include "slurm", "k8s", or "test".
# Can be overridden if the engine is specified in a topology request to topograph
engine: "slurm"

# request_aggregation_delay: defines the delay before processing a request (required).
# Topograph aggregates multiple sequential requests within this delay into a single request,
# processing only if no new requests arrive during the specified duration.
Expand Down Expand Up @@ -109,10 +119,12 @@ Topograph offers three endpoints for interacting with the service. Below are the
- **URL:** `http://<server>:<port>/v1/generate`
- **Description:** This endpoint is used to request a new cluster topology.
- **Payload:** The payload is a JSON object that includes the following fields:
- **provider name**: (mandatory) A string specifying the Service Provider, such as `aws`, `oci`, `gcp`, `cw`, `baremetal` or `test`.
- **provider name**: (optional) A string specifying the Service Provider, such as `aws`, `oci`, `gcp`, `cw`, `baremetal` or `test`. This parameter will be override the provider set in the topograph config.
- **provider credentials**: (optional) A key-value map with provider-specific parameters for authentication.
- **engine name**: (mandatory) A string specifying the topology output, either `slurm` or `k8s`.
- **engine parameters**: A key-value map with engine-specific parameters.
- **provider parameters**: (optional) A key-value map with parameters that are used for provider simulation with toposim.
- **model_path**: (optional) A string parameter that points to the model file to use for simulating topology.
- **engine name**: (optional) A string specifying the topology output, either `slurm`, `k8s`, or `test`. This parameter will override the engine set in the topograph config.
- **engine parameters**: (optional) A key-value map with engine-specific parameters.
- **slurm parameters**:
- **topology_config_path**: (optional) A string specifying the file path for the topology configuration. If omitted, the topology config content is returned in the HTTP response.
- **plugin**: (optional) A string specifying topology plugin. Default topology/tree.
Expand All @@ -133,6 +145,10 @@ Topograph offers three endpoints for interacting with the service. Below are the
"creds": {
"access_key_id": "id",
"secret_access_key": "secret"
},
"params": {
"use_simulation": "false",
"model_path": ""
}
},
"engine": {
Expand Down
7 changes: 7 additions & 0 deletions config/topograph-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,16 @@ http:
port: 49021
ssl: false

# Set provider and engine for topograph to use
# provider: "test"
# engine: "test"

# waiting period before processing a request
request_aggregation_delay: 15s

# URL of an external gRPC service for request processing (optional)
# forward_service_url:

# number of results per API call (optional)
page_size: 100

Expand Down
10 changes: 6 additions & 4 deletions docs/slurm.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ curl http://localhost:49021/healthz
```

#### Using Toposim
To test the service on a simulated cluster, first add the following line to `/etc/topograph/topograph-config.yaml` so that any topology requests are forwarded to toposim.
To test the service on a simulated cluster, first add the following lines to `/etc/topograph/topograph-config.yaml` so that topograph knows to run topology in simulation and to forward any topology requests to toposim.
```bash
provider: "test"
engine: "test"
forward_service_url: dns:localhost:49025
```
Then run the topograph service as normal.
Expand All @@ -48,15 +50,15 @@ You must then start the toposim service as such, setting the path to the test mo
/usr/local/bin/topograph -m /usr/local/bin/tests/models/<cluster-model>.yaml
```

You can then verify the topology results via simulation by querying topograph using the `test` provider and engine, and specifying the test model path as a parameter to the provider.
You can then verify the topology results via simulation by querying topograph, and specifying the test model path as a parameter to the provider.
If you want to view the tree topology, then use the command:
```bash
id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"name":"test", "params":{"model_path":"/usr/local/bin/topograph/tests/models/<cluster-model>.yaml"}},"engine":{"name":"test"}}' http://localhost:49021/v1/generate)
id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"params":{"model_path":"/usr/local/bin/tests/models/<cluster-model>.yaml"}}}' http://localhost:49021/v1/generate)
```

And if you want to view the block topology (with specified block sizes), use the command:
```bash
id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"name":"test", "params":{"model_path":"/usr/local/bin/topograph/tests/models/<cluster-model>.yaml"}},"engine":{"name":"test", "params":{"plugin":"topology/block", "block_sizes": <block-sizes>}}}' http://localhost:49021/v1/generate)
id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"params":{"model_path":"/usr/local/bin/tests/models/<cluster-model>.yaml"}},"engine":{"params":{"plugin":"topology/block", "block_sizes": "4,8"}}}' http://localhost:49021/v1/generate)
```

You can query the results of either topology request with:
Expand Down
17 changes: 17 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,15 @@ import (
"gopkg.in/yaml.v3"
"k8s.io/klog/v2"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/utils"
)

type Config struct {
HTTP Endpoint `yaml:"http"`
RequestAggregationDelay time.Duration `yaml:"request_aggregation_delay"`
Provider string `yaml:"provider,omitempty"`
Engine string `yaml:"engine,omitempty"`
PageSize int `yaml:"page_size,omitempty"`
SSL *SSL `yaml:"ssl,omitempty"`
CredsPath *string `yaml:"credentials_path,omitempty"`
Expand Down Expand Up @@ -76,6 +79,20 @@ func (cfg *Config) validate() error {
return fmt.Errorf("port is not set")
}

switch cfg.Provider {
case common.ProviderAWS, common.ProviderOCI, common.ProviderGCP, common.ProviderCW, common.ProviderBM, common.ProviderTest, "":
//nop
default:
return fmt.Errorf("unsupported provider %s", cfg.Provider)
}

switch cfg.Engine {
case common.EngineK8S, common.EngineSLURM, common.EngineTest, "":
//nop
default:
return fmt.Errorf("unsupported engine %s", cfg.Engine)
}

if cfg.RequestAggregationDelay == 0 {
return fmt.Errorf("request_aggregation_delay is not set")
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/engines/k8s/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (eng *K8sEngine) GenerateOutput(ctx context.Context, tree *common.Vertex, p
return nil, err
}
buf := &bytes.Buffer{}
err := translate.ToSLURM(buf, tree)
err := translate.ToGraph(buf, tree)
if err != nil {
return nil, err
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/engines/k8s/labeler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"testing"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/translate"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -49,7 +50,7 @@ func TestApplyNodeLabels(t *testing.T) {
"Node306": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"},
}

err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root, labeler)
err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root.Vertices[common.ValTopologyTree], labeler)
require.NoError(t, err)
require.Equal(t, data, labeler.data)
}
2 changes: 1 addition & 1 deletion pkg/engines/slurm/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func GenerateOutput(ctx context.Context, tree *common.Vertex, params map[string]
tree.Metadata[common.KeyBlockSizes] = blockSize
}

err := translate.ToSLURM(buf, tree)
err := translate.ToGraph(buf, tree)
if err != nil {
return nil, err
}
Expand Down
13 changes: 5 additions & 8 deletions pkg/factory/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"fmt"
"net/http"

"k8s.io/klog/v2"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/models"
"github.com/NVIDIA/topograph/pkg/providers/aws"
Expand Down Expand Up @@ -69,21 +71,16 @@ type testProvider struct {
func GetTestProvider(params map[string]string) (*testProvider, error) {
p := &testProvider{}

var modelPath string
if len(params) != 0 {
modelPath = params[common.KeyModelPath]
}

if len(modelPath) == 0 {
if path, ok := params[common.KeyModelPath]; !ok || len(path) == 0 {
p.tree, p.instance2node = translate.GetTreeTestSet(false)
} else {
model, err := models.NewModelFromFile(modelPath)
klog.InfoS("Using simulated topology", "model path", params[common.KeyModelPath])
model, err := models.NewModelFromFile(params[common.KeyModelPath])
if err != nil {
return nil, err // Wrapped by models.NewModelFromFile
}
p.tree, p.instance2node = model.ToTree()
}

return p, nil
}

Expand Down
20 changes: 18 additions & 2 deletions pkg/models/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ func (model *Model) ToTree() (*common.Vertex, map[string]string) {
nodeVertexMap := make(map[string]*common.Vertex)
swVertexMap := make(map[string]*common.Vertex)
swRootMap := make(map[string]bool)
blockVertexMap := make(map[string]*common.Vertex)

// Create all the vertices for each node
for k, v := range model.Nodes {
Expand All @@ -158,6 +159,14 @@ func (model *Model) ToTree() (*common.Vertex, map[string]string) {
swRootMap[sw.Name] = true
}

// Initializes all the block vertices
for _, cb := range model.CapacityBlocks {
blockVertexMap[cb.Name] = &common.Vertex{ID: cb.Name, Vertices: make(map[string]*common.Vertex)}
for _, node := range cb.Nodes {
blockVertexMap[cb.Name].Vertices[node] = nodeVertexMap[node]
}
}

// Connect all the switches to their sub-switches and sub-nodes
for _, sw := range model.Switches {
for _, subsw := range sw.Switches {
Expand All @@ -177,11 +186,18 @@ func (model *Model) ToTree() (*common.Vertex, map[string]string) {
}

// Connects all root vertices to the hidden root
root := &common.Vertex{Vertices: make(map[string]*common.Vertex)}
treeRoot := &common.Vertex{Vertices: make(map[string]*common.Vertex)}
for k, v := range swRootMap {
if v {
root.Vertices[k] = swVertexMap[k]
treeRoot.Vertices[k] = swVertexMap[k]
}
}
blockRoot := &common.Vertex{Vertices: make(map[string]*common.Vertex)}
for k, v := range blockVertexMap {
blockRoot.Vertices[k] = v
}
root := &common.Vertex{
Vertices: map[string]*common.Vertex{common.ValTopologyBlock: blockRoot, common.ValTopologyTree: treeRoot},
}
return root, instance2node
}
Loading

0 comments on commit 24d2801

Please sign in to comment.