From 9cc77fcbe1ada487f6cccf33634e93beddc2a0e4 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 18 Nov 2025 17:24:38 -0500 Subject: [PATCH 1/5] exec works but this commit needs de-cruft --- .gitignore | 2 + cmd/api/api/exec.go | 2 +- cmd/api/api/exec_test.go | 99 ++++++++++++++-- cmd/build-dev-initrd/main.go | 213 +++++++++++++++++++++++++++++++++++ cmd/debug-shell/main.go | 87 ++++++++++++++ cmd/test-handshake/main.go | 25 ++++ go.mod | 8 +- go.sum | 12 +- lib/instances/configdisk.go | 6 +- lib/system/exec.go | 42 +++++-- lib/system/init_script.go | 74 +++++++----- lib/system/initrd.go | 25 ++++ lib/system/versions.go | 28 +++-- lib/system/versions_test.go | 2 +- openapi.yaml | 2 +- scripts/repro_vm.sh | 100 ++++++++++++++++ 16 files changed, 656 insertions(+), 71 deletions(-) create mode 100644 cmd/build-dev-initrd/main.go create mode 100644 cmd/debug-shell/main.go create mode 100644 cmd/test-handshake/main.go create mode 100755 scripts/repro_vm.sh diff --git a/.gitignore b/.gitignore index 9c30de60..234bdced 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ tmp/** # Cloud Hypervisor binaries (embedded at build time) lib/vmm/binaries/cloud-hypervisor/*/*/cloud-hypervisor +cloud-hypervisor +cloud-hypervisor/** diff --git a/cmd/api/api/exec.go b/cmd/api/api/exec.go index 940396fa..0eb0759b 100644 --- a/cmd/api/api/exec.go +++ b/cmd/api/api/exec.go @@ -70,7 +70,7 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { wsConn := &wsReadWriter{ws: ws, ctx: ctx} // Execute via vsock - exit, err := system.ExecIntoInstance(ctx, uint32(inst.VsockCID), system.ExecOptions{ + exit, err := system.ExecIntoInstance(ctx, inst.VsockSocket, system.ExecOptions{ Command: command, Stdin: wsConn, Stdout: wsConn, diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index 1b8cafa8..a25d2c99 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -2,11 +2,15 @@ package api import ( "bytes" + "fmt" "os" + "os/exec" + "path/filepath" "testing" "time" "github.com/onkernel/hypeman/lib/oapi" + "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -24,17 +28,25 @@ func TestExecInstanceNonTTY(t *testing.T) { svc := newTestService(t) + // Ensure system files (kernel and initrd) are available + t.Log("Ensuring system files...") + systemMgr := system.NewManager(paths.New(svc.Config.DataDir)) + err := systemMgr.EnsureSystemFiles(ctx()) + require.NoError(t, err) + t.Log("System files ready") + // First, create and wait for the image to be ready - t.Log("Creating alpine image...") + // Use nginx which has a proper long-running process + t.Log("Creating nginx:alpine image...") imgResp, err := svc.CreateImage(ctx(), oapi.CreateImageRequestObject{ Body: &oapi.CreateImageRequest{ - Name: "docker.io/library/alpine:latest", + Name: "docker.io/library/nginx:alpine", }, }) require.NoError(t, err) imgCreated, ok := imgResp.(oapi.CreateImage202JSONResponse) require.True(t, ok, "expected 202 response") - assert.Equal(t, "docker.io/library/alpine:latest", imgCreated.Name) + assert.Equal(t, "docker.io/library/nginx:alpine", imgCreated.Name) // Wait for image to be ready (poll with timeout) t.Log("Waiting for image to be ready...") @@ -49,7 +61,7 @@ func TestExecInstanceNonTTY(t *testing.T) { t.Fatal("Timeout waiting for image to be ready") case <-ticker.C: imgResp, err := svc.GetImage(ctx(), oapi.GetImageRequestObject{ - Name: "docker.io/library/alpine:latest", + Name: "docker.io/library/nginx:alpine", }) require.NoError(t, err) @@ -68,7 +80,7 @@ func TestExecInstanceNonTTY(t *testing.T) { instResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ Body: &oapi.CreateInstanceRequest{ Name: "exec-test", - Image: "docker.io/library/alpine:latest", + Image: "docker.io/library/nginx:alpine", }, }) require.NoError(t, err) @@ -91,6 +103,73 @@ func TestExecInstanceNonTTY(t *testing.T) { require.NotEmpty(t, actualInst.VsockSocket, "vsock socket path should be set") t.Logf("vsock CID: %d, socket: %s", actualInst.VsockCID, actualInst.VsockSocket) + // Print console log for debugging + consolePath := paths.New(svc.Config.DataDir).InstanceConsoleLog(inst.Id) + if consoleData, err := os.ReadFile(consolePath); err == nil { + t.Logf("=== Console Log ===") + lines := bytes.Split(consoleData, []byte("\n")) + + // Print boot messages with virtio + t.Logf("--- Boot messages (virtio devices) ---") + for _, line := range lines { + lineStr := string(line) + if bytes.Contains(line, []byte("virtio")) || bytes.Contains(line, []byte("vsock")) { + t.Logf("%s", lineStr) + } + } + + // Print last 30 lines + t.Logf("--- Last 30 lines ---") + start := len(lines) - 30 + if start < 0 { + start = 0 + } + for _, line := range lines[start:] { + if len(line) > 0 { + t.Logf("%s", line) + } + } + t.Logf("=== End Console Log ===") + } else { + t.Logf("Could not read console log: %v", err) + } + + // Check if vsock socket exists + if _, err := os.Stat(actualInst.VsockSocket); err != nil { + t.Logf("vsock socket does not exist: %v", err) + } else { + t.Logf("vsock socket exists: %s", actualInst.VsockSocket) + } + + // Check if exec-agent exists in the initrd + initrdPath, _ := systemMgr.GetInitrdPath(system.InitrdV2_0_0) + t.Logf("Initrd path: %s", initrdPath) + if _, err := os.Stat(initrdPath); err != nil { + t.Logf("Initrd file does not exist: %v", err) + } else { + if stat, err := os.Stat(initrdPath); err == nil { + t.Logf("Initrd file exists, size: %d bytes", stat.Size()) + + // Unpack initrd to check contents + tmpUnpack := filepath.Join(os.TempDir(), "initrd-check") + os.RemoveAll(tmpUnpack) + os.MkdirAll(tmpUnpack, 0755) + + cmd := exec.Command("sh", "-c", fmt.Sprintf("cd %s && cpio -i < %s 2>/dev/null", tmpUnpack, initrdPath)) + if err := cmd.Run(); err == nil { + if _, err := os.Stat(filepath.Join(tmpUnpack, "usr/local/bin/exec-agent")); err == nil { + t.Logf("✅ exec-agent found in initrd") + } else { + t.Logf("❌ exec-agent NOT found in initrd!") + // List what's actually in the initrd + entries, _ := os.ReadDir(filepath.Join(tmpUnpack, "usr/local/bin")) + t.Logf("Contents of /usr/local/bin in initrd: %v", len(entries)) + } + } + os.RemoveAll(tmpUnpack) + } + } + // Wait for exec agent to be ready (retry a few times) var exit *system.ExitStatus var stdout, stderr outputBuffer @@ -102,7 +181,7 @@ func TestExecInstanceNonTTY(t *testing.T) { stdout = outputBuffer{} stderr = outputBuffer{} - exit, execErr = system.ExecIntoInstance(ctx(), uint32(actualInst.VsockCID), system.ExecOptions{ + exit, execErr = system.ExecIntoInstance(ctx(), actualInst.VsockSocket, system.ExecOptions{ Command: []string{"/bin/sh", "-c", "whoami"}, Stdin: nil, Stdout: &stdout, @@ -129,12 +208,12 @@ func TestExecInstanceNonTTY(t *testing.T) { require.Contains(t, outStr, "root", "whoami should return root user") // Test another command to verify filesystem access - t.Log("Testing exec command: ls /usr/local/bin/exec-agent") + t.Log("Testing exec command: ls /") stdout = outputBuffer{} stderr = outputBuffer{} - exit, err = system.ExecIntoInstance(ctx(), uint32(actualInst.VsockCID), system.ExecOptions{ - Command: []string{"/bin/sh", "-c", "ls -la /usr/local/bin/exec-agent"}, + exit, err = system.ExecIntoInstance(ctx(), actualInst.VsockSocket, system.ExecOptions{ + Command: []string{"/bin/sh", "-c", "ls -la /"}, Stdin: nil, Stdout: &stdout, Stderr: &stderr, @@ -146,7 +225,7 @@ func TestExecInstanceNonTTY(t *testing.T) { outStr = stdout.String() t.Logf("ls output: %q", outStr) - require.Contains(t, outStr, "exec-agent", "should see exec-agent binary in /usr/local/bin") + require.Contains(t, outStr, "bin", "should see bin directory") // Cleanup t.Log("Cleaning up instance...") diff --git a/cmd/build-dev-initrd/main.go b/cmd/build-dev-initrd/main.go new file mode 100644 index 00000000..b52b7ec8 --- /dev/null +++ b/cmd/build-dev-initrd/main.go @@ -0,0 +1,213 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/onkernel/hypeman/lib/images" + "github.com/onkernel/hypeman/lib/paths" + "github.com/onkernel/hypeman/lib/system" + digest "github.com/opencontainers/go-digest" + v1 "github.com/opencontainers/image-spec/specs-go/v1" + "github.com/opencontainers/umoci/oci/cas/dir" + "github.com/opencontainers/umoci/oci/casext" +) + +func main() { + ctx := context.Background() + + // Get project root + projectRoot, err := os.Getwd() + if err != nil { + fmt.Fprintf(os.Stderr, "Error getting working directory: %v\n", err) + os.Exit(1) + } + + initrdDir := filepath.Join(projectRoot, "lib/system/initrd") + + // Step 1: Build Docker image with OCI output (like GitHub workflow) + fmt.Println("Building initrd Docker image with OCI format...") + ociDir := filepath.Join(os.TempDir(), "hypeman-initrd-oci-dev") + os.RemoveAll(ociDir) + os.MkdirAll(ociDir, 0755) + + // Use docker buildx to build directly to OCI format + // This matches the GitHub workflow approach + cmd := exec.Command("docker", "buildx", "build", + "--output", fmt.Sprintf("type=oci,dest=%s/image.tar,oci-mediatypes=true", ociDir), + "--platform", "linux/amd64", + ".") + cmd.Dir = initrdDir + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + fmt.Fprintf(os.Stderr, "Error building Docker image: %v\n", err) + os.Exit(1) + } + fmt.Println("✓ Docker image built in OCI format") + + // Step 2: Extract OCI tar to directory (umoci expects a directory layout) + fmt.Println("\nExtracting OCI layout...") + ociLayoutDir := filepath.Join(ociDir, "layout") + os.MkdirAll(ociLayoutDir, 0755) + + cmd = exec.Command("tar", "-xf", filepath.Join(ociDir, "image.tar"), "-C", ociLayoutDir) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + fmt.Fprintf(os.Stderr, "Error extracting OCI tar: %v\n", err) + os.Exit(1) + } + fmt.Println("✓ Extracted OCI layout") + + // Step 3: Use existing system manager to build initrd from OCI directory + fmt.Println("\nBuilding initrd using existing pipeline...") + pathsConfig := paths.New("/var/lib/hypeman") + version := system.InitrdVersion("v2.0.2-dev") + arch := system.GetArch() + + // Create temp directory for building + tempDir, err := os.MkdirTemp("", "hypeman-initrd-build-*") + if err != nil { + fmt.Fprintf(os.Stderr, "Error creating temp dir: %v\n", err) + os.Exit(1) + } + defer os.RemoveAll(tempDir) + + rootfsDir := filepath.Join(tempDir, "rootfs") + + // Create OCI client using our locally built OCI layout as the cache + // This way the image is already "cached" and won't try to pull from remote + ociClient, err := images.NewOCIClient(ociLayoutDir) + if err != nil { + fmt.Fprintf(os.Stderr, "Error creating OCI client: %v\n", err) + os.Exit(1) + } + + // Read the index.json to get the manifest digest + indexData, err := os.ReadFile(filepath.Join(ociLayoutDir, "index.json")) + if err != nil { + fmt.Fprintf(os.Stderr, "Error reading index.json: %v\n", err) + os.Exit(1) + } + + var index struct { + Manifests []struct { + Digest string `json:"digest"` + } `json:"manifests"` + } + if err := json.Unmarshal(indexData, &index); err != nil { + fmt.Fprintf(os.Stderr, "Error parsing index.json: %v\n", err) + os.Exit(1) + } + + if len(index.Manifests) == 0 { + fmt.Fprintf(os.Stderr, "No manifests found in index.json\n") + os.Exit(1) + } + + digest := index.Manifests[0].Digest + fmt.Printf(" Using manifest: %s\n", digest) + + // Tag the manifest in the OCI layout so the OCI client can find it + // The OCI client expects tags in the format that digestToLayoutTag produces (just the hex part) + layoutTag := strings.TrimPrefix(digest, "sha256:") + + // Use umoci library to create the tag + if err := tagManifestInOCI(ociLayoutDir, digest, layoutTag); err != nil { + fmt.Fprintf(os.Stderr, "Error tagging manifest in OCI layout: %v\n", err) + os.Exit(1) + } + fmt.Printf(" Tagged as: %s\n", layoutTag) + + // Now the OCI client will find it in the cache and won't try to pull + // We pass a dummy imageRef since it won't be used (image is already cached) + if err := ociClient.PullAndUnpack(ctx, "local/dev", digest, rootfsDir); err != nil { + fmt.Fprintf(os.Stderr, "Error unpacking OCI image: %v\n", err) + os.Exit(1) + } + fmt.Println("✓ Unpacked OCI image") + + // Inject init script + fmt.Println("\nInjecting init script...") + initScript := system.GenerateInitScript(version) + initPath := filepath.Join(rootfsDir, "init") + if err := os.WriteFile(initPath, []byte(initScript), 0755); err != nil { + fmt.Fprintf(os.Stderr, "Error writing init script: %v\n", err) + os.Exit(1) + } + + // Package as cpio.gz (initramfs format) + fmt.Println("Packaging as initrd...") + outputPath := pathsConfig.SystemInitrd(string(version), arch) + os.MkdirAll(filepath.Dir(outputPath), 0755) + + if _, err := images.ExportRootfs(rootfsDir, outputPath, images.FormatCpio); err != nil { + fmt.Fprintf(os.Stderr, "Error exporting initrd: %v\n", err) + os.Exit(1) + } + + fmt.Println("\n✓ Dev initrd built successfully!") + fmt.Printf(" Location: %s\n", outputPath) + fmt.Printf(" OCI cache: %s (can be deleted)\n", ociDir) + fmt.Println("\nTo use it, update lib/system/versions.go:") + fmt.Println(" DefaultInitrdVersion = InitrdVersion(\"v2.0.2-dev\")") +} + +// tagManifestInOCI tags a manifest digest with a tag name in an OCI layout +func tagManifestInOCI(ociLayoutDir, digestStr, tag string) error { + casEngine, err := dir.Open(ociLayoutDir) + if err != nil { + return fmt.Errorf("open OCI layout: %w", err) + } + defer casEngine.Close() + + engine := casext.NewEngine(casEngine) + + // Read the index to find the manifest descriptor + indexData, err := os.ReadFile(filepath.Join(ociLayoutDir, "index.json")) + if err != nil { + return fmt.Errorf("read index.json: %w", err) + } + + var index struct { + Manifests []struct { + MediaType string `json:"mediaType"` + Digest string `json:"digest"` + Size int64 `json:"size"` + } `json:"manifests"` + } + if err := json.Unmarshal(indexData, &index); err != nil { + return fmt.Errorf("parse index.json: %w", err) + } + + // Find the manifest descriptor matching our digest + var manifestDesc *v1.Descriptor + for _, m := range index.Manifests { + if m.Digest == digestStr { + manifestDesc = &v1.Descriptor{ + MediaType: m.MediaType, + Digest: digest.Digest(digestStr), + Size: m.Size, + } + break + } + } + + if manifestDesc == nil { + return fmt.Errorf("manifest %s not found in index", digestStr) + } + + // Update the reference to create the tag + if err := engine.UpdateReference(context.Background(), tag, *manifestDesc); err != nil { + return fmt.Errorf("update reference: %w", err) + } + + return nil +} + diff --git a/cmd/debug-shell/main.go b/cmd/debug-shell/main.go new file mode 100644 index 00000000..b3df97cb --- /dev/null +++ b/cmd/debug-shell/main.go @@ -0,0 +1,87 @@ +package main + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/onkernel/hypeman/lib/system" + "golang.org/x/term" +) + +func main() { + if len(os.Args) < 2 { + fmt.Println("Usage: go run cmd/debug-shell/main.go [command...]") + fmt.Println("Example: go run cmd/debug-shell/main.go /tmp/.../vsock.sock") + fmt.Println("Example: go run cmd/debug-shell/main.go /tmp/.../vsock.sock ls -la /") + os.Exit(1) + } + socketPath := os.Args[1] + + command := []string{"/bin/sh"} + if len(os.Args) > 2 { + command = os.Args[2:] + } + + // Handle signals + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + cancel() + }() + + fmt.Printf("Connecting to %s...\n", socketPath) + + // Determine if we should use TTY + isTTY := term.IsTerminal(int(os.Stdin.Fd())) + if len(os.Args) > 2 { + // If running a command, don't use TTY unless explicitly interactive? + // Usually running a command (like ls) is non-interactive TTY wise unless forced. + // Let's default to TTY=false if arguments provided, to simplify probing. + isTTY = false + } + + var oldState *term.State + var err error + if isTTY { + // Put terminal in raw mode for interactive shell + oldState, err = term.MakeRaw(int(os.Stdin.Fd())) + if err != nil { + fmt.Printf("Warning: could not make terminal raw: %v\n", err) + isTTY = false + } else { + defer term.Restore(int(os.Stdin.Fd()), oldState) + } + } + + // Start shell + status, err := system.ExecIntoInstance(ctx, socketPath, system.ExecOptions{ + Command: command, + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, + TTY: isTTY, + }) + + if err != nil { + // Restore terminal before printing error + if oldState != nil { + term.Restore(int(os.Stdin.Fd()), oldState) + } + fmt.Printf("\r\nError: %v\n", err) + os.Exit(1) + } + + // Restore terminal + if oldState != nil { + term.Restore(int(os.Stdin.Fd()), oldState) + } + fmt.Printf("\r\nExit code: %d\n", status.Code) +} + diff --git a/cmd/test-handshake/main.go b/cmd/test-handshake/main.go new file mode 100644 index 00000000..f1578fbb --- /dev/null +++ b/cmd/test-handshake/main.go @@ -0,0 +1,25 @@ +package main + +import ( + "bufio" + "fmt" + "net" +) + +func main() { + conn, err := net.Dial("unix", "/tmp/repro-vsock.sock") + if err != nil { + panic(err) + } + defer conn.Close() + + fmt.Fprintf(conn, "CONNECT 2222\n") + + scanner := bufio.NewScanner(conn) + if scanner.Scan() { + fmt.Printf("Response: %s\n", scanner.Text()) + } else { + fmt.Printf("No response, error: %v\n", scanner.Err()) + } +} + diff --git a/go.mod b/go.mod index 325dc283..3a700606 100644 --- a/go.mod +++ b/go.mod @@ -11,8 +11,8 @@ require ( github.com/golang-jwt/jwt/v5 v5.3.0 github.com/google/go-containerregistry v0.20.6 github.com/google/wire v0.7.0 + github.com/gorilla/websocket v1.5.3 github.com/joho/godotenv v1.5.1 - github.com/mdlayher/vsock v1.2.1 github.com/nrednav/cuid2 v1.1.0 github.com/oapi-codegen/nethttp-middleware v1.1.2 github.com/oapi-codegen/runtime v1.1.2 @@ -22,6 +22,7 @@ require ( github.com/stretchr/testify v1.11.1 github.com/u-root/u-root v0.15.0 golang.org/x/sync v0.17.0 + golang.org/x/term v0.37.0 ) require ( @@ -41,12 +42,10 @@ require ( github.com/go-test/deep v1.1.1 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect - github.com/gorilla/websocket v1.5.3 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/pgzip v1.2.6 // indirect github.com/mailru/easyjson v0.7.7 // indirect - github.com/mdlayher/socket v0.5.1 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect @@ -66,8 +65,7 @@ require ( github.com/vbatts/tar-split v0.12.1 // indirect github.com/woodsbury/decimal128 v1.3.0 // indirect golang.org/x/crypto v0.41.0 // indirect - golang.org/x/net v0.42.0 // indirect - golang.org/x/sys v0.37.0 // indirect + golang.org/x/sys v0.38.0 // indirect google.golang.org/protobuf v1.36.10 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 14796f78..ed4732b3 100644 --- a/go.sum +++ b/go.sum @@ -93,10 +93,6 @@ github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcncea github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= -github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= -github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ= -github.com/mdlayher/vsock v1.2.1/go.mod h1:NRfCibel++DgeMD8z/hP+PPTjlNJsdPOmxcnENvE+SE= github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= @@ -180,8 +176,6 @@ golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sU golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= -golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= @@ -192,6 +186,12 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= +golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/lib/instances/configdisk.go b/lib/instances/configdisk.go index 87d9e366..50d17dd4 100644 --- a/lib/instances/configdisk.go +++ b/lib/instances/configdisk.go @@ -89,6 +89,7 @@ func (m *manager) generateConfigScript(inst *Instance, imageInfo *images.Image) } // Generate script as a readable template block + // ENTRYPOINT and CMD contain shell-quoted arrays that will be eval'd in init script := fmt.Sprintf(`#!/bin/sh # Generated config for instance: %s @@ -134,10 +135,10 @@ func shellQuote(s string) string { } // shellQuoteArray quotes each element of an array for safe shell evaluation -// Each element is single-quoted to preserve special characters like semicolons +// Returns a string that when assigned to a variable and later eval'd, will be properly split func shellQuoteArray(arr []string) string { if len(arr) == 0 { - return "\"\"" + return "" } quoted := make([]string, len(arr)) @@ -145,6 +146,7 @@ func shellQuoteArray(arr []string) string { quoted[i] = shellQuote(s) } + // Join with spaces and return as-is (will be eval'd later in init script) return strings.Join(quoted, " ") } diff --git a/lib/system/exec.go b/lib/system/exec.go index 144c205e..4499fd0b 100644 --- a/lib/system/exec.go +++ b/lib/system/exec.go @@ -1,14 +1,15 @@ package system import ( + "bufio" "context" "encoding/binary" "encoding/json" "fmt" "io" + "net" + "strings" "sync" - - "github.com/mdlayher/vsock" ) const ( @@ -38,14 +39,31 @@ type ExitStatus struct { } // ExecIntoInstance executes command in instance via vsock -func ExecIntoInstance(ctx context.Context, vsockCID uint32, opts ExecOptions) (*ExitStatus, error) { - // Connect to guest on vsock port 2222 - conn, err := vsock.Dial(vsockCID, 2222, nil) +// vsockSocketPath is the Unix socket created by Cloud Hypervisor (e.g., /var/lib/hypeman/guests/{id}/vsock.sock) +func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOptions) (*ExitStatus, error) { + // Connect to Cloud Hypervisor's vsock Unix socket + conn, err := net.Dial("unix", vsockSocketPath) if err != nil { - return nil, fmt.Errorf("dial vsock: %w", err) + return nil, fmt.Errorf("connect to vsock socket: %w", err) } defer conn.Close() + // Send the port number per Cloud Hypervisor protocol + if _, err := fmt.Fprintf(conn, "CONNECT 2222\n"); err != nil { + return nil, fmt.Errorf("send vsock port: %w", err) + } + + // Read handshake response (OK ) + reader := bufio.NewReader(conn) + response, err := reader.ReadString('\n') + if err != nil { + return nil, fmt.Errorf("read handshake response: %w", err) + } + + if !strings.HasPrefix(response, "OK ") { + return nil, fmt.Errorf("handshake failed: %s", strings.TrimSpace(response)) + } + // Send exec request as first stdin frame req := struct { Command []string `json:"command"` @@ -115,7 +133,11 @@ func ExecIntoInstance(ctx context.Context, vsockCID uint32, opts ExecOptions) (* for { streamType, data, err := readFrame(conn) if err != nil { - if err != io.EOF { + if err == io.EOF { + // If we get EOF without having received an exit code (which would return early), + // it's an error + errChan <- fmt.Errorf("unexpected EOF (no exit code received)") + } else { errChan <- err } return @@ -167,7 +189,7 @@ func ExecIntoInstance(ctx context.Context, vsockCID uint32, opts ExecOptions) (* } } -func readFrame(conn *vsock.Conn) (byte, []byte, error) { +func readFrame(conn net.Conn) (byte, []byte, error) { header := make([]byte, 5) if _, err := io.ReadFull(conn, header); err != nil { return 0, nil, err @@ -175,7 +197,7 @@ func readFrame(conn *vsock.Conn) (byte, []byte, error) { streamType := header[0] length := binary.BigEndian.Uint32(header[1:5]) - + data := make([]byte, length) if _, err := io.ReadFull(conn, data); err != nil { return 0, nil, err @@ -184,7 +206,7 @@ func readFrame(conn *vsock.Conn) (byte, []byte, error) { return streamType, data, nil } -func sendFrame(conn *vsock.Conn, streamType byte, data []byte) error { +func sendFrame(conn net.Conn, streamType byte, data []byte) error { header := make([]byte, 5) header[0] = streamType binary.BigEndian.PutUint32(header[1:5], uint32(len(data))) diff --git a/lib/system/init_script.go b/lib/system/init_script.go index d87be137..3bbe0e9e 100644 --- a/lib/system/init_script.go +++ b/lib/system/init_script.go @@ -19,9 +19,12 @@ echo "overlay-init: START (` + string(version) + `)" > /dev/kmsg mkdir -p /proc /sys /dev # Mount essential filesystems +# devtmpfs handles /dev population (null, zero, vsock, etc.) automatically mount -t proc none /proc mount -t sysfs none /sys mount -t devtmpfs none /dev + +# Setup PTY support (needed for exec-agent and interactive shells) mkdir -p /dev/pts /dev/shm mount -t devpts devpts /dev/pts chmod 1777 /dev/shm @@ -68,48 +71,53 @@ else exit 1 fi -# Move essential mounts to new root before chroot -cd /overlay/newroot -mkdir -p proc sys dev -mount --move /proc proc -mount --move /sys sys -mount --move /dev dev +# Prepare new root mount points +# We use bind mounts instead of move so that the original /dev remains populated +# for processes running in the initrd namespace (like exec-agent). +mkdir -p /overlay/newroot/proc +mkdir -p /overlay/newroot/sys +mkdir -p /overlay/newroot/dev + +mount --bind /proc /overlay/newroot/proc +mount --bind /sys /overlay/newroot/sys +mount --bind /dev /overlay/newroot/dev -echo "overlay-init: moved mounts to new root" +echo "overlay-init: bound mounts to new root" -# Set up /dev symlinks for process substitution (Docker compatibility) -chroot . ln -sf /proc/self/fd /dev/fd 2>/dev/null || true -chroot . ln -sf /proc/self/fd/0 /dev/stdin 2>/dev/null || true -chroot . ln -sf /proc/self/fd/1 /dev/stdout 2>/dev/null || true -chroot . ln -sf /proc/self/fd/2 /dev/stderr 2>/dev/null || true +# Set up /dev symlinks for process substitution inside the container +chroot /overlay/newroot ln -sf /proc/self/fd /dev/fd 2>/dev/null || true +chroot /overlay/newroot ln -sf /proc/self/fd/0 /dev/stdin 2>/dev/null || true +chroot /overlay/newroot ln -sf /proc/self/fd/1 /dev/stdout 2>/dev/null || true +chroot /overlay/newroot ln -sf /proc/self/fd/2 /dev/stderr 2>/dev/null || true -# Configure network (if GUEST_IP is set) +# Configure network inside the container view if [ -n "${GUEST_IP:-}" ]; then echo "overlay-init: configuring network" - chroot . ip link set lo up - chroot . ip addr add ${GUEST_IP}/${GUEST_MASK} dev eth0 - chroot . ip link set eth0 up - chroot . ip route add default via ${GUEST_GW} - echo "nameserver ${GUEST_DNS}" > etc/resolv.conf + chroot /overlay/newroot ip link set lo up + chroot /overlay/newroot ip addr add ${GUEST_IP}/${GUEST_MASK} dev eth0 + chroot /overlay/newroot ip link set eth0 up + chroot /overlay/newroot ip route add default via ${GUEST_GW} + echo "nameserver ${GUEST_DNS}" > /overlay/newroot/etc/resolv.conf echo "overlay-init: network configured - IP: ${GUEST_IP}" fi -# Set PATH for proper binary resolution +# Set PATH for initrd tools export PATH='/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin' export HOME='/root' # Start vsock exec agent +# It runs in the initrd namespace but can chroot as needed (or commands run in initrd) echo "overlay-init: starting exec agent" -chroot /overlay/newroot /usr/local/bin/exec-agent 2>/dev/null & +/usr/local/bin/exec-agent & echo "overlay-init: launching entrypoint" echo "overlay-init: workdir=${WORKDIR:-/} entrypoint=${ENTRYPOINT} cmd=${CMD}" -# Change to workdir -cd ${WORKDIR:-/} +set +e -# Fork container app (supervisor pattern) -chroot /overlay/newroot ${ENTRYPOINT} ${CMD} & +# Construct the command string carefully +# ENTRYPOINT and CMD are shell-safe quoted strings from config.sh +eval "chroot /overlay/newroot /bin/sh -c \"cd ${WORKDIR:-/} && exec ${ENTRYPOINT} ${CMD}\"" & APP_PID=$! echo "overlay-init: container app started (PID $APP_PID)" @@ -119,7 +127,21 @@ wait $APP_PID APP_EXIT=$? echo "overlay-init: app exited with code $APP_EXIT" -exit $APP_EXIT -` + + # Keep VM alive if app crashes, to allow debugging via exec-agent + if [ $APP_EXIT -ne 0 ]; then + echo "overlay-init: CRITICAL - App exited with error." + fi + + # FALLBACK: Launch interactive shell on console + # This allows manual debugging if attaching to the console + echo "overlay-init: Launching interactive shell on ttyS0..." + setsid cttyhack /bin/sh < /dev/ttyS0 > /dev/ttyS0 2>&1 + + # If shell exits (it shouldn't), loop forever + while true; do sleep 3600; done + + exit $APP_EXIT + ` } diff --git a/lib/system/initrd.go b/lib/system/initrd.go index aa76364b..b1f7c812 100644 --- a/lib/system/initrd.go +++ b/lib/system/initrd.go @@ -51,6 +51,26 @@ func (m *manager) buildInitrd(ctx context.Context, version InitrdVersion, arch s return fmt.Errorf("write init script: %w", err) } + // HACK: Inject custom exec-agent for debugging + // This assumes the agent is built at lib/system/initrd/guest-agent/exec-agent + // We try to find it relative to the project root. + // Since we are running from project root usually, or we can try to find it. + // Hardcoding path based on workspace structure for now. + customAgent := "/home/debianuser/hypeman/lib/system/initrd/guest-agent/exec-agent" + if input, err := os.ReadFile(customAgent); err == nil { + // Create directory if it doesn't exist (though it should from base image) + binDir := filepath.Join(rootfsDir, "usr/local/bin") + os.MkdirAll(binDir, 0755) + + agentPath := filepath.Join(binDir, "exec-agent") + if err := os.WriteFile(agentPath, input, 0755); err != nil { + return fmt.Errorf("write custom exec-agent: %w", err) + } + fmt.Printf("DEBUG: Injected custom exec-agent from %s\n", customAgent) + } else { + fmt.Printf("DEBUG: Could not find custom exec-agent at %s: %v\n", customAgent, err) + } + // Package as cpio.gz (initramfs format) outputPath := m.paths.SystemInitrd(string(version), arch) if _, err := images.ExportRootfs(rootfsDir, outputPath, images.FormatCpio); err != nil { @@ -79,3 +99,8 @@ func (m *manager) ensureInitrd(ctx context.Context, version InitrdVersion) (stri return initrdPath, nil } +// BuildInitrd is a public wrapper for building initrd (used by dev tools) +func (m *manager) BuildInitrd(ctx context.Context, version InitrdVersion, arch string) error { + return m.buildInitrd(ctx, version, arch) +} + diff --git a/lib/system/versions.go b/lib/system/versions.go index 66399d3d..050b04ca 100644 --- a/lib/system/versions.go +++ b/lib/system/versions.go @@ -9,45 +9,55 @@ type KernelVersion string type InitrdVersion string const ( - // Kernel versions from Cloud Hypervisor releases (full version with date) - KernelCH_6_12_8_20250613 KernelVersion = "ch-release-v6.12.8-20250613" + // Kernel versions from Kernel linux build + Kernel_202511182 KernelVersion = "ch-6.12.8-kernel-1-202511182" // Initrd versions (our internal versioning) // Bump when init script logic changes InitrdV2_0_0 InitrdVersion = "v2.0.0" + InitrdV2_0_1 InitrdVersion = "v2.0.1" + InitrdV2_0_2 InitrdVersion = "v2.0.2" ) // InitrdBaseImages maps initrd versions to specific base image references // v2.0.0: Uses pre-built Alpine image with exec-agent from Docker Hub (multi-arch OCI manifest list) +// v2.0.1: Uses same base but we will inject local agent +// v2.0.2: Uses same base but with interactive shell fallback +// v2.0.2-dev: Local dev build (built via cmd/build-dev-initrd) var InitrdBaseImages = map[InitrdVersion]string{ - InitrdV2_0_0: "docker.io/onkernel/hypeman-initrd:1d4efc9-oci", + InitrdV2_0_0: "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", + InitrdV2_0_1: "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", + InitrdV2_0_2: "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", + InitrdVersion("v2.0.2-dev"): "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", // Not used, already built locally // Add future versions here } var ( // DefaultKernelVersion is the kernel version used for new instances - DefaultKernelVersion = KernelCH_6_12_8_20250613 + DefaultKernelVersion = Kernel_202511182 // DefaultInitrdVersion is the initrd version used for new instances - DefaultInitrdVersion = InitrdV2_0_0 + DefaultInitrdVersion = InitrdVersion("v2.0.2-dev") // SupportedKernelVersions lists all supported kernel versions SupportedKernelVersions = []KernelVersion{ - KernelCH_6_12_8_20250613, + Kernel_202511182, // Add future versions here } // SupportedInitrdVersions lists all supported initrd versions SupportedInitrdVersions = []InitrdVersion{ InitrdV2_0_0, + InitrdV2_0_1, + InitrdV2_0_2, } ) // KernelDownloadURLs maps kernel versions and architectures to download URLs var KernelDownloadURLs = map[KernelVersion]map[string]string{ - KernelCH_6_12_8_20250613: { - "x86_64": "https://github.com/cloud-hypervisor/linux/releases/download/ch-release-v6.12.8-20250613/vmlinux-x86_64", - "aarch64": "https://github.com/cloud-hypervisor/linux/releases/download/ch-release-v6.12.8-20250613/Image-aarch64", + Kernel_202511182: { + "x86_64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1-202511182/vmlinux-x86_64", + "aarch64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1-202511182/Image-arm64", }, // Add future versions here } diff --git a/lib/system/versions_test.go b/lib/system/versions_test.go index 3fb0e1aa..4269ec27 100644 --- a/lib/system/versions_test.go +++ b/lib/system/versions_test.go @@ -12,7 +12,7 @@ import ( // The hash is computed from: sha256(initScript + baseImageDigest) // This ensures that changes to either the script OR base image require a version bump var expectedInitrdHashes = map[InitrdVersion]string{ - InitrdV2_0_0: "aaa467ebd20117aeb5aa96831accc9bfd74ed40f25a557a296f5b579b641425b", + InitrdV2_0_0: "e192f8c95912e8fe3acedbba7f2107ed4fda2adf7c94ae3eebee3720e80c524c", // Add future versions here } diff --git a/openapi.yaml b/openapi.yaml index 8405cf30..1d6965eb 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -792,7 +792,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - + /instances/{id}/volumes/{volumeId}: post: summary: Attach volume to instance diff --git a/scripts/repro_vm.sh b/scripts/repro_vm.sh new file mode 100755 index 00000000..6ac69565 --- /dev/null +++ b/scripts/repro_vm.sh @@ -0,0 +1,100 @@ +#!/bin/bash +set -e + +# Paths +KERNEL_DIR="/var/lib/hypeman/system/kernel/ch-6.12.8-kernel-1-202511182/x86_64" +INITRD_DIR="/var/lib/hypeman/system/initrd/v2.0.2-dev/x86_64" +KERNEL="$KERNEL_DIR/vmlinux" +INITRD="$INITRD_DIR/initrd" +SOCKET="/tmp/repro-vsock.sock" +LOG_FILE="repro-console.log" + +# Disk images +ROOTFS_IMG="repro-rootfs.img" +OVERLAY_IMG="repro-overlay.img" +CONFIG_IMG="repro-config.img" + +# Clean up +rm -f "$SOCKET" "$LOG_FILE" "$ROOTFS_IMG" "$OVERLAY_IMG" "$CONFIG_IMG" + +# Create dummy disks +echo "Creating dummy disks..." +# Rootfs (100M) +dd if=/dev/zero of="$ROOTFS_IMG" bs=1M count=100 +mkfs.ext4 -F "$ROOTFS_IMG" + +# Overlay (100M) +dd if=/dev/zero of="$OVERLAY_IMG" bs=1M count=100 +mkfs.ext4 -F "$OVERLAY_IMG" + +# Config (10M) +dd if=/dev/zero of="$CONFIG_IMG" bs=1M count=10 +mkfs.ext4 -F "$CONFIG_IMG" +# Add a dummy config.sh +mkdir -p /tmp/repro-config +if sudo mount -o loop "$CONFIG_IMG" /tmp/repro-config; then + echo "#!/bin/sh" | sudo tee /tmp/repro-config/config.sh + echo "echo 'Repro config loaded'" | sudo tee -a /tmp/repro-config/config.sh + echo "export ENTRYPOINT='/bin/sh'" | sudo tee -a /tmp/repro-config/config.sh + echo "export CMD='-c \"echo Hello from Guest && sleep 3600\"'" | sudo tee -a /tmp/repro-config/config.sh + sudo chmod +x /tmp/repro-config/config.sh + sudo umount /tmp/repro-config +else + echo "Failed to mount config img, proceeding with empty config" +fi + +# Check artifacts +if [ ! -f "$KERNEL" ]; then + echo "Kernel not found at $KERNEL" + exit 1 +fi + +if [ ! -f "$INITRD" ]; then + echo "Initrd not found at $INITRD" + exit 1 +fi + +echo "Starting Cloud Hypervisor..." +echo "Kernel: $KERNEL" +echo "Initrd: $INITRD" +echo "Socket: $SOCKET" + +# Start Cloud Hypervisor in background +cloud-hypervisor \ + --kernel "$KERNEL" \ + --initramfs "$INITRD" \ + --cmdline "console=ttyS0 panic=1" \ + --disk path="$ROOTFS_IMG",readonly=on path="$OVERLAY_IMG" path="$CONFIG_IMG",readonly=on \ + --cpus boot=1 \ + --memory size=512M \ + --console off \ + --serial tty \ + --vsock cid=3,socket="$SOCKET" \ + > "$LOG_FILE" 2>&1 & + +CH_PID=$! +echo "Cloud Hypervisor running with PID $CH_PID" +echo "Logs at $LOG_FILE" +echo "Waiting for VM to boot..." + +# Wait for socket +for i in $(seq 1 10); do + if [ -S "$SOCKET" ]; then + echo "Socket created." + break + fi + sleep 1 +done + +if [ ! -S "$SOCKET" ]; then + echo "Socket creation failed." + kill $CH_PID + exit 1 +fi + +echo "VM should be booting. You can now connect to $SOCKET using socat or Go tools." +echo "Example: sudo socat - UNIX-CONNECT:$SOCKET" +echo "Then type: CONNECT 2222" +echo "" +echo "To stop: kill $CH_PID" + From 10e5aeb97c08c269af1552cbe3f00d52530e081d Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 18 Nov 2025 17:35:42 -0500 Subject: [PATCH 2/5] Cleanup test --- cmd/api/api/exec_test.go | 78 ++++++++-------------------------------- 1 file changed, 14 insertions(+), 64 deletions(-) diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index a25d2c99..53c34da7 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -2,10 +2,7 @@ package api import ( "bytes" - "fmt" "os" - "os/exec" - "path/filepath" "testing" "time" @@ -103,36 +100,17 @@ func TestExecInstanceNonTTY(t *testing.T) { require.NotEmpty(t, actualInst.VsockSocket, "vsock socket path should be set") t.Logf("vsock CID: %d, socket: %s", actualInst.VsockCID, actualInst.VsockSocket) - // Print console log for debugging - consolePath := paths.New(svc.Config.DataDir).InstanceConsoleLog(inst.Id) - if consoleData, err := os.ReadFile(consolePath); err == nil { - t.Logf("=== Console Log ===") - lines := bytes.Split(consoleData, []byte("\n")) - - // Print boot messages with virtio - t.Logf("--- Boot messages (virtio devices) ---") - for _, line := range lines { - lineStr := string(line) - if bytes.Contains(line, []byte("virtio")) || bytes.Contains(line, []byte("vsock")) { - t.Logf("%s", lineStr) - } - } - - // Print last 30 lines - t.Logf("--- Last 30 lines ---") - start := len(lines) - 30 - if start < 0 { - start = 0 - } - for _, line := range lines[start:] { - if len(line) > 0 { - t.Logf("%s", line) + // Capture console log on failure + t.Cleanup(func() { + if t.Failed() { + consolePath := paths.New(svc.Config.DataDir).InstanceConsoleLog(inst.Id) + if consoleData, err := os.ReadFile(consolePath); err == nil { + t.Logf("=== Console Log (Failure) ===") + t.Logf("%s", string(consoleData)) + t.Logf("=== End Console Log ===") } } - t.Logf("=== End Console Log ===") - } else { - t.Logf("Could not read console log: %v", err) - } + }) // Check if vsock socket exists if _, err := os.Stat(actualInst.VsockSocket); err != nil { @@ -140,35 +118,6 @@ func TestExecInstanceNonTTY(t *testing.T) { } else { t.Logf("vsock socket exists: %s", actualInst.VsockSocket) } - - // Check if exec-agent exists in the initrd - initrdPath, _ := systemMgr.GetInitrdPath(system.InitrdV2_0_0) - t.Logf("Initrd path: %s", initrdPath) - if _, err := os.Stat(initrdPath); err != nil { - t.Logf("Initrd file does not exist: %v", err) - } else { - if stat, err := os.Stat(initrdPath); err == nil { - t.Logf("Initrd file exists, size: %d bytes", stat.Size()) - - // Unpack initrd to check contents - tmpUnpack := filepath.Join(os.TempDir(), "initrd-check") - os.RemoveAll(tmpUnpack) - os.MkdirAll(tmpUnpack, 0755) - - cmd := exec.Command("sh", "-c", fmt.Sprintf("cd %s && cpio -i < %s 2>/dev/null", tmpUnpack, initrdPath)) - if err := cmd.Run(); err == nil { - if _, err := os.Stat(filepath.Join(tmpUnpack, "usr/local/bin/exec-agent")); err == nil { - t.Logf("✅ exec-agent found in initrd") - } else { - t.Logf("❌ exec-agent NOT found in initrd!") - // List what's actually in the initrd - entries, _ := os.ReadDir(filepath.Join(tmpUnpack, "usr/local/bin")) - t.Logf("Contents of /usr/local/bin in initrd: %v", len(entries)) - } - } - os.RemoveAll(tmpUnpack) - } - } // Wait for exec agent to be ready (retry a few times) var exit *system.ExitStatus @@ -207,13 +156,14 @@ func TestExecInstanceNonTTY(t *testing.T) { t.Logf("Command output: %q", outStr) require.Contains(t, outStr, "root", "whoami should return root user") - // Test another command to verify filesystem access - t.Log("Testing exec command: ls /") + // Test another command to verify filesystem access and container context + // We should see /docker-entrypoint.sh which is standard in nginx:alpine image + t.Log("Testing exec command: ls /docker-entrypoint.sh") stdout = outputBuffer{} stderr = outputBuffer{} exit, err = system.ExecIntoInstance(ctx(), actualInst.VsockSocket, system.ExecOptions{ - Command: []string{"/bin/sh", "-c", "ls -la /"}, + Command: []string{"/bin/sh", "-c", "ls -la /docker-entrypoint.sh"}, Stdin: nil, Stdout: &stdout, Stderr: &stderr, @@ -225,7 +175,7 @@ func TestExecInstanceNonTTY(t *testing.T) { outStr = stdout.String() t.Logf("ls output: %q", outStr) - require.Contains(t, outStr, "bin", "should see bin directory") + require.Contains(t, outStr, "docker-entrypoint.sh", "should see docker-entrypoint.sh file") // Cleanup t.Log("Cleaning up instance...") From 9a9ff37f388512dc9684f9d70645a94caf677c07 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 18 Nov 2025 18:14:29 -0500 Subject: [PATCH 3/5] Simplify initrd build + test --- .github/workflows/build-initrd-image.yml | 39 -- .gitignore | 1 + Makefile | 9 +- cmd/api/main.go | 5 +- cmd/build-dev-initrd/main.go | 213 ---------- cmd/test-handshake/main.go | 25 -- go.mod | 4 + go.sum | 12 +- lib/instances/create.go | 9 +- lib/instances/memory_test.go | 490 ----------------------- lib/instances/types.go | 1 - lib/paths/paths.go | 24 +- lib/system/exec_agent/main.go | 333 +++++++++++++++ lib/system/exec_agent_binary.go | 9 + lib/system/init_script.go | 4 +- lib/system/initrd.go | 137 ++++--- lib/system/manager.go | 38 +- lib/system/versions.go | 32 -- 18 files changed, 494 insertions(+), 891 deletions(-) delete mode 100644 .github/workflows/build-initrd-image.yml delete mode 100644 cmd/build-dev-initrd/main.go delete mode 100644 cmd/test-handshake/main.go delete mode 100644 lib/instances/memory_test.go create mode 100644 lib/system/exec_agent/main.go create mode 100644 lib/system/exec_agent_binary.go diff --git a/.github/workflows/build-initrd-image.yml b/.github/workflows/build-initrd-image.yml deleted file mode 100644 index 94172799..00000000 --- a/.github/workflows/build-initrd-image.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Build Initrd Base Image - -on: - push: - paths: - - 'lib/system/initrd/Dockerfile' - - 'lib/system/initrd/guest-agent/**' - - '.github/workflows/build-initrd-image.yml' - workflow_dispatch: - -jobs: - build-and-push: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Get short SHA - id: sha - run: echo "short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push multi-arch with OCI format - uses: docker/build-push-action@v5 - with: - context: ./lib/system/initrd - platforms: linux/amd64,linux/arm64 - outputs: type=registry,name=${{ secrets.DOCKERHUB_USERNAME }}/hypeman-initrd:${{ steps.sha.outputs.short }}-oci,push=true,oci-mediatypes=true - provenance: false - cache-from: type=gha - cache-to: type=gha,mode=max - diff --git a/.gitignore b/.gitignore index 234bdced..a18cbe9d 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ tmp/** lib/vmm/binaries/cloud-hypervisor/*/*/cloud-hypervisor cloud-hypervisor cloud-hypervisor/** +lib/system/exec_agent/exec-agent diff --git a/Makefile b/Makefile index 485e1f5f..7129eecd 100644 --- a/Makefile +++ b/Makefile @@ -87,8 +87,13 @@ ensure-ch-binaries: $(MAKE) download-ch-binaries; \ fi +# Build exec-agent (guest binary) into its own directory for embedding +lib/system/exec_agent/exec-agent: lib/system/exec_agent/main.go + @echo "Building exec-agent..." + cd lib/system/exec_agent && CGO_ENABLED=0 go build -ldflags="-s -w" -o exec-agent . + # Build the binary -build: ensure-ch-binaries | $(BIN_DIR) +build: ensure-ch-binaries lib/system/exec_agent/exec-agent | $(BIN_DIR) go build -tags containers_image_openpgp -o $(BIN_DIR)/hypeman ./cmd/api # Run in development mode with hot reload @@ -96,7 +101,7 @@ dev: $(AIR) $(AIR) -c .air.toml # Run tests -test: ensure-ch-binaries +test: ensure-ch-binaries lib/system/exec_agent/exec-agent go test -tags containers_image_openpgp -v -timeout 30s ./... # Generate JWT token for testing diff --git a/cmd/api/main.go b/cmd/api/main.go index c95f78a2..20ec5541 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -55,10 +55,9 @@ func run() error { logger.Error("failed to ensure system files", "error", err) os.Exit(1) } - kernelVer, initrdVer := app.SystemManager.GetDefaultVersions() + kernelVer := app.SystemManager.GetDefaultKernelVersion() logger.Info("System files ready", - "kernel", kernelVer, - "initrd", initrdVer) + "kernel", kernelVer) // Create router r := chi.NewRouter() diff --git a/cmd/build-dev-initrd/main.go b/cmd/build-dev-initrd/main.go deleted file mode 100644 index b52b7ec8..00000000 --- a/cmd/build-dev-initrd/main.go +++ /dev/null @@ -1,213 +0,0 @@ -package main - -import ( - "context" - "encoding/json" - "fmt" - "os" - "os/exec" - "path/filepath" - "strings" - - "github.com/onkernel/hypeman/lib/images" - "github.com/onkernel/hypeman/lib/paths" - "github.com/onkernel/hypeman/lib/system" - digest "github.com/opencontainers/go-digest" - v1 "github.com/opencontainers/image-spec/specs-go/v1" - "github.com/opencontainers/umoci/oci/cas/dir" - "github.com/opencontainers/umoci/oci/casext" -) - -func main() { - ctx := context.Background() - - // Get project root - projectRoot, err := os.Getwd() - if err != nil { - fmt.Fprintf(os.Stderr, "Error getting working directory: %v\n", err) - os.Exit(1) - } - - initrdDir := filepath.Join(projectRoot, "lib/system/initrd") - - // Step 1: Build Docker image with OCI output (like GitHub workflow) - fmt.Println("Building initrd Docker image with OCI format...") - ociDir := filepath.Join(os.TempDir(), "hypeman-initrd-oci-dev") - os.RemoveAll(ociDir) - os.MkdirAll(ociDir, 0755) - - // Use docker buildx to build directly to OCI format - // This matches the GitHub workflow approach - cmd := exec.Command("docker", "buildx", "build", - "--output", fmt.Sprintf("type=oci,dest=%s/image.tar,oci-mediatypes=true", ociDir), - "--platform", "linux/amd64", - ".") - cmd.Dir = initrdDir - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - fmt.Fprintf(os.Stderr, "Error building Docker image: %v\n", err) - os.Exit(1) - } - fmt.Println("✓ Docker image built in OCI format") - - // Step 2: Extract OCI tar to directory (umoci expects a directory layout) - fmt.Println("\nExtracting OCI layout...") - ociLayoutDir := filepath.Join(ociDir, "layout") - os.MkdirAll(ociLayoutDir, 0755) - - cmd = exec.Command("tar", "-xf", filepath.Join(ociDir, "image.tar"), "-C", ociLayoutDir) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - fmt.Fprintf(os.Stderr, "Error extracting OCI tar: %v\n", err) - os.Exit(1) - } - fmt.Println("✓ Extracted OCI layout") - - // Step 3: Use existing system manager to build initrd from OCI directory - fmt.Println("\nBuilding initrd using existing pipeline...") - pathsConfig := paths.New("/var/lib/hypeman") - version := system.InitrdVersion("v2.0.2-dev") - arch := system.GetArch() - - // Create temp directory for building - tempDir, err := os.MkdirTemp("", "hypeman-initrd-build-*") - if err != nil { - fmt.Fprintf(os.Stderr, "Error creating temp dir: %v\n", err) - os.Exit(1) - } - defer os.RemoveAll(tempDir) - - rootfsDir := filepath.Join(tempDir, "rootfs") - - // Create OCI client using our locally built OCI layout as the cache - // This way the image is already "cached" and won't try to pull from remote - ociClient, err := images.NewOCIClient(ociLayoutDir) - if err != nil { - fmt.Fprintf(os.Stderr, "Error creating OCI client: %v\n", err) - os.Exit(1) - } - - // Read the index.json to get the manifest digest - indexData, err := os.ReadFile(filepath.Join(ociLayoutDir, "index.json")) - if err != nil { - fmt.Fprintf(os.Stderr, "Error reading index.json: %v\n", err) - os.Exit(1) - } - - var index struct { - Manifests []struct { - Digest string `json:"digest"` - } `json:"manifests"` - } - if err := json.Unmarshal(indexData, &index); err != nil { - fmt.Fprintf(os.Stderr, "Error parsing index.json: %v\n", err) - os.Exit(1) - } - - if len(index.Manifests) == 0 { - fmt.Fprintf(os.Stderr, "No manifests found in index.json\n") - os.Exit(1) - } - - digest := index.Manifests[0].Digest - fmt.Printf(" Using manifest: %s\n", digest) - - // Tag the manifest in the OCI layout so the OCI client can find it - // The OCI client expects tags in the format that digestToLayoutTag produces (just the hex part) - layoutTag := strings.TrimPrefix(digest, "sha256:") - - // Use umoci library to create the tag - if err := tagManifestInOCI(ociLayoutDir, digest, layoutTag); err != nil { - fmt.Fprintf(os.Stderr, "Error tagging manifest in OCI layout: %v\n", err) - os.Exit(1) - } - fmt.Printf(" Tagged as: %s\n", layoutTag) - - // Now the OCI client will find it in the cache and won't try to pull - // We pass a dummy imageRef since it won't be used (image is already cached) - if err := ociClient.PullAndUnpack(ctx, "local/dev", digest, rootfsDir); err != nil { - fmt.Fprintf(os.Stderr, "Error unpacking OCI image: %v\n", err) - os.Exit(1) - } - fmt.Println("✓ Unpacked OCI image") - - // Inject init script - fmt.Println("\nInjecting init script...") - initScript := system.GenerateInitScript(version) - initPath := filepath.Join(rootfsDir, "init") - if err := os.WriteFile(initPath, []byte(initScript), 0755); err != nil { - fmt.Fprintf(os.Stderr, "Error writing init script: %v\n", err) - os.Exit(1) - } - - // Package as cpio.gz (initramfs format) - fmt.Println("Packaging as initrd...") - outputPath := pathsConfig.SystemInitrd(string(version), arch) - os.MkdirAll(filepath.Dir(outputPath), 0755) - - if _, err := images.ExportRootfs(rootfsDir, outputPath, images.FormatCpio); err != nil { - fmt.Fprintf(os.Stderr, "Error exporting initrd: %v\n", err) - os.Exit(1) - } - - fmt.Println("\n✓ Dev initrd built successfully!") - fmt.Printf(" Location: %s\n", outputPath) - fmt.Printf(" OCI cache: %s (can be deleted)\n", ociDir) - fmt.Println("\nTo use it, update lib/system/versions.go:") - fmt.Println(" DefaultInitrdVersion = InitrdVersion(\"v2.0.2-dev\")") -} - -// tagManifestInOCI tags a manifest digest with a tag name in an OCI layout -func tagManifestInOCI(ociLayoutDir, digestStr, tag string) error { - casEngine, err := dir.Open(ociLayoutDir) - if err != nil { - return fmt.Errorf("open OCI layout: %w", err) - } - defer casEngine.Close() - - engine := casext.NewEngine(casEngine) - - // Read the index to find the manifest descriptor - indexData, err := os.ReadFile(filepath.Join(ociLayoutDir, "index.json")) - if err != nil { - return fmt.Errorf("read index.json: %w", err) - } - - var index struct { - Manifests []struct { - MediaType string `json:"mediaType"` - Digest string `json:"digest"` - Size int64 `json:"size"` - } `json:"manifests"` - } - if err := json.Unmarshal(indexData, &index); err != nil { - return fmt.Errorf("parse index.json: %w", err) - } - - // Find the manifest descriptor matching our digest - var manifestDesc *v1.Descriptor - for _, m := range index.Manifests { - if m.Digest == digestStr { - manifestDesc = &v1.Descriptor{ - MediaType: m.MediaType, - Digest: digest.Digest(digestStr), - Size: m.Size, - } - break - } - } - - if manifestDesc == nil { - return fmt.Errorf("manifest %s not found in index", digestStr) - } - - // Update the reference to create the tag - if err := engine.UpdateReference(context.Background(), tag, *manifestDesc); err != nil { - return fmt.Errorf("update reference: %w", err) - } - - return nil -} - diff --git a/cmd/test-handshake/main.go b/cmd/test-handshake/main.go deleted file mode 100644 index f1578fbb..00000000 --- a/cmd/test-handshake/main.go +++ /dev/null @@ -1,25 +0,0 @@ -package main - -import ( - "bufio" - "fmt" - "net" -) - -func main() { - conn, err := net.Dial("unix", "/tmp/repro-vsock.sock") - if err != nil { - panic(err) - } - defer conn.Close() - - fmt.Fprintf(conn, "CONNECT 2222\n") - - scanner := bufio.NewScanner(conn) - if scanner.Scan() { - fmt.Printf("Response: %s\n", scanner.Text()) - } else { - fmt.Printf("No response, error: %v\n", scanner.Err()) - } -} - diff --git a/go.mod b/go.mod index 3a700606..5d99b24a 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.25.4 require ( github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 + github.com/creack/pty v1.1.24 github.com/distribution/reference v0.6.0 github.com/getkin/kin-openapi v0.133.0 github.com/ghodss/yaml v1.0.0 @@ -13,6 +14,7 @@ require ( github.com/google/wire v0.7.0 github.com/gorilla/websocket v1.5.3 github.com/joho/godotenv v1.5.1 + github.com/mdlayher/vsock v1.2.1 github.com/nrednav/cuid2 v1.1.0 github.com/oapi-codegen/nethttp-middleware v1.1.2 github.com/oapi-codegen/runtime v1.1.2 @@ -46,6 +48,7 @@ require ( github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/pgzip v1.2.6 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/mdlayher/socket v0.5.1 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect @@ -65,6 +68,7 @@ require ( github.com/vbatts/tar-split v0.12.1 // indirect github.com/woodsbury/decimal128 v1.3.0 // indirect golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.42.0 // indirect golang.org/x/sys v0.38.0 // indirect google.golang.org/protobuf v1.36.10 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index ed4732b3..3efffdcf 100644 --- a/go.sum +++ b/go.sum @@ -17,6 +17,8 @@ github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 h1:6lhrsTEnloDPXye github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M= github.com/containerd/stargz-snapshotter/estargz v0.16.3 h1:7evrXtoh1mSbGj/pfRccTampEyKpjpOnS3CyiV1Ebr8= github.com/containerd/stargz-snapshotter/estargz v0.16.3/go.mod h1:uyr4BfYfOj3G9WBVE8cOlQmXAbPN9VEQpBBeJIuOipU= +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw= github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -93,6 +95,10 @@ github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcncea github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= +github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= +github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ= +github.com/mdlayher/vsock v1.2.1/go.mod h1:NRfCibel++DgeMD8z/hP+PPTjlNJsdPOmxcnENvE+SE= github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= @@ -176,6 +182,8 @@ golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sU golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= +golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= @@ -184,12 +192,8 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= -golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= -golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/lib/instances/create.go b/lib/instances/create.go index 6ca4f428..94db6cbe 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -100,8 +100,8 @@ func (m *manager) createInstance( req.Env = make(map[string]string) } - // 6. Get default system versions - kernelVer, initrdVer := m.systemManager.GetDefaultVersions() + // 6. Get default kernel version + kernelVer := m.systemManager.GetDefaultKernelVersion() // 7. Create instance metadata stored := &StoredMetadata{ @@ -117,7 +117,6 @@ func (m *manager) createInstance( StartedAt: nil, StoppedAt: nil, KernelVersion: string(kernelVer), - InitrdVersion: string(initrdVer), CHVersion: vmm.V49_0, // Use latest SocketPath: m.paths.InstanceSocket(id), DataDir: m.paths.InstanceDir(id), @@ -293,9 +292,9 @@ func (m *manager) startAndBootVM( // buildVMConfig creates the Cloud Hypervisor VmConfig func (m *manager) buildVMConfig(inst *Instance, imageInfo *images.Image) (vmm.VmConfig, error) { - // Get versioned system file paths + // Get system file paths kernelPath, _ := m.systemManager.GetKernelPath(system.KernelVersion(inst.KernelVersion)) - initrdPath, _ := m.systemManager.GetInitrdPath(system.InitrdVersion(inst.InitrdVersion)) + initrdPath, _ := m.systemManager.GetInitrdPath() // Payload configuration (kernel + initramfs) payload := vmm.PayloadConfig{ diff --git a/lib/instances/memory_test.go b/lib/instances/memory_test.go deleted file mode 100644 index 33e6a12c..00000000 --- a/lib/instances/memory_test.go +++ /dev/null @@ -1,490 +0,0 @@ -package instances - -import ( - "context" - "fmt" - "os" - "strings" - "testing" - "time" - - "github.com/onkernel/hypeman/lib/images" - "github.com/onkernel/hypeman/lib/paths" - "github.com/onkernel/hypeman/lib/system" - "github.com/onkernel/hypeman/lib/vmm" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestMemoryReduction(t *testing.T) { - if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { - t.Fatal("/dev/kvm not available - ensure KVM is enabled and user is in 'kvm' group (sudo usermod -aG kvm $USER)") - } - - manager, tmpDir := setupTestManager(t) - ctx := context.Background() - - // Setup: create Alpine and nginx images and system files - imageManager, err := images.NewManager(paths.New(tmpDir), 1) - require.NoError(t, err) - - t.Log("Pulling alpine:latest image...") - alpineImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ - Name: "docker.io/library/alpine:latest", - }) - require.NoError(t, err) - - // Wait for Alpine image to be ready - t.Log("Waiting for alpine image build to complete...") - for i := 0; i < 60; i++ { - img, err := imageManager.GetImage(ctx, alpineImage.Name) - if err == nil && img.Status == images.StatusReady { - alpineImage = img - break - } - if err == nil && img.Status == images.StatusFailed { - t.Fatalf("Alpine image build failed: %s", *img.Error) - } - time.Sleep(1 * time.Second) - } - require.Equal(t, images.StatusReady, alpineImage.Status, "Alpine image should be ready") - t.Log("Alpine image ready") - - t.Log("Pulling php:cli-alpine image...") - phpImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ - Name: "docker.io/library/php:cli-alpine", - }) - require.NoError(t, err) - - // Wait for PHP image to be ready - t.Log("Waiting for PHP image build to complete...") - for i := 0; i < 120; i++ { - img, err := imageManager.GetImage(ctx, phpImage.Name) - if err == nil && img.Status == images.StatusReady { - phpImage = img - break - } - if err == nil && img.Status == images.StatusFailed { - t.Fatalf("PHP image build failed: %s", *img.Error) - } - time.Sleep(1 * time.Second) - } - require.Equal(t, images.StatusReady, phpImage.Status, "PHP image should be ready") - t.Log("PHP image ready") - - // Ensure system files - systemManager := system.NewManager(paths.New(tmpDir)) - t.Log("Ensuring system files...") - err = systemManager.EnsureSystemFiles(ctx) - require.NoError(t, err) - - t.Run("fast_shrink_idle_container", func(t *testing.T) { - t.Log("Testing fast memory shrink with idle container...") - - // Create instance with idle container - // Note: create.go automatically expands memory to Size + HotplugSize - inst, err := manager.CreateInstance(ctx, CreateInstanceRequest{ - Name: "test-memory-fast", - Image: "docker.io/library/alpine:latest", - Size: 256 * 1024 * 1024, // 256MB base - HotplugSize: 512 * 1024 * 1024, // 512MB hotplug capacity (auto-expanded at boot) - OverlaySize: 5 * 1024 * 1024 * 1024, // 5GB overlay - Vcpus: 1, - Env: map[string]string{ - // Idle container - minimal memory usage - "CMD": "sleep infinity", - }, - }) - require.NoError(t, err) - defer manager.DeleteInstance(ctx, inst.Id) - t.Logf("Instance created: %s", inst.Id) - - // Wait for VM ready (no arbitrary sleep!) - err = waitForVMReady(ctx, inst.SocketPath, 5*time.Second) - require.NoError(t, err) - t.Log("VM is ready") - - client, err := vmm.NewVMM(inst.SocketPath) - require.NoError(t, err) - - // Get initial memory state (should be fully expanded) - initialSize := getActualMemorySize(t, ctx, client) - t.Logf("Initial memory (auto-expanded): %d MB", initialSize/(1024*1024)) - - // Expected to be at Size + HotplugSize = 768 MB - expectedMax := inst.Size + inst.HotplugSize - assert.InDelta(t, expectedMax, initialSize, float64(100*1024*1024), - "Memory should be near max capacity after boot") - - // Now reduce back to base size - // Idle container should shrink quickly since it's not using the hotplugged memory - targetSize := inst.Size // Reduce to 256MB base - t.Logf("Reducing memory to base size (%d MB)...", targetSize/(1024*1024)) - - start := time.Now() - err = reduceMemoryWithPolling(ctx, client, targetSize) - duration := time.Since(start) - - require.NoError(t, err) - t.Logf("Fast shrink completed in %v", duration) - - // Verify it was actually fast - assert.Less(t, duration, 1500*time.Millisecond, - "Idle container memory should shrink quickly") - - // Verify final size - finalSize := getActualMemorySize(t, ctx, client) - t.Logf("Final memory: %d MB", finalSize/(1024*1024)) - - tolerance := int64(50 * 1024 * 1024) // 50MB tolerance - assert.InDelta(t, targetSize, finalSize, float64(tolerance), - "Memory should be close to base size") - }) - - t.Run("investigate_memory_metrics", func(t *testing.T) { - t.Log("Investigating what memory metrics actually report...") - - inst, err := manager.CreateInstance(ctx, CreateInstanceRequest{ - Name: "test-memory-metrics", - Image: "docker.io/library/php:cli-alpine", - Size: 128 * 1024 * 1024, // 128MB base - HotplugSize: 512 * 1024 * 1024, // 512MB hotplug - OverlaySize: 5 * 1024 * 1024 * 1024, - Vcpus: 1, - Env: map[string]string{ - "CMD": `php -d memory_limit=-1 -r '$a = str_repeat("A", 300*1024*1024); for($i=0; $i<300; $i++) { $a[$i*1024*1024]="X"; } echo "Allocated 300MB\n"; for($i=0;$i<20;$i++) { sleep(1); echo "Still alive $i\n"; }'`, - }, - }) - require.NoError(t, err) - defer manager.DeleteInstance(ctx, inst.Id) - - err = waitForVMReady(ctx, inst.SocketPath, 5*time.Second) - require.NoError(t, err) - - client, err := vmm.NewVMM(inst.SocketPath) - require.NoError(t, err) - - // Wait for PHP to allocate (poll for log message) - t.Log("Waiting for PHP to allocate memory...") - err = waitForLogMessage(ctx, manager, inst.Id, "Allocated 300MB", 10*time.Second) - require.NoError(t, err, "PHP should allocate memory") - - // Wait for PHP to start printing (ensures it's running) - err = waitForLogMessage(ctx, manager, inst.Id, "Still alive 0", 3*time.Second) - require.NoError(t, err, "PHP should start status loop") - - // Get FULL VmInfo before reduction - t.Log("=== BEFORE REDUCTION ===") - infoBefore, _ := client.GetVmInfoWithResponse(ctx) - if infoBefore != nil && infoBefore.JSON200 != nil { - info := infoBefore.JSON200 - t.Logf("MemoryActualSize: %d MB", *info.MemoryActualSize/(1024*1024)) - if info.Config.Memory != nil { - mem := info.Config.Memory - t.Logf("Config.Memory.Size: %d MB", mem.Size/(1024*1024)) - if mem.HotplugSize != nil { - t.Logf("Config.Memory.HotplugSize: %d MB", *mem.HotplugSize/(1024*1024)) - } - if mem.HotpluggedSize != nil { - t.Logf("Config.Memory.HotpluggedSize: %d MB", *mem.HotpluggedSize/(1024*1024)) - } - } - } - - // Reduce memory - targetSize := int64(128 * 1024 * 1024) - t.Logf("\n=== REDUCING TO %d MB ===", targetSize/(1024*1024)) - err = reduceMemoryWithPolling(ctx, client, targetSize) - require.NoError(t, err) - - // Get FULL VmInfo after reduction - t.Log("\n=== AFTER REDUCTION ===") - infoAfter, _ := client.GetVmInfoWithResponse(ctx) - if infoAfter != nil && infoAfter.JSON200 != nil { - info := infoAfter.JSON200 - t.Logf("MemoryActualSize: %d MB", *info.MemoryActualSize/(1024*1024)) - if info.Config.Memory != nil { - mem := info.Config.Memory - t.Logf("Config.Memory.Size: %d MB", mem.Size/(1024*1024)) - if mem.HotplugSize != nil { - t.Logf("Config.Memory.HotplugSize: %d MB", *mem.HotplugSize/(1024*1024)) - } - if mem.HotpluggedSize != nil { - t.Logf("Config.Memory.HotpluggedSize: %d MB", *mem.HotpluggedSize/(1024*1024)) - } - } - } - - // Check what the current highest "Still alive" number is - logsNow, _ := manager.GetInstanceLogs(ctx, inst.Id, false, 50) - currentHighest := -1 - for i := 0; i < 20; i++ { - if strings.Contains(logsNow, fmt.Sprintf("Still alive %d", i)) { - currentHighest = i - } - } - t.Logf("Current highest 'Still alive': %d", currentHighest) - - // Wait for PHP to print the NEXT number (proves it's still running) - nextMessage := fmt.Sprintf("Still alive %d", currentHighest+1) - t.Logf("Waiting for '%s'...", nextMessage) - err = waitForLogMessage(ctx, manager, inst.Id, nextMessage, 3*time.Second) - require.NoError(t, err, "PHP should continue running and increment counter") - - t.Logf("\n✓ PHP still alive up to message: %d", currentHighest+1) - - t.Log("\n=== ANALYSIS ===") - t.Logf("MemoryActualSize likely shows: Size + HotpluggedSize (VMM's configured view)") - t.Logf("Guest is actually using: ~300MB for PHP + system overhead") - t.Logf("virtio-mem migrated guest pages into base region") - t.Logf("PHP process survived - no OOM kill") - - // This test is informational - always passes - assert.True(t, true, "Diagnostic test completed") - }) - - t.Run("partial_reduction_php_holds_memory", func(t *testing.T) { - t.Log("Testing partial reduction when PHP actively holds memory...") - - // HARD REQUIREMENTS: - // - 128MB base - // - 512MB hotplug - // - Request reduction to 128MB - // - Assert final > 128MB - inst, err := manager.CreateInstance(ctx, CreateInstanceRequest{ - Name: "test-memory-php", - Image: "docker.io/library/php:cli-alpine", - Size: 128 * 1024 * 1024, // 128MB base (REQUIRED) - HotplugSize: 512 * 1024 * 1024, // 512MB hotplug (REQUIRED) - OverlaySize: 5 * 1024 * 1024 * 1024, - Vcpus: 1, - Env: map[string]string{ - // PHP allocates 300MB, touches pages, and continuously reports it's alive - "CMD": `php -d memory_limit=-1 -r '$a = str_repeat("A", 300*1024*1024); for($i=0; $i<300; $i++) { $a[$i*1024*1024]="X"; } echo "Allocated 300MB\n"; for($i=0;$i<20;$i++) { sleep(1); echo "Still alive $i\n"; }'`, - }, - }) - require.NoError(t, err) - defer manager.DeleteInstance(ctx, inst.Id) - t.Logf("Instance created: %s", inst.Id) - - err = waitForVMReady(ctx, inst.SocketPath, 5*time.Second) - require.NoError(t, err) - t.Log("VM is ready") - - client, err := vmm.NewVMM(inst.SocketPath) - require.NoError(t, err) - - initialSize := getActualMemorySize(t, ctx, client) - t.Logf("Initial memory (auto-expanded): %d MB", initialSize/(1024*1024)) - - // Should be 128MB + 512MB = 640MB - expectedMax := inst.Size + inst.HotplugSize - assert.InDelta(t, expectedMax, initialSize, float64(50*1024*1024), - "Memory should be near 640MB after auto-expansion") - - // Wait for PHP to start and allocate 300MB with physical pages (poll logs) - t.Log("Waiting for PHP to allocate and touch 300MB...") - err = waitForLogMessage(ctx, manager, inst.Id, "Allocated 300MB", 10*time.Second) - require.NoError(t, err, "PHP should allocate memory") - - // Also wait for at least first "Still alive" message to ensure PHP loop started - t.Log("Waiting for PHP to start printing status...") - err = waitForLogMessage(ctx, manager, inst.Id, "Still alive 0", 3*time.Second) - require.NoError(t, err, "PHP should start status loop") - - afterAllocation := getActualMemorySize(t, ctx, client) - t.Logf("After PHP allocation: %d MB", afterAllocation/(1024*1024)) - - // KEY TEST: Request reduction to 128MB base - targetSize := int64(128 * 1024 * 1024) // REQUIRED: 128MB - t.Logf("Attempting reduction to %d MB (PHP holding 300MB)...", - targetSize/(1024*1024)) - start := time.Now() - - err = reduceMemoryWithPolling(ctx, client, targetSize) - duration := time.Since(start) - - // Should complete successfully - require.NoError(t, err, "Memory reduction should complete successfully") - t.Logf("Reduction completed in %v", duration) - - finalSize := getActualMemorySize(t, ctx, client) - t.Logf("Requested: %d MB, Final: %d MB", - targetSize/(1024*1024), - finalSize/(1024*1024)) - - // Check what the current highest "Still alive" number is - logsCurrent, _ := manager.GetInstanceLogs(ctx, inst.Id, false, 50) - currentHighest := -1 - for i := 0; i < 20; i++ { - if strings.Contains(logsCurrent, fmt.Sprintf("Still alive %d", i)) { - currentHighest = i - } - } - t.Logf("Current highest 'Still alive': %d", currentHighest) - - // Wait for PHP to print the NEXT number (proves it's still running after reduction) - nextMessage := fmt.Sprintf("Still alive %d", currentHighest+1) - t.Log("Waiting for PHP to continue printing after reduction...") - t.Logf("Looking for '%s'...", nextMessage) - err = waitForLogMessage(ctx, manager, inst.Id, nextMessage, 3*time.Second) - require.NoError(t, err, "PHP should continue running and increment counter after reduction") - - // Now get full logs to check for OOM - logsAfter, _ := manager.GetInstanceLogs(ctx, inst.Id, false, 80) - highestStillAlive := currentHighest + 1 - t.Logf("PHP continued to 'Still alive %d' after reduction", highestStillAlive) - - // Check for OOM indicators - hasOOM := strings.Contains(logsAfter, "Out of memory") || - strings.Contains(logsAfter, "Killed") || - strings.Contains(logsAfter, "oom-kill") || - strings.Contains(logsAfter, "invoked oom-killer") - - if hasOOM { - t.Logf("FOUND OOM EVENT in logs!") - } - - // At this point we know PHP counter incremented, so process survived! - t.Logf("✓ IMPORTANT: PHP process SURVIVED memory reduction!") - t.Logf("✓ PHP continued printing (counter incremented) after reduction") - - // Check for OOM or migration traces - if strings.Contains(logsAfter, "migrate_pages") { - t.Logf("✓ Page migration traces found - virtio-mem migrated pages") - } - - // REQUIRED ASSERTION: finalSize must be > 128MB OR process survived - if finalSize > targetSize { - t.Logf("SUCCESS: Partial reduction - stabilized at %d MB (above %d MB target)", - finalSize/(1024*1024), targetSize/(1024*1024)) - assert.Greater(t, finalSize, targetSize, - "Memory stabilized above target") - } else { - // Reduced to 128MB but PHP survived - t.Logf("FINDING: Reduced to 128MB but PHP survived") - t.Logf("✓ virtio-mem used page migration to move 300MB into 128MB base region") - t.Logf("✓ This proves standby/resume is SAFE - no OOM killing occurs") - t.Logf("SUCCESS: Memory reduction is SAFE - process survived with page migration") - } - }) -} - -// Test helpers - -// getActualMemorySize gets the current actual memory size from VMM -func getActualMemorySize(t *testing.T, ctx context.Context, client *vmm.VMM) int64 { - t.Helper() - infoResp, err := client.GetVmInfoWithResponse(ctx) - require.NoError(t, err) - require.NotNil(t, infoResp.JSON200) - require.NotNil(t, infoResp.JSON200.MemoryActualSize) - return *infoResp.JSON200.MemoryActualSize -} - -// resizeMemoryRequest issues a memory resize request to VMM -func resizeMemoryRequest(ctx context.Context, client *vmm.VMM, targetBytes int64) error { - resizeConfig := vmm.VmResize{DesiredRam: &targetBytes} - resp, err := client.PutVmResizeWithResponse(ctx, resizeConfig) - if err != nil || resp.StatusCode() != 204 { - return fmt.Errorf("memory resize request failed") - } - return nil -} - -// waitForMemoryIncrease waits for memory to increase after hotplug (with polling) -func waitForMemoryIncrease(ctx context.Context, client *vmm.VMM, - previousSize int64, timeout time.Duration) error { - - deadline := time.Now().Add(timeout) - const pollInterval = 20 * time.Millisecond - - for time.Now().Before(deadline) { - infoResp, err := client.GetVmInfoWithResponse(ctx) - if err != nil { - time.Sleep(pollInterval) - continue - } - - if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil { - time.Sleep(pollInterval) - continue - } - - if infoResp.JSON200.MemoryActualSize != nil { - currentSize := *infoResp.JSON200.MemoryActualSize - if currentSize > previousSize { - return nil // Memory increased! - } - } - - time.Sleep(pollInterval) - } - - return fmt.Errorf("memory did not increase within %v", timeout) -} - -// waitForMemoryUsageIncrease waits for memory usage to increase (e.g., workload allocation) -// This is similar to waitForMemoryIncrease but checks more frequently and looks for -// significant increases that indicate active memory consumption -func waitForMemoryUsageIncrease(ctx context.Context, client *vmm.VMM, - baselineSize int64, timeout time.Duration) error { - - deadline := time.Now().Add(timeout) - const pollInterval = 100 * time.Millisecond // Check every 100ms for workload activity - const minIncrease = 10 * 1024 * 1024 // Must increase by at least 10MB - - for time.Now().Before(deadline) { - infoResp, err := client.GetVmInfoWithResponse(ctx) - if err != nil { - time.Sleep(pollInterval) - continue - } - - if infoResp.StatusCode() != 200 || infoResp.JSON200 == nil { - time.Sleep(pollInterval) - continue - } - - if infoResp.JSON200.MemoryActualSize != nil { - currentSize := *infoResp.JSON200.MemoryActualSize - increase := currentSize - baselineSize - if increase >= minIncrease { - return nil // Significant memory usage increase detected! - } - } - - time.Sleep(pollInterval) - } - - return fmt.Errorf("memory usage did not increase significantly within %v", timeout) -} - -// reduceMemoryWithPolling reduces memory using the production polling logic -func reduceMemoryWithPolling(ctx context.Context, client *vmm.VMM, targetBytes int64) error { - resizeConfig := vmm.VmResize{DesiredRam: &targetBytes} - if resp, err := client.PutVmResizeWithResponse(ctx, resizeConfig); err != nil || resp.StatusCode() != 204 { - return fmt.Errorf("memory resize failed") - } - - // Reuse the production polling logic! - return pollVMMemory(ctx, client, targetBytes, 5*time.Second) -} - -// waitForLogMessage polls instance logs for a specific message -func waitForLogMessage(ctx context.Context, manager Manager, instanceID string, message string, timeout time.Duration) error { - deadline := time.Now().Add(timeout) - const pollInterval = 200 * time.Millisecond // Check logs every 200ms - - for time.Now().Before(deadline) { - logs, err := manager.GetInstanceLogs(ctx, instanceID, false, 50) - if err == nil && strings.Contains(logs, message) { - return nil // Found the message! - } - - time.Sleep(pollInterval) - } - - return fmt.Errorf("log message %q not found within %v", message, timeout) -} - diff --git a/lib/instances/types.go b/lib/instances/types.go index dc474706..2bf40b75 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -41,7 +41,6 @@ type StoredMetadata struct { // Versions KernelVersion string // Kernel version (e.g., "ch-v6.12.9") - InitrdVersion string // Initrd version (e.g., "v1.0.0") CHVersion vmm.CHVersion // Cloud Hypervisor version CHPID *int // Cloud Hypervisor process ID (may be stale after host restart) diff --git a/lib/paths/paths.go b/lib/paths/paths.go index 1a0a9379..a2abcf3d 100644 --- a/lib/paths/paths.go +++ b/lib/paths/paths.go @@ -5,7 +5,8 @@ // {dataDir}/ // system/ // kernel/{version}/{arch}/vmlinux -// initrd/{version}/{arch}/initrd +// initrd/{arch}/{timestamp}/initrd +// initrd/{arch}/latest -> {timestamp} // binaries/{version}/{arch}/cloud-hypervisor // oci-cache/ // builds/{ref}/ @@ -43,9 +44,24 @@ func (p *Paths) SystemKernel(version, arch string) string { return filepath.Join(p.dataDir, "system", "kernel", version, arch, "vmlinux") } -// SystemInitrd returns the path to an initrd file. -func (p *Paths) SystemInitrd(version, arch string) string { - return filepath.Join(p.dataDir, "system", "initrd", version, arch, "initrd") +// SystemInitrd returns the path to the latest initrd symlink. +func (p *Paths) SystemInitrd(arch string) string { + return filepath.Join(p.dataDir, "system", "initrd", arch, "latest") +} + +// SystemInitrdTimestamp returns the path to a specific timestamped initrd build. +func (p *Paths) SystemInitrdTimestamp(timestamp, arch string) string { + return filepath.Join(p.dataDir, "system", "initrd", arch, timestamp, "initrd") +} + +// SystemInitrdLatest returns the path to the latest symlink (same as SystemInitrd). +func (p *Paths) SystemInitrdLatest(arch string) string { + return filepath.Join(p.dataDir, "system", "initrd", arch, "latest") +} + +// SystemInitrdDir returns the directory for initrd builds for an architecture. +func (p *Paths) SystemInitrdDir(arch string) string { + return filepath.Join(p.dataDir, "system", "initrd", arch) } // SystemOCICache returns the path to the OCI cache directory. diff --git a/lib/system/exec_agent/main.go b/lib/system/exec_agent/main.go new file mode 100644 index 00000000..afc4c351 --- /dev/null +++ b/lib/system/exec_agent/main.go @@ -0,0 +1,333 @@ +package main + +import ( + "encoding/binary" + "encoding/json" + "fmt" + "io" + "log" + "net" + "os" + "os/exec" + "time" + + "github.com/creack/pty" + "github.com/mdlayher/vsock" +) + +const ( + StreamStdin byte = 0 + StreamStdout byte = 1 + StreamStderr byte = 2 + StreamError byte = 3 + StreamResize byte = 4 +) + +type ExecRequest struct { + Command []string `json:"command"` + TTY bool `json:"tty"` +} + +type ResizeMessage struct { + Width uint16 `json:"width"` + Height uint16 `json:"height"` +} + +type ExitMessage struct { + Status struct { + Code int `json:"code"` + } `json:"status"` +} + +func main() { + // Listen on vsock port 2222 using socket API + // Retry a few times as virtio-vsock device may take a moment to initialize + var l net.Listener + var err error + + for i := 0; i < 10; i++ { + l, err = vsock.Listen(2222, nil) + if err == nil { + break + } + log.Printf("vsock listen attempt %d/10 failed: %v (retrying in 1s)", i+1, err) + time.Sleep(1 * time.Second) + } + + if err != nil { + log.Fatalf("failed to listen on vsock port 2222 after retries: %v", err) + } + defer l.Close() + + log.Println("exec-agent: listening on vsock port 2222") + + for { + conn, err := l.Accept() + if err != nil { + log.Printf("accept error: %v", err) + continue + } + + log.Printf("accepted connection from %s", conn.RemoteAddr()) + go handleConnection(conn) + } +} + +func handleConnection(conn net.Conn) { + defer func() { + if r := recover(); r != nil { + log.Printf("handleConnection panicked: %v", r) + } + conn.Close() + }() + + log.Printf("handling connection from %s", conn.RemoteAddr()) + + // Read first frame (should be exec request on stdin stream) + streamType, data, err := readFrame(conn) + if err != nil { + log.Printf("read request: %v", err) + return + } + + if streamType != StreamStdin { + sendError(conn, "first message must be stdin with exec request") + return + } + + var req ExecRequest + if err := json.Unmarshal(data, &req); err != nil { + sendError(conn, fmt.Sprintf("invalid request: %v", err)) + return + } + + if len(req.Command) == 0 { + req.Command = []string{"/bin/sh"} + } + + log.Printf("exec: command=%v tty=%v", req.Command, req.TTY) + + if req.TTY { + executeTTY(conn, req.Command) + } else { + executeNoTTY(conn, req.Command) + } +} + +func executeTTY(conn net.Conn, command []string) { + // Chroot into container before executing + cmd := exec.Command("chroot", append([]string{"/overlay/newroot"}, command...)...) + cmd.Env = os.Environ() + + // Start with PTY + ptmx, err := pty.Start(cmd) + if err != nil { + sendError(conn, fmt.Sprintf("start pty: %v", err)) + return + } + defer ptmx.Close() + + done := make(chan struct{}) + + // Handle input (stdin + resize) + go func() { + defer close(done) + for { + streamType, data, err := readFrame(conn) + if err != nil { + return + } + + switch streamType { + case StreamStdin: + ptmx.Write(data) + case StreamResize: + var resize ResizeMessage + if err := json.Unmarshal(data, &resize); err == nil { + pty.Setsize(ptmx, &pty.Winsize{ + Rows: resize.Height, + Cols: resize.Width, + }) + } + } + } + }() + + // Stream output + go func() { + buf := make([]byte, 32*1024) + for { + n, err := ptmx.Read(buf) + if n > 0 { + sendFrame(conn, StreamStdout, buf[:n]) + } + if err != nil { + return + } + } + }() + + <-done + cmd.Wait() + + // Send exit code + exitCode := 0 + if cmd.ProcessState != nil { + exitCode = cmd.ProcessState.ExitCode() + } + sendExit(conn, exitCode) // Ignore error in TTY mode + + // Graceful shutdown + if tcpConn, ok := conn.(*net.TCPConn); ok { + tcpConn.CloseWrite() + } else if unixConn, ok := conn.(*net.UnixConn); ok { + unixConn.CloseWrite() + } + io.Copy(io.Discard, conn) +} + +func executeNoTTY(conn net.Conn, command []string) { + // Chroot into container before executing + cmd := exec.Command("chroot", append([]string{"/overlay/newroot"}, command...)...) + cmd.Env = os.Environ() + + stdin, _ := cmd.StdinPipe() + stdout, _ := cmd.StdoutPipe() + stderr, _ := cmd.StderrPipe() + + if err := cmd.Start(); err != nil { + sendError(conn, fmt.Sprintf("start: %v", err)) + return + } + + // Handle stdin in background (don't block on it) + go func() { + defer stdin.Close() + for { + streamType, data, err := readFrame(conn) + if err != nil { + return + } + if streamType == StreamStdin { + stdin.Write(data) + } + } + }() + + // Use channels to wait for stdout/stderr to finish + stdoutDone := make(chan struct{}) + stderrDone := make(chan struct{}) + + // Stream stdout + go func() { + defer close(stdoutDone) + buf := make([]byte, 32*1024) + for { + n, err := stdout.Read(buf) + if n > 0 { + sendFrame(conn, StreamStdout, buf[:n]) + } + if err != nil { + return + } + } + }() + + // Stream stderr + go func() { + defer close(stderrDone) + buf := make([]byte, 32*1024) + for { + n, err := stderr.Read(buf) + if n > 0 { + sendFrame(conn, StreamStderr, buf[:n]) + } + if err != nil { + return + } + } + }() + + // Wait for command to finish (don't wait for stdin) + err := cmd.Wait() + + log.Printf("command finished: err=%v", err) + + // Wait for stdout/stderr goroutines to finish reading all data + <-stdoutDone + <-stderrDone + log.Printf("stdout/stderr streams closed") + + exitCode := 0 + if cmd.ProcessState != nil { + exitCode = cmd.ProcessState.ExitCode() + } + + log.Printf("sending exit code: %d", exitCode) + if err := sendExit(conn, exitCode); err != nil { + log.Printf("error sending exit: %v", err) + return + } + log.Printf("exit sent successfully") + + // Close the write side to signal we're done + // This sends a FIN packet but keeps the connection open for reading + if tcpConn, ok := conn.(*net.TCPConn); ok { + tcpConn.CloseWrite() + } else if unixConn, ok := conn.(*net.UnixConn); ok { + unixConn.CloseWrite() + } + + // Wait for client to close the connection by reading until EOF + // This ensures the client has received all data including the exit code + // properly before we fully close the socket. + io.Copy(io.Discard, conn) + + log.Printf("connection closed by client") +} + +func readFrame(conn net.Conn) (byte, []byte, error) { + header := make([]byte, 5) + if _, err := io.ReadFull(conn, header); err != nil { + return 0, nil, err + } + + streamType := header[0] + length := binary.BigEndian.Uint32(header[1:5]) + + data := make([]byte, length) + if _, err := io.ReadFull(conn, data); err != nil { + return 0, nil, err + } + + return streamType, data, nil +} + +func sendFrame(conn net.Conn, streamType byte, data []byte) error { + header := make([]byte, 5) + header[0] = streamType + binary.BigEndian.PutUint32(header[1:5], uint32(len(data))) + + if _, err := conn.Write(header); err != nil { + return err + } + if _, err := conn.Write(data); err != nil { + return err + } + return nil +} + +func sendError(conn net.Conn, msg string) { + sendFrame(conn, StreamError, []byte(msg)) +} + +func sendExit(conn net.Conn, code int) error { + exit := ExitMessage{} + exit.Status.Code = code + data, err := json.Marshal(exit) + if err != nil { + return err + } + return sendFrame(conn, StreamError, data) +} + diff --git a/lib/system/exec_agent_binary.go b/lib/system/exec_agent_binary.go new file mode 100644 index 00000000..eb7ac875 --- /dev/null +++ b/lib/system/exec_agent_binary.go @@ -0,0 +1,9 @@ +package system + +import _ "embed" + +// ExecAgentBinary contains the embedded exec-agent binary +// This is built by the Makefile before the main binary is compiled +//go:embed exec_agent/exec-agent +var ExecAgentBinary []byte + diff --git a/lib/system/init_script.go b/lib/system/init_script.go index 3bbe0e9e..01dd90dc 100644 --- a/lib/system/init_script.go +++ b/lib/system/init_script.go @@ -9,11 +9,11 @@ package system // 3. Mounts and sources config disk (/dev/vdc) // 4. Configures networking (if enabled) // 5. Executes container entrypoint -func GenerateInitScript(version InitrdVersion) string { +func GenerateInitScript() string { return `#!/bin/sh set -xe -echo "overlay-init: START (` + string(version) + `)" > /dev/kmsg +echo "overlay-init: START" > /dev/kmsg # Create mount points mkdir -p /proc /sys /dev diff --git a/lib/system/initrd.go b/lib/system/initrd.go index b1f7c812..595a7621 100644 --- a/lib/system/initrd.go +++ b/lib/system/initrd.go @@ -2,105 +2,130 @@ package system import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" "os" "path/filepath" + "strconv" + "time" "github.com/onkernel/hypeman/lib/images" ) -// buildInitrd builds initrd from base image + custom init script -func (m *manager) buildInitrd(ctx context.Context, version InitrdVersion, arch string) error { +const alpineBaseImage = "alpine:3.22" + +// buildInitrd builds initrd from Alpine base + embedded exec-agent + generated init script +func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) { // Create temp directory for building tempDir, err := os.MkdirTemp("", "hypeman-initrd-*") if err != nil { - return fmt.Errorf("create temp dir: %w", err) + return "", fmt.Errorf("create temp dir: %w", err) } defer os.RemoveAll(tempDir) rootfsDir := filepath.Join(tempDir, "rootfs") - // Get base image for this initrd version - baseImageRef, ok := InitrdBaseImages[version] - if !ok { - return fmt.Errorf("no base image defined for initrd %s", version) - } - - // Create a temporary OCI client (reuses image manager's cache) + // Create OCI client (reuses image manager's cache) cacheDir := m.paths.SystemOCICache() ociClient, err := images.NewOCIClient(cacheDir) if err != nil { - return fmt.Errorf("create oci client: %w", err) + return "", fmt.Errorf("create oci client: %w", err) } - // Inspect to get digest - digest, err := ociClient.InspectManifest(ctx, baseImageRef) + // Inspect Alpine base to get digest + digest, err := ociClient.InspectManifest(ctx, alpineBaseImage) if err != nil { - return fmt.Errorf("inspect base image manifest: %w", err) + return "", fmt.Errorf("inspect alpine manifest: %w", err) + } + + // Pull and unpack Alpine base + if err := ociClient.PullAndUnpack(ctx, alpineBaseImage, digest, rootfsDir); err != nil { + return "", fmt.Errorf("pull alpine base: %w", err) } - // Pull and unpack base image - if err := ociClient.PullAndUnpack(ctx, baseImageRef, digest, rootfsDir); err != nil { - return fmt.Errorf("pull base image: %w", err) + // Write embedded exec-agent binary + binDir := filepath.Join(rootfsDir, "usr/local/bin") + if err := os.MkdirAll(binDir, 0755); err != nil { + return "", fmt.Errorf("create bin dir: %w", err) + } + + agentPath := filepath.Join(binDir, "exec-agent") + if err := os.WriteFile(agentPath, ExecAgentBinary, 0755); err != nil { + return "", fmt.Errorf("write exec-agent: %w", err) } - // Inject init script - initScript := GenerateInitScript(version) + // Write generated init script + initScript := GenerateInitScript() initPath := filepath.Join(rootfsDir, "init") if err := os.WriteFile(initPath, []byte(initScript), 0755); err != nil { - return fmt.Errorf("write init script: %w", err) + return "", fmt.Errorf("write init script: %w", err) } - // HACK: Inject custom exec-agent for debugging - // This assumes the agent is built at lib/system/initrd/guest-agent/exec-agent - // We try to find it relative to the project root. - // Since we are running from project root usually, or we can try to find it. - // Hardcoding path based on workspace structure for now. - customAgent := "/home/debianuser/hypeman/lib/system/initrd/guest-agent/exec-agent" - if input, err := os.ReadFile(customAgent); err == nil { - // Create directory if it doesn't exist (though it should from base image) - binDir := filepath.Join(rootfsDir, "usr/local/bin") - os.MkdirAll(binDir, 0755) - - agentPath := filepath.Join(binDir, "exec-agent") - if err := os.WriteFile(agentPath, input, 0755); err != nil { - return fmt.Errorf("write custom exec-agent: %w", err) - } - fmt.Printf("DEBUG: Injected custom exec-agent from %s\n", customAgent) - } else { - fmt.Printf("DEBUG: Could not find custom exec-agent at %s: %v\n", customAgent, err) + // Generate timestamp for this build + timestamp := strconv.FormatInt(time.Now().Unix(), 10) + + // Package as cpio.gz + outputPath := m.paths.SystemInitrdTimestamp(timestamp, arch) + if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil { + return "", fmt.Errorf("create output dir: %w", err) } - - // Package as cpio.gz (initramfs format) - outputPath := m.paths.SystemInitrd(string(version), arch) + if _, err := images.ExportRootfs(rootfsDir, outputPath, images.FormatCpio); err != nil { - return fmt.Errorf("export initrd: %w", err) + return "", fmt.Errorf("export initrd: %w", err) } - return nil + // Update 'latest' symlink + latestLink := m.paths.SystemInitrdLatest(arch) + // Remove old symlink if it exists + os.Remove(latestLink) + // Create new symlink (relative path) + if err := os.Symlink(timestamp, latestLink); err != nil { + return "", fmt.Errorf("create latest symlink: %w", err) + } + + return outputPath, nil } -// ensureInitrd ensures initrd exists, builds if missing -func (m *manager) ensureInitrd(ctx context.Context, version InitrdVersion) (string, error) { +// ensureInitrd ensures initrd exists and is up-to-date, builds if missing or stale +func (m *manager) ensureInitrd(ctx context.Context) (string, error) { arch := GetArch() - - initrdPath := m.paths.SystemInitrd(string(version), arch) - - // Check if already exists - if _, err := os.Stat(initrdPath); err == nil { - return initrdPath, nil + latestLink := m.paths.SystemInitrdLatest(arch) + + // Check if latest symlink exists + if target, err := os.Readlink(latestLink); err == nil { + // Symlink exists, check if the actual file exists + initrdPath := m.paths.SystemInitrdTimestamp(target, arch) + if _, err := os.Stat(initrdPath); err == nil { + // File exists, check if it's stale by comparing embedded binary hash + if !m.isInitrdStale(initrdPath) { + return initrdPath, nil + } + } } - // Build initrd - if err := m.buildInitrd(ctx, version, arch); err != nil { + // Build new initrd + initrdPath, err := m.buildInitrd(ctx, arch) + if err != nil { return "", fmt.Errorf("build initrd: %w", err) } return initrdPath, nil } -// BuildInitrd is a public wrapper for building initrd (used by dev tools) -func (m *manager) BuildInitrd(ctx context.Context, version InitrdVersion, arch string) error { - return m.buildInitrd(ctx, version, arch) +// isInitrdStale checks if the initrd needs rebuilding by comparing embedded binary hash +func (m *manager) isInitrdStale(initrdPath string) bool { + // For now, we'll consider it stale if the embedded binary has changed + // We could store a hash file alongside the initrd and compare + // For simplicity, we'll just rebuild on every run for now + // TODO: Implement proper hash-based staleness check + return false } +// computeInitrdHash computes a hash of the embedded binary and init script +func computeInitrdHash() string { + h := sha256.New() + h.Write(ExecAgentBinary) + h.Write([]byte(GenerateInitScript())) + return hex.EncodeToString(h.Sum(nil))[:16] +} diff --git a/lib/system/manager.go b/lib/system/manager.go index 1f538435..87fc4282 100644 --- a/lib/system/manager.go +++ b/lib/system/manager.go @@ -3,6 +3,7 @@ package system import ( "context" "fmt" + "os" "github.com/onkernel/hypeman/lib/paths" ) @@ -15,11 +16,11 @@ type Manager interface { // GetKernelPath returns path to kernel file GetKernelPath(version KernelVersion) (string, error) - // GetInitrdPath returns path to initrd file - GetInitrdPath(version InitrdVersion) (string, error) + // GetInitrdPath returns path to current initrd file + GetInitrdPath() (string, error) - // GetDefaultVersions returns the default kernel and initrd versions - GetDefaultVersions() (KernelVersion, InitrdVersion) + // GetDefaultKernelVersion returns the default kernel version + GetDefaultKernelVersion() KernelVersion } type manager struct { @@ -35,16 +36,16 @@ func NewManager(p *paths.Paths) Manager { // EnsureSystemFiles ensures default kernel and initrd exist, downloading/building if needed func (m *manager) EnsureSystemFiles(ctx context.Context) error { - kernelVer, initrdVer := m.GetDefaultVersions() + kernelVer := m.GetDefaultKernelVersion() // Ensure kernel exists if _, err := m.ensureKernel(kernelVer); err != nil { return fmt.Errorf("ensure kernel %s: %w", kernelVer, err) } - // Ensure initrd exists - if _, err := m.ensureInitrd(ctx, initrdVer); err != nil { - return fmt.Errorf("ensure initrd %s: %w", initrdVer, err) + // Ensure initrd exists (builds if missing or stale) + if _, err := m.ensureInitrd(ctx); err != nil { + return fmt.Errorf("ensure initrd: %w", err) } return nil @@ -57,15 +58,22 @@ func (m *manager) GetKernelPath(version KernelVersion) (string, error) { return path, nil } -// GetInitrdPath returns the path to an initrd version -func (m *manager) GetInitrdPath(version InitrdVersion) (string, error) { +// GetInitrdPath returns the path to the current initrd file +func (m *manager) GetInitrdPath() (string, error) { arch := GetArch() - path := m.paths.SystemInitrd(string(version), arch) - return path, nil + latestLink := m.paths.SystemInitrdLatest(arch) + + // Read the symlink to get the timestamp + target, err := os.Readlink(latestLink) + if err != nil { + return "", fmt.Errorf("read latest symlink: %w", err) + } + + return m.paths.SystemInitrdTimestamp(target, arch), nil } -// GetDefaultVersions returns the default kernel and initrd versions -func (m *manager) GetDefaultVersions() (KernelVersion, InitrdVersion) { - return DefaultKernelVersion, DefaultInitrdVersion +// GetDefaultKernelVersion returns the default kernel version +func (m *manager) GetDefaultKernelVersion() KernelVersion { + return DefaultKernelVersion } diff --git a/lib/system/versions.go b/lib/system/versions.go index 050b04ca..167cb4d9 100644 --- a/lib/system/versions.go +++ b/lib/system/versions.go @@ -5,52 +5,20 @@ import "runtime" // KernelVersion represents a Cloud Hypervisor kernel version type KernelVersion string -// InitrdVersion represents our internal initrd version -type InitrdVersion string - const ( // Kernel versions from Kernel linux build Kernel_202511182 KernelVersion = "ch-6.12.8-kernel-1-202511182" - - // Initrd versions (our internal versioning) - // Bump when init script logic changes - InitrdV2_0_0 InitrdVersion = "v2.0.0" - InitrdV2_0_1 InitrdVersion = "v2.0.1" - InitrdV2_0_2 InitrdVersion = "v2.0.2" ) -// InitrdBaseImages maps initrd versions to specific base image references -// v2.0.0: Uses pre-built Alpine image with exec-agent from Docker Hub (multi-arch OCI manifest list) -// v2.0.1: Uses same base but we will inject local agent -// v2.0.2: Uses same base but with interactive shell fallback -// v2.0.2-dev: Local dev build (built via cmd/build-dev-initrd) -var InitrdBaseImages = map[InitrdVersion]string{ - InitrdV2_0_0: "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", - InitrdV2_0_1: "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", - InitrdV2_0_2: "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", - InitrdVersion("v2.0.2-dev"): "docker.io/onkernel/hypeman-initrd:d0e84c2-oci", // Not used, already built locally - // Add future versions here -} - var ( // DefaultKernelVersion is the kernel version used for new instances DefaultKernelVersion = Kernel_202511182 - // DefaultInitrdVersion is the initrd version used for new instances - DefaultInitrdVersion = InitrdVersion("v2.0.2-dev") - // SupportedKernelVersions lists all supported kernel versions SupportedKernelVersions = []KernelVersion{ Kernel_202511182, // Add future versions here } - - // SupportedInitrdVersions lists all supported initrd versions - SupportedInitrdVersions = []InitrdVersion{ - InitrdV2_0_0, - InitrdV2_0_1, - InitrdV2_0_2, - } ) // KernelDownloadURLs maps kernel versions and architectures to download URLs From 7b3cf6ce167eb85e5b3967ca07257c55b24c38f2 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 18 Nov 2025 18:36:13 -0500 Subject: [PATCH 4/5] Simplify initrd by removing versioning and removing docker build --- cmd/debug-shell/main.go | 87 ----------------------------------- lib/instances/manager_test.go | 1 - lib/system/init_script.go | 17 +------ lib/system/initrd.go | 25 +++++++--- lib/system/manager_test.go | 23 +++------ lib/system/versions_test.go | 74 ----------------------------- 6 files changed, 27 insertions(+), 200 deletions(-) delete mode 100644 cmd/debug-shell/main.go delete mode 100644 lib/system/versions_test.go diff --git a/cmd/debug-shell/main.go b/cmd/debug-shell/main.go deleted file mode 100644 index b3df97cb..00000000 --- a/cmd/debug-shell/main.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( - "context" - "fmt" - "os" - "os/signal" - "syscall" - - "github.com/onkernel/hypeman/lib/system" - "golang.org/x/term" -) - -func main() { - if len(os.Args) < 2 { - fmt.Println("Usage: go run cmd/debug-shell/main.go [command...]") - fmt.Println("Example: go run cmd/debug-shell/main.go /tmp/.../vsock.sock") - fmt.Println("Example: go run cmd/debug-shell/main.go /tmp/.../vsock.sock ls -la /") - os.Exit(1) - } - socketPath := os.Args[1] - - command := []string{"/bin/sh"} - if len(os.Args) > 2 { - command = os.Args[2:] - } - - // Handle signals - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) - go func() { - <-sigChan - cancel() - }() - - fmt.Printf("Connecting to %s...\n", socketPath) - - // Determine if we should use TTY - isTTY := term.IsTerminal(int(os.Stdin.Fd())) - if len(os.Args) > 2 { - // If running a command, don't use TTY unless explicitly interactive? - // Usually running a command (like ls) is non-interactive TTY wise unless forced. - // Let's default to TTY=false if arguments provided, to simplify probing. - isTTY = false - } - - var oldState *term.State - var err error - if isTTY { - // Put terminal in raw mode for interactive shell - oldState, err = term.MakeRaw(int(os.Stdin.Fd())) - if err != nil { - fmt.Printf("Warning: could not make terminal raw: %v\n", err) - isTTY = false - } else { - defer term.Restore(int(os.Stdin.Fd()), oldState) - } - } - - // Start shell - status, err := system.ExecIntoInstance(ctx, socketPath, system.ExecOptions{ - Command: command, - Stdin: os.Stdin, - Stdout: os.Stdout, - Stderr: os.Stderr, - TTY: isTTY, - }) - - if err != nil { - // Restore terminal before printing error - if oldState != nil { - term.Restore(int(os.Stdin.Fd()), oldState) - } - fmt.Printf("\r\nError: %v\n", err) - os.Exit(1) - } - - // Restore terminal - if oldState != nil { - term.Restore(int(os.Stdin.Fd()), oldState) - } - fmt.Printf("\r\nExit code: %d\n", status.Code) -} - diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 85f55098..56682573 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -177,7 +177,6 @@ func TestCreateAndDeleteInstance(t *testing.T) { assert.Equal(t, StateRunning, inst.State) assert.False(t, inst.HasSnapshot) assert.NotEmpty(t, inst.KernelVersion) - assert.NotEmpty(t, inst.InitrdVersion) // Verify directories exist p := paths.New(tmpDir) diff --git a/lib/system/init_script.go b/lib/system/init_script.go index 01dd90dc..76d0f973 100644 --- a/lib/system/init_script.go +++ b/lib/system/init_script.go @@ -127,21 +127,6 @@ wait $APP_PID APP_EXIT=$? echo "overlay-init: app exited with code $APP_EXIT" - - # Keep VM alive if app crashes, to allow debugging via exec-agent - if [ $APP_EXIT -ne 0 ]; then - echo "overlay-init: CRITICAL - App exited with error." - fi - - # FALLBACK: Launch interactive shell on console - # This allows manual debugging if attaching to the console - echo "overlay-init: Launching interactive shell on ttyS0..." - setsid cttyhack /bin/sh < /dev/ttyS0 > /dev/ttyS0 2>&1 - - # If shell exits (it shouldn't), loop forever - while true; do sleep 3600; done - - exit $APP_EXIT - ` +exit $APP_EXIT` } diff --git a/lib/system/initrd.go b/lib/system/initrd.go index 595a7621..c409ec70 100644 --- a/lib/system/initrd.go +++ b/lib/system/initrd.go @@ -75,6 +75,13 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) return "", fmt.Errorf("export initrd: %w", err) } + // Store hash for staleness detection + hashPath := filepath.Join(filepath.Dir(outputPath), ".hash") + currentHash := computeInitrdHash() + if err := os.WriteFile(hashPath, []byte(currentHash), 0644); err != nil { + return "", fmt.Errorf("write hash file: %w", err) + } + // Update 'latest' symlink latestLink := m.paths.SystemInitrdLatest(arch) // Remove old symlink if it exists @@ -113,13 +120,19 @@ func (m *manager) ensureInitrd(ctx context.Context) (string, error) { return initrdPath, nil } -// isInitrdStale checks if the initrd needs rebuilding by comparing embedded binary hash +// isInitrdStale checks if the initrd needs rebuilding by comparing hashes func (m *manager) isInitrdStale(initrdPath string) bool { - // For now, we'll consider it stale if the embedded binary has changed - // We could store a hash file alongside the initrd and compare - // For simplicity, we'll just rebuild on every run for now - // TODO: Implement proper hash-based staleness check - return false + // Read stored hash + hashPath := filepath.Join(filepath.Dir(initrdPath), ".hash") + storedHash, err := os.ReadFile(hashPath) + if err != nil { + // No hash file, consider stale + return true + } + + // Compare with current hash + currentHash := computeInitrdHash() + return string(storedHash) != currentHash } // computeInitrdHash computes a hash of the embedded binary and init script diff --git a/lib/system/manager_test.go b/lib/system/manager_test.go index 5711a597..3540e448 100644 --- a/lib/system/manager_test.go +++ b/lib/system/manager_test.go @@ -9,30 +9,23 @@ import ( "github.com/stretchr/testify/require" ) -func TestGetDefaultVersions(t *testing.T) { +func TestGetDefaultKernelVersion(t *testing.T) { tmpDir := t.TempDir() mgr := NewManager(paths.New(tmpDir)) - kernelVer, initrdVer := mgr.GetDefaultVersions() + kernelVer := mgr.GetDefaultKernelVersion() assert.Equal(t, DefaultKernelVersion, kernelVer) - assert.Equal(t, DefaultInitrdVersion, initrdVer) } -func TestGetPaths(t *testing.T) { +func TestGetKernelPath(t *testing.T) { tmpDir := t.TempDir() mgr := NewManager(paths.New(tmpDir)) // Get kernel path - kernelPath, err := mgr.GetKernelPath(KernelCH_6_12_8_20250613) + kernelPath, err := mgr.GetKernelPath(DefaultKernelVersion) require.NoError(t, err) - assert.Contains(t, kernelPath, "kernel/ch-release-v6.12.8-20250613") + assert.Contains(t, kernelPath, "kernel") assert.Contains(t, kernelPath, "vmlinux") - - // Get initrd path - initrdPath, err := mgr.GetInitrdPath(InitrdV2_0_0) - require.NoError(t, err) - assert.Contains(t, initrdPath, "initrd/v2.0.0") - assert.Contains(t, initrdPath, "initrd") } func TestEnsureSystemFiles(t *testing.T) { @@ -56,7 +49,7 @@ func TestEnsureSystemFiles(t *testing.T) { assert.FileExists(t, kernelPath) // Verify initrd exists - initrdPath, err := mgr.GetInitrdPath(DefaultInitrdVersion) + initrdPath, err := mgr.GetInitrdPath() require.NoError(t, err) assert.FileExists(t, initrdPath) @@ -66,7 +59,7 @@ func TestEnsureSystemFiles(t *testing.T) { } func TestInitScriptGeneration(t *testing.T) { - script := GenerateInitScript(InitrdV2_0_0) + script := GenerateInitScript() // Verify script contains essential components assert.Contains(t, script, "#!/bin/sh") @@ -76,7 +69,5 @@ func TestInitScriptGeneration(t *testing.T) { assert.Contains(t, script, "/dev/vdc") // config disk assert.Contains(t, script, "exec-agent") // vsock exec agent assert.Contains(t, script, "${ENTRYPOINT}") - assert.Contains(t, script, "v2.0.0") // Version in script assert.Contains(t, script, "wait $APP_PID") // Supervisor pattern } - diff --git a/lib/system/versions_test.go b/lib/system/versions_test.go deleted file mode 100644 index 4269ec27..00000000 --- a/lib/system/versions_test.go +++ /dev/null @@ -1,74 +0,0 @@ -package system - -import ( - "crypto/sha256" - "fmt" - "testing" - - "github.com/stretchr/testify/require" -) - -// expectedInitrdHashes maps initrd versions to their expected content hash -// The hash is computed from: sha256(initScript + baseImageDigest) -// This ensures that changes to either the script OR base image require a version bump -var expectedInitrdHashes = map[InitrdVersion]string{ - InitrdV2_0_0: "e192f8c95912e8fe3acedbba7f2107ed4fda2adf7c94ae3eebee3720e80c524c", - // Add future versions here -} - -func TestInitrdVersionIntegrity(t *testing.T) { - for version, expectedHash := range expectedInitrdHashes { - t.Run(string(version), func(t *testing.T) { - // Get the base image digest for this version - baseImageDigest, ok := InitrdBaseImages[version] - require.True(t, ok, "Missing base image digest for %s", version) - - // Compute hash from script + digest - script := GenerateInitScript(version) - combined := script + baseImageDigest - actualHash := fmt.Sprintf("%x", sha256.Sum256([]byte(combined))) - - if expectedHash == "PLACEHOLDER" { - t.Fatalf("Initrd %s needs hash to be set.\n"+ - "Add this to expectedInitrdHashes in versions_test.go:\n"+ - " InitrdV2_0_0: %q,\n", - version, actualHash) - } - - require.Equal(t, expectedHash, actualHash, - "Initrd %s content changed!\n"+ - "Expected hash: %s\n"+ - "Actual hash: %s\n\n"+ - "If this is intentional, create a new version:\n"+ - "1. Add new constant in versions.go: InitrdV2_1_0 = \"v2.1.0\"\n"+ - "2. Add base image digest to InitrdBaseImages map\n"+ - "3. Add to SupportedInitrdVersions list\n"+ - "4. Add this hash to expectedInitrdHashes in versions_test.go:\n"+ - " InitrdV2_1_0: %q,\n"+ - "5. Update DefaultInitrdVersion if this should be the new default\n", - version, expectedHash, actualHash, actualHash) - }) - } -} - -func TestInitrdBaseImagesArePinned(t *testing.T) { - // Ensure all initrd versions have valid image references - // Tags are acceptable since the OCI client resolves them to digests - for version, baseImageRef := range InitrdBaseImages { - require.NotEmpty(t, baseImageRef, - "base image for %s must not be empty", - version) - require.Contains(t, baseImageRef, "docker.io/", - "base image for %s must be a fully qualified reference", - version) - } -} - -func TestAllInitrdVersionsHaveExpectedHash(t *testing.T) { - // Ensure every initrd version in InitrdBaseImages has a corresponding hash - for version := range InitrdBaseImages { - _, ok := expectedInitrdHashes[version] - require.True(t, ok, "Initrd version %s is missing from expectedInitrdHashes map in versions_test.go", version) - } -} - From 453a1bb3532deaa0fae269b4e5e0f526eec1fa5d Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 18 Nov 2025 18:39:18 -0500 Subject: [PATCH 5/5] Delete extra file --- scripts/repro_vm.sh | 100 -------------------------------------------- 1 file changed, 100 deletions(-) delete mode 100755 scripts/repro_vm.sh diff --git a/scripts/repro_vm.sh b/scripts/repro_vm.sh deleted file mode 100755 index 6ac69565..00000000 --- a/scripts/repro_vm.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/bash -set -e - -# Paths -KERNEL_DIR="/var/lib/hypeman/system/kernel/ch-6.12.8-kernel-1-202511182/x86_64" -INITRD_DIR="/var/lib/hypeman/system/initrd/v2.0.2-dev/x86_64" -KERNEL="$KERNEL_DIR/vmlinux" -INITRD="$INITRD_DIR/initrd" -SOCKET="/tmp/repro-vsock.sock" -LOG_FILE="repro-console.log" - -# Disk images -ROOTFS_IMG="repro-rootfs.img" -OVERLAY_IMG="repro-overlay.img" -CONFIG_IMG="repro-config.img" - -# Clean up -rm -f "$SOCKET" "$LOG_FILE" "$ROOTFS_IMG" "$OVERLAY_IMG" "$CONFIG_IMG" - -# Create dummy disks -echo "Creating dummy disks..." -# Rootfs (100M) -dd if=/dev/zero of="$ROOTFS_IMG" bs=1M count=100 -mkfs.ext4 -F "$ROOTFS_IMG" - -# Overlay (100M) -dd if=/dev/zero of="$OVERLAY_IMG" bs=1M count=100 -mkfs.ext4 -F "$OVERLAY_IMG" - -# Config (10M) -dd if=/dev/zero of="$CONFIG_IMG" bs=1M count=10 -mkfs.ext4 -F "$CONFIG_IMG" -# Add a dummy config.sh -mkdir -p /tmp/repro-config -if sudo mount -o loop "$CONFIG_IMG" /tmp/repro-config; then - echo "#!/bin/sh" | sudo tee /tmp/repro-config/config.sh - echo "echo 'Repro config loaded'" | sudo tee -a /tmp/repro-config/config.sh - echo "export ENTRYPOINT='/bin/sh'" | sudo tee -a /tmp/repro-config/config.sh - echo "export CMD='-c \"echo Hello from Guest && sleep 3600\"'" | sudo tee -a /tmp/repro-config/config.sh - sudo chmod +x /tmp/repro-config/config.sh - sudo umount /tmp/repro-config -else - echo "Failed to mount config img, proceeding with empty config" -fi - -# Check artifacts -if [ ! -f "$KERNEL" ]; then - echo "Kernel not found at $KERNEL" - exit 1 -fi - -if [ ! -f "$INITRD" ]; then - echo "Initrd not found at $INITRD" - exit 1 -fi - -echo "Starting Cloud Hypervisor..." -echo "Kernel: $KERNEL" -echo "Initrd: $INITRD" -echo "Socket: $SOCKET" - -# Start Cloud Hypervisor in background -cloud-hypervisor \ - --kernel "$KERNEL" \ - --initramfs "$INITRD" \ - --cmdline "console=ttyS0 panic=1" \ - --disk path="$ROOTFS_IMG",readonly=on path="$OVERLAY_IMG" path="$CONFIG_IMG",readonly=on \ - --cpus boot=1 \ - --memory size=512M \ - --console off \ - --serial tty \ - --vsock cid=3,socket="$SOCKET" \ - > "$LOG_FILE" 2>&1 & - -CH_PID=$! -echo "Cloud Hypervisor running with PID $CH_PID" -echo "Logs at $LOG_FILE" -echo "Waiting for VM to boot..." - -# Wait for socket -for i in $(seq 1 10); do - if [ -S "$SOCKET" ]; then - echo "Socket created." - break - fi - sleep 1 -done - -if [ ! -S "$SOCKET" ]; then - echo "Socket creation failed." - kill $CH_PID - exit 1 -fi - -echo "VM should be booting. You can now connect to $SOCKET using socat or Go tools." -echo "Example: sudo socat - UNIX-CONNECT:$SOCKET" -echo "Then type: CONNECT 2222" -echo "" -echo "To stop: kill $CH_PID" -