diff --git a/cmd/api/api/api_test.go b/cmd/api/api/api_test.go index 09c6f2bf..2a7e8787 100644 --- a/cmd/api/api/api_test.go +++ b/cmd/api/api/api_test.go @@ -6,14 +6,17 @@ import ( "os" "syscall" "testing" + "time" "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/network" + "github.com/onkernel/hypeman/lib/oapi" "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/volumes" + "github.com/stretchr/testify/require" ) // newTestService creates an ApiService for testing with automatic cleanup @@ -51,23 +54,23 @@ func newTestService(t *testing.T) *ApiService { func cleanupOrphanedProcesses(t *testing.T, dataDir string) { p := paths.New(dataDir) guestsDir := p.GuestsDir() - + entries, err := os.ReadDir(guestsDir) if err != nil { return // No guests directory } - + for _, entry := range entries { if !entry.IsDir() { continue } - + metaPath := p.InstanceMetadata(entry.Name()) data, err := os.ReadFile(metaPath) if err != nil { continue } - + // Parse just the CHPID field var meta struct { CHPID *int `json:"CHPID"` @@ -75,11 +78,11 @@ func cleanupOrphanedProcesses(t *testing.T, dataDir string) { if err := json.Unmarshal(data, &meta); err != nil { continue } - + // If metadata has a PID, try to kill it if meta.CHPID != nil { pid := *meta.CHPID - + // Check if process exists if err := syscall.Kill(pid, 0); err == nil { t.Logf("Cleaning up orphaned Cloud Hypervisor process: PID %d", pid) @@ -92,3 +95,46 @@ func cleanupOrphanedProcesses(t *testing.T, dataDir string) { func ctx() context.Context { return context.Background() } + +// createAndWaitForImage creates an image and waits for it to be ready. +// Returns the image name on success, or fails the test on error/timeout. +func createAndWaitForImage(t *testing.T, svc *ApiService, imageName string, timeout time.Duration) string { + t.Helper() + + t.Logf("Creating image %s...", imageName) + imgResp, err := svc.CreateImage(ctx(), oapi.CreateImageRequestObject{ + Body: &oapi.CreateImageRequest{ + Name: imageName, + }, + }) + require.NoError(t, err) + + imgCreated, ok := imgResp.(oapi.CreateImage202JSONResponse) + require.True(t, ok, "expected 202 response for image creation") + + t.Log("Waiting for image to be ready...") + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + imgResp, err := svc.GetImage(ctx(), oapi.GetImageRequestObject{ + Name: imageName, + }) + require.NoError(t, err) + + img, ok := imgResp.(oapi.GetImage200JSONResponse) + if ok { + switch img.Status { + case "ready": + t.Log("Image is ready") + return imgCreated.Name + case "failed": + t.Fatalf("Image build failed: %v", img.Error) + default: + t.Logf("Image status: %s", img.Status) + } + } + time.Sleep(1 * time.Second) + } + + t.Fatalf("Timeout waiting for image %s to be ready", imageName) + return "" +} diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index ec00a6e8..8e5992f2 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -34,58 +34,20 @@ func TestExecInstanceNonTTY(t *testing.T) { require.NoError(t, err) t.Log("System files ready") - // First, create and wait for the image to be ready - // Use nginx which has a proper long-running process - t.Log("Creating nginx:alpine image...") - imgResp, err := svc.CreateImage(ctx(), oapi.CreateImageRequestObject{ - Body: &oapi.CreateImageRequest{ - Name: "docker.io/library/nginx:alpine", - }, - }) - require.NoError(t, err) - imgCreated, ok := imgResp.(oapi.CreateImage202JSONResponse) - require.True(t, ok, "expected 202 response") - assert.Equal(t, "docker.io/library/nginx:alpine", imgCreated.Name) - - // Wait for image to be ready (poll with timeout) - t.Log("Waiting for image to be ready...") - timeout := time.After(30 * time.Second) - ticker := time.NewTicker(1 * time.Second) - defer ticker.Stop() - - imageReady := false - for !imageReady { - select { - case <-timeout: - t.Fatal("Timeout waiting for image to be ready") - case <-ticker.C: - imgResp, err := svc.GetImage(ctx(), oapi.GetImageRequestObject{ - Name: "docker.io/library/nginx:alpine", - }) - require.NoError(t, err) - - img, ok := imgResp.(oapi.GetImage200JSONResponse) - if ok && img.Status == "ready" { - imageReady = true - t.Log("Image is ready") - } else if ok { - t.Logf("Image status: %s", img.Status) - } - } - } + // Create and wait for nginx image (has a proper long-running process) + createAndWaitForImage(t, svc, "docker.io/library/nginx:alpine", 30*time.Second) // Create instance t.Log("Creating instance...") - networkDisabled := false + networkEnabled := false instResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ Body: &oapi.CreateInstanceRequest{ Name: "exec-test", Image: "docker.io/library/nginx:alpine", Network: &struct { - Enabled *bool `json:"enabled,omitempty"` - Name *string `json:"name,omitempty"` + Enabled *bool `json:"enabled,omitempty"` }{ - Enabled: &networkDisabled, + Enabled: &networkEnabled, }, }, }) @@ -108,8 +70,8 @@ func TestExecInstanceNonTTY(t *testing.T) { case <-nginxTimeout: t.Fatal("Timeout waiting for nginx to start") case <-nginxTicker.C: - logs, err := svc.InstanceManager.GetInstanceLogs(ctx(), inst.Id, false, 100) - if err == nil && strings.Contains(logs, "start worker processes") { + logs := collectTestLogs(t, svc, inst.Id, 100) + if strings.Contains(logs, "start worker processes") { nginxReady = true t.Log("Nginx is ready") } @@ -132,7 +94,7 @@ func TestExecInstanceNonTTY(t *testing.T) { consolePath := paths.New(svc.Config.DataDir).InstanceConsoleLog(inst.Id) if consoleData, err := os.ReadFile(consolePath); err == nil { lines := strings.Split(string(consoleData), "\n") - + // Print exec-agent specific logs t.Logf("=== Exec Agent Logs ===") for _, line := range lines { @@ -155,13 +117,13 @@ func TestExecInstanceNonTTY(t *testing.T) { var exit *exec.ExitStatus var stdout, stderr outputBuffer var execErr error - + t.Log("Testing exec command: whoami") maxRetries := 10 for i := 0; i < maxRetries; i++ { stdout = outputBuffer{} stderr = outputBuffer{} - + exit, execErr = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "whoami"}, Stdin: nil, @@ -169,26 +131,25 @@ func TestExecInstanceNonTTY(t *testing.T) { Stderr: &stderr, TTY: false, }) - + if execErr == nil { break } - + t.Logf("Exec attempt %d/%d failed, retrying: %v", i+1, maxRetries, execErr) time.Sleep(1 * time.Second) } - + // Assert exec worked require.NoError(t, execErr, "exec should succeed after retries") require.NotNil(t, exit, "exit status should be returned") require.Equal(t, 0, exit.Code, "whoami should exit with code 0") - // Verify output outStr := stdout.String() t.Logf("Command output: %q", outStr) require.Contains(t, outStr, "root", "whoami should return root user") - + // Cleanup t.Log("Cleaning up instance...") delResp, err := svc.DeleteInstance(ctx(), oapi.DeleteInstanceRequestObject{ @@ -199,6 +160,139 @@ func TestExecInstanceNonTTY(t *testing.T) { require.True(t, ok, "expected 204 response") } +// TestExecWithDebianMinimal tests exec with a minimal Debian image. +// This test specifically catches issues that wouldn't appear with Alpine-based images: +// 1. Debian's default entrypoint (bash) exits immediately without a TTY +// 2. exec-agent must keep running even after the main app exits +// 3. The VM must not kernel panic when the entrypoint exits +func TestExecWithDebianMinimal(t *testing.T) { + // Require KVM access for VM creation + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { + t.Fatal("/dev/kvm not available - ensure KVM is enabled and user is in 'kvm' group (sudo usermod -aG kvm $USER)") + } + + if testing.Short() { + t.Skip("Skipping integration test in short mode") + } + + svc := newTestService(t) + + // Ensure system files (kernel and initrd) are available + t.Log("Ensuring system files...") + systemMgr := system.NewManager(paths.New(svc.Config.DataDir)) + err := systemMgr.EnsureSystemFiles(ctx()) + require.NoError(t, err) + t.Log("System files ready") + + // Create Debian 12 slim image (minimal, no iproute2) + createAndWaitForImage(t, svc, "docker.io/library/debian:12-slim", 60*time.Second) + + // Create instance (network disabled in test environment) + t.Log("Creating Debian instance...") + networkEnabled := false + instResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ + Body: &oapi.CreateInstanceRequest{ + Name: "debian-exec-test", + Image: "docker.io/library/debian:12-slim", + Network: &struct { + Enabled *bool `json:"enabled,omitempty"` + }{ + Enabled: &networkEnabled, + }, + }, + }) + require.NoError(t, err) + + inst, ok := instResp.(oapi.CreateInstance201JSONResponse) + require.True(t, ok, "expected 201 response") + require.NotEmpty(t, inst.Id) + t.Logf("Instance created: %s", inst.Id) + + // Cleanup on exit + t.Cleanup(func() { + t.Log("Cleaning up instance...") + svc.DeleteInstance(ctx(), oapi.DeleteInstanceRequestObject{Id: inst.Id}) + }) + + // Get actual instance to access vsock fields + actualInst, err := svc.InstanceManager.GetInstance(ctx(), inst.Id) + require.NoError(t, err) + require.NotNil(t, actualInst) + + // Wait for exec-agent to be ready by checking logs + // This is the key difference: we wait for exec-agent, not the app (which exits immediately) + t.Log("Waiting for exec-agent to start...") + execAgentReady := false + agentTimeout := time.After(15 * time.Second) + agentTicker := time.NewTicker(500 * time.Millisecond) + defer agentTicker.Stop() + + var logs string + for !execAgentReady { + select { + case <-agentTimeout: + // Dump logs on failure for debugging + logs = collectTestLogs(t, svc, inst.Id, 200) + t.Logf("Console logs:\n%s", logs) + t.Fatal("Timeout waiting for exec-agent to start") + case <-agentTicker.C: + logs = collectTestLogs(t, svc, inst.Id, 100) + if strings.Contains(logs, "[exec-agent] listening on vsock port 2222") { + execAgentReady = true + t.Log("exec-agent is ready") + } + } + } + + // Verify the app exited but VM is still usable (key behavior this test validates) + logs = collectTestLogs(t, svc, inst.Id, 200) + assert.Contains(t, logs, "overlay-init: app exited with code", "App should have exited") + + // Test exec commands work even though the main app (bash) has exited + t.Log("Testing exec command: echo") + var stdout, stderr outputBuffer + exit, err := exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{ + Command: []string{"echo", "hello from debian"}, + Stdout: &stdout, + Stderr: &stderr, + TTY: false, + }) + require.NoError(t, err, "exec should succeed") + require.NotNil(t, exit) + require.Equal(t, 0, exit.Code, "echo should exit with code 0") + assert.Contains(t, stdout.String(), "hello from debian") + + // Verify we're actually in Debian + t.Log("Verifying OS release...") + stdout = outputBuffer{} + exit, err = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{ + Command: []string{"cat", "/etc/os-release"}, + Stdout: &stdout, + TTY: false, + }) + require.NoError(t, err) + require.Equal(t, 0, exit.Code) + assert.Contains(t, stdout.String(), "Debian", "Should be running Debian") + assert.Contains(t, stdout.String(), "bookworm", "Should be Debian 12 (bookworm)") + t.Logf("OS: %s", strings.Split(stdout.String(), "\n")[0]) + +} + +// collectTestLogs collects logs from an instance (non-streaming) +func collectTestLogs(t *testing.T, svc *ApiService, instanceID string, n int) string { + logChan, err := svc.InstanceManager.StreamInstanceLogs(ctx(), instanceID, n, false) + if err != nil { + return "" + } + + var lines []string + for line := range logChan { + lines = append(lines, line) + } + + return strings.Join(lines, "\n") +} + // outputBuffer is a simple buffer for capturing exec output type outputBuffer struct { buf bytes.Buffer diff --git a/cmd/api/api/instances_test.go b/cmd/api/api/instances_test.go index 0c1a32c4..03c5b261 100644 --- a/cmd/api/api/instances_test.go +++ b/cmd/api/api/instances_test.go @@ -44,46 +44,13 @@ func TestCreateInstance_ParsesHumanReadableSizes(t *testing.T) { svc := newTestService(t) - // First, create and wait for the image to be ready - t.Log("Creating alpine image...") - imgResp, err := svc.CreateImage(ctx(), oapi.CreateImageRequestObject{ - Body: &oapi.CreateImageRequest{ - Name: "docker.io/library/alpine:latest", - }, - }) - require.NoError(t, err) - - imgCreated, ok := imgResp.(oapi.CreateImage202JSONResponse) - require.True(t, ok, "expected 202 accepted response for image creation") - img := oapi.Image(imgCreated) - - // Wait for image to be ready - t.Log("Waiting for image to be ready...") - imageName := img.Name - var image *oapi.Image - for i := 0; i < 60; i++ { - getImgResp, err := svc.GetImage(ctx(), oapi.GetImageRequestObject{Name: imageName}) - require.NoError(t, err) - - if getImg, ok := getImgResp.(oapi.GetImage200JSONResponse); ok { - img := oapi.Image(getImg) - if img.Status == "ready" { - image = &img - break - } - if img.Status == "failed" { - t.Fatalf("Image build failed: %v", img.Error) - } - } - time.Sleep(100 * time.Millisecond) - } - require.NotNil(t, image, "image should be ready within 6 seconds") - t.Log("Image ready!") + // Create and wait for alpine image + createAndWaitForImage(t, svc, "docker.io/library/alpine:latest", 30*time.Second) // Ensure system files (kernel and initramfs) are available t.Log("Ensuring system files (kernel and initramfs)...") systemMgr := system.NewManager(paths.New(svc.Config.DataDir)) - err = systemMgr.EnsureSystemFiles(ctx()) + err := systemMgr.EnsureSystemFiles(ctx()) require.NoError(t, err) t.Log("System files ready!") @@ -138,7 +105,7 @@ func TestCreateInstance_InvalidSizeFormat(t *testing.T) { // Test with invalid size format invalidSize := "not-a-size" - networkDisabled := false + networkEnabled := false resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{ Body: &oapi.CreateInstanceRequest{ @@ -148,7 +115,7 @@ func TestCreateInstance_InvalidSizeFormat(t *testing.T) { Network: &struct { Enabled *bool `json:"enabled,omitempty"` }{ - Enabled: &networkDisabled, + Enabled: &networkEnabled, }, }, }) diff --git a/lib/exec/client.go b/lib/exec/client.go index dce0ae97..35dc2786 100644 --- a/lib/exec/client.go +++ b/lib/exec/client.go @@ -5,13 +5,24 @@ import ( "context" "fmt" "io" + "log/slog" "net" "strings" + "time" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" ) +const ( + // vsockDialTimeout is the timeout for connecting to the vsock Unix socket + vsockDialTimeout = 5 * time.Second + // vsockHandshakeTimeout is the timeout for the Cloud Hypervisor vsock handshake + vsockHandshakeTimeout = 5 * time.Second + // vsockGuestPort is the port the exec-agent listens on inside the guest + vsockGuestPort = 2222 +) + // ExitStatus represents command exit information type ExitStatus struct { Code int @@ -29,41 +40,24 @@ type ExecOptions struct { Timeout int32 // Execution timeout in seconds (0 = no timeout) } +// bufferedConn wraps a net.Conn with a bufio.Reader to ensure any buffered +// data from the handshake is properly drained before reading from the connection +type bufferedConn struct { + net.Conn + reader *bufio.Reader +} + +func (c *bufferedConn) Read(p []byte) (int, error) { + return c.reader.Read(p) +} + // ExecIntoInstance executes command in instance via vsock using gRPC // vsockSocketPath is the Unix socket created by Cloud Hypervisor (e.g., /var/lib/hypeman/guests/{id}/vsock.sock) func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOptions) (*ExitStatus, error) { // Connect to Cloud Hypervisor's vsock Unix socket with custom dialer grpcConn, err := grpc.NewClient("passthrough:///vsock", grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { - // Connect to CH's Unix socket - conn, err := net.Dial("unix", vsockSocketPath) - if err != nil { - return nil, fmt.Errorf("dial unix socket: %w", err) - } - - // Perform Cloud Hypervisor vsock handshake - if _, err := fmt.Fprintf(conn, "CONNECT 2222\n"); err != nil { - conn.Close() - return nil, fmt.Errorf("send handshake: %w", err) - } - - // Read handshake response - reader := bufio.NewReader(conn) - response, err := reader.ReadString('\n') - if err != nil { - conn.Close() - return nil, fmt.Errorf("read handshake response: %w", err) - } - - if !strings.HasPrefix(response, "OK ") { - conn.Close() - return nil, fmt.Errorf("handshake failed: %s", strings.TrimSpace(response)) - } - - // Return the connection for gRPC to use - // Note: bufio.Reader may have buffered data, but since we only read one line - // and gRPC will start fresh, this should be safe - return conn, nil + return dialVsock(ctx, vsockSocketPath) }), grpc.WithTransportCredentials(insecure.NewCredentials()), ) @@ -139,3 +133,64 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti } } +// dialVsock connects to Cloud Hypervisor's vsock Unix socket and performs the handshake +func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { + slog.Debug("connecting to vsock", "socket", vsockSocketPath) + + // Use dial timeout, respecting context deadline if shorter + dialTimeout := vsockDialTimeout + if deadline, ok := ctx.Deadline(); ok { + if remaining := time.Until(deadline); remaining < dialTimeout { + dialTimeout = remaining + } + } + + // Connect to CH's Unix socket with timeout + dialer := net.Dialer{Timeout: dialTimeout} + conn, err := dialer.DialContext(ctx, "unix", vsockSocketPath) + if err != nil { + return nil, fmt.Errorf("dial vsock socket %s: %w", vsockSocketPath, err) + } + + slog.Debug("connected to vsock socket, performing handshake", "port", vsockGuestPort) + + // Set deadline for handshake + if err := conn.SetDeadline(time.Now().Add(vsockHandshakeTimeout)); err != nil { + conn.Close() + return nil, fmt.Errorf("set handshake deadline: %w", err) + } + + // Perform Cloud Hypervisor vsock handshake + handshakeCmd := fmt.Sprintf("CONNECT %d\n", vsockGuestPort) + if _, err := conn.Write([]byte(handshakeCmd)); err != nil { + conn.Close() + return nil, fmt.Errorf("send vsock handshake: %w", err) + } + + // Read handshake response + reader := bufio.NewReader(conn) + response, err := reader.ReadString('\n') + if err != nil { + conn.Close() + return nil, fmt.Errorf("read vsock handshake response (is exec-agent running in guest?): %w", err) + } + + // Clear deadline after successful handshake + if err := conn.SetDeadline(time.Time{}); err != nil { + conn.Close() + return nil, fmt.Errorf("clear deadline: %w", err) + } + + response = strings.TrimSpace(response) + if !strings.HasPrefix(response, "OK ") { + conn.Close() + return nil, fmt.Errorf("vsock handshake failed: %s", response) + } + + slog.Debug("vsock handshake successful", "response", response) + + // Return wrapped connection that uses the bufio.Reader + // This ensures any bytes buffered during handshake are not lost + return &bufferedConn{Conn: conn, reader: reader}, nil +} + diff --git a/lib/system/init_script.go b/lib/system/init_script.go index 7df0a70a..cd36c85c 100644 --- a/lib/system/init_script.go +++ b/lib/system/init_script.go @@ -92,13 +92,14 @@ chroot /overlay/newroot ln -sf /proc/self/fd/0 /dev/stdin 2>/dev/null || true chroot /overlay/newroot ln -sf /proc/self/fd/1 /dev/stdout 2>/dev/null || true chroot /overlay/newroot ln -sf /proc/self/fd/2 /dev/stderr 2>/dev/null || true -# Configure network inside the container view +# Configure network from initrd (using busybox ip, not container's ip) +# Network interfaces are shared, so we can configure them from here if [ -n "${GUEST_IP:-}" ]; then echo "overlay-init: configuring network" - chroot /overlay/newroot ip link set lo up - chroot /overlay/newroot ip addr add ${GUEST_IP}/${GUEST_CIDR} dev eth0 - chroot /overlay/newroot ip link set eth0 up - chroot /overlay/newroot ip route add default via ${GUEST_GW} + ip link set lo up + ip addr add ${GUEST_IP}/${GUEST_CIDR} dev eth0 + ip link set eth0 up + ip route add default via ${GUEST_GW} echo "nameserver ${GUEST_DNS}" > /overlay/newroot/etc/resolv.conf echo "overlay-init: network configured - IP: ${GUEST_IP}/${GUEST_CIDR}" fi @@ -134,6 +135,8 @@ wait $APP_PID APP_EXIT=$? echo "overlay-init: app exited with code $APP_EXIT" -exit $APP_EXIT` -} +# Wait for all background jobs (exec-agent runs forever, keeping init alive) +# This prevents kernel panic from killing init (PID 1) +wait` +}