Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,7 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
Name: inst.Name,
Image: inst.Image,
State: oapi.InstanceState(inst.State),
StateError: inst.StateError,
Size: lo.ToPtr(sizeStr),
HotplugSize: lo.ToPtr(hotplugSizeStr),
OverlaySize: lo.ToPtr(overlaySizeStr),
Expand Down
20 changes: 10 additions & 10 deletions lib/instances/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func (m *manager) deleteInstance(
) error {
log := logger.FromContext(ctx)
log.InfoContext(ctx, "deleting instance", "id", id)

// 1. Load instance
meta, err := m.loadMetadata(id)
if err != nil {
Expand All @@ -40,7 +40,8 @@ func (m *manager) deleteInstance(
}

// 3. If VMM might be running, force kill it
if inst.State.RequiresVMM() {
// Also attempt kill for StateUnknown since we can't be sure if VMM is running
if inst.State.RequiresVMM() || inst.State == StateUnknown {
log.DebugContext(ctx, "stopping VMM", "id", id, "state", inst.State)
if err := m.killVMM(ctx, &inst); err != nil {
// Log error but continue with cleanup
Expand Down Expand Up @@ -85,11 +86,11 @@ func (m *manager) deleteInstance(
// For operations that need graceful shutdown (like standby), use the VMM API directly.
func (m *manager) killVMM(ctx context.Context, inst *Instance) error {
log := logger.FromContext(ctx)

// If we have a PID, kill the process immediately
if inst.CHPID != nil {
pid := *inst.CHPID

// Check if process exists
if err := syscall.Kill(pid, 0); err == nil {
// Process exists - kill it immediately with SIGKILL
Expand All @@ -98,7 +99,7 @@ func (m *manager) killVMM(ctx context.Context, inst *Instance) error {
if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
log.WarnContext(ctx, "failed to kill VMM process", "id", inst.Id, "pid", pid, "error", err)
}

// Wait for process to die and reap it to prevent zombies
// SIGKILL should be instant, but give it a moment
for i := 0; i < 50; i++ { // 50 * 100ms = 5 seconds
Expand All @@ -118,18 +119,18 @@ func (m *manager) killVMM(ctx context.Context, inst *Instance) error {
log.DebugContext(ctx, "VMM process not running", "id", inst.Id, "pid", pid)
}
}

// Clean up socket if it still exists
os.Remove(inst.SocketPath)

return nil
}

// WaitForProcessExit polls for a process to exit, returns true if exited within timeout.
// Exported for use in tests.
func WaitForProcessExit(pid int, timeout time.Duration) bool {
deadline := time.Now().Add(timeout)

for time.Now().Before(deadline) {
// Check if process still exists (signal 0 doesn't kill, just checks existence)
if err := syscall.Kill(pid, 0); err != nil {
Expand All @@ -140,8 +141,7 @@ func WaitForProcessExit(pid int, timeout time.Duration) bool {
// 10ms polling interval balances responsiveness with CPU usage
time.Sleep(10 * time.Millisecond)
}

// Timeout reached, process still exists
return false
}

84 changes: 56 additions & 28 deletions lib/instances/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,90 @@ package instances

import (
"context"
"fmt"
"os"
"path/filepath"

"github.com/onkernel/hypeman/lib/logger"
"github.com/onkernel/hypeman/lib/vmm"
)

// deriveState determines instance state by checking socket and querying VMM
func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) State {
// stateResult holds the result of state derivation
type stateResult struct {
State State
Error *string // Non-nil if state couldn't be determined
}

// deriveState determines instance state by checking socket and querying VMM.
// Returns StateUnknown with an error message if the socket exists but VMM is unreachable.
func (m *manager) deriveState(ctx context.Context, stored *StoredMetadata) stateResult {
log := logger.FromContext(ctx)

// 1. Check if socket exists
if _, err := os.Stat(stored.SocketPath); err != nil {
// No socket - check for snapshot to distinguish Stopped vs Standby
if m.hasSnapshot(stored.DataDir) {
return StateStandby
return stateResult{State: StateStandby}
}
return StateStopped
return stateResult{State: StateStopped}
}

// 2. Socket exists - query VMM for actual state
client, err := vmm.NewVMM(stored.SocketPath)
if err != nil {
// Stale socket - check for snapshot to distinguish Stopped vs Standby
if m.hasSnapshot(stored.DataDir) {
return StateStandby
}
return StateStopped
// Failed to create client - this is unexpected if socket exists
errMsg := fmt.Sprintf("failed to create VMM client: %v", err)
log.WarnContext(ctx, "failed to determine instance state",
"instance_id", stored.Id,
"socket", stored.SocketPath,
"error", err,
)
return stateResult{State: StateUnknown, Error: &errMsg}
}

resp, err := client.GetVmInfoWithResponse(ctx)
if err != nil {
// VMM unreachable - stale socket, check for snapshot
if m.hasSnapshot(stored.DataDir) {
return StateStandby
}
return StateStopped
// Socket exists but VMM is unreachable - this is unexpected
errMsg := fmt.Sprintf("failed to query VMM: %v", err)
log.WarnContext(ctx, "failed to query VMM state",
"instance_id", stored.Id,
"socket", stored.SocketPath,
"error", err,
)
return stateResult{State: StateUnknown, Error: &errMsg}
}

if resp.StatusCode() != 200 || resp.JSON200 == nil {
// VMM returned error - check for snapshot
if m.hasSnapshot(stored.DataDir) {
return StateStandby
}
return StateStopped
// VMM returned an error - log it and return Unknown
body := string(resp.Body)
errMsg := fmt.Sprintf("VMM returned error (status %d): %s", resp.StatusCode(), body)
log.WarnContext(ctx, "VMM returned error response",
"instance_id", stored.Id,
"socket", stored.SocketPath,
"status_code", resp.StatusCode(),
"body", body,
)
return stateResult{State: StateUnknown, Error: &errMsg}
}

// 3. Map CH state to our state
switch resp.JSON200.State {
case vmm.Created:
return StateCreated
return stateResult{State: StateCreated}
case vmm.Running:
return StateRunning
return stateResult{State: StateRunning}
case vmm.Paused:
return StatePaused
return stateResult{State: StatePaused}
case vmm.Shutdown:
return StateShutdown
return stateResult{State: StateShutdown}
default:
return StateStopped
// Unknown CH state - log and return Unknown
errMsg := fmt.Sprintf("unexpected VMM state: %s", resp.JSON200.State)
log.WarnContext(ctx, "VMM returned unexpected state",
"instance_id", stored.Id,
"vmm_state", resp.JSON200.State,
)
return stateResult{State: StateUnknown, Error: &errMsg}
}
}

Expand All @@ -83,9 +110,11 @@ func (m *manager) hasSnapshot(dataDir string) bool {

// toInstance converts stored metadata to Instance with derived fields
func (m *manager) toInstance(ctx context.Context, meta *metadata) Instance {
result := m.deriveState(ctx, &meta.StoredMetadata)
inst := Instance{
StoredMetadata: meta.StoredMetadata,
State: m.deriveState(ctx, &meta.StoredMetadata),
State: result.State,
StateError: result.Error,
HasSnapshot: m.hasSnapshot(meta.StoredMetadata.DataDir),
}
return inst
Expand All @@ -95,7 +124,7 @@ func (m *manager) toInstance(ctx context.Context, meta *metadata) Instance {
func (m *manager) listInstances(ctx context.Context) ([]Instance, error) {
log := logger.FromContext(ctx)
log.DebugContext(ctx, "listing all instances")

files, err := m.listMetadataFiles()
if err != nil {
log.ErrorContext(ctx, "failed to list metadata files", "error", err)
Expand Down Expand Up @@ -127,7 +156,7 @@ func (m *manager) listInstances(ctx context.Context) ([]Instance, error) {
func (m *manager) getInstance(ctx context.Context, id string) (*Instance, error) {
log := logger.FromContext(ctx)
log.DebugContext(ctx, "getting instance", "id", id)

meta, err := m.loadMetadata(id)
if err != nil {
log.ErrorContext(ctx, "failed to load instance metadata", "id", id, "error", err)
Expand All @@ -138,4 +167,3 @@ func (m *manager) getInstance(ctx context.Context, id string) (*Instance, error)
log.DebugContext(ctx, "retrieved instance", "id", id, "state", inst.State)
return &inst, nil
}

9 changes: 6 additions & 3 deletions lib/instances/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ var ValidTransitions = map[State][]State{
StatePaused, // start VMM + restore (atomic operation)
StateStopped, // delete snapshot + cleanup (terminal)
},
// StateUnknown means we failed to determine state - no transitions allowed.
// Operations on instances in Unknown state should fail with an error
// until the underlying issue is resolved.
// Can still Delete the instance.
StateUnknown: {},
}

// CanTransitionTo checks if a transition from current state to target state is valid
Expand Down Expand Up @@ -65,11 +70,9 @@ func (s State) RequiresVMM() bool {
switch s {
case StateCreated, StateRunning, StatePaused, StateShutdown:
return true
case StateStopped, StateStandby:
case StateStopped, StateStandby, StateUnknown:
return false
default:
return false
}
}


6 changes: 4 additions & 2 deletions lib/instances/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const (
StatePaused State = "Paused" // VM paused (CH native)
StateShutdown State = "Shutdown" // VM shutdown, VMM exists (CH native)
StateStandby State = "Standby" // No VMM, snapshot exists
StateUnknown State = "Unknown" // Failed to determine state (VMM query failed)
)

// VolumeAttachment represents a volume attached to an instance
Expand Down Expand Up @@ -73,8 +74,9 @@ type Instance struct {
StoredMetadata

// Derived fields (not stored in metadata.json)
State State // Derived from socket + VMM query
HasSnapshot bool // Derived from filesystem check
State State // Derived from socket + VMM query
StateError *string // Error message if state couldn't be determined (non-nil when State=Unknown)
HasSnapshot bool // Derived from filesystem check
}

// CreateInstanceRequest is the domain request for creating an instance
Expand Down
Loading
Loading