From 5ce83d47a8bca6c632fcc4cc7ddf65b1e928319f Mon Sep 17 00:00:00 2001
From: Rory Ford <me@roryford.com>
Date: Mon, 15 Jun 2026 19:45:05 +1000
Subject: [PATCH] feat(inference): ResidentModelStatus snapshot +
 queuedRequestCount

Adds read-only runtime observability to InferenceService:
- ResidentModelStatus (new public struct) snapshots modelID, backend, estimated
  footprint, loadedAt, lastActivityAt, and a live idleDuration computed property.
- queuedRequestCount (Int) exposes queue depth without the Bool-only hasQueuedRequests.
- lastActivityTimestamp tracked in GenerationQueue at enqueue, dequeue-to-active,
  and request completion; exposed via lastActivityAt.
- loadedAt and residentFootprintBytes tracked in ModelLifecycleCoordinator, set at
  commitLoadIfCurrent and cleared at unloadModel; footprint threaded from
  ModelLoadPlan.outcome.totalEstimatedBytes for local loads (nil for cloud/debug init).

Closes #1880

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../ResidentModelStatus.swift                 |  44 ++++++
 .../Services/GenerationQueue.swift            |  15 ++
 .../Services/InferenceService.swift           |  25 ++++
 .../Services/ModelLifecycleCoordinator.swift  |  32 ++++-
 .../ResidentModelStatusTests.swift            | 128 ++++++++++++++++++
 5 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100644 Sources/ManifoldInference/ResidentModelStatus.swift
 create mode 100644 Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift

diff --git a/Sources/ManifoldInference/ResidentModelStatus.swift b/Sources/ManifoldInference/ResidentModelStatus.swift
new file mode 100644
index 000000000..ca2e6adc7
--- /dev/null
+++ b/Sources/ManifoldInference/ResidentModelStatus.swift
@@ -0,0 +1,44 @@
+import Foundation
+
+/// A point-in-time snapshot of the currently resident model's identity and
+/// runtime characteristics.
+///
+/// Obtained via ``InferenceService/residentModelStatus``. Returns `nil` when
+/// no model is loaded. Useful for dashboards, telemetry, and idle-eviction
+/// policies that need to know which model is loaded, how long it has been idle,
+/// and how much memory it is estimated to occupy.
+///
+/// ```swift
+/// if let status = inferenceService.residentModelStatus {
+///     print("Model: \(status.modelID) via \(status.backend)")
+///     print("Idle for \(status.idleDuration.formatted()) seconds")
+/// }
+/// ```
+public struct ResidentModelStatus: Sendable {
+
+    /// The human-readable model identifier (e.g. `ModelInfo.name`).
+    public let modelID: String
+
+    /// The backend engine label (e.g. `"Mock"`, `"llama"`, `"ollama"`).
+    public let backend: String
+
+    /// Best-effort selection-time footprint estimate in bytes.
+    ///
+    /// Sourced from `ModelLoadPlan.outcome.totalEstimatedBytes` for local
+    /// (on-disk) loads. `nil` for cloud / system-managed endpoints where no
+    /// local memory estimate is computed.
+    public let estimatedFootprintBytes: UInt64?
+
+    /// The moment the model transitioned to `isModelLoaded == true`.
+    public let loadedAt: Date
+
+    /// The timestamp of the most recent queue activity: enqueue, dequeue-to-active,
+    /// or request completion.
+    public let lastActivityAt: Date
+
+    /// Seconds elapsed since the last queue activity.
+    ///
+    /// Computed on access so the value reflects wall time without requiring a
+    /// periodic refresh of the snapshot itself.
+    public var idleDuration: TimeInterval { Date().timeIntervalSince(lastActivityAt) }
+}
diff --git a/Sources/ManifoldInference/Services/GenerationQueue.swift b/Sources/ManifoldInference/Services/GenerationQueue.swift
index 400cf0451..5c152a5ee 100644
--- a/Sources/ManifoldInference/Services/GenerationQueue.swift
+++ b/Sources/ManifoldInference/Services/GenerationQueue.swift
@@ -197,10 +197,22 @@ final class GenerationQueue {
     private var continuations: [GenerationRequestToken: AsyncThrowingStream<GenerationEvent, Error>.Continuation] = [:]
     private let maxQueueDepth = 8
 
+    /// Timestamp of the most recent queue activity: enqueue, dequeue-to-active, or
+    /// completion. Initialized to `.distantPast` so `idleDuration` is always
+    /// meaningful even before the first request.
+    private var lastActivityTimestamp: Date = .distantPast
+
     // MARK: - Computed
 
     var hasQueuedRequests: Bool { !requestQueue.isEmpty }
 
+    /// Number of requests currently waiting in the queue (not including the
+    /// active request being generated). Exposed publicly via `InferenceService`.
+    var queuedRequestCount: Int { requestQueue.count }
+
+    /// Timestamp of the most recent queue activity.
+    var lastActivityAt: Date { lastActivityTimestamp }
+
     var lastTokenUsage: (promptTokens: Int, completionTokens: Int)? {
         (currentBackend as? TokenUsageProvider)?.lastUsage
     }
@@ -601,6 +613,7 @@ final class GenerationQueue {
             requestQueue.append(request)
         }
 
+        lastActivityTimestamp = Date()
         drainQueue()
         return (token: token, stream: stream)
     }
@@ -769,6 +782,7 @@ final class GenerationQueue {
 
         activeRequest = next
         isGenerating = true
+        lastActivityTimestamp = Date()
         next.stream.setPhase(.connecting)
 
         activeTask = Task { [weak self] in
@@ -795,6 +809,7 @@ final class GenerationQueue {
                     }
                 }
                 if self.activeRequest?.token == next.token {
+                    self.lastActivityTimestamp = Date()
                     self.activeRequest = nil
                     self.activeTask = nil
                     self.isGenerating = false
diff --git a/Sources/ManifoldInference/Services/InferenceService.swift b/Sources/ManifoldInference/Services/InferenceService.swift
index 46f527044..9b0ec8c3c 100644
--- a/Sources/ManifoldInference/Services/InferenceService.swift
+++ b/Sources/ManifoldInference/Services/InferenceService.swift
@@ -688,6 +688,31 @@ public final class InferenceService {
         return generation.hasQueuedRequests
     }
 
+    /// Number of requests waiting in the generation queue (not counting the
+    /// request currently being generated). Zero when the queue is idle.
+    public var queuedRequestCount: Int {
+        ensureProviderWired()
+        return generation.queuedRequestCount
+    }
+
+    /// A point-in-time snapshot of the resident model's identity and runtime
+    /// characteristics, or `nil` when no model is loaded.
+    ///
+    /// See ``ResidentModelStatus`` for field documentation.
+    public var residentModelStatus: ResidentModelStatus? {
+        guard lifecycle.isModelLoaded,
+              let modelID = lifecycle.activeModelName,
+              let backend = lifecycle.activeBackendName,
+              let loadedAt = lifecycle.loadedAt else { return nil }
+        return ResidentModelStatus(
+            modelID: modelID,
+            backend: backend,
+            estimatedFootprintBytes: lifecycle.residentFootprintBytes,
+            loadedAt: loadedAt,
+            lastActivityAt: generation.lastActivityAt
+        )
+    }
+
     public func resetConversation() {
         lifecycle.resetConversation()
     }
diff --git a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift
index 35bd9ee21..29341430a 100644
--- a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift
+++ b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift
@@ -19,6 +19,16 @@ final class ModelLifecycleCoordinator {
     private(set) var activeModelName: String?
     private(set) var modelLoadProgress: Double?
 
+    /// Timestamp of the moment `isModelLoaded` transitioned to `true` for the
+    /// current resident model. Cleared to `nil` on unload.
+    private(set) var loadedAt: Date?
+
+    /// Best-effort selection-time footprint estimate for the resident model, in
+    /// bytes. Sourced from `ModelLoadPlan.outcome.totalEstimatedBytes` when a
+    /// plan-based load is used. `nil` for cloud endpoints or when no plan was
+    /// computed (e.g. the `#if DEBUG` test init path).
+    private(set) var residentFootprintBytes: UInt64?
+
     /// Identity of the ``APIEndpointRecord`` backing the active endpoint
     /// backend, or `nil` for on-disk model loads (which have no endpoint
     /// record). Threaded through the load commit so usage accounting can
@@ -98,6 +108,7 @@ final class ModelLifecycleCoordinator {
         self.isModelLoaded = true
         self.activeBackendName = name
         self.activeModelName = modelName
+        self.loadedAt = Date()
         let request = LoadRequestToken(rawValue: 1)
         self.nextLoadRequestToken = request
         self.latestRequestedLoadToken = request
@@ -238,12 +249,20 @@ final class ModelLifecycleCoordinator {
         let url = modelInfo.url
         let mmprojURL = modelInfo.mmprojURL
         let dispatchPlan = effectivePlan
+        // Capture the plan's footprint estimate so `commitLoadIfCurrent` can
+        // store it as `residentFootprintBytes`. Only non-zero estimates are
+        // meaningful; zero is the plan's unset default for cloud/system-managed
+        // backends and is stored as `nil` to signal "unknown".
+        let footprint: UInt64? = plan.outcome.totalEstimatedBytes > 0
+            ? plan.outcome.totalEstimatedBytes
+            : nil
         try await runLoad(
             source: "local",
             target: modelTypeLogLabel(modelInfo.modelType),
             backendName: backendName,
             backend: newBackend,
-            modelName: modelInfo.name
+            modelName: modelInfo.name,
+            footprintBytes: footprint
         ) {
             (newBackend as? MultimodalProjectorConfigurable)?.setMmprojURL(mmprojURL)
             try await newBackend.loadModel(from: url, plan: dispatchPlan)
@@ -262,6 +281,7 @@ final class ModelLifecycleCoordinator {
         backend newBackend: any InferenceBackend,
         modelName: String,
         endpointID: UUID? = nil,
+        footprintBytes: UInt64? = nil,
         loadOperation: @escaping @Sendable () async throws -> Void
     ) async throws {
         let request = beginLoadRequest(
@@ -290,7 +310,8 @@ final class ModelLifecycleCoordinator {
             backend: newBackend,
             backendName: backendName,
             modelName: modelName,
-            endpointID: endpointID
+            endpointID: endpointID,
+            footprintBytes: footprintBytes
         ) else {
             newBackend.unloadModel()
             logLoadEvent("load.suppress", request: request, reason: "stale-success", clearMetadata: true)
@@ -369,6 +390,8 @@ final class ModelLifecycleCoordinator {
         activeBackendName = nil
         activeModelName = nil
         activeEndpointID = nil
+        loadedAt = nil
+        residentFootprintBytes = nil
     }
 
     // MARK: - Capability Queries
@@ -527,7 +550,8 @@ final class ModelLifecycleCoordinator {
         backend newBackend: any InferenceBackend,
         backendName: String,
         modelName: String,
-        endpointID: UUID? = nil
+        endpointID: UUID? = nil,
+        footprintBytes: UInt64? = nil
     ) -> Bool {
         guard canCommitLoad(request) else { return false }
         backend = newBackend
@@ -536,6 +560,8 @@ final class ModelLifecycleCoordinator {
         activeBackendName = backendName
         activeModelName = modelName
         activeEndpointID = endpointID
+        loadedAt = Date()
+        residentFootprintBytes = footprintBytes
         loadPhase = .loaded(request: request)
         logLoadEvent("load.commit", request: request, clearMetadata: true)
         return true
diff --git a/Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift b/Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift
new file mode 100644
index 000000000..efef620bd
--- /dev/null
+++ b/Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift
@@ -0,0 +1,128 @@
+import XCTest
+@testable import ManifoldInference
+import ManifoldTestSupport
+
+/// Tests for ``ResidentModelStatus`` and ``InferenceService/queuedRequestCount``.
+///
+/// Uses XCTestCase (not Swift Testing) per the two-runner constraint (#681):
+/// mixing Swift Testing and XCTest in one process causes libmalloc SIGABRT.
+@MainActor
+final class ResidentModelStatusTests: XCTestCase {
+
+    // MARK: - Helpers
+
+    private func makeService(modelName: String = "TestModel") -> (InferenceService, MockInferenceBackend) {
+        let backend = MockInferenceBackend()
+        backend.isModelLoaded = true
+        backend.tokensToYield = ["hello"]
+        let service = InferenceService(backend: backend, name: "Mock", modelName: modelName)
+        return (service, backend)
+    }
+
+    // MARK: - residentModelStatus — loaded
+
+    func test_residentModelStatus_nonNil_whenModelLoaded() {
+        let (service, _) = makeService(modelName: "Llama-3")
+        XCTAssertNotNil(service.residentModelStatus,
+                        "residentModelStatus must be non-nil when a model is loaded")
+    }
+
+    func test_residentModelStatus_modelID_matchesModelName() {
+        let (service, _) = makeService(modelName: "Llama-3")
+        XCTAssertEqual(service.residentModelStatus?.modelID, "Llama-3",
+                       "modelID must match the name supplied at load time")
+    }
+
+    func test_residentModelStatus_backend_matchesBackendName() {
+        let (service, _) = makeService()
+        XCTAssertEqual(service.residentModelStatus?.backend, "Mock",
+                       "backend must match the name supplied at load time")
+    }
+
+    func test_residentModelStatus_loadedAt_isRecent() {
+        let before = Date()
+        let (service, _) = makeService()
+        let after = Date()
+        guard let loadedAt = service.residentModelStatus?.loadedAt else {
+            XCTFail("loadedAt must be non-nil when a model is loaded")
+            return
+        }
+        XCTAssertGreaterThanOrEqual(loadedAt, before,
+                                    "loadedAt must not precede service creation")
+        XCTAssertLessThanOrEqual(loadedAt, after,
+                                 "loadedAt must not be in the future")
+    }
+
+    func test_residentModelStatus_estimatedFootprintBytes_nilForDebugInit() {
+        // The #if DEBUG init path (used by InferenceService(backend:name:)) does
+        // not have access to a ModelLoadPlan so footprint is always nil there.
+        let (service, _) = makeService()
+        XCTAssertNil(service.residentModelStatus?.estimatedFootprintBytes,
+                     "estimatedFootprintBytes must be nil when loaded via the debug-init path (no plan)")
+    }
+
+    // MARK: - residentModelStatus — unloaded
+
+    func test_residentModelStatus_nil_whenNoModelLoaded() {
+        let service = InferenceService()
+        XCTAssertNil(service.residentModelStatus,
+                     "residentModelStatus must be nil when no model is loaded")
+    }
+
+    func test_residentModelStatus_nil_afterUnload() {
+        let (service, _) = makeService()
+        XCTAssertNotNil(service.residentModelStatus, "pre-condition: should be loaded")
+        service.unloadModel()
+        XCTAssertNil(service.residentModelStatus,
+                     "residentModelStatus must be nil after unloadModel()")
+    }
+
+    // MARK: - queuedRequestCount
+
+    func test_queuedRequestCount_zeroInitially() {
+        let (service, _) = makeService()
+        XCTAssertEqual(service.queuedRequestCount, 0,
+                       "queuedRequestCount must be 0 before any requests are enqueued")
+    }
+
+    func test_queuedRequestCount_zeroAfterGenerationCompletes() async throws {
+        let (service, _) = makeService()
+        let (_, stream) = try service.enqueue(
+            messages: [Message.user("hi")]
+        )
+        // Drain the stream to let the queue settle.
+        for try await _ in stream.events {}
+        XCTAssertEqual(service.queuedRequestCount, 0,
+                       "queuedRequestCount must return to 0 after a generation finishes")
+    }
+
+    // MARK: - lastActivityAt
+
+    func test_lastActivityAt_updatedAfterGenerationCompletes() async throws {
+        let (service, _) = makeService()
+        let before = Date()
+        let (_, stream) = try service.enqueue(
+            messages: [Message.user("hi")]
+        )
+        for try await _ in stream.events {}
+        let after = Date()
+        guard let status = service.residentModelStatus else {
+            XCTFail("residentModelStatus must be non-nil after generation")
+            return
+        }
+        XCTAssertGreaterThanOrEqual(status.lastActivityAt, before,
+                                    "lastActivityAt must be >= the moment before enqueue")
+        XCTAssertLessThanOrEqual(status.lastActivityAt, after,
+                                 "lastActivityAt must not be in the future")
+    }
+
+    func test_idleDuration_isNonNegative() async throws {
+        let (service, _) = makeService()
+        let (_, stream) = try service.enqueue(messages: [Message.user("hi")])
+        for try await _ in stream.events {}
+        let status = service.residentModelStatus
+        XCTAssertNotNil(status)
+        XCTAssertGreaterThanOrEqual(status?.idleDuration ?? -1, 0,
+                                    "idleDuration must always be >= 0")
+    }
+}