From 5ce83d47a8bca6c632fcc4cc7ddf65b1e928319f Mon Sep 17 00:00:00 2001 From: Rory Ford Date: Mon, 15 Jun 2026 19:45:05 +1000 Subject: [PATCH] feat(inference): ResidentModelStatus snapshot + queuedRequestCount Adds read-only runtime observability to InferenceService: - ResidentModelStatus (new public struct) snapshots modelID, backend, estimated footprint, loadedAt, lastActivityAt, and a live idleDuration computed property. - queuedRequestCount (Int) exposes queue depth without the Bool-only hasQueuedRequests. - lastActivityTimestamp tracked in GenerationQueue at enqueue, dequeue-to-active, and request completion; exposed via lastActivityAt. - loadedAt and residentFootprintBytes tracked in ModelLifecycleCoordinator, set at commitLoadIfCurrent and cleared at unloadModel; footprint threaded from ModelLoadPlan.outcome.totalEstimatedBytes for local loads (nil for cloud/debug init). Closes #1880 Co-Authored-By: Claude Sonnet 4.6 --- .../ResidentModelStatus.swift | 44 ++++++ .../Services/GenerationQueue.swift | 15 ++ .../Services/InferenceService.swift | 25 ++++ .../Services/ModelLifecycleCoordinator.swift | 32 ++++- .../ResidentModelStatusTests.swift | 128 ++++++++++++++++++ 5 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 Sources/ManifoldInference/ResidentModelStatus.swift create mode 100644 Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift diff --git a/Sources/ManifoldInference/ResidentModelStatus.swift b/Sources/ManifoldInference/ResidentModelStatus.swift new file mode 100644 index 000000000..ca2e6adc7 --- /dev/null +++ b/Sources/ManifoldInference/ResidentModelStatus.swift @@ -0,0 +1,44 @@ +import Foundation + +/// A point-in-time snapshot of the currently resident model's identity and +/// runtime characteristics. +/// +/// Obtained via ``InferenceService/residentModelStatus``. Returns `nil` when +/// no model is loaded. Useful for dashboards, telemetry, and idle-eviction +/// policies that need to know which model is loaded, how long it has been idle, +/// and how much memory it is estimated to occupy. +/// +/// ```swift +/// if let status = inferenceService.residentModelStatus { +/// print("Model: \(status.modelID) via \(status.backend)") +/// print("Idle for \(status.idleDuration.formatted()) seconds") +/// } +/// ``` +public struct ResidentModelStatus: Sendable { + + /// The human-readable model identifier (e.g. `ModelInfo.name`). + public let modelID: String + + /// The backend engine label (e.g. `"Mock"`, `"llama"`, `"ollama"`). + public let backend: String + + /// Best-effort selection-time footprint estimate in bytes. + /// + /// Sourced from `ModelLoadPlan.outcome.totalEstimatedBytes` for local + /// (on-disk) loads. `nil` for cloud / system-managed endpoints where no + /// local memory estimate is computed. + public let estimatedFootprintBytes: UInt64? + + /// The moment the model transitioned to `isModelLoaded == true`. + public let loadedAt: Date + + /// The timestamp of the most recent queue activity: enqueue, dequeue-to-active, + /// or request completion. + public let lastActivityAt: Date + + /// Seconds elapsed since the last queue activity. + /// + /// Computed on access so the value reflects wall time without requiring a + /// periodic refresh of the snapshot itself. + public var idleDuration: TimeInterval { Date().timeIntervalSince(lastActivityAt) } +} diff --git a/Sources/ManifoldInference/Services/GenerationQueue.swift b/Sources/ManifoldInference/Services/GenerationQueue.swift index 400cf0451..5c152a5ee 100644 --- a/Sources/ManifoldInference/Services/GenerationQueue.swift +++ b/Sources/ManifoldInference/Services/GenerationQueue.swift @@ -197,10 +197,22 @@ final class GenerationQueue { private var continuations: [GenerationRequestToken: AsyncThrowingStream.Continuation] = [:] private let maxQueueDepth = 8 + /// Timestamp of the most recent queue activity: enqueue, dequeue-to-active, or + /// completion. Initialized to `.distantPast` so `idleDuration` is always + /// meaningful even before the first request. + private var lastActivityTimestamp: Date = .distantPast + // MARK: - Computed var hasQueuedRequests: Bool { !requestQueue.isEmpty } + /// Number of requests currently waiting in the queue (not including the + /// active request being generated). Exposed publicly via `InferenceService`. + var queuedRequestCount: Int { requestQueue.count } + + /// Timestamp of the most recent queue activity. + var lastActivityAt: Date { lastActivityTimestamp } + var lastTokenUsage: (promptTokens: Int, completionTokens: Int)? { (currentBackend as? TokenUsageProvider)?.lastUsage } @@ -601,6 +613,7 @@ final class GenerationQueue { requestQueue.append(request) } + lastActivityTimestamp = Date() drainQueue() return (token: token, stream: stream) } @@ -769,6 +782,7 @@ final class GenerationQueue { activeRequest = next isGenerating = true + lastActivityTimestamp = Date() next.stream.setPhase(.connecting) activeTask = Task { [weak self] in @@ -795,6 +809,7 @@ final class GenerationQueue { } } if self.activeRequest?.token == next.token { + self.lastActivityTimestamp = Date() self.activeRequest = nil self.activeTask = nil self.isGenerating = false diff --git a/Sources/ManifoldInference/Services/InferenceService.swift b/Sources/ManifoldInference/Services/InferenceService.swift index 46f527044..9b0ec8c3c 100644 --- a/Sources/ManifoldInference/Services/InferenceService.swift +++ b/Sources/ManifoldInference/Services/InferenceService.swift @@ -688,6 +688,31 @@ public final class InferenceService { return generation.hasQueuedRequests } + /// Number of requests waiting in the generation queue (not counting the + /// request currently being generated). Zero when the queue is idle. + public var queuedRequestCount: Int { + ensureProviderWired() + return generation.queuedRequestCount + } + + /// A point-in-time snapshot of the resident model's identity and runtime + /// characteristics, or `nil` when no model is loaded. + /// + /// See ``ResidentModelStatus`` for field documentation. + public var residentModelStatus: ResidentModelStatus? { + guard lifecycle.isModelLoaded, + let modelID = lifecycle.activeModelName, + let backend = lifecycle.activeBackendName, + let loadedAt = lifecycle.loadedAt else { return nil } + return ResidentModelStatus( + modelID: modelID, + backend: backend, + estimatedFootprintBytes: lifecycle.residentFootprintBytes, + loadedAt: loadedAt, + lastActivityAt: generation.lastActivityAt + ) + } + public func resetConversation() { lifecycle.resetConversation() } diff --git a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift index 35bd9ee21..29341430a 100644 --- a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift +++ b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift @@ -19,6 +19,16 @@ final class ModelLifecycleCoordinator { private(set) var activeModelName: String? private(set) var modelLoadProgress: Double? + /// Timestamp of the moment `isModelLoaded` transitioned to `true` for the + /// current resident model. Cleared to `nil` on unload. + private(set) var loadedAt: Date? + + /// Best-effort selection-time footprint estimate for the resident model, in + /// bytes. Sourced from `ModelLoadPlan.outcome.totalEstimatedBytes` when a + /// plan-based load is used. `nil` for cloud endpoints or when no plan was + /// computed (e.g. the `#if DEBUG` test init path). + private(set) var residentFootprintBytes: UInt64? + /// Identity of the ``APIEndpointRecord`` backing the active endpoint /// backend, or `nil` for on-disk model loads (which have no endpoint /// record). Threaded through the load commit so usage accounting can @@ -98,6 +108,7 @@ final class ModelLifecycleCoordinator { self.isModelLoaded = true self.activeBackendName = name self.activeModelName = modelName + self.loadedAt = Date() let request = LoadRequestToken(rawValue: 1) self.nextLoadRequestToken = request self.latestRequestedLoadToken = request @@ -238,12 +249,20 @@ final class ModelLifecycleCoordinator { let url = modelInfo.url let mmprojURL = modelInfo.mmprojURL let dispatchPlan = effectivePlan + // Capture the plan's footprint estimate so `commitLoadIfCurrent` can + // store it as `residentFootprintBytes`. Only non-zero estimates are + // meaningful; zero is the plan's unset default for cloud/system-managed + // backends and is stored as `nil` to signal "unknown". + let footprint: UInt64? = plan.outcome.totalEstimatedBytes > 0 + ? plan.outcome.totalEstimatedBytes + : nil try await runLoad( source: "local", target: modelTypeLogLabel(modelInfo.modelType), backendName: backendName, backend: newBackend, - modelName: modelInfo.name + modelName: modelInfo.name, + footprintBytes: footprint ) { (newBackend as? MultimodalProjectorConfigurable)?.setMmprojURL(mmprojURL) try await newBackend.loadModel(from: url, plan: dispatchPlan) @@ -262,6 +281,7 @@ final class ModelLifecycleCoordinator { backend newBackend: any InferenceBackend, modelName: String, endpointID: UUID? = nil, + footprintBytes: UInt64? = nil, loadOperation: @escaping @Sendable () async throws -> Void ) async throws { let request = beginLoadRequest( @@ -290,7 +310,8 @@ final class ModelLifecycleCoordinator { backend: newBackend, backendName: backendName, modelName: modelName, - endpointID: endpointID + endpointID: endpointID, + footprintBytes: footprintBytes ) else { newBackend.unloadModel() logLoadEvent("load.suppress", request: request, reason: "stale-success", clearMetadata: true) @@ -369,6 +390,8 @@ final class ModelLifecycleCoordinator { activeBackendName = nil activeModelName = nil activeEndpointID = nil + loadedAt = nil + residentFootprintBytes = nil } // MARK: - Capability Queries @@ -527,7 +550,8 @@ final class ModelLifecycleCoordinator { backend newBackend: any InferenceBackend, backendName: String, modelName: String, - endpointID: UUID? = nil + endpointID: UUID? = nil, + footprintBytes: UInt64? = nil ) -> Bool { guard canCommitLoad(request) else { return false } backend = newBackend @@ -536,6 +560,8 @@ final class ModelLifecycleCoordinator { activeBackendName = backendName activeModelName = modelName activeEndpointID = endpointID + loadedAt = Date() + residentFootprintBytes = footprintBytes loadPhase = .loaded(request: request) logLoadEvent("load.commit", request: request, clearMetadata: true) return true diff --git a/Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift b/Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift new file mode 100644 index 000000000..efef620bd --- /dev/null +++ b/Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift @@ -0,0 +1,128 @@ +import XCTest +@testable import ManifoldInference +import ManifoldTestSupport + +/// Tests for ``ResidentModelStatus`` and ``InferenceService/queuedRequestCount``. +/// +/// Uses XCTestCase (not Swift Testing) per the two-runner constraint (#681): +/// mixing Swift Testing and XCTest in one process causes libmalloc SIGABRT. +@MainActor +final class ResidentModelStatusTests: XCTestCase { + + // MARK: - Helpers + + private func makeService(modelName: String = "TestModel") -> (InferenceService, MockInferenceBackend) { + let backend = MockInferenceBackend() + backend.isModelLoaded = true + backend.tokensToYield = ["hello"] + let service = InferenceService(backend: backend, name: "Mock", modelName: modelName) + return (service, backend) + } + + // MARK: - residentModelStatus — loaded + + func test_residentModelStatus_nonNil_whenModelLoaded() { + let (service, _) = makeService(modelName: "Llama-3") + XCTAssertNotNil(service.residentModelStatus, + "residentModelStatus must be non-nil when a model is loaded") + } + + func test_residentModelStatus_modelID_matchesModelName() { + let (service, _) = makeService(modelName: "Llama-3") + XCTAssertEqual(service.residentModelStatus?.modelID, "Llama-3", + "modelID must match the name supplied at load time") + } + + func test_residentModelStatus_backend_matchesBackendName() { + let (service, _) = makeService() + XCTAssertEqual(service.residentModelStatus?.backend, "Mock", + "backend must match the name supplied at load time") + } + + func test_residentModelStatus_loadedAt_isRecent() { + let before = Date() + let (service, _) = makeService() + let after = Date() + guard let loadedAt = service.residentModelStatus?.loadedAt else { + XCTFail("loadedAt must be non-nil when a model is loaded") + return + } + XCTAssertGreaterThanOrEqual(loadedAt, before, + "loadedAt must not precede service creation") + XCTAssertLessThanOrEqual(loadedAt, after, + "loadedAt must not be in the future") + } + + func test_residentModelStatus_estimatedFootprintBytes_nilForDebugInit() { + // The #if DEBUG init path (used by InferenceService(backend:name:)) does + // not have access to a ModelLoadPlan so footprint is always nil there. + let (service, _) = makeService() + XCTAssertNil(service.residentModelStatus?.estimatedFootprintBytes, + "estimatedFootprintBytes must be nil when loaded via the debug-init path (no plan)") + } + + // MARK: - residentModelStatus — unloaded + + func test_residentModelStatus_nil_whenNoModelLoaded() { + let service = InferenceService() + XCTAssertNil(service.residentModelStatus, + "residentModelStatus must be nil when no model is loaded") + } + + func test_residentModelStatus_nil_afterUnload() { + let (service, _) = makeService() + XCTAssertNotNil(service.residentModelStatus, "pre-condition: should be loaded") + service.unloadModel() + XCTAssertNil(service.residentModelStatus, + "residentModelStatus must be nil after unloadModel()") + } + + // MARK: - queuedRequestCount + + func test_queuedRequestCount_zeroInitially() { + let (service, _) = makeService() + XCTAssertEqual(service.queuedRequestCount, 0, + "queuedRequestCount must be 0 before any requests are enqueued") + } + + func test_queuedRequestCount_zeroAfterGenerationCompletes() async throws { + let (service, _) = makeService() + let (_, stream) = try service.enqueue( + messages: [Message.user("hi")] + ) + // Drain the stream to let the queue settle. + for try await _ in stream.events {} + XCTAssertEqual(service.queuedRequestCount, 0, + "queuedRequestCount must return to 0 after a generation finishes") + } + + // MARK: - lastActivityAt + + func test_lastActivityAt_updatedAfterGenerationCompletes() async throws { + let (service, _) = makeService() + let before = Date() + let (_, stream) = try service.enqueue( + messages: [Message.user("hi")] + ) + for try await _ in stream.events {} + let after = Date() + guard let status = service.residentModelStatus else { + XCTFail("residentModelStatus must be non-nil after generation") + return + } + XCTAssertGreaterThanOrEqual(status.lastActivityAt, before, + "lastActivityAt must be >= the moment before enqueue") + XCTAssertLessThanOrEqual(status.lastActivityAt, after, + "lastActivityAt must not be in the future") + } + + func test_idleDuration_isNonNegative() async throws { + let (service, _) = makeService() + let (_, stream) = try service.enqueue(messages: [Message.user("hi")]) + for try await _ in stream.events {} + let status = service.residentModelStatus + XCTAssertNotNil(status) + XCTAssertGreaterThanOrEqual(status?.idleDuration ?? -1, 0, + "idleDuration must always be >= 0") + } +}