Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions Sources/ManifoldInference/ResidentModelStatus.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import Foundation

/// A point-in-time snapshot of the currently resident model's identity and
/// runtime characteristics.
///
/// Obtained via ``InferenceService/residentModelStatus``. Returns `nil` when
/// no model is loaded. Useful for dashboards, telemetry, and idle-eviction
/// policies that need to know which model is loaded, how long it has been idle,
/// and how much memory it is estimated to occupy.
///
/// ```swift
/// if let status = inferenceService.residentModelStatus {
/// print("Model: \(status.modelID) via \(status.backend)")
/// print("Idle for \(status.idleDuration.formatted()) seconds")
/// }
/// ```
public struct ResidentModelStatus: Sendable {

/// The human-readable model identifier (e.g. `ModelInfo.name`).
public let modelID: String

/// The backend engine label (e.g. `"Mock"`, `"llama"`, `"ollama"`).
public let backend: String

/// Best-effort selection-time footprint estimate in bytes.
///
/// Sourced from `ModelLoadPlan.outcome.totalEstimatedBytes` for local
/// (on-disk) loads. `nil` for cloud / system-managed endpoints where no
/// local memory estimate is computed.
public let estimatedFootprintBytes: UInt64?

/// The moment the model transitioned to `isModelLoaded == true`.
public let loadedAt: Date

/// The timestamp of the most recent queue activity: enqueue, dequeue-to-active,
/// or request completion.
public let lastActivityAt: Date

/// Seconds elapsed since the last queue activity.
///
/// Computed on access so the value reflects wall time without requiring a
/// periodic refresh of the snapshot itself.
public var idleDuration: TimeInterval { Date().timeIntervalSince(lastActivityAt) }
}
15 changes: 15 additions & 0 deletions Sources/ManifoldInference/Services/GenerationQueue.swift
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,22 @@ final class GenerationQueue {
private var continuations: [GenerationRequestToken: AsyncThrowingStream<GenerationEvent, Error>.Continuation] = [:]
private let maxQueueDepth = 8

/// Timestamp of the most recent queue activity: enqueue, dequeue-to-active, or
/// completion. Initialized to `.distantPast` so `idleDuration` is always
/// meaningful even before the first request.
private var lastActivityTimestamp: Date = .distantPast

// MARK: - Computed

var hasQueuedRequests: Bool { !requestQueue.isEmpty }

/// Number of requests currently waiting in the queue (not including the
/// active request being generated). Exposed publicly via `InferenceService`.
var queuedRequestCount: Int { requestQueue.count }

/// Timestamp of the most recent queue activity.
var lastActivityAt: Date { lastActivityTimestamp }

var lastTokenUsage: (promptTokens: Int, completionTokens: Int)? {
(currentBackend as? TokenUsageProvider)?.lastUsage
}
Expand Down Expand Up @@ -601,6 +613,7 @@ final class GenerationQueue {
requestQueue.append(request)
}

lastActivityTimestamp = Date()
drainQueue()
return (token: token, stream: stream)
}
Expand Down Expand Up @@ -769,6 +782,7 @@ final class GenerationQueue {

activeRequest = next
isGenerating = true
lastActivityTimestamp = Date()
next.stream.setPhase(.connecting)

activeTask = Task { [weak self] in
Expand All @@ -795,6 +809,7 @@ final class GenerationQueue {
}
}
if self.activeRequest?.token == next.token {
self.lastActivityTimestamp = Date()
self.activeRequest = nil
self.activeTask = nil
self.isGenerating = false
Expand Down
25 changes: 25 additions & 0 deletions Sources/ManifoldInference/Services/InferenceService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,31 @@ public final class InferenceService {
return generation.hasQueuedRequests
}

/// Number of requests waiting in the generation queue (not counting the
/// request currently being generated). Zero when the queue is idle.
public var queuedRequestCount: Int {
ensureProviderWired()
return generation.queuedRequestCount
}

/// A point-in-time snapshot of the resident model's identity and runtime
/// characteristics, or `nil` when no model is loaded.
///
/// See ``ResidentModelStatus`` for field documentation.
public var residentModelStatus: ResidentModelStatus? {
guard lifecycle.isModelLoaded,
let modelID = lifecycle.activeModelName,
let backend = lifecycle.activeBackendName,
let loadedAt = lifecycle.loadedAt else { return nil }
return ResidentModelStatus(
modelID: modelID,
backend: backend,
estimatedFootprintBytes: lifecycle.residentFootprintBytes,
loadedAt: loadedAt,
lastActivityAt: generation.lastActivityAt
)
}

public func resetConversation() {
lifecycle.resetConversation()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,16 @@ final class ModelLifecycleCoordinator {
private(set) var activeModelName: String?
private(set) var modelLoadProgress: Double?

/// Timestamp of the moment `isModelLoaded` transitioned to `true` for the
/// current resident model. Cleared to `nil` on unload.
private(set) var loadedAt: Date?

/// Best-effort selection-time footprint estimate for the resident model, in
/// bytes. Sourced from `ModelLoadPlan.outcome.totalEstimatedBytes` when a
/// plan-based load is used. `nil` for cloud endpoints or when no plan was
/// computed (e.g. the `#if DEBUG` test init path).
private(set) var residentFootprintBytes: UInt64?

/// Identity of the ``APIEndpointRecord`` backing the active endpoint
/// backend, or `nil` for on-disk model loads (which have no endpoint
/// record). Threaded through the load commit so usage accounting can
Expand Down Expand Up @@ -98,6 +108,7 @@ final class ModelLifecycleCoordinator {
self.isModelLoaded = true
self.activeBackendName = name
self.activeModelName = modelName
self.loadedAt = Date()
let request = LoadRequestToken(rawValue: 1)
self.nextLoadRequestToken = request
self.latestRequestedLoadToken = request
Expand Down Expand Up @@ -238,12 +249,20 @@ final class ModelLifecycleCoordinator {
let url = modelInfo.url
let mmprojURL = modelInfo.mmprojURL
let dispatchPlan = effectivePlan
// Capture the plan's footprint estimate so `commitLoadIfCurrent` can
// store it as `residentFootprintBytes`. Only non-zero estimates are
// meaningful; zero is the plan's unset default for cloud/system-managed
// backends and is stored as `nil` to signal "unknown".
let footprint: UInt64? = plan.outcome.totalEstimatedBytes > 0
? plan.outcome.totalEstimatedBytes
: nil
try await runLoad(
source: "local",
target: modelTypeLogLabel(modelInfo.modelType),
backendName: backendName,
backend: newBackend,
modelName: modelInfo.name
modelName: modelInfo.name,
footprintBytes: footprint
) {
(newBackend as? MultimodalProjectorConfigurable)?.setMmprojURL(mmprojURL)
try await newBackend.loadModel(from: url, plan: dispatchPlan)
Expand All @@ -262,6 +281,7 @@ final class ModelLifecycleCoordinator {
backend newBackend: any InferenceBackend,
modelName: String,
endpointID: UUID? = nil,
footprintBytes: UInt64? = nil,
loadOperation: @escaping @Sendable () async throws -> Void
) async throws {
let request = beginLoadRequest(
Expand Down Expand Up @@ -290,7 +310,8 @@ final class ModelLifecycleCoordinator {
backend: newBackend,
backendName: backendName,
modelName: modelName,
endpointID: endpointID
endpointID: endpointID,
footprintBytes: footprintBytes
) else {
newBackend.unloadModel()
logLoadEvent("load.suppress", request: request, reason: "stale-success", clearMetadata: true)
Expand Down Expand Up @@ -369,6 +390,8 @@ final class ModelLifecycleCoordinator {
activeBackendName = nil
activeModelName = nil
activeEndpointID = nil
loadedAt = nil
residentFootprintBytes = nil
}

// MARK: - Capability Queries
Expand Down Expand Up @@ -527,7 +550,8 @@ final class ModelLifecycleCoordinator {
backend newBackend: any InferenceBackend,
backendName: String,
modelName: String,
endpointID: UUID? = nil
endpointID: UUID? = nil,
footprintBytes: UInt64? = nil
) -> Bool {
guard canCommitLoad(request) else { return false }
backend = newBackend
Expand All @@ -536,6 +560,8 @@ final class ModelLifecycleCoordinator {
activeBackendName = backendName
activeModelName = modelName
activeEndpointID = endpointID
loadedAt = Date()
residentFootprintBytes = footprintBytes
loadPhase = .loaded(request: request)
logLoadEvent("load.commit", request: request, clearMetadata: true)
return true
Expand Down
128 changes: 128 additions & 0 deletions Tests/ManifoldInferenceTests/ResidentModelStatusTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import XCTest
@testable import ManifoldInference
import ManifoldTestSupport

/// Tests for ``ResidentModelStatus`` and ``InferenceService/queuedRequestCount``.
///
/// Uses XCTestCase (not Swift Testing) per the two-runner constraint (#681):
/// mixing Swift Testing and XCTest in one process causes libmalloc SIGABRT.
@MainActor
final class ResidentModelStatusTests: XCTestCase {

// MARK: - Helpers

private func makeService(modelName: String = "TestModel") -> (InferenceService, MockInferenceBackend) {
let backend = MockInferenceBackend()
backend.isModelLoaded = true
backend.tokensToYield = ["hello"]
let service = InferenceService(backend: backend, name: "Mock", modelName: modelName)
return (service, backend)
}

// MARK: - residentModelStatus — loaded

func test_residentModelStatus_nonNil_whenModelLoaded() {
let (service, _) = makeService(modelName: "Llama-3")
XCTAssertNotNil(service.residentModelStatus,
"residentModelStatus must be non-nil when a model is loaded")
}

func test_residentModelStatus_modelID_matchesModelName() {
let (service, _) = makeService(modelName: "Llama-3")
XCTAssertEqual(service.residentModelStatus?.modelID, "Llama-3",
"modelID must match the name supplied at load time")
}

func test_residentModelStatus_backend_matchesBackendName() {
let (service, _) = makeService()
XCTAssertEqual(service.residentModelStatus?.backend, "Mock",
"backend must match the name supplied at load time")
}

func test_residentModelStatus_loadedAt_isRecent() {
let before = Date()
let (service, _) = makeService()
let after = Date()
guard let loadedAt = service.residentModelStatus?.loadedAt else {
XCTFail("loadedAt must be non-nil when a model is loaded")
return
}
XCTAssertGreaterThanOrEqual(loadedAt, before,
"loadedAt must not precede service creation")
XCTAssertLessThanOrEqual(loadedAt, after,
"loadedAt must not be in the future")
}

func test_residentModelStatus_estimatedFootprintBytes_nilForDebugInit() {
// The #if DEBUG init path (used by InferenceService(backend:name:)) does
// not have access to a ModelLoadPlan so footprint is always nil there.
let (service, _) = makeService()
XCTAssertNil(service.residentModelStatus?.estimatedFootprintBytes,
"estimatedFootprintBytes must be nil when loaded via the debug-init path (no plan)")
}

// MARK: - residentModelStatus — unloaded

func test_residentModelStatus_nil_whenNoModelLoaded() {
let service = InferenceService()
XCTAssertNil(service.residentModelStatus,
"residentModelStatus must be nil when no model is loaded")
}

func test_residentModelStatus_nil_afterUnload() {
let (service, _) = makeService()
XCTAssertNotNil(service.residentModelStatus, "pre-condition: should be loaded")
service.unloadModel()
XCTAssertNil(service.residentModelStatus,
"residentModelStatus must be nil after unloadModel()")
}

// MARK: - queuedRequestCount

func test_queuedRequestCount_zeroInitially() {
let (service, _) = makeService()
XCTAssertEqual(service.queuedRequestCount, 0,
"queuedRequestCount must be 0 before any requests are enqueued")
}

func test_queuedRequestCount_zeroAfterGenerationCompletes() async throws {
let (service, _) = makeService()
let (_, stream) = try service.enqueue(
messages: [Message.user("hi")]
)
// Drain the stream to let the queue settle.
for try await _ in stream.events {}
XCTAssertEqual(service.queuedRequestCount, 0,
"queuedRequestCount must return to 0 after a generation finishes")
}

// MARK: - lastActivityAt

func test_lastActivityAt_updatedAfterGenerationCompletes() async throws {
let (service, _) = makeService()
let before = Date()
let (_, stream) = try service.enqueue(
messages: [Message.user("hi")]
)
for try await _ in stream.events {}
let after = Date()
guard let status = service.residentModelStatus else {
XCTFail("residentModelStatus must be non-nil after generation")
return
}
XCTAssertGreaterThanOrEqual(status.lastActivityAt, before,
"lastActivityAt must be >= the moment before enqueue")
XCTAssertLessThanOrEqual(status.lastActivityAt, after,
"lastActivityAt must not be in the future")
}

func test_idleDuration_isNonNegative() async throws {
let (service, _) = makeService()
let (_, stream) = try service.enqueue(messages: [Message.user("hi")])
for try await _ in stream.events {}
let status = service.residentModelStatus
XCTAssertNotNil(status)
XCTAssertGreaterThanOrEqual(status?.idleDuration ?? -1, 0,
"idleDuration must always be >= 0")
}
}