diff --git a/.github/api-breakage-allowlist.txt b/.github/api-breakage-allowlist.txt index ace26956..27539e78 100644 --- a/.github/api-breakage-allowlist.txt +++ b/.github/api-breakage-allowlist.txt @@ -103,3 +103,4 @@ API breakage: constructor ToolCallTransform.init(markers:) has been removed API breakage: struct InferenceMetric has been removed API breakage: protocol InferenceMetricSink has been removed API breakage: class InMemoryMetricSink has been removed +API breakage: enumelement UnloadReason.idleTimeout has been added as a new enum case diff --git a/Sources/ManifoldHardware/MemoryPressureEvent.swift b/Sources/ManifoldHardware/MemoryPressureEvent.swift index ca0d99ab..6ede8eb4 100644 --- a/Sources/ManifoldHardware/MemoryPressureEvent.swift +++ b/Sources/ManifoldHardware/MemoryPressureEvent.swift @@ -10,6 +10,8 @@ public enum UnloadReason: Sendable, Equatable { case userRequested /// The model was evicted while the app was in the background. case backgroundEviction + /// The model exceeded the configured idle keep-alive TTL and was automatically unloaded. + case idleTimeout } // MARK: - MemoryPressureEvent diff --git a/Sources/ManifoldInference/KeepAlivePolicy.swift b/Sources/ManifoldInference/KeepAlivePolicy.swift new file mode 100644 index 00000000..daf87c3a --- /dev/null +++ b/Sources/ManifoldInference/KeepAlivePolicy.swift @@ -0,0 +1,40 @@ +import Foundation + +/// Policy that controls how long an idle resident model is kept in memory. +/// +/// When ``idleTimeout`` is non-nil, ``InferenceService`` starts a background +/// watch task after each successful model load. The task polls the generation +/// queue's ``GenerationQueue/idleDuration`` and calls +/// ``InferenceService/unloadModel(reason:)`` with ``UnloadReason/idleTimeout`` +/// when the model has been idle for longer than the configured threshold. +/// +/// Any generation activity resets the idle clock, so a busy model is never +/// evicted mid-turn. +/// +/// ```swift +/// // Unload after 5 minutes of silence (Ollama-style keep-alive): +/// inferenceService.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 5 * 60) +/// +/// // Disable auto-unload (the default): +/// inferenceService.keepAlivePolicy = .never +/// ``` +public struct KeepAlivePolicy: Sendable, Equatable { + + /// How long the model may be idle before being automatically unloaded. + /// + /// `nil` disables auto-unload. A value of `0` would unload the model + /// immediately after every generation; prefer a small positive value + /// (≥ 1 second) in practice. + public var idleTimeout: TimeInterval? + + /// The default policy: no automatic unloading. + public static let never = KeepAlivePolicy(idleTimeout: nil) + + /// Creates a policy with the given idle timeout. + /// + /// - Parameter idleTimeout: Seconds of idle time after which the model + /// is automatically unloaded. Pass `nil` to disable auto-unload. + public init(idleTimeout: TimeInterval?) { + self.idleTimeout = idleTimeout + } +} diff --git a/Sources/ManifoldInference/Services/GenerationQueue.swift b/Sources/ManifoldInference/Services/GenerationQueue.swift index 005e8ddc..75f4ade6 100644 --- a/Sources/ManifoldInference/Services/GenerationQueue.swift +++ b/Sources/ManifoldInference/Services/GenerationQueue.swift @@ -202,6 +202,14 @@ final class GenerationQueue { /// meaningful even before the first request. private var lastActivityTimestamp: Date = .distantPast + /// Seconds elapsed since the most recent generation activity. Returns `.infinity` + /// when no generation has ever occurred so a freshly started service is treated + /// as maximally idle by the keep-alive policy. + var idleDuration: TimeInterval { + guard lastActivityTimestamp != .distantPast else { return .infinity } + return Date.now.timeIntervalSince(lastActivityTimestamp) + } + // MARK: - Computed var hasQueuedRequests: Bool { !requestQueue.isEmpty } diff --git a/Sources/ManifoldInference/Services/InferenceService.swift b/Sources/ManifoldInference/Services/InferenceService.swift index 9b0ec8c3..d4aedec0 100644 --- a/Sources/ManifoldInference/Services/InferenceService.swift +++ b/Sources/ManifoldInference/Services/InferenceService.swift @@ -138,6 +138,31 @@ public final class InferenceService { set { lifecycle.selectedPromptTemplate = newValue } } + // MARK: - Keep-Alive Policy + + /// Policy controlling automatic idle unloading of the resident model. + /// + /// Set to a ``KeepAlivePolicy`` with a non-nil `idleTimeout` to enable + /// Ollama-style keep-alive behaviour: the model is automatically unloaded + /// after the configured period of idle time, emitting + /// ``MemoryPressureEvent/willUnload(modelID:reason:)`` and + /// ``MemoryPressureEvent/didUnload(modelID:reason:)`` with reason + /// ``UnloadReason/idleTimeout``. Any generation activity resets the clock. + /// + /// Defaults to ``KeepAlivePolicy/never`` (disabled). + /// + /// ```swift + /// // Unload after 5 minutes of silence: + /// inferenceService.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 5 * 60) + /// + /// // Disable auto-unload: + /// inferenceService.keepAlivePolicy = .never + /// ``` + public var keepAlivePolicy: KeepAlivePolicy { + get { lifecycle.keepAlivePolicy } + set { lifecycle.keepAlivePolicy = newValue } + } + // MARK: - Computed public var capabilities: BackendCapabilities? { lifecycle.capabilities } @@ -998,12 +1023,25 @@ extension InferenceService { /// Replaces the former ``GenerationContextProvider`` protocol conformance. /// Each closure reads through `lifecycle` so the queue always sees the /// current backend / template state — never a cached snapshot. + /// + /// Also wires the keep-alive policy closures so the ``ModelLifecycleCoordinator`` + /// can request idle unloads and read the generation queue's idle duration without + /// holding strong references that would create retain cycles. fileprivate func wireGenerationContext() { generation.bindContext( currentBackend: { [weak self] in self?.lifecycle.backend }, isBackendLoaded: { [weak self] in self?.lifecycle.isModelLoaded ?? false }, selectedPromptTemplate: { [weak self] in self?.lifecycle.selectedPromptTemplate ?? .chatML } ) + // Keep-alive policy seams: inject closures rather than references to + // avoid the lifecycle ↔ service retain cycle. These are set once at + // wiring time and never need to change — the closures read current state. + lifecycle.unloadRequestHandler = { [weak self] reason in + self?.unloadModel(reason: reason) + } + lifecycle.idleDurationProvider = { [weak self] in + self?.generation.idleDuration ?? .infinity + } } } diff --git a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift index 29341430..46c5ad92 100644 --- a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift +++ b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift @@ -47,6 +47,39 @@ final class ModelLifecycleCoordinator { /// `didSet` so tests and custom gates can swap it before each load. var denyPolicy: LoadDenyPolicy = .platformDefault + // MARK: - Keep-Alive Policy + + /// Policy that controls automatic idle unloading. Defaults to `.never` (disabled). + /// + /// When set to a non-nil `idleTimeout`, an idle watch task is armed after each + /// successful model load. The task polls the generation queue's idle duration + /// and calls the facade's `unloadModel(reason:)` with `.idleTimeout` when the + /// model has been idle longer than the threshold. + /// + /// Written by the InferenceService facade's `didSet` so callers interact only + /// with the public-facing property. + var keepAlivePolicy: KeepAlivePolicy = .never { + didSet { applyKeepAlivePolicy() } + } + + /// Closure through which the idle watch task requests an unload on the facade. + /// + /// Injected by ``InferenceService`` after init (the coordinator cannot hold a + /// strong reference to the service directly — that would create a retain cycle + /// since the service owns the coordinator). The closure is `@Sendable` so the + /// watch `Task { }` can capture it safely under Swift 6 strict concurrency. + var unloadRequestHandler: (@MainActor @Sendable (UnloadReason) -> Void)? + + /// Closure that returns the current generation queue idle duration. + /// + /// Injected by ``InferenceService`` after init so the coordinator can poll + /// idle time without holding a direct reference to the generation queue. + var idleDurationProvider: (@MainActor @Sendable () -> TimeInterval)? + + /// The running idle watch task, if any. Cancelled when a model unloads or + /// when `keepAlivePolicy` changes to `.never`. + private var idleWatchTask: Task? + // MARK: - Prompt Template var selectedPromptTemplate: PromptTemplate = .chatML @@ -383,6 +416,7 @@ final class ModelLifecycleCoordinator { /// Does NOT stop generation — that is the facade's responsibility. /// The facade calls `stopGeneration()` before delegating here. func unloadModel() { + cancelIdleWatchTask() invalidateOutstandingLoads() backend?.unloadModel() backend = nil @@ -564,6 +598,7 @@ final class ModelLifecycleCoordinator { residentFootprintBytes = footprintBytes loadPhase = .loaded(request: request) logLoadEvent("load.commit", request: request, clearMetadata: true) + armIdleWatchTaskIfNeeded() return true } @@ -632,4 +667,63 @@ final class ModelLifecycleCoordinator { return "Apple Foundation Models require iOS 26 / macOS 26 or later." } } + + // MARK: - Keep-Alive Idle Watch (Private) + + /// Re-evaluates the keep-alive policy whenever it changes. + /// + /// Called via `keepAlivePolicy.didSet`. If a model is already loaded and + /// the new policy has a non-nil timeout, re-arms the watch task. If the + /// new policy is `.never`, the current watch task (if any) is cancelled. + private func applyKeepAlivePolicy() { + guard isModelLoaded else { return } + if keepAlivePolicy.idleTimeout != nil { + armIdleWatchTaskIfNeeded() + } else { + cancelIdleWatchTask() + } + } + + /// Arms the idle watch task when a policy with a non-nil timeout is active. + /// + /// Any previously running watch task is cancelled first so rearming after a + /// new model load or policy change is always clean. + private func armIdleWatchTaskIfNeeded() { + guard let timeout = keepAlivePolicy.idleTimeout else { return } + cancelIdleWatchTask() + + // Poll interval: check no more frequently than once every 10 seconds, + // but also no less frequently than once per quarter of the timeout + // window so short timeouts (e.g. 0.5 s in tests) still fire promptly. + let pollInterval = min(max(timeout / 4, 0.1), 10.0) + + idleWatchTask = Task { [weak self] in + while !Task.isCancelled { + do { + try await Task.sleep(for: .seconds(pollInterval)) + } catch { + // Sleep was cancelled — exit cleanly. + return + } + guard !Task.isCancelled else { return } + guard let self else { return } + + // Check idle duration against the current policy (the policy + // may have been updated since the task was armed). + guard let currentTimeout = self.keepAlivePolicy.idleTimeout else { return } + let idle = self.idleDurationProvider?() ?? TimeInterval.infinity + if idle >= currentTimeout { + Log.inference.info("KeepAlivePolicy: idle \(idle, privacy: .public)s >= timeout \(currentTimeout, privacy: .public)s — requesting auto-unload") + self.unloadRequestHandler?(.idleTimeout) + return + } + } + } + } + + /// Cancels and nils the idle watch task. + private func cancelIdleWatchTask() { + idleWatchTask?.cancel() + idleWatchTask = nil + } } diff --git a/Tests/ManifoldInferenceTests/KeepAlivePolicyTests.swift b/Tests/ManifoldInferenceTests/KeepAlivePolicyTests.swift new file mode 100644 index 00000000..74fc2850 --- /dev/null +++ b/Tests/ManifoldInferenceTests/KeepAlivePolicyTests.swift @@ -0,0 +1,179 @@ +import XCTest +@testable import ManifoldInference +import ManifoldTestSupport + +/// Tests for ``KeepAlivePolicy`` and its idle auto-unload integration with +/// ``InferenceService``. +/// +/// Timer-based tests use short idle timeouts (0.3 – 0.5 s) with 2-second +/// XCTWaiter deadlines so they run fast on CI without being flaky. +@MainActor +final class KeepAlivePolicyTests: XCTestCase { + + // MARK: - Helpers + + private func makeService() -> InferenceService { + let backend = MockInferenceBackend() + backend.isModelLoaded = true + return InferenceService(backend: backend) + } + + // MARK: - KeepAlivePolicy value semantics + + func test_keepAlivePolicy_defaultIsNever() { + let service = makeService() + XCTAssertNil(service.keepAlivePolicy.idleTimeout) + XCTAssertEqual(service.keepAlivePolicy, .never) + } + + func test_keepAlivePolicy_equatable() { + XCTAssertEqual(KeepAlivePolicy.never, KeepAlivePolicy(idleTimeout: nil)) + XCTAssertNotEqual(KeepAlivePolicy(idleTimeout: 30), KeepAlivePolicy(idleTimeout: 60)) + XCTAssertNotEqual(KeepAlivePolicy.never, KeepAlivePolicy(idleTimeout: 30)) + } + + // MARK: - Default .never does NOT auto-unload + + func test_never_policy_doesNotUnload() async throws { + let service = makeService() + // .never is the default; model is loaded. + XCTAssertTrue(service.isModelLoaded) + + // Wait briefly — no unload should occur. + try await Task.sleep(for: .milliseconds(300)) + XCTAssertTrue( + service.isModelLoaded, + "Model should remain loaded when keepAlivePolicy is .never" + ) + } + + // MARK: - Policy fires after idle timeout + + func test_idleTimeout_unloadsAfterIdle() async throws { + let service = makeService() + XCTAssertTrue(service.isModelLoaded) + + // Record any unload events so we can verify the reason. + var unloadReasons: [UnloadReason] = [] + let stream = service.memoryPressureEvents() + let eventTask = Task { @MainActor in + for await event in stream { + if case .didUnload(_, let reason) = event { + unloadReasons.append(reason) + } + } + } + defer { eventTask.cancel() } + + // Set a 0.3-second idle timeout — short enough that the watch task fires + // before our 2-second test deadline, long enough to not be flaky. + service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.3) + + // Wait up to 2 seconds for the model to be auto-unloaded. + let deadline = Date.now.addingTimeInterval(2.0) + while service.isModelLoaded && Date.now < deadline { + try await Task.sleep(for: .milliseconds(50)) + } + + XCTAssertFalse(service.isModelLoaded, "Model should have been auto-unloaded after idle timeout") + + // Allow the event stream task a tick to collect the event. + try await Task.sleep(for: .milliseconds(50)) + XCTAssertTrue( + unloadReasons.contains(.idleTimeout), + "MemoryPressureEvent should carry UnloadReason.idleTimeout; got \(unloadReasons)" + ) + } + + // MARK: - Activity resets the idle clock + + func test_activity_resetsIdleClock() async throws { + let backend = MockInferenceBackend() + backend.isModelLoaded = true + // Provide tokens so the generation stream has something to consume. + backend.tokensToYield = ["hello"] + let service = InferenceService(backend: backend) + + // Set a 0.4-second idle timeout. + service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.4) + + // Enqueue and fully consume a generation ~0.1 s after setting the policy, + // which should reset the idle clock to now. + try await Task.sleep(for: .milliseconds(100)) + let (_, genStream) = try service.enqueue( + messages: [.user("hi")], + config: GenerationConfig() + ) + for try await _ in genStream.events {} + + // Shortly after the generation completes the model should still be loaded + // (idle clock was just reset). + try await Task.sleep(for: .milliseconds(100)) + XCTAssertTrue( + service.isModelLoaded, + "Model should still be loaded — activity reset the idle clock" + ) + + // But eventually (well after 0.4 s of silence post-generation) it should + // auto-unload. + let deadline = Date.now.addingTimeInterval(2.0) + while service.isModelLoaded && Date.now < deadline { + try await Task.sleep(for: .milliseconds(50)) + } + XCTAssertFalse( + service.isModelLoaded, + "Model should eventually auto-unload after activity silence" + ) + } + + // MARK: - Policy disabled at runtime before timeout fires + + func test_policyDisabledBeforeTimeout_keepsModelLoaded() async throws { + let service = makeService() + XCTAssertTrue(service.isModelLoaded) + + // Arm a 0.5-second timeout. + service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.5) + + // Immediately disable it — the watch task should be cancelled. + service.keepAlivePolicy = .never + + // Wait beyond what the timeout would have been. + try await Task.sleep(for: .milliseconds(700)) + + XCTAssertTrue( + service.isModelLoaded, + "Model should remain loaded after the policy was reverted to .never" + ) + } + + // MARK: - Explicit unload cancels the watch task (no double-unload) + + func test_explicitUnload_doesNotTriggerSecondUnload() async throws { + let service = makeService() + XCTAssertTrue(service.isModelLoaded) + + var didUnloadCount = 0 + let stream = service.memoryPressureEvents() + let eventTask = Task { @MainActor in + for await event in stream { + if case .didUnload = event { didUnloadCount += 1 } + } + } + defer { eventTask.cancel() } + + service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.5) + + // Explicitly unload before the timer fires. + service.unloadModel() + XCTAssertFalse(service.isModelLoaded) + + // Wait beyond the timeout window. + try await Task.sleep(for: .milliseconds(700)) + + // Allow the event stream task a tick to collect events. + try await Task.sleep(for: .milliseconds(50)) + + XCTAssertEqual(didUnloadCount, 1, "Only one unload event should fire — the explicit one") + } +}