Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/api-breakage-allowlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,4 @@ API breakage: constructor ToolCallTransform.init(markers:) has been removed
API breakage: struct InferenceMetric has been removed
API breakage: protocol InferenceMetricSink has been removed
API breakage: class InMemoryMetricSink has been removed
API breakage: enumelement UnloadReason.idleTimeout has been added as a new enum case
2 changes: 2 additions & 0 deletions Sources/ManifoldHardware/MemoryPressureEvent.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ public enum UnloadReason: Sendable, Equatable {
case userRequested
/// The model was evicted while the app was in the background.
case backgroundEviction
/// The model exceeded the configured idle keep-alive TTL and was automatically unloaded.
case idleTimeout
}

// MARK: - MemoryPressureEvent
Expand Down
40 changes: 40 additions & 0 deletions Sources/ManifoldInference/KeepAlivePolicy.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import Foundation

/// Policy that controls how long an idle resident model is kept in memory.
///
/// When ``idleTimeout`` is non-nil, ``InferenceService`` starts a background
/// watch task after each successful model load. The task polls the generation
/// queue's ``GenerationQueue/idleDuration`` and calls
/// ``InferenceService/unloadModel(reason:)`` with ``UnloadReason/idleTimeout``
/// when the model has been idle for longer than the configured threshold.
///
/// Any generation activity resets the idle clock, so a busy model is never
/// evicted mid-turn.
///
/// ```swift
/// // Unload after 5 minutes of silence (Ollama-style keep-alive):
/// inferenceService.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 5 * 60)
///
/// // Disable auto-unload (the default):
/// inferenceService.keepAlivePolicy = .never
/// ```
public struct KeepAlivePolicy: Sendable, Equatable {

/// How long the model may be idle before being automatically unloaded.
///
/// `nil` disables auto-unload. A value of `0` would unload the model
/// immediately after every generation; prefer a small positive value
/// (≥ 1 second) in practice.
public var idleTimeout: TimeInterval?

/// The default policy: no automatic unloading.
public static let never = KeepAlivePolicy(idleTimeout: nil)

/// Creates a policy with the given idle timeout.
///
/// - Parameter idleTimeout: Seconds of idle time after which the model
/// is automatically unloaded. Pass `nil` to disable auto-unload.
public init(idleTimeout: TimeInterval?) {
self.idleTimeout = idleTimeout
}
}
8 changes: 8 additions & 0 deletions Sources/ManifoldInference/Services/GenerationQueue.swift
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,14 @@ final class GenerationQueue {
/// meaningful even before the first request.
private var lastActivityTimestamp: Date = .distantPast

/// Seconds elapsed since the most recent generation activity. Returns `.infinity`
/// when no generation has ever occurred so a freshly started service is treated
/// as maximally idle by the keep-alive policy.
var idleDuration: TimeInterval {
guard lastActivityTimestamp != .distantPast else { return .infinity }
return Date.now.timeIntervalSince(lastActivityTimestamp)
}

// MARK: - Computed

var hasQueuedRequests: Bool { !requestQueue.isEmpty }
Expand Down
38 changes: 38 additions & 0 deletions Sources/ManifoldInference/Services/InferenceService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,31 @@ public final class InferenceService {
set { lifecycle.selectedPromptTemplate = newValue }
}

// MARK: - Keep-Alive Policy

/// Policy controlling automatic idle unloading of the resident model.
///
/// Set to a ``KeepAlivePolicy`` with a non-nil `idleTimeout` to enable
/// Ollama-style keep-alive behaviour: the model is automatically unloaded
/// after the configured period of idle time, emitting
/// ``MemoryPressureEvent/willUnload(modelID:reason:)`` and
/// ``MemoryPressureEvent/didUnload(modelID:reason:)`` with reason
/// ``UnloadReason/idleTimeout``. Any generation activity resets the clock.
///
/// Defaults to ``KeepAlivePolicy/never`` (disabled).
///
/// ```swift
/// // Unload after 5 minutes of silence:
/// inferenceService.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 5 * 60)
///
/// // Disable auto-unload:
/// inferenceService.keepAlivePolicy = .never
/// ```
public var keepAlivePolicy: KeepAlivePolicy {
get { lifecycle.keepAlivePolicy }
set { lifecycle.keepAlivePolicy = newValue }
}

// MARK: - Computed

public var capabilities: BackendCapabilities? { lifecycle.capabilities }
Expand Down Expand Up @@ -998,12 +1023,25 @@ extension InferenceService {
/// Replaces the former ``GenerationContextProvider`` protocol conformance.
/// Each closure reads through `lifecycle` so the queue always sees the
/// current backend / template state — never a cached snapshot.
///
/// Also wires the keep-alive policy closures so the ``ModelLifecycleCoordinator``
/// can request idle unloads and read the generation queue's idle duration without
/// holding strong references that would create retain cycles.
fileprivate func wireGenerationContext() {
generation.bindContext(
currentBackend: { [weak self] in self?.lifecycle.backend },
isBackendLoaded: { [weak self] in self?.lifecycle.isModelLoaded ?? false },
selectedPromptTemplate: { [weak self] in self?.lifecycle.selectedPromptTemplate ?? .chatML }
)
// Keep-alive policy seams: inject closures rather than references to
// avoid the lifecycle ↔ service retain cycle. These are set once at
// wiring time and never need to change — the closures read current state.
lifecycle.unloadRequestHandler = { [weak self] reason in
self?.unloadModel(reason: reason)
}
lifecycle.idleDurationProvider = { [weak self] in
self?.generation.idleDuration ?? .infinity
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,39 @@ final class ModelLifecycleCoordinator {
/// `didSet` so tests and custom gates can swap it before each load.
var denyPolicy: LoadDenyPolicy = .platformDefault

// MARK: - Keep-Alive Policy

/// Policy that controls automatic idle unloading. Defaults to `.never` (disabled).
///
/// When set to a non-nil `idleTimeout`, an idle watch task is armed after each
/// successful model load. The task polls the generation queue's idle duration
/// and calls the facade's `unloadModel(reason:)` with `.idleTimeout` when the
/// model has been idle longer than the threshold.
///
/// Written by the InferenceService facade's `didSet` so callers interact only
/// with the public-facing property.
var keepAlivePolicy: KeepAlivePolicy = .never {
didSet { applyKeepAlivePolicy() }
}

/// Closure through which the idle watch task requests an unload on the facade.
///
/// Injected by ``InferenceService`` after init (the coordinator cannot hold a
/// strong reference to the service directly — that would create a retain cycle
/// since the service owns the coordinator). The closure is `@Sendable` so the
/// watch `Task { }` can capture it safely under Swift 6 strict concurrency.
var unloadRequestHandler: (@MainActor @Sendable (UnloadReason) -> Void)?

/// Closure that returns the current generation queue idle duration.
///
/// Injected by ``InferenceService`` after init so the coordinator can poll
/// idle time without holding a direct reference to the generation queue.
var idleDurationProvider: (@MainActor @Sendable () -> TimeInterval)?

/// The running idle watch task, if any. Cancelled when a model unloads or
/// when `keepAlivePolicy` changes to `.never`.
private var idleWatchTask: Task<Void, Never>?

// MARK: - Prompt Template

var selectedPromptTemplate: PromptTemplate = .chatML
Expand Down Expand Up @@ -383,6 +416,7 @@ final class ModelLifecycleCoordinator {
/// Does NOT stop generation — that is the facade's responsibility.
/// The facade calls `stopGeneration()` before delegating here.
func unloadModel() {
cancelIdleWatchTask()
invalidateOutstandingLoads()
backend?.unloadModel()
backend = nil
Expand Down Expand Up @@ -564,6 +598,7 @@ final class ModelLifecycleCoordinator {
residentFootprintBytes = footprintBytes
loadPhase = .loaded(request: request)
logLoadEvent("load.commit", request: request, clearMetadata: true)
armIdleWatchTaskIfNeeded()
return true
}

Expand Down Expand Up @@ -632,4 +667,63 @@ final class ModelLifecycleCoordinator {
return "Apple Foundation Models require iOS 26 / macOS 26 or later."
}
}

// MARK: - Keep-Alive Idle Watch (Private)

/// Re-evaluates the keep-alive policy whenever it changes.
///
/// Called via `keepAlivePolicy.didSet`. If a model is already loaded and
/// the new policy has a non-nil timeout, re-arms the watch task. If the
/// new policy is `.never`, the current watch task (if any) is cancelled.
private func applyKeepAlivePolicy() {
guard isModelLoaded else { return }
if keepAlivePolicy.idleTimeout != nil {
armIdleWatchTaskIfNeeded()
} else {
cancelIdleWatchTask()
}
}

/// Arms the idle watch task when a policy with a non-nil timeout is active.
///
/// Any previously running watch task is cancelled first so rearming after a
/// new model load or policy change is always clean.
private func armIdleWatchTaskIfNeeded() {
guard let timeout = keepAlivePolicy.idleTimeout else { return }
cancelIdleWatchTask()

// Poll interval: check no more frequently than once every 10 seconds,
// but also no less frequently than once per quarter of the timeout
// window so short timeouts (e.g. 0.5 s in tests) still fire promptly.
let pollInterval = min(max(timeout / 4, 0.1), 10.0)

idleWatchTask = Task { [weak self] in
while !Task.isCancelled {
do {
try await Task.sleep(for: .seconds(pollInterval))
} catch {
// Sleep was cancelled — exit cleanly.
return
}
guard !Task.isCancelled else { return }
guard let self else { return }

// Check idle duration against the current policy (the policy
// may have been updated since the task was armed).
guard let currentTimeout = self.keepAlivePolicy.idleTimeout else { return }
let idle = self.idleDurationProvider?() ?? TimeInterval.infinity
if idle >= currentTimeout {
Log.inference.info("KeepAlivePolicy: idle \(idle, privacy: .public)s >= timeout \(currentTimeout, privacy: .public)s — requesting auto-unload")
self.unloadRequestHandler?(.idleTimeout)
return
}
}
}
}

/// Cancels and nils the idle watch task.
private func cancelIdleWatchTask() {
idleWatchTask?.cancel()
idleWatchTask = nil
}
}
Loading