diff --git a/.github/api-breakage-allowlist.txt b/.github/api-breakage-allowlist.txt
index ace26956..27539e78 100644
--- a/.github/api-breakage-allowlist.txt
+++ b/.github/api-breakage-allowlist.txt
@@ -103,3 +103,4 @@ API breakage: constructor ToolCallTransform.init(markers:) has been removed
 API breakage: struct InferenceMetric has been removed
 API breakage: protocol InferenceMetricSink has been removed
 API breakage: class InMemoryMetricSink has been removed
+API breakage: enumelement UnloadReason.idleTimeout has been added as a new enum case
diff --git a/Sources/ManifoldHardware/MemoryPressureEvent.swift b/Sources/ManifoldHardware/MemoryPressureEvent.swift
index ca0d99ab..6ede8eb4 100644
--- a/Sources/ManifoldHardware/MemoryPressureEvent.swift
+++ b/Sources/ManifoldHardware/MemoryPressureEvent.swift
@@ -10,6 +10,8 @@ public enum UnloadReason: Sendable, Equatable {
     case userRequested
     /// The model was evicted while the app was in the background.
     case backgroundEviction
+    /// The model exceeded the configured idle keep-alive TTL and was automatically unloaded.
+    case idleTimeout
 }
 
 // MARK: - MemoryPressureEvent
diff --git a/Sources/ManifoldInference/KeepAlivePolicy.swift b/Sources/ManifoldInference/KeepAlivePolicy.swift
new file mode 100644
index 00000000..daf87c3a
--- /dev/null
+++ b/Sources/ManifoldInference/KeepAlivePolicy.swift
@@ -0,0 +1,40 @@
+import Foundation
+
+/// Policy that controls how long an idle resident model is kept in memory.
+///
+/// When ``idleTimeout`` is non-nil, ``InferenceService`` starts a background
+/// watch task after each successful model load. The task polls the generation
+/// queue's ``GenerationQueue/idleDuration`` and calls
+/// ``InferenceService/unloadModel(reason:)`` with ``UnloadReason/idleTimeout``
+/// when the model has been idle for longer than the configured threshold.
+///
+/// Any generation activity resets the idle clock, so a busy model is never
+/// evicted mid-turn.
+///
+/// ```swift
+/// // Unload after 5 minutes of silence (Ollama-style keep-alive):
+/// inferenceService.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 5 * 60)
+///
+/// // Disable auto-unload (the default):
+/// inferenceService.keepAlivePolicy = .never
+/// ```
+public struct KeepAlivePolicy: Sendable, Equatable {
+
+    /// How long the model may be idle before being automatically unloaded.
+    ///
+    /// `nil` disables auto-unload. A value of `0` would unload the model
+    /// immediately after every generation; prefer a small positive value
+    /// (≥ 1 second) in practice.
+    public var idleTimeout: TimeInterval?
+
+    /// The default policy: no automatic unloading.
+    public static let never = KeepAlivePolicy(idleTimeout: nil)
+
+    /// Creates a policy with the given idle timeout.
+    ///
+    /// - Parameter idleTimeout: Seconds of idle time after which the model
+    ///   is automatically unloaded. Pass `nil` to disable auto-unload.
+    public init(idleTimeout: TimeInterval?) {
+        self.idleTimeout = idleTimeout
+    }
+}
diff --git a/Sources/ManifoldInference/Services/GenerationQueue.swift b/Sources/ManifoldInference/Services/GenerationQueue.swift
index 005e8ddc..75f4ade6 100644
--- a/Sources/ManifoldInference/Services/GenerationQueue.swift
+++ b/Sources/ManifoldInference/Services/GenerationQueue.swift
@@ -202,6 +202,14 @@ final class GenerationQueue {
     /// meaningful even before the first request.
     private var lastActivityTimestamp: Date = .distantPast
 
+    /// Seconds elapsed since the most recent generation activity. Returns `.infinity`
+    /// when no generation has ever occurred so a freshly started service is treated
+    /// as maximally idle by the keep-alive policy.
+    var idleDuration: TimeInterval {
+        guard lastActivityTimestamp != .distantPast else { return .infinity }
+        return Date.now.timeIntervalSince(lastActivityTimestamp)
+    }
+
     // MARK: - Computed
 
     var hasQueuedRequests: Bool { !requestQueue.isEmpty }
diff --git a/Sources/ManifoldInference/Services/InferenceService.swift b/Sources/ManifoldInference/Services/InferenceService.swift
index 9b0ec8c3..d4aedec0 100644
--- a/Sources/ManifoldInference/Services/InferenceService.swift
+++ b/Sources/ManifoldInference/Services/InferenceService.swift
@@ -138,6 +138,31 @@ public final class InferenceService {
         set { lifecycle.selectedPromptTemplate = newValue }
     }
 
+    // MARK: - Keep-Alive Policy
+
+    /// Policy controlling automatic idle unloading of the resident model.
+    ///
+    /// Set to a ``KeepAlivePolicy`` with a non-nil `idleTimeout` to enable
+    /// Ollama-style keep-alive behaviour: the model is automatically unloaded
+    /// after the configured period of idle time, emitting
+    /// ``MemoryPressureEvent/willUnload(modelID:reason:)`` and
+    /// ``MemoryPressureEvent/didUnload(modelID:reason:)`` with reason
+    /// ``UnloadReason/idleTimeout``. Any generation activity resets the clock.
+    ///
+    /// Defaults to ``KeepAlivePolicy/never`` (disabled).
+    ///
+    /// ```swift
+    /// // Unload after 5 minutes of silence:
+    /// inferenceService.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 5 * 60)
+    ///
+    /// // Disable auto-unload:
+    /// inferenceService.keepAlivePolicy = .never
+    /// ```
+    public var keepAlivePolicy: KeepAlivePolicy {
+        get { lifecycle.keepAlivePolicy }
+        set { lifecycle.keepAlivePolicy = newValue }
+    }
+
     // MARK: - Computed
 
     public var capabilities: BackendCapabilities? { lifecycle.capabilities }
@@ -998,12 +1023,25 @@ extension InferenceService {
     /// Replaces the former ``GenerationContextProvider`` protocol conformance.
     /// Each closure reads through `lifecycle` so the queue always sees the
     /// current backend / template state — never a cached snapshot.
+    ///
+    /// Also wires the keep-alive policy closures so the ``ModelLifecycleCoordinator``
+    /// can request idle unloads and read the generation queue's idle duration without
+    /// holding strong references that would create retain cycles.
     fileprivate func wireGenerationContext() {
         generation.bindContext(
             currentBackend: { [weak self] in self?.lifecycle.backend },
             isBackendLoaded: { [weak self] in self?.lifecycle.isModelLoaded ?? false },
             selectedPromptTemplate: { [weak self] in self?.lifecycle.selectedPromptTemplate ?? .chatML }
         )
+        // Keep-alive policy seams: inject closures rather than references to
+        // avoid the lifecycle ↔ service retain cycle. These are set once at
+        // wiring time and never need to change — the closures read current state.
+        lifecycle.unloadRequestHandler = { [weak self] reason in
+            self?.unloadModel(reason: reason)
+        }
+        lifecycle.idleDurationProvider = { [weak self] in
+            self?.generation.idleDuration ?? .infinity
+        }
     }
 }
 
diff --git a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift
index 29341430..46c5ad92 100644
--- a/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift
+++ b/Sources/ManifoldInference/Services/ModelLifecycleCoordinator.swift
@@ -47,6 +47,39 @@ final class ModelLifecycleCoordinator {
     /// `didSet` so tests and custom gates can swap it before each load.
     var denyPolicy: LoadDenyPolicy = .platformDefault
 
+    // MARK: - Keep-Alive Policy
+
+    /// Policy that controls automatic idle unloading. Defaults to `.never` (disabled).
+    ///
+    /// When set to a non-nil `idleTimeout`, an idle watch task is armed after each
+    /// successful model load. The task polls the generation queue's idle duration
+    /// and calls the facade's `unloadModel(reason:)` with `.idleTimeout` when the
+    /// model has been idle longer than the threshold.
+    ///
+    /// Written by the InferenceService facade's `didSet` so callers interact only
+    /// with the public-facing property.
+    var keepAlivePolicy: KeepAlivePolicy = .never {
+        didSet { applyKeepAlivePolicy() }
+    }
+
+    /// Closure through which the idle watch task requests an unload on the facade.
+    ///
+    /// Injected by ``InferenceService`` after init (the coordinator cannot hold a
+    /// strong reference to the service directly — that would create a retain cycle
+    /// since the service owns the coordinator). The closure is `@Sendable` so the
+    /// watch `Task { }` can capture it safely under Swift 6 strict concurrency.
+    var unloadRequestHandler: (@MainActor @Sendable (UnloadReason) -> Void)?
+
+    /// Closure that returns the current generation queue idle duration.
+    ///
+    /// Injected by ``InferenceService`` after init so the coordinator can poll
+    /// idle time without holding a direct reference to the generation queue.
+    var idleDurationProvider: (@MainActor @Sendable () -> TimeInterval)?
+
+    /// The running idle watch task, if any. Cancelled when a model unloads or
+    /// when `keepAlivePolicy` changes to `.never`.
+    private var idleWatchTask: Task<Void, Never>?
+
     // MARK: - Prompt Template
 
     var selectedPromptTemplate: PromptTemplate = .chatML
@@ -383,6 +416,7 @@ final class ModelLifecycleCoordinator {
     /// Does NOT stop generation — that is the facade's responsibility.
     /// The facade calls `stopGeneration()` before delegating here.
     func unloadModel() {
+        cancelIdleWatchTask()
         invalidateOutstandingLoads()
         backend?.unloadModel()
         backend = nil
@@ -564,6 +598,7 @@ final class ModelLifecycleCoordinator {
         residentFootprintBytes = footprintBytes
         loadPhase = .loaded(request: request)
         logLoadEvent("load.commit", request: request, clearMetadata: true)
+        armIdleWatchTaskIfNeeded()
         return true
     }
 
@@ -632,4 +667,63 @@ final class ModelLifecycleCoordinator {
             return "Apple Foundation Models require iOS 26 / macOS 26 or later."
         }
     }
+
+    // MARK: - Keep-Alive Idle Watch (Private)
+
+    /// Re-evaluates the keep-alive policy whenever it changes.
+    ///
+    /// Called via `keepAlivePolicy.didSet`. If a model is already loaded and
+    /// the new policy has a non-nil timeout, re-arms the watch task. If the
+    /// new policy is `.never`, the current watch task (if any) is cancelled.
+    private func applyKeepAlivePolicy() {
+        guard isModelLoaded else { return }
+        if keepAlivePolicy.idleTimeout != nil {
+            armIdleWatchTaskIfNeeded()
+        } else {
+            cancelIdleWatchTask()
+        }
+    }
+
+    /// Arms the idle watch task when a policy with a non-nil timeout is active.
+    ///
+    /// Any previously running watch task is cancelled first so rearming after a
+    /// new model load or policy change is always clean.
+    private func armIdleWatchTaskIfNeeded() {
+        guard let timeout = keepAlivePolicy.idleTimeout else { return }
+        cancelIdleWatchTask()
+
+        // Poll interval: check no more frequently than once every 10 seconds,
+        // but also no less frequently than once per quarter of the timeout
+        // window so short timeouts (e.g. 0.5 s in tests) still fire promptly.
+        let pollInterval = min(max(timeout / 4, 0.1), 10.0)
+
+        idleWatchTask = Task { [weak self] in
+            while !Task.isCancelled {
+                do {
+                    try await Task.sleep(for: .seconds(pollInterval))
+                } catch {
+                    // Sleep was cancelled — exit cleanly.
+                    return
+                }
+                guard !Task.isCancelled else { return }
+                guard let self else { return }
+
+                // Check idle duration against the current policy (the policy
+                // may have been updated since the task was armed).
+                guard let currentTimeout = self.keepAlivePolicy.idleTimeout else { return }
+                let idle = self.idleDurationProvider?() ?? TimeInterval.infinity
+                if idle >= currentTimeout {
+                    Log.inference.info("KeepAlivePolicy: idle \(idle, privacy: .public)s >= timeout \(currentTimeout, privacy: .public)s — requesting auto-unload")
+                    self.unloadRequestHandler?(.idleTimeout)
+                    return
+                }
+            }
+        }
+    }
+
+    /// Cancels and nils the idle watch task.
+    private func cancelIdleWatchTask() {
+        idleWatchTask?.cancel()
+        idleWatchTask = nil
+    }
 }
diff --git a/Tests/ManifoldInferenceTests/KeepAlivePolicyTests.swift b/Tests/ManifoldInferenceTests/KeepAlivePolicyTests.swift
new file mode 100644
index 00000000..74fc2850
--- /dev/null
+++ b/Tests/ManifoldInferenceTests/KeepAlivePolicyTests.swift
@@ -0,0 +1,179 @@
+import XCTest
+@testable import ManifoldInference
+import ManifoldTestSupport
+
+/// Tests for ``KeepAlivePolicy`` and its idle auto-unload integration with
+/// ``InferenceService``.
+///
+/// Timer-based tests use short idle timeouts (0.3 – 0.5 s) with 2-second
+/// XCTWaiter deadlines so they run fast on CI without being flaky.
+@MainActor
+final class KeepAlivePolicyTests: XCTestCase {
+
+    // MARK: - Helpers
+
+    private func makeService() -> InferenceService {
+        let backend = MockInferenceBackend()
+        backend.isModelLoaded = true
+        return InferenceService(backend: backend)
+    }
+
+    // MARK: - KeepAlivePolicy value semantics
+
+    func test_keepAlivePolicy_defaultIsNever() {
+        let service = makeService()
+        XCTAssertNil(service.keepAlivePolicy.idleTimeout)
+        XCTAssertEqual(service.keepAlivePolicy, .never)
+    }
+
+    func test_keepAlivePolicy_equatable() {
+        XCTAssertEqual(KeepAlivePolicy.never, KeepAlivePolicy(idleTimeout: nil))
+        XCTAssertNotEqual(KeepAlivePolicy(idleTimeout: 30), KeepAlivePolicy(idleTimeout: 60))
+        XCTAssertNotEqual(KeepAlivePolicy.never, KeepAlivePolicy(idleTimeout: 30))
+    }
+
+    // MARK: - Default .never does NOT auto-unload
+
+    func test_never_policy_doesNotUnload() async throws {
+        let service = makeService()
+        // .never is the default; model is loaded.
+        XCTAssertTrue(service.isModelLoaded)
+
+        // Wait briefly — no unload should occur.
+        try await Task.sleep(for: .milliseconds(300))
+        XCTAssertTrue(
+            service.isModelLoaded,
+            "Model should remain loaded when keepAlivePolicy is .never"
+        )
+    }
+
+    // MARK: - Policy fires after idle timeout
+
+    func test_idleTimeout_unloadsAfterIdle() async throws {
+        let service = makeService()
+        XCTAssertTrue(service.isModelLoaded)
+
+        // Record any unload events so we can verify the reason.
+        var unloadReasons: [UnloadReason] = []
+        let stream = service.memoryPressureEvents()
+        let eventTask = Task { @MainActor in
+            for await event in stream {
+                if case .didUnload(_, let reason) = event {
+                    unloadReasons.append(reason)
+                }
+            }
+        }
+        defer { eventTask.cancel() }
+
+        // Set a 0.3-second idle timeout — short enough that the watch task fires
+        // before our 2-second test deadline, long enough to not be flaky.
+        service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.3)
+
+        // Wait up to 2 seconds for the model to be auto-unloaded.
+        let deadline = Date.now.addingTimeInterval(2.0)
+        while service.isModelLoaded && Date.now < deadline {
+            try await Task.sleep(for: .milliseconds(50))
+        }
+
+        XCTAssertFalse(service.isModelLoaded, "Model should have been auto-unloaded after idle timeout")
+
+        // Allow the event stream task a tick to collect the event.
+        try await Task.sleep(for: .milliseconds(50))
+        XCTAssertTrue(
+            unloadReasons.contains(.idleTimeout),
+            "MemoryPressureEvent should carry UnloadReason.idleTimeout; got \(unloadReasons)"
+        )
+    }
+
+    // MARK: - Activity resets the idle clock
+
+    func test_activity_resetsIdleClock() async throws {
+        let backend = MockInferenceBackend()
+        backend.isModelLoaded = true
+        // Provide tokens so the generation stream has something to consume.
+        backend.tokensToYield = ["hello"]
+        let service = InferenceService(backend: backend)
+
+        // Set a 0.4-second idle timeout.
+        service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.4)
+
+        // Enqueue and fully consume a generation ~0.1 s after setting the policy,
+        // which should reset the idle clock to now.
+        try await Task.sleep(for: .milliseconds(100))
+        let (_, genStream) = try service.enqueue(
+            messages: [.user("hi")],
+            config: GenerationConfig()
+        )
+        for try await _ in genStream.events {}
+
+        // Shortly after the generation completes the model should still be loaded
+        // (idle clock was just reset).
+        try await Task.sleep(for: .milliseconds(100))
+        XCTAssertTrue(
+            service.isModelLoaded,
+            "Model should still be loaded — activity reset the idle clock"
+        )
+
+        // But eventually (well after 0.4 s of silence post-generation) it should
+        // auto-unload.
+        let deadline = Date.now.addingTimeInterval(2.0)
+        while service.isModelLoaded && Date.now < deadline {
+            try await Task.sleep(for: .milliseconds(50))
+        }
+        XCTAssertFalse(
+            service.isModelLoaded,
+            "Model should eventually auto-unload after activity silence"
+        )
+    }
+
+    // MARK: - Policy disabled at runtime before timeout fires
+
+    func test_policyDisabledBeforeTimeout_keepsModelLoaded() async throws {
+        let service = makeService()
+        XCTAssertTrue(service.isModelLoaded)
+
+        // Arm a 0.5-second timeout.
+        service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.5)
+
+        // Immediately disable it — the watch task should be cancelled.
+        service.keepAlivePolicy = .never
+
+        // Wait beyond what the timeout would have been.
+        try await Task.sleep(for: .milliseconds(700))
+
+        XCTAssertTrue(
+            service.isModelLoaded,
+            "Model should remain loaded after the policy was reverted to .never"
+        )
+    }
+
+    // MARK: - Explicit unload cancels the watch task (no double-unload)
+
+    func test_explicitUnload_doesNotTriggerSecondUnload() async throws {
+        let service = makeService()
+        XCTAssertTrue(service.isModelLoaded)
+
+        var didUnloadCount = 0
+        let stream = service.memoryPressureEvents()
+        let eventTask = Task { @MainActor in
+            for await event in stream {
+                if case .didUnload = event { didUnloadCount += 1 }
+            }
+        }
+        defer { eventTask.cancel() }
+
+        service.keepAlivePolicy = KeepAlivePolicy(idleTimeout: 0.5)
+
+        // Explicitly unload before the timer fires.
+        service.unloadModel()
+        XCTAssertFalse(service.isModelLoaded)
+
+        // Wait beyond the timeout window.
+        try await Task.sleep(for: .milliseconds(700))
+
+        // Allow the event stream task a tick to collect events.
+        try await Task.sleep(for: .milliseconds(50))
+
+        XCTAssertEqual(didUnloadCount, 1, "Only one unload event should fire — the explicit one")
+    }
+}