roryford · roryford · Jun 15, 2026 · Jun 15, 2026
@@ -100,3 +100,6 @@ API breakage: func ModelLoadCoordinator.dispatchLoad(_:) has been renamed to fun
 API breakage: enumelement GenerationEvent.toolCallParseFailed has been added as a new enum case
 API breakage: enumelement GenerationEvent.toolCallTruncated has been added as a new enum case
 API breakage: constructor ToolCallTransform.init(markers:) has been removed
+API breakage: struct InferenceMetric has been removed
+API breakage: protocol InferenceMetricSink has been removed
+API breakage: class InMemoryMetricSink has been removed
@@ -0,0 +1,13 @@
+// Source compatibility shim — InferenceMetric, InferenceMetricSink, and
+// InMemoryMetricSink were relocated from ManifoldCloudCore to ManifoldInference
+// in the observability train so that ManifoldFoundation (which depends on
+// ManifoldInference but not ManifoldCloudCore) can reach them.
+//
+// @_exported re-surfaces the entire ManifoldInference surface through
+// ManifoldCloudCore so all existing `import ManifoldCloudCore` consumers
+// continue to resolve InferenceMetric / InferenceMetricSink / InMemoryMetricSink
+// at the same import depth — no source changes required downstream.
+//
+// ManifoldCloudCore already takes a direct dependency on ManifoldInference in
+// Package.swift, so this is a pure source-compat promotion, not a new dep.
+@_exported import ManifoldInference
@@ -110,12 +110,27 @@ struct SSEGenerationTaskRunner {
         }
 
         if let sink = context.metricSink {
+            // Compute cost in ManifoldCloudCore where InferenceCostEstimator lives,
+            // then pass the pre-resolved values to the ManifoldInference record helper.
+            let usage = context.readUsage()
+            let promptTokens = usage?.promptTokens ?? 0
+            let completionTokens = usage?.completionTokens ?? 0
+            let (costUSD, isApprox) = InferenceCostEstimator.estimatedCost(
+                provider: context.backendName,
+                model: context.modelName,
+                promptTokens: promptTokens,
+                completionTokens: completionTokens
+            )
             SSEGenerationMetrics.record(
                 to: sink,
                 tracker: metricTracker,
                 provider: context.backendName,
                 model: context.modelName,
-                usage: context.readUsage(),
+                promptTokens: promptTokens,
+                completionTokens: completionTokens,
+                estimatedCostUSD: costUSD,
+                isCostApproximate: isApprox,
+                costTableDate: InferenceCostEstimator.costTableDate,
                 errorClass: streamError.map { SSECloudBackend.classifyError($0) }
             )
         }

@@ -6,6 +6,10 @@ import os
 // surface only (InferenceBackend, GenerationConfig, GenerationEvent, …) — no
 // engine state. ManifoldContract re-exports the P1 leaf types it needs.
 import ManifoldContract
+// InferenceMetricSink and InMemoryMetricSink live in ManifoldInference since
+// the observability train relocated them from ManifoldCloudCore so that this
+// backend can reach them without a ManifoldCloudCore dependency.
+import ManifoldInference
 
 /// Apple FoundationModels inference backend for on-device Apple Intelligence models.
 ///
@@ -195,6 +199,12 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
     /// real Apple Intelligence entitlement. Production uses the system default.
     private let availabilityResolver: @Sendable () -> SystemLanguageModel.Availability
 
+    /// The sink that receives an ``InferenceMetric`` after every generation call.
+    ///
+    /// Defaults to ``InMemoryMetricSink/shared`` so callers can read recent
+    /// metrics without any configuration. Set to `nil` to disable metric emission.
+    public var metricSink: (any InferenceMetricSink)? = InMemoryMetricSink.shared
+
     /// Structured conversation history installed by ``GenerationHistoryInstaller``
     /// through the ``StructuredHistoryReceiver`` opt-in.
     ///
@@ -496,14 +506,34 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
         // returning and the Task being scheduled by the cooperative executor.
         // The retain cycle (backend → generationTask → backend) is broken in the
         // `defer` block when `generationTask` is nilled out on completion.
+        let metricTracker = GenerationMetricTracker()
+        let capturedMetricSink = withStateLock { metricSink }
         let task = Task { [self, generationStream] in
+            var streamError: Error?
             defer {
                 withStateLock {
                     if generationSequence == generationID {
                         _isGenerating = false
                         generationTask = nil
                     }
                 }
+                // Emit an InferenceMetric after every generation (success or
+                // failure). Cost is zero / approximate because the Foundation
+                // Models framework does not expose token-level billing.
+                if let sink = capturedMetricSink {
+                    SSEGenerationMetrics.record(
+                        to: sink,
+                        tracker: metricTracker,
+                        provider: "FoundationModels",
+                        model: "apple-foundation",
+                        promptTokens: 0,
+                        completionTokens: 0,
+                        estimatedCostUSD: 0,
+                        isCostApproximate: true,
+                        costTableDate: "",
+                        errorClass: streamError.map { String(describing: type(of: $0)) }
+                    )
+                }
                 Self.logger.debug("Foundation generate finished")
             }
 
@@ -527,6 +557,8 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
                 // iterator was dropped before returning nil.
                 withStateLock { _sessionIsClean = false }
 
+                metricTracker.start()
+
                 let result: StreamResult
                 if let toolEnvelope {
                     result = try await runToolAwareStream(
@@ -535,15 +567,17 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
                         schema: toolEnvelope,
                         options: options,
                         continuation: continuation,
-                        generationStream: generationStream
+                        generationStream: generationStream,
+                        metricTracker: capturedMetricSink != nil ? metricTracker : nil
                     )
                 } else {
                     result = try await runTextOnlyStream(
                         session: activeSession,
                         prompt: prompt,
                         options: options,
                         continuation: continuation,
-                        generationStream: generationStream
+                        generationStream: generationStream,
+                        metricTracker: capturedMetricSink != nil ? metricTracker : nil
                     )
                 }
 
@@ -579,6 +613,7 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
 
                 await MainActor.run { generationStream.setPhase(.done) }
             } catch {
+                streamError = error
                 if !Task.isCancelled {
                     Self.logger.error("Foundation generation error: \(error)")
                     await MainActor.run { generationStream.setPhase(.failed(error.localizedDescription)) }
@@ -620,7 +655,8 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
         prompt: String,
         options: GenerationOptions,
         continuation: AsyncThrowingStream<GenerationEvent, Error>.Continuation,
-        generationStream: GenerationStream
+        generationStream: GenerationStream,
+        metricTracker: GenerationMetricTracker?
     ) async throws -> StreamResult {
         let responseStream = session.streamResponse(to: prompt, options: options)
 
@@ -641,6 +677,7 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
                     await MainActor.run { generationStream.setPhase(.streaming) }
                     isFirstToken = false
                 }
+                metricTracker?.recordToken()
                 continuation.yield(.token(newContent))
                 eventsEmitted += 1
                 previousCount = currentText.count
@@ -662,7 +699,8 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
         schema: GenerationSchema,
         options: GenerationOptions,
         continuation: AsyncThrowingStream<GenerationEvent, Error>.Continuation,
-        generationStream: GenerationStream
+        generationStream: GenerationStream,
+        metricTracker: GenerationMetricTracker?
     ) async throws -> StreamResult {
         let responseStream = session.streamResponse(
             to: prompt,
@@ -699,6 +737,7 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
                         await MainActor.run { generationStream.setPhase(.streaming) }
                         isFirstToken = false
                     }
+                    metricTracker?.recordToken()
                     continuation.yield(.token(delta))
                     eventsEmitted += 1
                     lastTextLength = textSoFar.count

@@ -1,19 +1,20 @@
 import Foundation
-import ManifoldInference
 
 /// Accumulates per-token timing data for a single generation call.
 ///
 /// Thread-safety via `NSLock`. Updated from the generation task (arbitrary
 /// thread); read after the task completes to build the final ``InferenceMetric``.
-final class GenerationMetricTracker: @unchecked Sendable {
+package final class GenerationMetricTracker: @unchecked Sendable {
     private let lock = NSLock()
     private var wallStart: ContinuousClock.Instant = ContinuousClock.now
     private var dispatchDate: Date = Date()
     private var firstTokenInstant: ContinuousClock.Instant?
     private var lastTokenInstant: ContinuousClock.Instant?
     private var interTokenGapsNs: [Int64] = []
 
-    func start() {
+    package init() {}
+
+    package func start() {
         lock.lock()
         defer { lock.unlock() }
         wallStart = ContinuousClock.now
@@ -22,7 +23,7 @@ final class GenerationMetricTracker: @unchecked Sendable {
         dispatchDate = Date()
     }
 
-    func recordToken() {
+    package func recordToken() {
         lock.lock()
         defer { lock.unlock() }
         let now = ContinuousClock.now
@@ -37,7 +38,7 @@ final class GenerationMetricTracker: @unchecked Sendable {
         lastTokenInstant = now
     }
 
-    func buildMetric(
+    package func buildMetric(
         provider: String,
         model: String,
         promptTokens: Int,
@@ -88,8 +89,8 @@ final class GenerationMetricTracker: @unchecked Sendable {
     }
 }
 
-enum SSEGenerationMetrics {
-    static func observing(
+package enum SSEGenerationMetrics {
+    package static func observing(
         _ stream: AsyncThrowingStream<GenerationEvent, Error>,
         tracker: GenerationMetricTracker,
         enabled: Bool
@@ -115,31 +116,35 @@ enum SSEGenerationMetrics {
         }
     }
 
-    static func record(
+    /// Records a metric to `sink` using pre-built tracker data.
+    ///
+    /// Cost fields are passed explicitly so this method remains in
+    /// `ManifoldInference` without a dependency on `InferenceCostEstimator`,
+    /// which lives in `ManifoldCloudCore`. Cloud backends compute cost before
+    /// calling this method; local backends (Foundation) pass zero cost with
+    /// `isCostApproximate: true`.
+    package static func record(
         to sink: any InferenceMetricSink,
         tracker: GenerationMetricTracker,
         provider: String,
         model: String,
-        usage: (promptTokens: Int, completionTokens: Int)?,
+        promptTokens: Int,
+        completionTokens: Int,
+        cachedPromptTokens: Int = 0,
+        estimatedCostUSD: Double,
+        isCostApproximate: Bool,
+        costTableDate: String,
         errorClass: String?
     ) {
-        let promptTokens = usage?.promptTokens ?? 0
-        let completionTokens = usage?.completionTokens ?? 0
-        let (costUSD, isApprox) = InferenceCostEstimator.estimatedCost(
-            provider: provider,
-            model: model,
-            promptTokens: promptTokens,
-            completionTokens: completionTokens
-        )
         let metric = tracker.buildMetric(
             provider: provider,
             model: model,
             promptTokens: promptTokens,
-            cachedPromptTokens: 0,
+            cachedPromptTokens: cachedPromptTokens,
             completionTokens: completionTokens,
-            estimatedCostUSD: costUSD,
-            isCostApproximate: isApprox,
-            costTableDate: InferenceCostEstimator.costTableDate,
+            estimatedCostUSD: estimatedCostUSD,
+            isCostApproximate: isCostApproximate,
+            costTableDate: costTableDate,
             errorClass: errorClass
         )
         Task { await sink.record(metric) }

@@ -1,14 +1,14 @@
 import Foundation
 
 /// A snapshot of latency, cost, and token-count data produced after a single
-/// cloud inference call.
+/// inference call.
 ///
-/// Emitted by ``SSECloudBackend`` after every generation (success or failure)
-/// and forwarded to the configured ``InferenceMetricSink``. Consumers use this
-/// to power dashboards, cost alerts, and latency regression detection without
-/// having to instrument individual backends.
+/// Emitted by backends after every generation (success or failure) and forwarded
+/// to the configured ``InferenceMetricSink``. Consumers use this to power
+/// dashboards, cost alerts, and latency regression detection without having to
+/// instrument individual backends.
 public struct InferenceMetric: Sendable {
-    /// Human-readable backend name (e.g. "Claude", "OpenAI").
+    /// Human-readable backend name (e.g. "Claude", "OpenAI", "FoundationModels").
     public let provider: String
     /// Model identifier used for the call (e.g. "claude-sonnet-4-6").
     public let model: String
@@ -75,7 +75,7 @@ public struct InferenceMetric: Sendable {
 
 // MARK: - Sink Protocol
 
-/// A type that receives ``InferenceMetric`` values produced by cloud backends.
+/// A type that receives ``InferenceMetric`` values produced by backends.
 ///
 /// Conform to this protocol to route metrics into observability systems (Datadog,
 /// OpenTelemetry, a local ring buffer, etc.) without coupling the backend layer
@@ -89,16 +89,16 @@ public protocol InferenceMetricSink: AnyObject, Sendable {
 
 /// A thread-safe, bounded ring buffer of ``InferenceMetric`` values.
 ///
-/// The shared singleton is the default sink wired into ``SSECloudBackend``.
+/// The shared singleton is the default sink wired into cloud and local backends.
 /// Tests and host apps can inject their own sink; this actor is useful as a
 /// lightweight diagnostic tool in debug builds.
 ///
 /// When the buffer is full the oldest entry is evicted before the new one is
 /// appended, so memory usage stays constant regardless of call volume.
 public actor InMemoryMetricSink: InferenceMetricSink {
 
-    /// Shared singleton. ``SSECloudBackend`` defaults to this sink so callers
-    /// can read recent metrics without configuring anything.
+    /// Shared singleton. Backends default to this sink so callers can read
+    /// recent metrics without configuring anything.
     public static let shared = InMemoryMetricSink()
 
     private var metrics: [InferenceMetric] = []