Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/api-breakage-allowlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,6 @@ API breakage: func ModelLoadCoordinator.dispatchLoad(_:) has been renamed to fun
API breakage: enumelement GenerationEvent.toolCallParseFailed has been added as a new enum case
API breakage: enumelement GenerationEvent.toolCallTruncated has been added as a new enum case
API breakage: constructor ToolCallTransform.init(markers:) has been removed
API breakage: struct InferenceMetric has been removed
API breakage: protocol InferenceMetricSink has been removed
API breakage: class InMemoryMetricSink has been removed
13 changes: 13 additions & 0 deletions Sources/ManifoldCloudCore/MetricTypeAliases.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Source compatibility shim — InferenceMetric, InferenceMetricSink, and
// InMemoryMetricSink were relocated from ManifoldCloudCore to ManifoldInference
// in the observability train so that ManifoldFoundation (which depends on
// ManifoldInference but not ManifoldCloudCore) can reach them.
//
// @_exported re-surfaces the entire ManifoldInference surface through
// ManifoldCloudCore so all existing `import ManifoldCloudCore` consumers
// continue to resolve InferenceMetric / InferenceMetricSink / InMemoryMetricSink
// at the same import depth — no source changes required downstream.
//
// ManifoldCloudCore already takes a direct dependency on ManifoldInference in
// Package.swift, so this is a pure source-compat promotion, not a new dep.
@_exported import ManifoldInference
17 changes: 16 additions & 1 deletion Sources/ManifoldCloudCore/SSEGenerationTaskRunner.swift
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,27 @@ struct SSEGenerationTaskRunner {
}

if let sink = context.metricSink {
// Compute cost in ManifoldCloudCore where InferenceCostEstimator lives,
// then pass the pre-resolved values to the ManifoldInference record helper.
let usage = context.readUsage()
let promptTokens = usage?.promptTokens ?? 0
let completionTokens = usage?.completionTokens ?? 0
let (costUSD, isApprox) = InferenceCostEstimator.estimatedCost(
provider: context.backendName,
model: context.modelName,
promptTokens: promptTokens,
completionTokens: completionTokens
)
SSEGenerationMetrics.record(
to: sink,
tracker: metricTracker,
provider: context.backendName,
model: context.modelName,
usage: context.readUsage(),
promptTokens: promptTokens,
completionTokens: completionTokens,
estimatedCostUSD: costUSD,
isCostApproximate: isApprox,
costTableDate: InferenceCostEstimator.costTableDate,
errorClass: streamError.map { SSECloudBackend.classifyError($0) }
)
}
Expand Down
47 changes: 43 additions & 4 deletions Sources/ManifoldFoundation/FoundationBackend.swift
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ import os
// surface only (InferenceBackend, GenerationConfig, GenerationEvent, …) — no
// engine state. ManifoldContract re-exports the P1 leaf types it needs.
import ManifoldContract
// InferenceMetricSink and InMemoryMetricSink live in ManifoldInference since
// the observability train relocated them from ManifoldCloudCore so that this
// backend can reach them without a ManifoldCloudCore dependency.
import ManifoldInference

/// Apple FoundationModels inference backend for on-device Apple Intelligence models.
///
Expand Down Expand Up @@ -195,6 +199,12 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
/// real Apple Intelligence entitlement. Production uses the system default.
private let availabilityResolver: @Sendable () -> SystemLanguageModel.Availability

/// The sink that receives an ``InferenceMetric`` after every generation call.
///
/// Defaults to ``InMemoryMetricSink/shared`` so callers can read recent
/// metrics without any configuration. Set to `nil` to disable metric emission.
public var metricSink: (any InferenceMetricSink)? = InMemoryMetricSink.shared

/// Structured conversation history installed by ``GenerationHistoryInstaller``
/// through the ``StructuredHistoryReceiver`` opt-in.
///
Expand Down Expand Up @@ -496,14 +506,34 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
// returning and the Task being scheduled by the cooperative executor.
// The retain cycle (backend → generationTask → backend) is broken in the
// `defer` block when `generationTask` is nilled out on completion.
let metricTracker = GenerationMetricTracker()
let capturedMetricSink = withStateLock { metricSink }
let task = Task { [self, generationStream] in
var streamError: Error?
defer {
withStateLock {
if generationSequence == generationID {
_isGenerating = false
generationTask = nil
}
}
// Emit an InferenceMetric after every generation (success or
// failure). Cost is zero / approximate because the Foundation
// Models framework does not expose token-level billing.
if let sink = capturedMetricSink {
SSEGenerationMetrics.record(
to: sink,
tracker: metricTracker,
provider: "FoundationModels",
model: "apple-foundation",
promptTokens: 0,
completionTokens: 0,
estimatedCostUSD: 0,
isCostApproximate: true,
costTableDate: "",
errorClass: streamError.map { String(describing: type(of: $0)) }
)
}
Self.logger.debug("Foundation generate finished")
}

Expand All @@ -527,6 +557,8 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
// iterator was dropped before returning nil.
withStateLock { _sessionIsClean = false }

metricTracker.start()

let result: StreamResult
if let toolEnvelope {
result = try await runToolAwareStream(
Expand All @@ -535,15 +567,17 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
schema: toolEnvelope,
options: options,
continuation: continuation,
generationStream: generationStream
generationStream: generationStream,
metricTracker: capturedMetricSink != nil ? metricTracker : nil
)
} else {
result = try await runTextOnlyStream(
session: activeSession,
prompt: prompt,
options: options,
continuation: continuation,
generationStream: generationStream
generationStream: generationStream,
metricTracker: capturedMetricSink != nil ? metricTracker : nil
)
}

Expand Down Expand Up @@ -579,6 +613,7 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {

await MainActor.run { generationStream.setPhase(.done) }
} catch {
streamError = error
if !Task.isCancelled {
Self.logger.error("Foundation generation error: \(error)")
await MainActor.run { generationStream.setPhase(.failed(error.localizedDescription)) }
Expand Down Expand Up @@ -620,7 +655,8 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
prompt: String,
options: GenerationOptions,
continuation: AsyncThrowingStream<GenerationEvent, Error>.Continuation,
generationStream: GenerationStream
generationStream: GenerationStream,
metricTracker: GenerationMetricTracker?
) async throws -> StreamResult {
let responseStream = session.streamResponse(to: prompt, options: options)

Expand All @@ -641,6 +677,7 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
await MainActor.run { generationStream.setPhase(.streaming) }
isFirstToken = false
}
metricTracker?.recordToken()
continuation.yield(.token(newContent))
eventsEmitted += 1
previousCount = currentText.count
Expand All @@ -662,7 +699,8 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
schema: GenerationSchema,
options: GenerationOptions,
continuation: AsyncThrowingStream<GenerationEvent, Error>.Continuation,
generationStream: GenerationStream
generationStream: GenerationStream,
metricTracker: GenerationMetricTracker?
) async throws -> StreamResult {
let responseStream = session.streamResponse(
to: prompt,
Expand Down Expand Up @@ -699,6 +737,7 @@ public final class FoundationBackend: InferenceBackend, @unchecked Sendable {
await MainActor.run { generationStream.setPhase(.streaming) }
isFirstToken = false
}
metricTracker?.recordToken()
continuation.yield(.token(delta))
eventsEmitted += 1
lastTextLength = textSoFar.count
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import Foundation
import ManifoldInference

/// Accumulates per-token timing data for a single generation call.
///
/// Thread-safety via `NSLock`. Updated from the generation task (arbitrary
/// thread); read after the task completes to build the final ``InferenceMetric``.
final class GenerationMetricTracker: @unchecked Sendable {
package final class GenerationMetricTracker: @unchecked Sendable {
private let lock = NSLock()
private var wallStart: ContinuousClock.Instant = ContinuousClock.now
private var dispatchDate: Date = Date()
private var firstTokenInstant: ContinuousClock.Instant?
private var lastTokenInstant: ContinuousClock.Instant?
private var interTokenGapsNs: [Int64] = []

func start() {
package init() {}

package func start() {
lock.lock()
defer { lock.unlock() }
wallStart = ContinuousClock.now
Expand All @@ -22,7 +23,7 @@ final class GenerationMetricTracker: @unchecked Sendable {
dispatchDate = Date()
}

func recordToken() {
package func recordToken() {
lock.lock()
defer { lock.unlock() }
let now = ContinuousClock.now
Expand All @@ -37,7 +38,7 @@ final class GenerationMetricTracker: @unchecked Sendable {
lastTokenInstant = now
}

func buildMetric(
package func buildMetric(
provider: String,
model: String,
promptTokens: Int,
Expand Down Expand Up @@ -88,8 +89,8 @@ final class GenerationMetricTracker: @unchecked Sendable {
}
}

enum SSEGenerationMetrics {
static func observing(
package enum SSEGenerationMetrics {
package static func observing(
_ stream: AsyncThrowingStream<GenerationEvent, Error>,
tracker: GenerationMetricTracker,
enabled: Bool
Expand All @@ -115,31 +116,35 @@ enum SSEGenerationMetrics {
}
}

static func record(
/// Records a metric to `sink` using pre-built tracker data.
///
/// Cost fields are passed explicitly so this method remains in
/// `ManifoldInference` without a dependency on `InferenceCostEstimator`,
/// which lives in `ManifoldCloudCore`. Cloud backends compute cost before
/// calling this method; local backends (Foundation) pass zero cost with
/// `isCostApproximate: true`.
package static func record(
to sink: any InferenceMetricSink,
tracker: GenerationMetricTracker,
provider: String,
model: String,
usage: (promptTokens: Int, completionTokens: Int)?,
promptTokens: Int,
completionTokens: Int,
cachedPromptTokens: Int = 0,
estimatedCostUSD: Double,
isCostApproximate: Bool,
costTableDate: String,
errorClass: String?
) {
let promptTokens = usage?.promptTokens ?? 0
let completionTokens = usage?.completionTokens ?? 0
let (costUSD, isApprox) = InferenceCostEstimator.estimatedCost(
provider: provider,
model: model,
promptTokens: promptTokens,
completionTokens: completionTokens
)
let metric = tracker.buildMetric(
provider: provider,
model: model,
promptTokens: promptTokens,
cachedPromptTokens: 0,
cachedPromptTokens: cachedPromptTokens,
completionTokens: completionTokens,
estimatedCostUSD: costUSD,
isCostApproximate: isApprox,
costTableDate: InferenceCostEstimator.costTableDate,
estimatedCostUSD: estimatedCostUSD,
isCostApproximate: isCostApproximate,
costTableDate: costTableDate,
errorClass: errorClass
)
Task { await sink.record(metric) }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import Foundation

/// A snapshot of latency, cost, and token-count data produced after a single
/// cloud inference call.
/// inference call.
///
/// Emitted by ``SSECloudBackend`` after every generation (success or failure)
/// and forwarded to the configured ``InferenceMetricSink``. Consumers use this
/// to power dashboards, cost alerts, and latency regression detection without
/// having to instrument individual backends.
/// Emitted by backends after every generation (success or failure) and forwarded
/// to the configured ``InferenceMetricSink``. Consumers use this to power
/// dashboards, cost alerts, and latency regression detection without having to
/// instrument individual backends.
public struct InferenceMetric: Sendable {
/// Human-readable backend name (e.g. "Claude", "OpenAI").
/// Human-readable backend name (e.g. "Claude", "OpenAI", "FoundationModels").
public let provider: String
/// Model identifier used for the call (e.g. "claude-sonnet-4-6").
public let model: String
Expand Down Expand Up @@ -75,7 +75,7 @@ public struct InferenceMetric: Sendable {

// MARK: - Sink Protocol

/// A type that receives ``InferenceMetric`` values produced by cloud backends.
/// A type that receives ``InferenceMetric`` values produced by backends.
///
/// Conform to this protocol to route metrics into observability systems (Datadog,
/// OpenTelemetry, a local ring buffer, etc.) without coupling the backend layer
Expand All @@ -89,16 +89,16 @@ public protocol InferenceMetricSink: AnyObject, Sendable {

/// A thread-safe, bounded ring buffer of ``InferenceMetric`` values.
///
/// The shared singleton is the default sink wired into ``SSECloudBackend``.
/// The shared singleton is the default sink wired into cloud and local backends.
/// Tests and host apps can inject their own sink; this actor is useful as a
/// lightweight diagnostic tool in debug builds.
///
/// When the buffer is full the oldest entry is evicted before the new one is
/// appended, so memory usage stays constant regardless of call volume.
public actor InMemoryMetricSink: InferenceMetricSink {

/// Shared singleton. ``SSECloudBackend`` defaults to this sink so callers
/// can read recent metrics without configuring anything.
/// Shared singleton. Backends default to this sink so callers can read
/// recent metrics without configuring anything.
public static let shared = InMemoryMetricSink()

private var metrics: [InferenceMetric] = []
Expand Down
Loading