From 1b5813ccff98c9f402ba542ccf4ea8470b74fdfe Mon Sep 17 00:00:00 2001 From: Zhongxuan Wang Date: Thu, 25 Jun 2026 17:45:32 -0700 Subject: [PATCH] docs: document and stabilize token/cost field semantics (RELAY-243) Add a canonical "Token and Cost Field Semantics" section to the provider response codecs page: a Usage and CostEstimate field reference, the per-provider token normalization table, an exporter field-mapping table (ATOF/ATIF/OpenInference/OpenTelemetry), and a stability contract. Add brief field pointers from the OpenTelemetry, OpenInference, and ATIF exporter pages, and a Known Issues entry noting ATIF derives token/cost from the raw event payload rather than the codec annotation. Lock the contract with two characterization tests: OpenTelemetry LLM end events emit cost only (no token-count or gen_ai attributes), and Usage ignores unmodeled provider subfields. Existing tests already cover the other projections and the USD-only/currency-aware cost behavior. No runtime behavior change. Signed-off-by: Zhongxuan Wang --- .../core/tests/unit/codec/response_tests.rs | 15 ++++ .../tests/unit/observability/otel_tests.rs | 37 ++++++++ .../release-notes/known-issues.mdx | 7 ++ .../provider-response-codecs.mdx | 89 ++++++++++++++++++- docs/observability-plugin/atif.mdx | 7 ++ docs/observability-plugin/openinference.mdx | 6 ++ docs/observability-plugin/opentelemetry.mdx | 6 ++ 7 files changed, 164 insertions(+), 3 deletions(-) diff --git a/crates/core/tests/unit/codec/response_tests.rs b/crates/core/tests/unit/codec/response_tests.rs index c51ed246c..17b23c1e3 100644 --- a/crates/core/tests/unit/codec/response_tests.rs +++ b/crates/core/tests/unit/codec/response_tests.rs @@ -1456,6 +1456,21 @@ fn test_unknown_model_pricing_returns_none_without_blocking_usage() { assert_eq!(usage.prompt_tokens, Some(1_000)); } +#[test] +fn test_usage_ignores_unmodeled_provider_subfields() { + // Usage has no serde catch-all, so unknown provider usage keys are dropped. + let usage: Usage = serde_json::from_value(json!({ + "prompt_tokens": 5, + "completion_tokens": 7, + "some_future_field": 99 + })) + .unwrap(); + assert_eq!(usage.prompt_tokens, Some(5)); + assert_eq!(usage.completion_tokens, Some(7)); + let reserialized = serde_json::to_value(&usage).unwrap(); + assert!(reserialized.get("some_future_field").is_none()); +} + // ------------------------------------------------------------------- // FinishReason serialization // ------------------------------------------------------------------- diff --git a/crates/core/tests/unit/observability/otel_tests.rs b/crates/core/tests/unit/observability/otel_tests.rs index 661fe660a..7927fbb1c 100644 --- a/crates/core/tests/unit/observability/otel_tests.rs +++ b/crates/core/tests/unit/observability/otel_tests.rs @@ -971,6 +971,43 @@ fn llm_end_with_unannotated_openai_response_uses_codec_cost() { ); } +#[test] +fn llm_end_emits_cost_only_no_token_or_gen_ai_attributes() { + let _pricing_guard = pricing_test_mutex().lock().unwrap(); + install_openai_disambiguation_pricing("priced-model"); + let _reset_guard = ResetPricingResolverGuard; + + let event = make_end_event( + Uuid::now_v7(), + None, + "other", + ScopeType::Llm, + Some(openai_chat_provider_response("priced-model")), + ); + + let attributes = attr_map(&end_attributes(&event)); + + // Cost is the only nemo_relay.llm.* surface; token counts are not emitted as + // discrete attributes, and no foreign gen_ai.* namespace is used. + let llm_keys: Vec<&String> = attributes + .keys() + .filter(|key| key.starts_with("nemo_relay.llm.")) + .collect(); + assert_eq!( + llm_keys.len(), + 2, + "unexpected nemo_relay.llm.* keys: {llm_keys:?}" + ); + assert!(attributes.contains_key("nemo_relay.llm.cost.total")); + assert!(attributes.contains_key("nemo_relay.llm.cost.currency")); + assert!( + attributes + .keys() + .all(|key| !key.starts_with("gen_ai") && !key.contains("token_count")), + "no token-count attributes expected" + ); +} + #[test] fn llm_end_with_unpriced_response_model_uses_requested_model_cost() { let _pricing_guard = pricing_test_mutex().lock().unwrap(); diff --git a/docs/about-nemo-relay/release-notes/known-issues.mdx b/docs/about-nemo-relay/release-notes/known-issues.mdx index 8ead12a6f..550b8ba9c 100644 --- a/docs/about-nemo-relay/release-notes/known-issues.mdx +++ b/docs/about-nemo-relay/release-notes/known-issues.mdx @@ -42,6 +42,13 @@ and limitations apply to NeMo Relay 0.4: - Pricing estimates depend on configured pricing sources and the freshness of the source catalog. Unknown model pricing and missing token data leave cost absent instead of defaulting to zero. +- ATIF token and cost metrics are derived from the raw LLM event payload, not + from the codec-normalized `annotated_response.usage`. A managed call whose + usage or cost exists only in the codec annotation (for example, a custom + response codec, or a framework that reports usage out of band) is reflected in + OpenInference token attributes and in OpenTelemetry and OpenInference cost + attributes, but not in ATIF step or final metrics. A future release will align + ATIF with the codec annotation. - ATOF streaming endpoints depend on collector availability. Failed endpoints are skipped or retried without blocking file output or other configured endpoints. diff --git a/docs/integrate-into-frameworks/provider-response-codecs.mdx b/docs/integrate-into-frameworks/provider-response-codecs.mdx index e70ed3d78..8cbfd9ebe 100644 --- a/docs/integrate-into-frameworks/provider-response-codecs.mdx +++ b/docs/integrate-into-frameworks/provider-response-codecs.mdx @@ -310,9 +310,92 @@ so provider-reported cost remains authoritative in the annotation. Observability exporters prefer an explicit cost in the raw payload, then normalized `Usage.cost`, then a derived estimate from model pricing. When cost is -available, ATIF step metrics and final metrics include `cost_usd`, -OpenInference includes the USD-denominated `llm.cost.total`, and OpenTelemetry -includes `nemo_relay.llm.cost.total` and `nemo_relay.llm.cost.currency`. +available, each exporter projects it per the [exporter field mapping](#exporter-field-mapping) +below. + +## Token and Cost Field Semantics + +This is the stable reference for the token and cost fields on LLM end events. +Token and cost data are reported per LLM call on `AnnotatedLlmResponse.usage`. +The only cross-call rollup is the ATIF trajectory `final_metrics`; OpenTelemetry +and OpenInference spans are per call. + +### Usage fields + +All fields are optional and unset when the provider omits them. `Usage` has no +catch-all, so provider usage fields that Relay does not model are not retained. + +| Field | Meaning | +|---|---| +| `prompt_tokens` | Input/prompt tokens. | +| `completion_tokens` | Output/completion tokens. For OpenAI Responses, reasoning tokens are included in this value rather than counted separately. | +| `total_tokens` | Provider-reported total, or codec-computed as `prompt + completion` when the provider omits it. Relay does not recompute it for every source. | +| `cache_read_tokens` | Prompt-cache read tokens, when the provider reports prompt caching. | +| `cache_write_tokens` | Prompt-cache write tokens (Anthropic-style providers). | +| `cost` | Normalized `CostEstimate`, when reported by the provider or estimable from configured pricing. | + +Built-in codecs normalize provider field names as follows: + +| Normalized field | OpenAI Chat | OpenAI Responses | Anthropic Messages | +|---|---|---|---| +| `prompt_tokens` | `prompt_tokens` | `input_tokens` | `input_tokens` | +| `completion_tokens` | `completion_tokens` | `output_tokens` | `output_tokens` | +| `total_tokens` | `total_tokens` | `total_tokens` | computed | +| `cache_read_tokens` | `prompt_tokens_details.cached_tokens` | `input_tokens_details.cached_tokens` | `cache_read_input_tokens` | +| `cache_write_tokens` | — | — | `cache_creation_input_tokens` | + +Built-in codecs preserve only modeled provider-specific usage details under +`api_specific`; other usage fields are dropped. For example, OpenAI Responses +reasoning token counts remain under `api_specific` (`output_tokens_details`), +while OpenAI Chat `completion_tokens_details` and unmodeled usage fields are not +retained. + +### Cost fields + +`CostEstimate` carries currency-neutral amounts plus provenance. See +[Cost Estimation](#cost-estimation) above for resolution order and pricing setup. + +| Field | Meaning | +|---|---| +| `total` | Optional total cost in `currency`. When absent, some exporters derive a total from the component amounts. | +| `currency` | ISO 4217 code; defaults to `USD`. | +| `input` / `output` / `cache_read` / `cache_write` | Per-category amounts in `currency`. | +| `source` | `provider_reported` (authoritative) or `model_pricing` (estimated). | +| `pricing_provider` / `pricing_model` / `pricing_as_of` / `pricing_source` | Estimate provenance, for auditing stale pricing. | + +Missing is not zero: an absent `cost` or token field means unknown, while an +explicit `0` is a reported value and is preserved. Relay does not convert +currencies. + +### Exporter field mapping + +Each exporter projects `usage`/`cost` differently; projections do not change the +canonical fields above. + +| | ATOF | ATIF step / `final_metrics` | OpenInference | OpenTelemetry | +|---|---|---|---|---| +| Prompt tokens | full `usage` preserved | `prompt_tokens` / `total_prompt_tokens` | `llm.token_count.prompt` | not emitted | +| Completion tokens | preserved | `completion_tokens` / `total_completion_tokens` | `llm.token_count.completion` | not emitted | +| Total tokens | preserved | in `metrics.extra` (no first-class field) | `llm.token_count.total` | not emitted | +| Cache read / write | preserved | summed into `cached_tokens` / `total_cached_tokens` | `llm.token_count.prompt_details.cache_read` / `…cache_write` | not emitted | +| Cost | full `cost` preserved | `cost_usd` / `total_cost_usd` (USD only) | `llm.cost.total` (USD only) | `nemo_relay.llm.cost.total` + `nemo_relay.llm.cost.currency` (any currency) | + +OpenTelemetry carries cost in any currency; ATIF and OpenInference report cost +only when it is USD-denominated and otherwise omit it. ATIF carries unmapped +usage keys, such as reasoning token counts, in `metrics.extra`. + +### Stability + +The `Usage` and `CostEstimate` field names and meanings, and the exporter +mappings above, are stable as of ATOF `0.1` (ATIF schema `ATIF-v1.7`, pricing +catalog `version: 1`). New optional fields may be added; renames or removals are +breaking and called out in release notes. Behavior that is intentional in this +release but may change later: OpenTelemetry emits cost only, not token counts; +ATIF and OpenInference report cost only in USD; ATIF derives metrics from the raw +event payload rather than the codec annotation (see +[Known Issues](/about-nemo-relay/release-notes/known-issues)); reasoning tokens +are not a first-class `Usage` field; and bindings expose `usage`/`cost` as +snake_case JSON rather than typed objects. ## Built-in Response Codecs diff --git a/docs/observability-plugin/atif.mdx b/docs/observability-plugin/atif.mdx index f509a5274..a5761d333 100644 --- a/docs/observability-plugin/atif.mdx +++ b/docs/observability-plugin/atif.mdx @@ -199,6 +199,13 @@ agent scope UUID. Each step's `extra.ancestry.function_id` is the event UUID, and `extra.ancestry.parent_id` is the parent event UUID. Trace spans expose the same values as `nemo_relay.uuid` and `nemo_relay.parent_uuid` attributes. +Each step's `metrics` carries `prompt_tokens`, `completion_tokens`, `cached_tokens` +(cache read + write), and USD `cost_usd`; the trajectory `final_metrics` sums these +as `total_*`. See +[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics) +for the full mapping, including how ATIF sources these values from the raw event +payload. + ATIF is a trajectory projection over NeMo Relay events. It should preserve the meaning of scope parentage, event UUIDs, codec annotations, and exporter-local lineage rules without becoming the source of truth for runtime ownership, diff --git a/docs/observability-plugin/openinference.mdx b/docs/observability-plugin/openinference.mdx index 3bad14871..f3b0f221f 100644 --- a/docs/observability-plugin/openinference.mdx +++ b/docs/observability-plugin/openinference.mdx @@ -87,6 +87,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`. Backend-native `trace_id` and `span_id` values are not written into ATIF. +LLM token counts appear as `llm.token_count.prompt`, `llm.token_count.completion`, +`llm.token_count.total`, and `llm.token_count.prompt_details.cache_read`/`cache_write`; +cost appears as USD-denominated `llm.cost.total`. See +[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics) +for the full mapping. + Redact sensitive event payloads with sanitize guardrails before production export. diff --git a/docs/observability-plugin/opentelemetry.mdx b/docs/observability-plugin/opentelemetry.mdx index c26ade52a..fd3405157 100644 --- a/docs/observability-plugin/opentelemetry.mdx +++ b/docs/observability-plugin/opentelemetry.mdx @@ -73,6 +73,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`. Backend-native `trace_id` and `span_id` values are not written into ATIF. +For LLM end spans, cost is emitted as `nemo_relay.llm.cost.total` and +`nemo_relay.llm.cost.currency` (any currency). Token counts are not emitted as +discrete attributes. See +[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics) +for the full mapping. + Register the plugin before the first instrumented request, use stable service identity fields, keep credentials outside source code, and flush during graceful shutdown.