From 1b5813ccff98c9f402ba542ccf4ea8470b74fdfe Mon Sep 17 00:00:00 2001
From: Zhongxuan Wang <daniewang@nvidia.com>
Date: Thu, 25 Jun 2026 17:45:32 -0700
Subject: [PATCH] docs: document and stabilize token/cost field semantics
 (RELAY-243)

Add a canonical "Token and Cost Field Semantics" section to the provider response codecs page: a Usage and CostEstimate field reference, the per-provider token normalization table, an exporter field-mapping table (ATOF/ATIF/OpenInference/OpenTelemetry), and a stability contract. Add brief field pointers from the OpenTelemetry, OpenInference, and ATIF exporter pages, and a Known Issues entry noting ATIF derives token/cost from the raw event payload rather than the codec annotation.

Lock the contract with two characterization tests: OpenTelemetry LLM end events emit cost only (no token-count or gen_ai attributes), and Usage ignores unmodeled provider subfields. Existing tests already cover the other projections and the USD-only/currency-aware cost behavior.

No runtime behavior change.

Signed-off-by: Zhongxuan Wang <daniewang@nvidia.com>
---
 .../core/tests/unit/codec/response_tests.rs   | 15 ++++
 .../tests/unit/observability/otel_tests.rs    | 37 ++++++++
 .../release-notes/known-issues.mdx            |  7 ++
 .../provider-response-codecs.mdx              | 89 ++++++++++++++++++-
 docs/observability-plugin/atif.mdx            |  7 ++
 docs/observability-plugin/openinference.mdx   |  6 ++
 docs/observability-plugin/opentelemetry.mdx   |  6 ++
 7 files changed, 164 insertions(+), 3 deletions(-)

diff --git a/crates/core/tests/unit/codec/response_tests.rs b/crates/core/tests/unit/codec/response_tests.rs
index c51ed246c..17b23c1e3 100644
--- a/crates/core/tests/unit/codec/response_tests.rs
+++ b/crates/core/tests/unit/codec/response_tests.rs
@@ -1456,6 +1456,21 @@ fn test_unknown_model_pricing_returns_none_without_blocking_usage() {
     assert_eq!(usage.prompt_tokens, Some(1_000));
 }
 
+#[test]
+fn test_usage_ignores_unmodeled_provider_subfields() {
+    // Usage has no serde catch-all, so unknown provider usage keys are dropped.
+    let usage: Usage = serde_json::from_value(json!({
+        "prompt_tokens": 5,
+        "completion_tokens": 7,
+        "some_future_field": 99
+    }))
+    .unwrap();
+    assert_eq!(usage.prompt_tokens, Some(5));
+    assert_eq!(usage.completion_tokens, Some(7));
+    let reserialized = serde_json::to_value(&usage).unwrap();
+    assert!(reserialized.get("some_future_field").is_none());
+}
+
 // -------------------------------------------------------------------
 // FinishReason serialization
 // -------------------------------------------------------------------
diff --git a/crates/core/tests/unit/observability/otel_tests.rs b/crates/core/tests/unit/observability/otel_tests.rs
index 661fe660a..7927fbb1c 100644
--- a/crates/core/tests/unit/observability/otel_tests.rs
+++ b/crates/core/tests/unit/observability/otel_tests.rs
@@ -971,6 +971,43 @@ fn llm_end_with_unannotated_openai_response_uses_codec_cost() {
     );
 }
 
+#[test]
+fn llm_end_emits_cost_only_no_token_or_gen_ai_attributes() {
+    let _pricing_guard = pricing_test_mutex().lock().unwrap();
+    install_openai_disambiguation_pricing("priced-model");
+    let _reset_guard = ResetPricingResolverGuard;
+
+    let event = make_end_event(
+        Uuid::now_v7(),
+        None,
+        "other",
+        ScopeType::Llm,
+        Some(openai_chat_provider_response("priced-model")),
+    );
+
+    let attributes = attr_map(&end_attributes(&event));
+
+    // Cost is the only nemo_relay.llm.* surface; token counts are not emitted as
+    // discrete attributes, and no foreign gen_ai.* namespace is used.
+    let llm_keys: Vec<&String> = attributes
+        .keys()
+        .filter(|key| key.starts_with("nemo_relay.llm."))
+        .collect();
+    assert_eq!(
+        llm_keys.len(),
+        2,
+        "unexpected nemo_relay.llm.* keys: {llm_keys:?}"
+    );
+    assert!(attributes.contains_key("nemo_relay.llm.cost.total"));
+    assert!(attributes.contains_key("nemo_relay.llm.cost.currency"));
+    assert!(
+        attributes
+            .keys()
+            .all(|key| !key.starts_with("gen_ai") && !key.contains("token_count")),
+        "no token-count attributes expected"
+    );
+}
+
 #[test]
 fn llm_end_with_unpriced_response_model_uses_requested_model_cost() {
     let _pricing_guard = pricing_test_mutex().lock().unwrap();
diff --git a/docs/about-nemo-relay/release-notes/known-issues.mdx b/docs/about-nemo-relay/release-notes/known-issues.mdx
index 8ead12a6f..550b8ba9c 100644
--- a/docs/about-nemo-relay/release-notes/known-issues.mdx
+++ b/docs/about-nemo-relay/release-notes/known-issues.mdx
@@ -42,6 +42,13 @@ and limitations apply to NeMo Relay 0.4:
 - Pricing estimates depend on configured pricing sources and the freshness of
   the source catalog. Unknown model pricing and missing token data leave cost
   absent instead of defaulting to zero.
+- ATIF token and cost metrics are derived from the raw LLM event payload, not
+  from the codec-normalized `annotated_response.usage`. A managed call whose
+  usage or cost exists only in the codec annotation (for example, a custom
+  response codec, or a framework that reports usage out of band) is reflected in
+  OpenInference token attributes and in OpenTelemetry and OpenInference cost
+  attributes, but not in ATIF step or final metrics. A future release will align
+  ATIF with the codec annotation.
 - ATOF streaming endpoints depend on collector availability. Failed endpoints
   are skipped or retried without blocking file output or other configured
   endpoints.
diff --git a/docs/integrate-into-frameworks/provider-response-codecs.mdx b/docs/integrate-into-frameworks/provider-response-codecs.mdx
index e70ed3d78..8cbfd9ebe 100644
--- a/docs/integrate-into-frameworks/provider-response-codecs.mdx
+++ b/docs/integrate-into-frameworks/provider-response-codecs.mdx
@@ -310,9 +310,92 @@ so provider-reported cost remains authoritative in the annotation.
 
 Observability exporters prefer an explicit cost in the raw payload, then
 normalized `Usage.cost`, then a derived estimate from model pricing. When cost is
-available, ATIF step metrics and final metrics include `cost_usd`,
-OpenInference includes the USD-denominated `llm.cost.total`, and OpenTelemetry
-includes `nemo_relay.llm.cost.total` and `nemo_relay.llm.cost.currency`.
+available, each exporter projects it per the [exporter field mapping](#exporter-field-mapping)
+below.
+
+## Token and Cost Field Semantics
+
+This is the stable reference for the token and cost fields on LLM end events.
+Token and cost data are reported per LLM call on `AnnotatedLlmResponse.usage`.
+The only cross-call rollup is the ATIF trajectory `final_metrics`; OpenTelemetry
+and OpenInference spans are per call.
+
+### Usage fields
+
+All fields are optional and unset when the provider omits them. `Usage` has no
+catch-all, so provider usage fields that Relay does not model are not retained.
+
+| Field | Meaning |
+|---|---|
+| `prompt_tokens` | Input/prompt tokens. |
+| `completion_tokens` | Output/completion tokens. For OpenAI Responses, reasoning tokens are included in this value rather than counted separately. |
+| `total_tokens` | Provider-reported total, or codec-computed as `prompt + completion` when the provider omits it. Relay does not recompute it for every source. |
+| `cache_read_tokens` | Prompt-cache read tokens, when the provider reports prompt caching. |
+| `cache_write_tokens` | Prompt-cache write tokens (Anthropic-style providers). |
+| `cost` | Normalized `CostEstimate`, when reported by the provider or estimable from configured pricing. |
+
+Built-in codecs normalize provider field names as follows:
+
+| Normalized field | OpenAI Chat | OpenAI Responses | Anthropic Messages |
+|---|---|---|---|
+| `prompt_tokens` | `prompt_tokens` | `input_tokens` | `input_tokens` |
+| `completion_tokens` | `completion_tokens` | `output_tokens` | `output_tokens` |
+| `total_tokens` | `total_tokens` | `total_tokens` | computed |
+| `cache_read_tokens` | `prompt_tokens_details.cached_tokens` | `input_tokens_details.cached_tokens` | `cache_read_input_tokens` |
+| `cache_write_tokens` | — | — | `cache_creation_input_tokens` |
+
+Built-in codecs preserve only modeled provider-specific usage details under
+`api_specific`; other usage fields are dropped. For example, OpenAI Responses
+reasoning token counts remain under `api_specific` (`output_tokens_details`),
+while OpenAI Chat `completion_tokens_details` and unmodeled usage fields are not
+retained.
+
+### Cost fields
+
+`CostEstimate` carries currency-neutral amounts plus provenance. See
+[Cost Estimation](#cost-estimation) above for resolution order and pricing setup.
+
+| Field | Meaning |
+|---|---|
+| `total` | Optional total cost in `currency`. When absent, some exporters derive a total from the component amounts. |
+| `currency` | ISO 4217 code; defaults to `USD`. |
+| `input` / `output` / `cache_read` / `cache_write` | Per-category amounts in `currency`. |
+| `source` | `provider_reported` (authoritative) or `model_pricing` (estimated). |
+| `pricing_provider` / `pricing_model` / `pricing_as_of` / `pricing_source` | Estimate provenance, for auditing stale pricing. |
+
+Missing is not zero: an absent `cost` or token field means unknown, while an
+explicit `0` is a reported value and is preserved. Relay does not convert
+currencies.
+
+### Exporter field mapping
+
+Each exporter projects `usage`/`cost` differently; projections do not change the
+canonical fields above.
+
+| | ATOF | ATIF step / `final_metrics` | OpenInference | OpenTelemetry |
+|---|---|---|---|---|
+| Prompt tokens | full `usage` preserved | `prompt_tokens` / `total_prompt_tokens` | `llm.token_count.prompt` | not emitted |
+| Completion tokens | preserved | `completion_tokens` / `total_completion_tokens` | `llm.token_count.completion` | not emitted |
+| Total tokens | preserved | in `metrics.extra` (no first-class field) | `llm.token_count.total` | not emitted |
+| Cache read / write | preserved | summed into `cached_tokens` / `total_cached_tokens` | `llm.token_count.prompt_details.cache_read` / `…cache_write` | not emitted |
+| Cost | full `cost` preserved | `cost_usd` / `total_cost_usd` (USD only) | `llm.cost.total` (USD only) | `nemo_relay.llm.cost.total` + `nemo_relay.llm.cost.currency` (any currency) |
+
+OpenTelemetry carries cost in any currency; ATIF and OpenInference report cost
+only when it is USD-denominated and otherwise omit it. ATIF carries unmapped
+usage keys, such as reasoning token counts, in `metrics.extra`.
+
+### Stability
+
+The `Usage` and `CostEstimate` field names and meanings, and the exporter
+mappings above, are stable as of ATOF `0.1` (ATIF schema `ATIF-v1.7`, pricing
+catalog `version: 1`). New optional fields may be added; renames or removals are
+breaking and called out in release notes. Behavior that is intentional in this
+release but may change later: OpenTelemetry emits cost only, not token counts;
+ATIF and OpenInference report cost only in USD; ATIF derives metrics from the raw
+event payload rather than the codec annotation (see
+[Known Issues](/about-nemo-relay/release-notes/known-issues)); reasoning tokens
+are not a first-class `Usage` field; and bindings expose `usage`/`cost` as
+snake_case JSON rather than typed objects.
 
 ## Built-in Response Codecs
 
diff --git a/docs/observability-plugin/atif.mdx b/docs/observability-plugin/atif.mdx
index f509a5274..a5761d333 100644
--- a/docs/observability-plugin/atif.mdx
+++ b/docs/observability-plugin/atif.mdx
@@ -199,6 +199,13 @@ agent scope UUID. Each step's `extra.ancestry.function_id` is the event UUID,
 and `extra.ancestry.parent_id` is the parent event UUID. Trace spans expose the
 same values as `nemo_relay.uuid` and `nemo_relay.parent_uuid` attributes.
 
+Each step's `metrics` carries `prompt_tokens`, `completion_tokens`, `cached_tokens`
+(cache read + write), and USD `cost_usd`; the trajectory `final_metrics` sums these
+as `total_*`. See
+[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics)
+for the full mapping, including how ATIF sources these values from the raw event
+payload.
+
 ATIF is a trajectory projection over NeMo Relay events. It should preserve the
 meaning of scope parentage, event UUIDs, codec annotations, and exporter-local
 lineage rules without becoming the source of truth for runtime ownership,
diff --git a/docs/observability-plugin/openinference.mdx b/docs/observability-plugin/openinference.mdx
index 3bad14871..f3b0f221f 100644
--- a/docs/observability-plugin/openinference.mdx
+++ b/docs/observability-plugin/openinference.mdx
@@ -87,6 +87,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and
 the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`.
 Backend-native `trace_id` and `span_id` values are not written into ATIF.
 
+LLM token counts appear as `llm.token_count.prompt`, `llm.token_count.completion`,
+`llm.token_count.total`, and `llm.token_count.prompt_details.cache_read`/`cache_write`;
+cost appears as USD-denominated `llm.cost.total`. See
+[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics)
+for the full mapping.
+
 Redact sensitive event payloads with sanitize guardrails before production
 export.
 
diff --git a/docs/observability-plugin/opentelemetry.mdx b/docs/observability-plugin/opentelemetry.mdx
index c26ade52a..fd3405157 100644
--- a/docs/observability-plugin/opentelemetry.mdx
+++ b/docs/observability-plugin/opentelemetry.mdx
@@ -73,6 +73,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and
 the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`.
 Backend-native `trace_id` and `span_id` values are not written into ATIF.
 
+For LLM end spans, cost is emitted as `nemo_relay.llm.cost.total` and
+`nemo_relay.llm.cost.currency` (any currency). Token counts are not emitted as
+discrete attributes. See
+[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics)
+for the full mapping.
+
 Register the plugin before the first instrumented request, use stable service
 identity fields, keep credentials outside source code, and flush during
 graceful shutdown.