Skip to content

Commit c893a7f

Browse files
fix(tracing-limit): group info!(message = "foo") and info!("foo") in same bucket
1 parent 16429fa commit c893a7f

File tree

1 file changed

+73
-36
lines changed

1 file changed

+73
-36
lines changed

lib/tracing-limit/src/lib.rs

Lines changed: 73 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,17 @@
1616
//! # Rate limit grouping
1717
//!
1818
//! Events are rate limited independently based on a combination of:
19-
//! - **Callsite**: The code location where the log statement appears
19+
//! - **Message content**: The actual log message text
2020
//! - **Contextual fields**: Any fields attached to the event or its parent spans
2121
//!
2222
//! ## How fields contribute to grouping
2323
//!
24-
//! **Only these fields create distinct rate limit groups:**
24+
//! **These fields create distinct rate limit groups:**
25+
//! - `message` - Different messages are rate limited independently
2526
//! - `component_id` - Different components are rate limited independently
2627
//!
2728
//! **All other fields are ignored for grouping**, including:
2829
//! - `fanout_id`, `input_id`, `output_id` - Not used for grouping to avoid resource/cost implications from high-cardinality tags
29-
//! - `message` - The log message itself doesn't differentiate groups
3030
//! - `internal_log_rate_limit` - Control field for enabling/disabling rate limiting
3131
//! - `internal_log_rate_secs` - Control field for customizing the rate limit window
3232
//! - Any custom fields you add
@@ -39,42 +39,46 @@
3939
//! info!(component_id = "transform_2", "Processing event"); // Group B
4040
//! // Even though the message is identical, these are rate limited independently
4141
//!
42-
//! // Example 2: Only component_id matters for grouping
43-
//! info!(component_id = "router", fanout_id = "output_1", "Routing event"); // Group C
44-
//! info!(component_id = "router", fanout_id = "output_2", "Routing event"); // Group C (same group!)
45-
//! info!(component_id = "router", fanout_id = "output_1", "Routing event"); // Group C (same group!)
46-
//! info!(component_id = "router", fanout_id = "output_1", input_id = "kafka", "Routing event"); // Group C (same!)
47-
//! // All of these share the same group because they have the same component_id
42+
//! // Example 2: Same message with same component_id = same group
43+
//! info!(message = "Hello", component_id = "router"); // Group C
44+
//! info!("Hello", component_id = "router"); // Group C
45+
//! // Both forms of logging with the same message are now treated identically
46+
//!
47+
//! // Example 3: Only component_id and message matter for grouping
48+
//! info!(component_id = "router", fanout_id = "output_1", "Routing event"); // Group D
49+
//! info!(component_id = "router", fanout_id = "output_2", "Routing event"); // Group D
50+
//! info!(component_id = "router", input_id = "kafka", "Routing event"); // Group D
51+
//! // All of these share the same group because they have the same component_id and message
4852
//! // The fanout_id and input_id fields are ignored to avoid resource/cost implications
4953
//!
50-
//! // Example 3: Span fields contribute to grouping
54+
//! // Example 4: Different messages create different groups
55+
//! info!(component_id = "router", "Message A"); // Group E
56+
//! info!(component_id = "router", "Message B"); // Group F
57+
//! // Different messages are rate limited independently
58+
//!
59+
//! // Example 5: Span fields contribute to grouping
5160
//! let span = info_span!("process", component_id = "transform_1");
5261
//! let _enter = span.enter();
53-
//! info!("Processing event"); // Group E: callsite + component_id from span
62+
//! info!("Processing event"); // Group G: message + component_id from span
5463
//! drop(_enter);
5564
//!
5665
//! let span = info_span!("process", component_id = "transform_2");
5766
//! let _enter = span.enter();
58-
//! info!("Processing event"); // Group F: same callsite but different component_id
67+
//! info!("Processing event"); // Group H: same message but different component_id
5968
//!
60-
//! // Example 4: Nested spans - child span fields take precedence
69+
//! // Example 6: Nested spans - child span fields take precedence
6170
//! let outer = info_span!("outer", component_id = "parent");
6271
//! let _outer_guard = outer.enter();
6372
//! let inner = info_span!("inner", component_id = "child");
6473
//! let _inner_guard = inner.enter();
65-
//! info!("Nested event"); // Grouped by component_id = "child"
66-
//!
67-
//! // Example 5: Same callsite with no fields = single rate limit group
68-
//! info!("Simple message"); // Group G
69-
//! info!("Simple message"); // Group G
70-
//! info!("Simple message"); // Group G
74+
//! info!("Nested event"); // Grouped by component_id = "child" + message = "Nested event"
7175
//!
72-
//! // Example 6: Custom fields are ignored for grouping
73-
//! info!(component_id = "source", input_id = "in_1", "Received data"); // Group H
74-
//! info!(component_id = "source", input_id = "in_2", "Received data"); // Group H (same group!)
75-
//! // The input_id field is ignored - only component_id matters
76+
//! // Example 7: Same message with no component_id = single rate limit group
77+
//! info!("Simple message"); // Group I
78+
//! info!("Simple message"); // Group I
79+
//! info!("Simple message"); // Group I
7680
//!
77-
//! // Example 7: Disabling rate limiting for specific logs
81+
//! // Example 8: Disabling rate limiting for specific logs
7882
//! // Rate limiting is ON by default - explicitly disable for important logs
7983
//! warn!(
8084
//! component_id = "critical_component",
@@ -83,7 +87,7 @@
8387
//! );
8488
//! // This event will NEVER be rate limited, regardless of how often it fires
8589
//!
86-
//! // Example 8: Custom rate limit window for specific events
90+
//! // Example 9: Custom rate limit window for specific events
8791
//! info!(
8892
//! component_id = "noisy_component",
8993
//! message = "Frequent status update",
@@ -92,15 +96,14 @@
9296
//! // Override the default window for this specific log
9397
//! ```
9498
//!
95-
//! This ensures logs from different components are rate limited independently,
99+
//! This ensures logs from different components and with different messages are rate limited independently,
96100
//! while avoiding resource/cost implications from high-cardinality tags.
97101
98102
use std::fmt;
99103

100104
use dashmap::DashMap;
101105
use tracing_core::{
102106
Event, Metadata, Subscriber,
103-
callsite::Identifier,
104107
field::{Field, Value, Visit, display},
105108
span,
106109
subscriber::Interest,
@@ -127,7 +130,6 @@ const COMPONENT_ID_FIELD: &str = "component_id";
127130

128131
#[derive(Eq, PartialEq, Hash, Clone)]
129132
struct RateKeyIdentifier {
130-
callsite: Identifier,
131133
rate_limit_key_values: RateLimitedSpanKeys,
132134
}
133135

@@ -243,16 +245,16 @@ where
243245
// Build a composite key from event fields and span context to determine the rate limit group.
244246
// This multi-step process ensures we capture all relevant contextual information:
245247
//
246-
// 1. Start with event-level fields (e.g., fields directly on the log macro call)
248+
// 1. Start with event-level fields (e.g., message and fields directly on the log macro call)
247249
// 2. Walk up the span hierarchy from root to current span
248250
// 3. Merge in fields from each span, with child spans taking precedence
249251
//
250252
// This means an event's rate limit group is determined by the combination of:
251-
// - Its callsite (handled separately via RateKeyIdentifier)
252-
// - All contextual fields from both the event and its span ancestry
253+
// - Its message content (ensures info!(message = "foo") and info!("foo") are grouped together)
254+
// - All contextual fields from both the event and its span ancestry (e.g., component_id)
253255
//
254-
// Example: The same `info!("msg")` callsite in different component contexts becomes
255-
// distinct rate limit groups, allowing fine-grained control over log flooding.
256+
// Example: The same message in different component contexts becomes distinct rate limit groups,
257+
// allowing fine-grained control over log flooding.
256258
let rate_limit_key_values = {
257259
let mut keys = RateLimitedSpanKeys::default();
258260
// Capture fields directly on this event
@@ -275,7 +277,6 @@ where
275277
// not, we'll initialize an entry for it.
276278
let metadata = event.metadata();
277279
let id = RateKeyIdentifier {
278-
callsite: metadata.callsite(),
279280
rate_limit_key_values,
280281
};
281282

@@ -480,25 +481,32 @@ impl From<String> for TraceValue {
480481
///
481482
/// **Tracked fields** (only these create distinct rate limit groups):
482483
/// - `component_id` - Different components are rate limited independently
484+
/// - `message` - Different messages are rate limited independently
483485
///
484486
/// **Ignored fields**: All other fields are ignored for grouping purposes. This avoids resource/cost implications from high-cardinality tags.
485487
/// ```
486488
#[derive(Default, Eq, PartialEq, Hash, Clone)]
487489
struct RateLimitedSpanKeys {
488490
component_id: Option<TraceValue>,
491+
message: Option<TraceValue>,
489492
}
490493

491494
impl RateLimitedSpanKeys {
492495
fn record(&mut self, field: &Field, value: TraceValue) {
493-
if field.name() == COMPONENT_ID_FIELD {
494-
self.component_id = Some(value);
496+
match field.name() {
497+
COMPONENT_ID_FIELD => self.component_id = Some(value),
498+
MESSAGE_FIELD => self.message = Some(value),
499+
_ => {}
495500
}
496501
}
497502

498503
fn merge(&mut self, other: &Self) {
499504
if let Some(component_id) = &other.component_id {
500505
self.component_id = Some(component_id.clone());
501506
}
507+
if let Some(message) = &other.message {
508+
self.message = Some(message.clone());
509+
}
502510
}
503511
}
504512

@@ -1052,4 +1060,33 @@ mod test {
10521060
]
10531061
);
10541062
}
1063+
1064+
#[test]
1065+
#[serial]
1066+
fn message_field_explicit_vs_implicit_same_bucket() {
1067+
// info!(message = "Hello") and info!("Hello") should be grouped under the same bucket
1068+
let (events, sub) = setup_test(1);
1069+
tracing::subscriber::with_default(sub, || {
1070+
for _ in 0..30 {
1071+
info!(message = "Hello");
1072+
info!("Hello");
1073+
}
1074+
});
1075+
1076+
let events = events.lock().unwrap();
1077+
1078+
assert_eq!(
1079+
events.len(),
1080+
2,
1081+
"Expected 2 events (1 message + 1 suppression warning), got {}",
1082+
events.len()
1083+
);
1084+
1085+
assert_eq!(events[0].message, "Hello");
1086+
assert!(
1087+
events[1]
1088+
.message
1089+
.contains("is being suppressed to avoid flooding")
1090+
);
1091+
}
10551092
}

0 commit comments

Comments
 (0)