Skip to content

Commit dc0c8de

Browse files
committed
feat: add back basic metrics to multipath
1 parent 1a7a88b commit dc0c8de

File tree

2 files changed

+90
-50
lines changed

2 files changed

+90
-50
lines changed

iroh/src/magicsock/endpoint_map/endpoint_state.rs

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -243,11 +243,7 @@ impl EndpointStateActor {
243243
self.handle_path_event(id, evt);
244244
}
245245
Some(conn_id) = self.connections_close.next(), if !self.connections_close.is_empty() => {
246-
self.connections.remove(&conn_id);
247-
if self.connections.is_empty() {
248-
trace!("last connection closed - clearing selected_path");
249-
self.selected_path.set(None).ok();
250-
}
246+
self.handle_connection_close(conn_id);
251247
}
252248
_ = self.local_addrs.updated() => {
253249
trace!("local addrs updated, triggering holepunching");
@@ -369,6 +365,7 @@ impl EndpointStateActor {
369365
) {
370366
let pub_open_paths = Watchable::default();
371367
if let Some(conn) = handle.upgrade() {
368+
self.metrics.num_conns_opened.inc();
372369
// Remove any conflicting stable_ids from the local state.
373370
let conn_id = ConnId(conn.stable_id());
374371
self.connections.remove(&conn_id);
@@ -387,6 +384,7 @@ impl EndpointStateActor {
387384
paths: Default::default(),
388385
open_paths: Default::default(),
389386
path_ids: Default::default(),
387+
transport_summary: TransportSummary::default(),
390388
})
391389
.into_mut();
392390

@@ -571,6 +569,17 @@ impl EndpointStateActor {
571569
tx.send(rtt).ok();
572570
}
573571

572+
fn handle_connection_close(&mut self, conn_id: ConnId) {
573+
if let Some(state) = self.connections.remove(&conn_id) {
574+
self.metrics.num_conns_closed.inc();
575+
state.transport_summary.record(&self.metrics);
576+
}
577+
if self.connections.is_empty() {
578+
trace!("last connection closed - clearing selected_path");
579+
self.selected_path.set(None).ok();
580+
}
581+
}
582+
574583
/// Triggers holepunching to the remote endpoint.
575584
///
576585
/// This will manage the entire process of holepunching with the remote endpoint.
@@ -1114,6 +1123,8 @@ struct ConnectionState {
11141123
open_paths: FxHashMap<PathId, transports::Addr>,
11151124
/// Reverse map of [`Self::paths].
11161125
path_ids: FxHashMap<transports::Addr, PathId>,
1126+
/// Summary over transports used in this connection, for metrics tracking.
1127+
transport_summary: TransportSummary,
11171128
}
11181129

11191130
impl ConnectionState {
@@ -1125,10 +1136,10 @@ impl ConnectionState {
11251136

11261137
/// Tracks an open path for the connection.
11271138
fn add_open_path(&mut self, remote: transports::Addr, path_id: PathId) {
1139+
self.transport_summary.add_path(&remote);
11281140
self.paths.insert(path_id, remote.clone());
11291141
self.open_paths.insert(path_id, remote.clone());
11301142
self.path_ids.insert(remote, path_id);
1131-
11321143
self.update_pub_path_info();
11331144
}
11341145

@@ -1372,3 +1383,41 @@ impl Future for OnClosed {
13721383
Poll::Ready(self.conn_id)
13731384
}
13741385
}
1386+
1387+
/// Used for metrics tracking.
1388+
#[derive(Debug, Clone, Copy, Default)]
1389+
enum TransportSummary {
1390+
#[default]
1391+
None,
1392+
IpOnly,
1393+
RelayOnly,
1394+
IpAndRelay,
1395+
}
1396+
1397+
impl TransportSummary {
1398+
fn add_path(&mut self, addr: &transports::Addr) {
1399+
use transports::Addr;
1400+
*self = match (*self, addr) {
1401+
(TransportSummary::None | TransportSummary::IpOnly, Addr::Ip(_)) => Self::IpOnly,
1402+
(TransportSummary::None | TransportSummary::RelayOnly, Addr::Relay(_, _)) => {
1403+
Self::RelayOnly
1404+
}
1405+
_ => Self::IpAndRelay,
1406+
}
1407+
}
1408+
1409+
fn record(&self, metrics: &MagicsockMetrics) {
1410+
match self {
1411+
TransportSummary::IpOnly => {
1412+
metrics.num_conns_transport_ip_only.inc();
1413+
}
1414+
TransportSummary::RelayOnly => {
1415+
metrics.num_conns_transport_relay_only.inc();
1416+
}
1417+
TransportSummary::IpAndRelay => {
1418+
metrics.num_conns_transport_ip_and_relay.inc();
1419+
}
1420+
TransportSummary::None => {}
1421+
}
1422+
}
1423+
}

iroh/src/magicsock/metrics.rs

Lines changed: 35 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use iroh_metrics::{Counter, Histogram, MetricsGroup};
1+
use iroh_metrics::{Counter, MetricsGroup};
22
use serde::{Deserialize, Serialize};
33

44
/// Enum of metrics for the module
@@ -14,11 +14,8 @@ pub struct Metrics {
1414
pub send_ipv4: Counter,
1515
pub send_ipv6: Counter,
1616
pub send_relay: Counter,
17-
pub send_relay_error: Counter,
1817

1918
// Data packets (non-disco)
20-
pub send_data: Counter,
21-
pub send_data_network_down: Counter,
2219
pub recv_data_relay: Counter,
2320
pub recv_data_ipv4: Counter,
2421
pub recv_data_ipv6: Counter,
@@ -50,15 +47,20 @@ pub struct Metrics {
5047

5148
/*
5249
* Connection Metrics
50+
*
51+
* These all only count connections that completed the TLS handshake successfully. This means
52+
* that short lived 0RTT connections are potentially not included in these counts.
5353
*/
54-
/// The number of direct connections we have made to peers.
55-
pub num_direct_conns_added: Counter,
56-
/// The number of direct connections we have lost to peers.
57-
pub num_direct_conns_removed: Counter,
58-
/// The number of connections to peers we have added over relay.
59-
pub num_relay_conns_added: Counter,
60-
/// The number of connections to peers we have removed over relay.
61-
pub num_relay_conns_removed: Counter,
54+
/// Number of connections opened (only handshaked connections are counted).
55+
pub num_conns_opened: Counter,
56+
/// Number of connections closed (only handshaked connections are counted).
57+
pub num_conns_closed: Counter,
58+
/// Number of connections that had only relay paths over their lifetime.
59+
pub num_conns_transport_relay_only: Counter,
60+
/// Number of connections that had only IP paths over their lifetime.
61+
pub num_conns_transport_ip_only: Counter,
62+
/// Number of connections that had both IP and relay paths.
63+
pub num_conns_transport_ip_and_relay: Counter,
6264

6365
pub actor_tick_main: Counter,
6466
pub actor_tick_msg: Counter,
@@ -67,36 +69,25 @@ pub struct Metrics {
6769
pub actor_tick_direct_addr_heartbeat: Counter,
6870
pub actor_link_change: Counter,
6971
pub actor_tick_other: Counter,
70-
71-
/// Number of endpoints we have attempted to contact.
72-
pub endpoints_contacted: Counter,
73-
/// Number of endpoints we have managed to contact directly.
74-
pub endpoints_contacted_directly: Counter,
75-
76-
/// Number of connections with a successful handshake.
77-
pub connection_handshake_success: Counter,
78-
/// Number of connections with a successful handshake that became direct.
79-
pub connection_became_direct: Counter,
80-
/// Histogram of connection latency in milliseconds across all endpoint connections.
81-
#[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))]
82-
pub connection_latency_ms: Histogram,
83-
84-
/*
85-
* Path Congestion Metrics
86-
*/
87-
/// Number of times a path was marked as outdated due to consecutive ping failures.
88-
pub path_marked_outdated: Counter,
89-
/// Number of ping failures recorded across all paths.
90-
pub path_ping_failures: Counter,
91-
/// Number of consecutive failure resets (path recovered).
92-
pub path_failure_resets: Counter,
93-
/// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
94-
#[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
95-
pub path_packet_loss_rate: Histogram,
96-
/// Histogram of RTT variance (in milliseconds) as a congestion indicator.
97-
#[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
98-
pub path_rtt_variance_ms: Histogram,
99-
/// Histogram of path quality scores (0.0-1.0).
100-
#[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
101-
pub path_quality_score: Histogram,
72+
// /// Histogram of connection latency in milliseconds across all endpoint connections.
73+
// #[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))]
74+
// pub connection_latency_ms: Histogram,
75+
// /*
76+
// * Path Congestion Metrics
77+
// */
78+
// /// Number of times a path was marked as outdated due to consecutive ping failures.
79+
// pub path_marked_outdated: Counter,
80+
// /// Number of ping failures recorded across all paths.
81+
// pub path_ping_failures: Counter,
82+
// /// Number of consecutive failure resets (path recovered).
83+
// pub path_failure_resets: Counter,
84+
// /// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
85+
// #[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
86+
// pub path_packet_loss_rate: Histogram,
87+
// /// Histogram of RTT variance (in milliseconds) as a congestion indicator.
88+
// #[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
89+
// pub path_rtt_variance_ms: Histogram,
90+
// /// Histogram of path quality scores (0.0-1.0).
91+
// #[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
92+
// pub path_quality_score: Histogram,
10293
}

0 commit comments

Comments
 (0)