Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 35 additions & 44 deletions iroh/src/magicsock/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use iroh_metrics::{Counter, Histogram, MetricsGroup};
use iroh_metrics::{Counter, MetricsGroup};
use serde::{Deserialize, Serialize};

/// Enum of metrics for the module
Expand All @@ -14,11 +14,8 @@ pub struct Metrics {
pub send_ipv4: Counter,
pub send_ipv6: Counter,
pub send_relay: Counter,
pub send_relay_error: Counter,

// Data packets (non-disco)
pub send_data: Counter,
Copy link
Member Author

@Frando Frando Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was unused, and it would be just the sum of send_ipv4, send_ipv6, send_relay, so I removed it because the sum can be calculated client-side.

pub send_data_network_down: Counter,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was unused, not sure if we'd want this back?

pub recv_data_relay: Counter,
pub recv_data_ipv4: Counter,
pub recv_data_ipv6: Counter,
Expand Down Expand Up @@ -50,15 +47,20 @@ pub struct Metrics {

/*
* Connection Metrics
*
* These all only count connections that completed the TLS handshake successfully. This means
* that short lived 0RTT connections are potentially not included in these counts.
*/
/// The number of direct connections we have made to peers.
pub num_direct_conns_added: Counter,
/// The number of direct connections we have lost to peers.
pub num_direct_conns_removed: Counter,
/// The number of connections to peers we have added over relay.
pub num_relay_conns_added: Counter,
/// The number of connections to peers we have removed over relay.
pub num_relay_conns_removed: Counter,
/// Number of connections opened (only handshaked connections are counted).
pub num_conns_opened: Counter,
/// Number of connections closed (only handshaked connections are counted).
pub num_conns_closed: Counter,
/// Number of connections that had only relay paths over their lifetime.
pub num_conns_transport_relay_only: Counter,
/// Number of connections that had only IP paths over their lifetime.
pub num_conns_transport_ip_only: Counter,
/// Number of connections that had both IP and relay paths.
pub num_conns_transport_ip_and_relay: Counter,

pub actor_tick_main: Counter,
pub actor_tick_msg: Counter,
Expand All @@ -67,36 +69,25 @@ pub struct Metrics {
pub actor_tick_direct_addr_heartbeat: Counter,
pub actor_link_change: Counter,
pub actor_tick_other: Counter,

/// Number of endpoints we have attempted to contact.
pub endpoints_contacted: Counter,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we ever could do this in a good way, because we don't keep lists of endpoint ids over restarts, so this was already a bad metric for a while - removed it.

/// Number of endpoints we have managed to contact directly.
pub endpoints_contacted_directly: Counter,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above, I don't think there's a good way to do it meaningfully.


/// Number of connections with a successful handshake.
pub connection_handshake_success: Counter,
/// Number of connections with a successful handshake that became direct.
pub connection_became_direct: Counter,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two are better handled by the new conn_ metrics above

/// Histogram of connection latency in milliseconds across all endpoint connections.
#[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))]
pub connection_latency_ms: Histogram,

/*
* Path Congestion Metrics
*/
/// Number of times a path was marked as outdated due to consecutive ping failures.
pub path_marked_outdated: Counter,
/// Number of ping failures recorded across all paths.
pub path_ping_failures: Counter,
/// Number of consecutive failure resets (path recovered).
pub path_failure_resets: Counter,
/// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
#[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
pub path_packet_loss_rate: Histogram,
/// Histogram of RTT variance (in milliseconds) as a congestion indicator.
#[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
pub path_rtt_variance_ms: Histogram,
/// Histogram of path quality scores (0.0-1.0).
#[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
pub path_quality_score: Histogram,
// /// Histogram of connection latency in milliseconds across all endpoint connections.
// #[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))]
// pub connection_latency_ms: Histogram,
// /*
// * Path Congestion Metrics
// */
// /// Number of times a path was marked as outdated due to consecutive ping failures.
// pub path_marked_outdated: Counter,
// /// Number of ping failures recorded across all paths.
// pub path_ping_failures: Counter,
// /// Number of consecutive failure resets (path recovered).
// pub path_failure_resets: Counter,
// /// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
// #[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
// pub path_packet_loss_rate: Histogram,
// /// Histogram of RTT variance (in milliseconds) as a congestion indicator.
// #[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
// pub path_rtt_variance_ms: Histogram,
// /// Histogram of path quality scores (0.0-1.0).
// #[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
// pub path_quality_score: Histogram,
}
61 changes: 55 additions & 6 deletions iroh/src/magicsock/remote_map/remote_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,7 @@ impl RemoteStateActor {
self.handle_path_event(id, evt);
}
Some(conn_id) = self.connections_close.next(), if !self.connections_close.is_empty() => {
self.connections.remove(&conn_id);
if self.connections.is_empty() {
trace!("last connection closed - clearing selected_path");
self.selected_path.set(None).ok();
}
self.handle_connection_close(conn_id);
}
_ = self.local_addrs.updated() => {
trace!("local addrs updated, triggering holepunching");
Expand Down Expand Up @@ -373,6 +369,7 @@ impl RemoteStateActor {
) {
let pub_open_paths = Watchable::default();
if let Some(conn) = handle.upgrade() {
self.metrics.num_conns_opened.inc();
// Remove any conflicting stable_ids from the local state.
let conn_id = ConnId(conn.stable_id());
self.connections.remove(&conn_id);
Expand All @@ -391,6 +388,7 @@ impl RemoteStateActor {
paths: Default::default(),
open_paths: Default::default(),
path_ids: Default::default(),
transport_summary: TransportSummary::default(),
})
.into_mut();

Expand Down Expand Up @@ -575,6 +573,17 @@ impl RemoteStateActor {
tx.send(rtt).ok();
}

fn handle_connection_close(&mut self, conn_id: ConnId) {
if let Some(state) = self.connections.remove(&conn_id) {
self.metrics.num_conns_closed.inc();
state.transport_summary.record(&self.metrics);
}
if self.connections.is_empty() {
trace!("last connection closed - clearing selected_path");
self.selected_path.set(None).ok();
}
}

/// Triggers holepunching to the remote endpoint.
///
/// This will manage the entire process of holepunching with the remote endpoint.
Expand Down Expand Up @@ -1118,6 +1127,8 @@ struct ConnectionState {
open_paths: FxHashMap<PathId, transports::Addr>,
/// Reverse map of [`Self::paths].
path_ids: FxHashMap<transports::Addr, PathId>,
/// Summary over transports used in this connection, for metrics tracking.
transport_summary: TransportSummary,
}

impl ConnectionState {
Expand All @@ -1129,10 +1140,10 @@ impl ConnectionState {

/// Tracks an open path for the connection.
fn add_open_path(&mut self, remote: transports::Addr, path_id: PathId) {
self.transport_summary.add_path(&remote);
self.paths.insert(path_id, remote.clone());
self.open_paths.insert(path_id, remote.clone());
self.path_ids.insert(remote, path_id);

self.update_pub_path_info();
}

Expand Down Expand Up @@ -1376,3 +1387,41 @@ impl Future for OnClosed {
Poll::Ready(self.conn_id)
}
}

/// Used for metrics tracking.
#[derive(Debug, Clone, Copy, Default)]
enum TransportSummary {
#[default]
None,
IpOnly,
RelayOnly,
IpAndRelay,
}

impl TransportSummary {
fn add_path(&mut self, addr: &transports::Addr) {
use transports::Addr;
*self = match (*self, addr) {
(TransportSummary::None | TransportSummary::IpOnly, Addr::Ip(_)) => Self::IpOnly,
(TransportSummary::None | TransportSummary::RelayOnly, Addr::Relay(_, _)) => {
Self::RelayOnly
}
_ => Self::IpAndRelay,
}
}

fn record(&self, metrics: &MagicsockMetrics) {
match self {
TransportSummary::IpOnly => {
metrics.num_conns_transport_ip_only.inc();
}
TransportSummary::RelayOnly => {
metrics.num_conns_transport_relay_only.inc();
}
TransportSummary::IpAndRelay => {
metrics.num_conns_transport_ip_and_relay.inc();
}
TransportSummary::None => {}
}
}
}
Loading