Skip to content

Commit e8a85b9

Browse files
committed
ref(instr): Lower deadline exceeded errors to warnings
1 parent 30ef200 commit e8a85b9

File tree

3 files changed

+78
-46
lines changed

3 files changed

+78
-46
lines changed

relay-server/src/services/projects/cache/service.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use crate::services::projects::cache::state::{
1313
self, CompletedFetch, Eviction, Fetch, ProjectStore, Refresh,
1414
};
1515
use crate::services::projects::project::ProjectState;
16-
use crate::services::projects::source::ProjectSource;
16+
use crate::services::projects::source::{ProjectSource, ProjectSourceError, upstream};
1717
use crate::statsd::{RelayCounters, RelayGauges, RelayTimers};
1818
use crate::utils::FuturesScheduled;
1919

@@ -119,7 +119,13 @@ impl ProjectCacheService {
119119
.await
120120
{
121121
Ok(result) => result,
122-
Err(err) => {
122+
Err(ProjectSourceError::Upstream(upstream::Error::DeadlineExceeded)) => {
123+
// Somewhat of an expected error which is already logged on the upstream side.
124+
//
125+
// -> we can just go into our usual pending handling.
126+
ProjectState::Pending.into()
127+
}
128+
Err(err @ ProjectSourceError::FatalUpstream) => {
123129
relay_log::error!(
124130
tags.project_key = fetch.project_key().as_str(),
125131
tags.has_revision = fetch.revision().as_str().is_some(),

relay-server/src/services/projects/source/mod.rs

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ impl ProjectSource {
102102
current_revision,
103103
no_cache,
104104
})
105-
.await?;
105+
.await
106+
.map_err(|_| ProjectSourceError::FatalUpstream)??;
106107

107108
Ok(match state {
108109
SourceProjectState::New(state) => {
@@ -115,12 +116,15 @@ impl ProjectSource {
115116

116117
#[derive(Debug, thiserror::Error)]
117118
pub enum ProjectSourceError {
118-
#[error("redis permit error {0}")]
119-
RedisPermit(#[from] tokio::sync::AcquireError),
120-
#[error("redis join error {0}")]
121-
RedisJoin(#[from] tokio::task::JoinError),
122-
#[error("upstream error {0}")]
123-
Upstream(#[from] relay_system::SendError),
119+
/// Error returned from the upstream.
120+
#[error("upstream error: {0}")]
121+
Upstream(#[from] upstream::Error),
122+
/// Upstream did not return a result.
123+
///
124+
/// This happens when the upstream does not reply to the request.
125+
/// This should never happen.
126+
#[error("fatal upstream error")]
127+
FatalUpstream,
124128
}
125129

126130
impl From<Infallible> for ProjectSourceError {

relay-server/src/services/projects/source/upstream.rs

Lines changed: 59 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,21 @@ impl UpstreamQuery for GetProjectStates {
9494
}
9595
}
9696

97+
/// Error returned to the requester.
98+
#[derive(Copy, Clone, Debug, thiserror::Error)]
99+
pub enum Error {
100+
/// Fetching the project state exceeded the configured deadline.
101+
#[error("deadline exceeded while fetching project state")]
102+
DeadlineExceeded,
103+
}
104+
97105
/// The wrapper struct for the incoming external requests which also keeps addition information.
98106
#[derive(Debug)]
99107
struct ProjectStateChannel {
100108
// Main broadcast channel.
101-
channel: BroadcastChannel<SourceProjectState>,
109+
channel: BroadcastChannel<Message>,
102110
// Additional broadcast channels tracked from merge operations.
103-
merged: Vec<BroadcastChannel<SourceProjectState>>,
111+
merged: Vec<BroadcastChannel<Message>>,
104112
revision: Revision,
105113
deadline: Instant,
106114
no_cache: bool,
@@ -113,7 +121,7 @@ struct ProjectStateChannel {
113121

114122
impl ProjectStateChannel {
115123
pub fn new(
116-
sender: BroadcastSender<SourceProjectState>,
124+
sender: BroadcastSender<Message>,
117125
revision: Revision,
118126
timeout: Duration,
119127
no_cache: bool,
@@ -141,22 +149,23 @@ impl ProjectStateChannel {
141149
/// If the new revision is different from the contained revision this clears the revision.
142150
/// To not have multiple fetches per revision per batch, we need to find a common denominator
143151
/// for requests with different revisions, which is always to fetch the full project config.
144-
pub fn attach(&mut self, sender: BroadcastSender<SourceProjectState>, revision: Revision) {
152+
pub fn attach(&mut self, sender: BroadcastSender<Message>, revision: Revision) {
145153
self.channel.attach(sender);
146154
if self.revision != revision {
147155
self.revision = Revision::default();
148156
}
149157
}
150158

151159
pub fn send(self, state: SourceProjectState) {
152-
for channel in self.merged {
153-
channel.send(state.clone());
154-
}
155-
self.channel.send(state)
160+
self.do_send(Ok(state));
161+
}
162+
163+
pub fn error(self, err: Error) {
164+
self.do_send(Err(err));
156165
}
157166

158-
pub fn expired(&self) -> bool {
159-
Instant::now() > self.deadline
167+
pub fn expired(&self, now: Instant) -> bool {
168+
now > self.deadline
160169
}
161170

162171
pub fn merge(&mut self, channel: ProjectStateChannel) {
@@ -182,28 +191,35 @@ impl ProjectStateChannel {
182191
self.errors += errors;
183192
self.pending += pending;
184193
}
194+
195+
fn do_send(self, message: Message) {
196+
for channel in self.merged {
197+
channel.send(message.clone());
198+
}
199+
self.channel.send(message)
200+
}
185201
}
186202

187203
/// The map of project keys with their project state channels.
188204
type ProjectStateChannels = HashMap<ProjectKey, ProjectStateChannel>;
189205

206+
/// The message used to communicate with the requester.
207+
type Message = Result<SourceProjectState, Error>;
208+
190209
/// This is the [`UpstreamProjectSourceService`] interface.
191210
///
192211
/// The service is responsible for fetching the [`ParsedProjectState`] from the upstream.
193212
/// Internally it maintains the buffer queue of the incoming requests, which got scheduled to fetch the
194213
/// state and takes care of the backoff in case there is a problem with the requests.
195214
#[derive(Debug)]
196-
pub struct UpstreamProjectSource(FetchProjectState, BroadcastSender<SourceProjectState>);
215+
pub struct UpstreamProjectSource(FetchProjectState, BroadcastSender<Message>);
197216

198217
impl Interface for UpstreamProjectSource {}
199218

200219
impl FromMessage<FetchProjectState> for UpstreamProjectSource {
201-
type Response = BroadcastResponse<SourceProjectState>;
220+
type Response = BroadcastResponse<Message>;
202221

203-
fn from_message(
204-
message: FetchProjectState,
205-
sender: BroadcastSender<SourceProjectState>,
206-
) -> Self {
222+
fn from_message(message: FetchProjectState, sender: BroadcastSender<Message>) -> Self {
207223
Self(message, sender)
208224
}
209225
}
@@ -271,6 +287,7 @@ impl UpstreamProjectSourceService {
271287
/// Prepares the batches of the cache and nocache channels which could be used to request the
272288
/// project states.
273289
fn prepare_batches(&mut self) -> ChannelsBatch {
290+
let now = Instant::now();
274291
let batch_size = self.config.query_batch_size();
275292
let num_batches = self.config.max_concurrent_queries();
276293

@@ -286,26 +303,31 @@ impl UpstreamProjectSourceService {
286303

287304
let fresh_channels = (projects.iter())
288305
.filter_map(|id| Some((*id, self.state_channels.remove(id)?)))
289-
.filter(|(id, channel)| {
290-
if channel.expired() {
291-
metric!(
292-
distribution(RelayDistributions::ProjectStateAttempts) = channel.attempts,
293-
result = "timeout",
294-
);
295-
metric!(
296-
counter(RelayCounters::ProjectUpstreamCompleted) += 1,
297-
result = "timeout",
298-
);
299-
relay_log::error!(
300-
errors = channel.errors,
301-
pending = channel.pending,
302-
tags.did_error = channel.errors > 0,
303-
tags.was_pending = channel.pending > 0,
304-
tags.project_key = id.to_string(),
305-
"error fetching project state {id}: deadline exceeded",
306-
);
306+
.filter_map(|(id, channel)| {
307+
if !channel.expired(now) {
308+
return Some((id, channel));
307309
}
308-
!channel.expired()
310+
311+
// Channel is expired, emit telemetry and notify the other end.
312+
metric!(
313+
distribution(RelayDistributions::ProjectStateAttempts) = channel.attempts,
314+
result = "timeout",
315+
);
316+
metric!(
317+
counter(RelayCounters::ProjectUpstreamCompleted) += 1,
318+
result = "timeout",
319+
);
320+
relay_log::warn!(
321+
errors = channel.errors,
322+
pending = channel.pending,
323+
tags.did_error = channel.errors > 0,
324+
tags.was_pending = channel.pending > 0,
325+
tags.project_key = %id,
326+
"error fetching project state {id}: deadline exceeded",
327+
);
328+
channel.error(Error::DeadlineExceeded);
329+
330+
None
309331
});
310332

311333
// Separate regular channels from those with the `nocache` flag. The latter go in separate
@@ -730,8 +752,8 @@ mod tests {
730752
.await;
731753

732754
let (response1, response2) = futures::future::join(response1, response2).await;
733-
assert!(matches!(response1, Ok(SourceProjectState::NotModified)));
734-
assert!(matches!(response2, Ok(SourceProjectState::NotModified)));
755+
assert!(matches!(response1, Ok(Ok(SourceProjectState::NotModified))));
756+
assert!(matches!(response2, Ok(Ok(SourceProjectState::NotModified))));
735757

736758
// No more messages to upstream expected.
737759
assert!(upstream_rx.try_recv().is_err());

0 commit comments

Comments
 (0)