@@ -49,6 +49,7 @@ type BESPipeInterceptor interface {
4949const besEventGlobalTimeoutDuration = 5 * time .Minute
5050const besEventThrottleDuration = 50 * time .Millisecond
5151const gracePeriodDuration = 2 * time .Second
52+ const sendTimeout = 10 * time .Second // Timeout for individual Send calls to detect backed-up receivers
5253
5354func NewBESPipe (buildId , invocationId string ) (BESPipeInterceptor , error ) {
5455 return & besPipe {
@@ -72,6 +73,9 @@ type besPipe struct {
7273 besInvocationId string
7374 besProxies []besproxy.BESProxy
7475
76+ // Track whether we have already unlinked the pipe due to backend failure
77+ pipeAborted sync.Once
78+
7579 wg * sync.WaitGroup
7680}
7781
@@ -101,7 +105,7 @@ func (bb *besPipe) RegisterBesProxy(ctx context.Context, p besproxy.BESProxy) {
101105 go func () {
102106 for {
103107 // If the proxy is not healthy, break out of the loop
104- if ! p .Healthy () {
108+ if ! p .IsHealthy () {
105109 break
106110 }
107111 _ , err := p .Recv ()
@@ -116,6 +120,8 @@ func (bb *besPipe) RegisterBesProxy(ctx context.Context, p besproxy.BESProxy) {
116120 break
117121 }
118122 }
123+ // When the ACK goroutine exits (usually because of error), check if we should abort the pipe
124+ bb .maybeAbortPipeBecauseNoHealthyBackends ()
119125 }()
120126}
121127
@@ -156,32 +162,35 @@ func (bb *besPipe) ServeWait(ctx context.Context) error {
156162 bb .wg .Add (1 )
157163 go func () {
158164 defer bb .wg .Done ()
165+
166+ // If the overall context is cancelled, abort the pipe immediately
167+ go func () {
168+ <- ctx .Done ()
169+ bb .maybeAbortPipeBecauseNoHealthyBackends ()
170+ }()
171+
159172 conn , err := os .OpenFile (bb .bepBinPath , os .O_RDONLY , os .ModeNamedPipe )
160173 if err != nil {
161174 bb .errorsMutex .Lock ()
162- defer bb .errorsMutex .Unlock ()
163175 bb .errors .Insert (fmt .Errorf ("failed to accept connection on BES pipe %s: %w" , bb .bepBinPath , err ))
176+ bb .errorsMutex .Unlock ()
164177 return
165178 }
166-
167- defer func () {
168- conn .Close ()
169- }()
179+ defer conn .Close ()
170180
171181 if err := bb .streamBesEvents (ctx , conn ); err != nil {
172182 bb .errorsMutex .Lock ()
173- defer bb .errorsMutex .Unlock ()
174183 bb .errors .Insert (fmt .Errorf ("failed to stream BES events: %w" , err ))
184+ bb .errorsMutex .Unlock ()
175185 return
176186 }
177187
188+ // Normal completion path
178189 for _ , p := range bb .besProxies {
179- if ! p .Healthy () {
190+ if ! p .IsHealthy () {
180191 continue
181192 }
182-
183193 bb .sendFinalLifecycleEvents (context .Background (), p )
184-
185194 if err := p .CloseSend (); err != nil {
186195 fmt .Fprintf (os .Stderr , "Error closing build event stream to %v: %s\n " , p .Host (), err .Error ())
187196 }
@@ -190,25 +199,47 @@ func (bb *besPipe) ServeWait(ctx context.Context) error {
190199 return nil
191200}
192201
202+ // maybeAbortPipeBecauseNoHealthyBackends unlinks the FIFO if all backends are unhealthy.
203+ // This gives Bazel a broken pipe → it aborts the upload and exits.
204+ func (bb * besPipe ) maybeAbortPipeBecauseNoHealthyBackends () {
205+ if len (bb .besProxies ) == 0 {
206+ return
207+ }
208+
209+ var anyHealthy bool
210+ for _ , p := range bb .besProxies {
211+ if p .IsHealthy () {
212+ anyHealthy = true
213+ break
214+ }
215+ }
216+ if anyHealthy {
217+ return
218+ }
219+
220+ bb .pipeAborted .Do (func () {
221+ fmt .Fprintf (os .Stderr , "All BES backends are unhealthy — unlinking pipe %s to unblock Bazel\n " , bb .bepBinPath )
222+ if err := syscall .Unlink (bb .bepBinPath ); err != nil && ! os .IsNotExist (err ) {
223+ fmt .Fprintf (os .Stderr , "failed to unlink BES pipe %s: %v\n " , bb .bepBinPath , err )
224+ }
225+ })
226+ }
227+
193228func (bb * besPipe ) streamBesEvents (ctx context.Context , conn * os.File ) error {
194229 reader := bufio .NewReader (conn )
195230
196231 go func () {
197232 <- ctx .Done ()
198- if err := conn .SetReadDeadline (time .Now ().Add (gracePeriodDuration )); err != nil {
199- fmt .Fprintf (os .Stderr , "failed to set read deadline after context done: %s\n " , err .Error ())
200- }
233+ conn .SetReadDeadline (time .Now ().Add (gracePeriodDuration ))
201234 }()
202235
203- // Manually manage a sequence ID for the events
204236 seqId := int64 (0 )
205-
206237 besEventGlobalTimeout := time .After (besEventGlobalTimeoutDuration )
238+
207239 for {
208240 event := buildeventstream.BuildEvent {}
209-
210241 opts := protodelim.UnmarshalOptions {
211- MaxSize : 32 * 1024 * 1024 , // 32 MB max; we have observed 17 MB BES events in the wild
242+ MaxSize : 32 * 1024 * 1024 , // 32 MB max
212243 }
213244
214245 if err := opts .UnmarshalFrom (reader , & event ); err != nil {
@@ -220,16 +251,13 @@ func (bb *besPipe) streamBesEvents(ctx context.Context, conn *os.File) error {
220251 case <- besEventGlobalTimeout :
221252 return fmt .Errorf ("timeout reached while waiting for BES events" )
222253 case <- time .After (besEventThrottleDuration ):
223- // throttle the reading of the BES file when no new data is available
224254 continue
225255 }
226256 }
227257 return fmt .Errorf ("failed to parse BES event: %w" , err )
228258 }
229259
230- // Reset the global timeout on each received event
231260 besEventGlobalTimeout = time .After (besEventGlobalTimeoutDuration )
232-
233261 seqId ++
234262
235263 if err := bb .publishBesEvent (seqId , & event ); err != nil {
@@ -241,9 +269,7 @@ func (bb *besPipe) streamBesEvents(ctx context.Context, conn *os.File) error {
241269 }
242270 }
243271
244- // Clear read deadline
245272 conn .SetReadDeadline (time.Time {})
246-
247273 return nil
248274}
249275
@@ -269,7 +295,6 @@ func (bb *besPipe) publishBesEvent(seqId int64, event *buildeventstream.BuildEve
269295 return fmt .Errorf ("failed to marshal BES event: %w" , err )
270296 }
271297
272- // Wrap the event in the gRPC message
273298 grpcEvent := & buildv1.PublishBuildToolEventStreamRequest {
274299 OrderedBuildEvent : & buildv1.OrderedBuildEvent {
275300 SequenceNumber : seqId ,
@@ -285,14 +310,36 @@ func (bb *besPipe) publishBesEvent(seqId int64, event *buildeventstream.BuildEve
285310 }
286311
287312 for _ , p := range bb .besProxies {
288- eg .Go (
289- func () error {
290- if err := p .Send (grpcEvent ); err != nil {
313+ p := p // capture
314+ eg .Go (func () error {
315+ if ! p .IsHealthy () {
316+ return nil
317+ }
318+
319+ // Channel for Send result
320+ sendCh := make (chan error , 1 )
321+
322+ // Run Send in goroutine
323+ go func () {
324+ err := p .Send (grpcEvent )
325+ sendCh <- err
326+ }()
327+
328+ // Wait for Send or timeout
329+ select {
330+ case err := <- sendCh :
331+ if err != nil {
291332 fmt .Fprintf (os .Stderr , "Error sending BES event to %v: %s\n " , p .Host (), err .Error ())
333+ bb .maybeAbortPipeBecauseNoHealthyBackends ()
292334 }
293335 return nil
294- },
295- )
336+ case <- time .After (sendTimeout ):
337+ fmt .Fprintf (os .Stderr , "Timeout sending BES event to %v: marking unhealthy\n " , p .Host ())
338+ p .MarkUnhealthy ()
339+ bb .maybeAbortPipeBecauseNoHealthyBackends ()
340+ return nil
341+ }
342+ })
296343 }
297344 }
298345
@@ -305,8 +352,6 @@ func (bb *besPipe) Args() []string {
305352 bb .bepBinPath ,
306353 }
307354
308- // Also add wait_for_upload_complete flag if the bes pipe was explicitly requested.
309- // NOTE: this is explicitly not the default behavior to avoid breaking changes in bazel6
310355 if os .Getenv ("ASPECT_BEP_USE_PIPE" ) != "" {
311356 args = append (args , "--build_event_binary_file_upload_mode=wait_for_upload_complete" )
312357 }
@@ -326,6 +371,6 @@ func (bb *besPipe) Errors() []error {
326371
327372func (bb * besPipe ) GracefulStop () {
328373 bb .wg .Wait ()
329-
374+ // Best-effort cleanup
330375 os .Remove (bb .bepBinPath )
331376}
0 commit comments