apache · sryza · Jul 2, 2025
diff --git a/...nnect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala b/...nnect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala
@@ -226,7 +226,7 @@ private[connect] object PipelinesHandler extends Logging {
     val dataflowGraphId = cmd.getDataflowGraphId
     val graphElementRegistry = DataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
     // We will use this variable to store the run failure event if it occurs. This will be set
-    // by the event callback that is executed when an event is added to the PipelineRunEventBuffer.
+    // by the event callback.
     @volatile var runFailureEvent = Option.empty[PipelineEvent]
     // Define a callback which will stream logs back to the SparkConnect client when an internal
     // pipeline event is emitted during pipeline execution. We choose to pass a callback rather the

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelineExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelineExecution.scala
@@ -50,7 +50,7 @@ class PipelineExecution(context: PipelineUpdateContext) {
     // Execute the graph.
     graphExecution = Option(
       new TriggeredGraphExecution(initializedGraph, context, onCompletion = terminationReason => {
-        context.eventBuffer.addEvent(
+        context.eventCallback(
           ConstructPipelineEvent(
             origin = PipelineEventOrigin(
               flowName = None,
@@ -75,7 +75,7 @@ class PipelineExecution(context: PipelineUpdateContext) {
       context.pipelineExecution.awaitCompletion()
     } catch {
       case e: Throwable =>
-        context.eventBuffer.addEvent(
+        context.eventCallback(
           ConstructPipelineEvent(
             origin = PipelineEventOrigin(
               flowName = None,

diff --git a/...pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelineUpdateContext.scala b/...pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelineUpdateContext.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.pipelines.graph
 
 import org.apache.spark.sql.classic.SparkSession
-import org.apache.spark.sql.pipelines.logging.{FlowProgressEventLogger, PipelineRunEventBuffer}
+import org.apache.spark.sql.pipelines.logging.{FlowProgressEventLogger, PipelineEvent}
 
 trait PipelineUpdateContext {
 
@@ -50,8 +50,8 @@ trait PipelineUpdateContext {
     UnionFlowFilter(flowFilterForTables, resetCheckpointFlows)
   }
 
-  /** Buffer containing internal events that are emitted during a run of a pipeline. */
-  def eventBuffer: PipelineRunEventBuffer
+  /** Callback to invoke for internal events that are emitted during a run of a pipeline. */
+  def eventCallback: PipelineEvent => Unit
 
   /** Emits internal flow progress events into the event buffer. */
   def flowProgressEventLogger: FlowProgressEventLogger

diff --git a/...lines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelineUpdateContextImpl.scala b/...lines/src/main/scala/org/apache/spark/sql/pipelines/graph/PipelineUpdateContextImpl.scala
@@ -18,11 +18,7 @@
 package org.apache.spark.sql.pipelines.graph
 
 import org.apache.spark.sql.classic.SparkSession
-import org.apache.spark.sql.pipelines.logging.{
-  FlowProgressEventLogger,
-  PipelineEvent,
-  PipelineRunEventBuffer
-}
+import org.apache.spark.sql.pipelines.logging.{FlowProgressEventLogger, PipelineEvent}
 
 /**
  * An implementation of the PipelineUpdateContext trait used in production.
@@ -31,17 +27,15 @@ import org.apache.spark.sql.pipelines.logging.{
  */
 class PipelineUpdateContextImpl(
     override val unresolvedGraph: DataflowGraph,
-    eventCallback: PipelineEvent => Unit
+    override val eventCallback: PipelineEvent => Unit
 ) extends PipelineUpdateContext {
 
   override val spark: SparkSession = SparkSession.getActiveSession.getOrElse(
     throw new IllegalStateException("SparkSession is not available")
   )
 
-  override val eventBuffer = new PipelineRunEventBuffer(eventCallback)
-
   override val flowProgressEventLogger: FlowProgressEventLogger =
-    new FlowProgressEventLogger(eventBuffer = eventBuffer)
+    new FlowProgressEventLogger(eventCallback = eventCallback)
 
   override val refreshTables: TableFilter = AllTables
   override val fullRefreshTables: TableFilter = NoTables

diff --git a/...lines/src/main/scala/org/apache/spark/sql/pipelines/logging/FlowProgressEventLogger.scala b/...lines/src/main/scala/org/apache/spark/sql/pipelines/logging/FlowProgressEventLogger.scala
@@ -36,9 +36,9 @@ import org.apache.spark.sql.pipelines.graph.{FlowExecution, ResolutionCompletedF
  *  - All flow progress events other than errors/warnings will be logged at INFO level (including
  *    flow progress events with metrics) and error/warning messages will be logged at their level.
  *
- * @param eventBuffer Event log to log the flow progress events.
+ * @param eventCallback Callback to invoke on the flow progress events.
  */
-class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Logging {
+class FlowProgressEventLogger(eventCallback: PipelineEvent => Unit) extends Logging {
 
   /**
    * This map stores flow identifier to a boolean representing whether flow is running.
@@ -57,7 +57,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
    * INFO level, since flows are only queued once.
    */
   def recordQueued(flow: ResolvedFlow): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -76,7 +76,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
    */
   def recordPlanningForBatchFlow(batchFlow: ResolvedFlow): Unit = synchronized {
     if (batchFlow.df.isStreaming) return
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(batchFlow.displayName),
@@ -97,7 +97,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
    * logged at METRICS. All other cases will be logged at INFO.
    */
   def recordStart(flowExecution: FlowExecution): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flowExecution.displayName),
@@ -114,7 +114,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
 
   /** Records flow progress events with flow status as RUNNING. */
   def recordRunning(flow: ResolvedFlow): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -142,7 +142,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
   ): Unit = synchronized {
     val eventLogMessage = messageOpt.getOrElse(s"Flow '${flow.displayName}' has FAILED.")
 
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -165,7 +165,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
    * record skipped should be used when the flow is skipped because of upstream flow failures.
    */
   def recordSkippedOnUpStreamFailure(flow: ResolvedFlow): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -188,7 +188,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
    * upstream failures use [[recordSkippedOnUpStreamFailure]] function.
    */
   def recordSkipped(flow: ResolvedFlow): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -208,7 +208,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
 
   /** Records flow progress events with flow status as EXCLUDED at INFO level.  */
   def recordExcluded(flow: ResolvedFlow): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -232,7 +232,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
       message: Option[String] = None,
       cause: Option[Throwable] = None
   ): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -252,7 +252,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
 
   /** Records flow progress events with flow status as IDLE. */
   def recordIdle(flow: ResolvedFlow): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),
@@ -277,7 +277,7 @@ class FlowProgressEventLogger(eventBuffer: PipelineRunEventBuffer) extends Loggi
    * event.
    */
   def recordCompletion(flow: ResolvedFlow): Unit = synchronized {
-    eventBuffer.addEvent(
+    eventCallback(
       ConstructPipelineEvent(
         origin = PipelineEventOrigin(
           flowName = Option(flow.displayName),

diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/ExecutionTest.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/ExecutionTest.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.pipelines.logging.{
   FlowProgress,
   FlowProgressEventLogger,
   PipelineEvent,
-  PipelineRunEventBuffer,
   RunProgress
 }
 
@@ -60,12 +59,12 @@ trait TestPipelineUpdateContextMixin {
       refreshTables: TableFilter = AllTables,
       resetCheckpointFlows: FlowFilter = AllFlows
   ) extends PipelineUpdateContext {
-    val eventBuffer = new PipelineRunEventBuffer(eventCallback = _ => ())
+    val eventBuffer = new PipelineRunEventBuffer()
+
+    override val eventCallback: PipelineEvent => Unit = eventBuffer.addEvent
 
     override def flowProgressEventLogger: FlowProgressEventLogger = {
-      new FlowProgressEventLogger(
-        eventBuffer = eventBuffer
-      )
+      new FlowProgressEventLogger(eventCallback = eventCallback)
     }
   }
 }

diff --git a/...ines/logging/PipelineRunEventBuffer.scala → ...elines/utils/PipelineRunEventBuffer.scala b/...ines/logging/PipelineRunEventBuffer.scala → ...elines/utils/PipelineRunEventBuffer.scala
@@ -15,33 +15,28 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.pipelines.logging
+package org.apache.spark.sql.pipelines.utils
 
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.pipelines.logging.PipelineEvent
 
 /**
  * An in-memory buffer which contains the internal events that are emitted during a run of a
  * pipeline.
- *
- * @param eventCallback A callback function to be called when an event is added to the buffer.
  */
-class PipelineRunEventBuffer(eventCallback: PipelineEvent => Unit) extends Logging {
+class PipelineRunEventBuffer extends Logging {
 
   /**
    * A buffer to hold the events emitted during a pipeline run.
    * This buffer is thread-safe and can be accessed concurrently.
-   *
-   * TODO(SPARK-52409): Deprecate this class to be used in test only and use a more
-   *                    robust event logging system in production.
    */
   private val events = ArrayBuffer[PipelineEvent]()
 
   def addEvent(event: PipelineEvent): Unit = synchronized {
     val eventToAdd = event
     events.append(eventToAdd)
-    eventCallback(event)
   }
 
   def clear(): Unit = synchronized {