@@ -36,23 +36,14 @@ const (
3636 RayJobDefaultRequeueDuration = 3 * time .Second
3737 RayJobDefaultClusterSelectorKey = "ray.io/cluster"
3838 PythonUnbufferedEnvVarName = "PYTHONUNBUFFERED"
39+ DashboardWorkerNum = 100
40+ TaskQueueSize = 500
3941)
4042
4143var jobInfoMap sync.Map
4244
4345// Simple worker pool for job info updates
44- var jobInfoChan = make (chan func (), 300 ) // Unbuffered channel with unlimited capacity
45-
46- func init () {
47- // Start 10 worker goroutines that will live for the entire program
48- for i := 0 ; i < 100 ; i ++ {
49- go func () {
50- for task := range jobInfoChan {
51- task () // Execute the function
52- }
53- }()
54- }
55- }
46+ var taskQueue = make (chan func (), TaskQueueSize ) // Unbuffered channel with unlimited capacity
5647
5748// RayJobReconciler reconciles a RayJob object
5849type RayJobReconciler struct {
@@ -70,7 +61,14 @@ type RayJobReconcilerOptions struct {
7061
7162// NewRayJobReconciler returns a new reconcile.Reconciler
7263func NewRayJobReconciler (_ context.Context , mgr manager.Manager , options RayJobReconcilerOptions , provider utils.ClientProvider ) * RayJobReconciler {
73- dashboardClientFunc := provider .GetDashboardClient (mgr )
64+ dashboardClientFunc := provider .GetDashboardClient (mgr , & jobInfoMap , taskQueue )
65+ for i := 0 ; i < DashboardWorkerNum ; i ++ {
66+ go func () {
67+ for task := range taskQueue {
68+ task ()
69+ }
70+ }()
71+ }
7472 return & RayJobReconciler {
7573 Client : mgr .GetClient (),
7674 Scheme : mgr .GetScheme (),
@@ -274,7 +272,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
274272 }
275273
276274 // Check the current status of ray jobs
277- jobInfoFromMapInterface , exists := jobInfoMap .Load (rayJobInstance .Name )
275+ jobInfoFromMapInterface , exists := jobInfoMap .Load (rayJobInstance .Status . JobId )
278276 var jobInfoFromMap utiltypes.RayJobInfo
279277 if exists {
280278 jobInfoFromMap = jobInfoFromMapInterface .(utiltypes.RayJobInfo )
@@ -299,7 +297,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
299297 return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
300298 }
301299 jobInfoFromMap = * jobInfo
302- jobInfoMap .Store (rayJobInstance .Name , jobInfoFromMap )
300+ jobInfoMap .Store (rayJobInstance .Status . JobId , jobInfoFromMap )
303301 }
304302
305303 // If the JobStatus is in a terminal status, such as SUCCEEDED, FAILED, or STOPPED, it is impossible for the Ray job
@@ -321,26 +319,11 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
321319 reason = rayv1 .AppFailed
322320 }
323321 } else {
324- // Submit to simple worker pool instead of creating new goroutine
325- select {
326- case jobInfoChan <- func () {
327- rayDashboardClient , err := r .dashboardClientFunc (rayClusterInstance , rayJobInstance .Status .DashboardURL )
328- if err != nil {
329- logger .Error (err , "Failed to get Job client" , "JobId" , rayJobInstance .Status .JobId )
330- return
331- }
332- jobInfo , err := rayDashboardClient .GetJobInfo (ctx , rayJobInstance .Status .JobId )
333- if err != nil {
334- logger .Error (err , "Failed to get job info" , "JobId" , rayJobInstance .Status .JobId )
335- return
336- }
337- jobInfoMap .Store (rayJobInstance .Name , * jobInfo )
338- }:
339- // Task submitted successfully
340- default :
341- // Channel full, skip this update
342- logger .V (1 ).Info ("Worker pool busy, skipping job info update" )
322+ rayDashboardClient , err := r .dashboardClientFunc (rayClusterInstance , rayJobInstance .Status .DashboardURL )
323+ if err != nil {
324+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
343325 }
326+ rayDashboardClient .AsyncGetJobInfo (ctx , rayJobInstance .Status .JobId )
344327 }
345328
346329 // Always update RayClusterStatus along with JobStatus and JobDeploymentStatus updates.
0 commit comments