66 "os"
77 "strconv"
88 "strings"
9+ "sync"
910 "time"
1011
1112 "github.com/go-logr/logr"
@@ -29,6 +30,7 @@ import (
2930 "github.com/ray-project/kuberay/ray-operator/controllers/ray/metrics"
3031 "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
3132 "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient"
33+ utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types"
3234 "github.com/ray-project/kuberay/ray-operator/pkg/features"
3335)
3436
@@ -40,11 +42,12 @@ const (
4042// RayJobReconciler reconciles a RayJob object
4143type RayJobReconciler struct {
4244 client.Client
43- Scheme * runtime.Scheme
44- Recorder record.EventRecorder
45-
45+ Scheme * runtime.Scheme
46+ Recorder record.EventRecorder
47+ JobInfoMap * sync. Map
4648 dashboardClientFunc func (rayCluster * rayv1.RayCluster , url string ) (dashboardclient.RayDashboardClientInterface , error )
4749 options RayJobReconcilerOptions
50+ workerPool * dashboardclient.WorkerPool
4851}
4952
5053type RayJobReconcilerOptions struct {
@@ -53,13 +56,18 @@ type RayJobReconcilerOptions struct {
5356
5457// NewRayJobReconciler returns a new reconcile.Reconciler
5558func NewRayJobReconciler (_ context.Context , mgr manager.Manager , options RayJobReconcilerOptions , provider utils.ClientProvider ) * RayJobReconciler {
56- dashboardClientFunc := provider .GetDashboardClient (mgr )
59+ taskQueue := make (chan func (), 1000 )
60+ JobInfoMap := & sync.Map {}
61+ workerPool := dashboardclient .NewWorkerPool (taskQueue )
62+ dashboardClientFunc := provider .GetDashboardClient (mgr , taskQueue , JobInfoMap )
5763 return & RayJobReconciler {
5864 Client : mgr .GetClient (),
5965 Scheme : mgr .GetScheme (),
6066 Recorder : mgr .GetEventRecorderFor ("rayjob-controller" ),
67+ JobInfoMap : JobInfoMap ,
6168 dashboardClientFunc : dashboardClientFunc ,
6269 options : options ,
70+ workerPool : workerPool ,
6371 }
6472}
6573
@@ -263,9 +271,12 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
263271 if err != nil {
264272 return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
265273 }
266-
267- jobInfo , err := rayDashboardClient .GetJobInfo (ctx , rayJobInstance .Status .JobId )
268- if err != nil {
274+ var jobInfo * utiltypes.RayJobInfo
275+ if loadedJobInfo , ok := r .JobInfoMap .Load (rayJobInstance .Status .JobId ); ok {
276+ logger .Info ("Found jobInfo in map" , "JobId" , rayJobInstance .Status .JobId , "jobInfo" , loadedJobInfo )
277+ jobInfo = loadedJobInfo .(* utiltypes.RayJobInfo )
278+ logger .Info ("Casted jobInfo" , "JobId" , rayJobInstance .Status .JobId , "jobInfo" , jobInfo )
279+ } else {
269280 // If the Ray job was not found, GetJobInfo returns a BadRequest error.
270281 if rayJobInstance .Spec .SubmissionMode == rayv1 .HTTPMode && errors .IsBadRequest (err ) {
271282 logger .Info ("The Ray job was not found. Submit a Ray job via an HTTP request." , "JobId" , rayJobInstance .Status .JobId )
@@ -275,10 +286,16 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
275286 }
276287 return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
277288 }
278- logger .Error (err , "Failed to get job info" , "JobId" , rayJobInstance .Status .JobId )
289+ logger .Info ("Job info not found in map" , "JobId" , rayJobInstance .Status .JobId )
290+ rayDashboardClient .AsyncGetJobInfo (ctx , rayJobInstance .Status .JobId )
279291 return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
280292 }
281293
294+ rayDashboardClient .AsyncGetJobInfo (ctx , rayJobInstance .Status .JobId )
295+ if jobInfo == nil {
296+ logger .Error (err , "Failed to get job info" , "JobId" , rayJobInstance .Status .JobId )
297+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
298+ }
282299 // If the JobStatus is in a terminal status, such as SUCCEEDED, FAILED, or STOPPED, it is impossible for the Ray job
283300 // to transition to any other. Additionally, RayJob does not currently support retries. Hence, we can mark the RayJob
284301 // as "Complete" or "Failed" to avoid unnecessary reconciliation.
0 commit comments