-
Notifications
You must be signed in to change notification settings - Fork 647
[draft] background goroutine get job info #4160
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
805369a
73b14b5
4ce2381
e184e5c
859f6a1
03ce0e9
ac275c2
0923ef5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,188 @@ | ||
| package dashboardclient | ||
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "sync" | ||
| "time" | ||
|
|
||
| lru "github.com/hashicorp/golang-lru/v2" | ||
|
|
||
| rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" | ||
| utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types" | ||
| ) | ||
|
|
||
| var ErrAgain = errors.New("EAGAIN") | ||
|
|
||
| const ( | ||
| // TODO: make queue size and worker size configurable. | ||
| taskQueueSize = 128 | ||
| workerSize = 8 | ||
|
|
||
| queryInterval = 3 * time.Second | ||
|
|
||
| // TODO: consider a proper size for accommodating the all live job info | ||
| cacheSize = 10000 | ||
| cacheExpiry = 10 * time.Minute | ||
| ) | ||
|
|
||
| var ( | ||
| // singleton | ||
| initWorkPool sync.Once | ||
| pool workerPool | ||
|
|
||
| // singleton | ||
| initCacheStorage sync.Once | ||
| cacheStorage *lru.Cache[string, *JobInfoCache] | ||
| ) | ||
|
|
||
| type ( | ||
| Task func() bool | ||
| JobInfoCache struct { | ||
| JobInfo *utiltypes.RayJobInfo | ||
| Err error | ||
| UpdateAt *time.Time | ||
| } | ||
|
|
||
| workerPool struct { | ||
| taskQueue chan Task | ||
| } | ||
| ) | ||
|
|
||
| func (w *workerPool) init(taskQueueSize int, workerSize int, queryInterval time.Duration) { | ||
| w.taskQueue = make(chan Task, taskQueueSize) | ||
|
|
||
| for i := 0; i < workerSize; i++ { | ||
| // TODO: observability for these goroutine | ||
| // TODO: should we consider the stop ? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we consider the stop? The goroutine fetches the JobInfo and keeps the copy of JobInfo in the memory not updating something. Once the program exits, all the copy are gone. |
||
| go func() { | ||
| for task := range w.taskQueue { | ||
| again := task() | ||
|
|
||
| if again { | ||
| time.AfterFunc(queryInterval, func() { | ||
| w.taskQueue <- task | ||
| }) | ||
|
Comment on lines
+63
to
+65
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This wouldn't block the current goroutine. |
||
| } | ||
| } | ||
| }() | ||
| } | ||
| } | ||
|
|
||
| func (w *workerPool) PutTask(task Task) { | ||
| w.taskQueue <- task | ||
| } | ||
|
|
||
| var _ RayDashboardClientInterface = (*RayDashboardCacheClient)(nil) | ||
|
|
||
| type RayDashboardCacheClient struct { | ||
| client RayDashboardClientInterface | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) InitClient(client RayDashboardClientInterface) { | ||
| initWorkPool.Do(func() { | ||
| pool.init(taskQueueSize, workerSize, queryInterval) | ||
| }) | ||
|
|
||
| initCacheStorage.Do(func() { | ||
| if cacheStorage == nil { | ||
| // the New() returns error only if the size is less or equal than zero. | ||
| cacheStorage, _ = lru.New[string, *JobInfoCache](cacheSize) | ||
| } | ||
|
|
||
| // expiry cache cleanup | ||
| go func() { | ||
| ticker := time.NewTicker(queryInterval * 10) | ||
| defer ticker.Stop() | ||
|
|
||
| // TODO: observability ? | ||
| // TODO: should we consider the stop ? | ||
| for range ticker.C { | ||
| keys := cacheStorage.Keys() | ||
| expiredThreshold := time.Now().Add(-cacheExpiry) | ||
| for _, key := range keys { | ||
| if cached, ok := cacheStorage.Peek(key); ok { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Peek wouldn't update the recent-ness of cache. |
||
| if cached.UpdateAt.Before(expiredThreshold) { | ||
| cacheStorage.Remove(key) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| }() | ||
| }) | ||
|
|
||
| r.client = client | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) UpdateDeployments(ctx context.Context, configJson []byte) error { | ||
| return r.client.UpdateDeployments(ctx, configJson) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) GetServeDetails(ctx context.Context) (*utiltypes.ServeDetails, error) { | ||
| return r.client.GetServeDetails(ctx) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) GetMultiApplicationStatus(ctx context.Context) (map[string]*utiltypes.ServeApplicationStatus, error) { | ||
| return r.client.GetMultiApplicationStatus(ctx) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) GetJobInfo(ctx context.Context, jobId string) (*utiltypes.RayJobInfo, error) { | ||
| if cached, ok := cacheStorage.Get(jobId); ok { | ||
| return cached.JobInfo, cached.Err | ||
| } | ||
| currentTime := time.Now() | ||
| placeholder := &JobInfoCache{Err: ErrAgain, UpdateAt: ¤tTime} | ||
|
|
||
| // Put a placeholder in storage. The cache will be updated only if the placeholder exists. | ||
| // The placeholder will be removed when StopJob or DeleteJob. | ||
| if cached, existed, _ := cacheStorage.PeekOrAdd(jobId, placeholder); existed { | ||
| return cached.JobInfo, cached.Err | ||
| } | ||
|
|
||
| task := func() bool { | ||
| jobInfoCache, existed := cacheStorage.Get(jobId) | ||
| if !existed { | ||
| return false | ||
| } | ||
|
|
||
| jobInfoCache.JobInfo, jobInfoCache.Err = r.client.GetJobInfo(ctx, jobId) | ||
| currentTime := time.Now() | ||
| jobInfoCache.UpdateAt = ¤tTime | ||
|
|
||
| if _, existed := cacheStorage.ContainsOrAdd(jobId, jobInfoCache); !existed { | ||
| return false | ||
| } | ||
|
|
||
| return !rayv1.IsJobTerminal(jobInfoCache.JobInfo.JobStatus) | ||
| } | ||
|
|
||
| pool.PutTask(task) | ||
|
|
||
| return nil, ErrAgain | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) ListJobs(ctx context.Context) (*[]utiltypes.RayJobInfo, error) { | ||
| return r.client.ListJobs(ctx) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) SubmitJob(ctx context.Context, rayJob *rayv1.RayJob) (string, error) { | ||
| return r.client.SubmitJob(ctx, rayJob) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) SubmitJobReq(ctx context.Context, request *utiltypes.RayJobRequest) (string, error) { | ||
| return r.client.SubmitJobReq(ctx, request) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) GetJobLog(ctx context.Context, jobName string) (*string, error) { | ||
| return r.client.GetJobLog(ctx, jobName) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) StopJob(ctx context.Context, jobName string) error { | ||
| cacheStorage.Remove(jobName) | ||
| return r.client.StopJob(ctx, jobName) | ||
| } | ||
|
|
||
| func (r *RayDashboardCacheClient) DeleteJob(ctx context.Context, jobName string) error { | ||
| cacheStorage.Remove(jobName) | ||
| return r.client.DeleteJob(ctx, jobName) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,7 +27,6 @@ var ( | |
| ) | ||
|
|
||
| type RayDashboardClientInterface interface { | ||
| InitClient(client *http.Client, dashboardURL string) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this method from the interface because the different implementation might have different input arguments. |
||
| UpdateDeployments(ctx context.Context, configJson []byte) error | ||
| // V2/multi-app Rest API | ||
| GetServeDetails(ctx context.Context) (*utiltypes.ServeDetails, error) | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it okay to call StopJob to remove the cache placeholder before deleting the RayCluster because the status of retry calls deleteClusterResources?