Skip to content

Commit 752d9cd

Browse files
[core] report driver stat and add test (#58045)
Signed-off-by: tianyi-ge <[email protected]> Signed-off-by: Tianyi <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent d05ca1c commit 752d9cd

File tree

3 files changed

+4
-37
lines changed

3 files changed

+4
-37
lines changed

python/ray/dashboard/modules/reporter/reporter_agent.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,8 +1312,8 @@ def generate_worker_stats_record(self, worker_stats: List[dict]) -> List[Record]
13121312

13131313
for stat in worker_stats:
13141314
cmdline = stat.get("cmdline")
1315-
# All ray processes start with ray::
1316-
if cmdline and len(cmdline) > 0 and cmdline[0].startswith("ray::"):
1315+
# collect both worker and driver stats
1316+
if cmdline:
13171317
proc_name = cmdline[0]
13181318
proc_name_to_stats[proc_name].append(stat)
13191319

@@ -1323,9 +1323,6 @@ def generate_worker_stats_record(self, worker_stats: List[dict]) -> List[Record]
13231323
or stat.get("gpu_utilization", 0) > 0
13241324
):
13251325
gpu_worker_proc_names.add(proc_name)
1326-
# We will lose worker stats that don't follow the ray worker proc
1327-
# naming convention. Theoretically, there should be no data loss here
1328-
# because all worker processes are renamed to ray::.
13291326

13301327
records = []
13311328

python/ray/dashboard/modules/reporter/tests/test_reporter.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -844,37 +844,6 @@ def verify_metrics_values(
844844
0,
845845
)
846846

847-
"""
848-
Verify worker names are only reported when they start with ray::.
849-
"""
850-
# Verify if the command doesn't start with ray::, metrics are not reported.
851-
unknown_stats = {
852-
"memory_info": Bunch(rss=55934976, vms=7026937856, pfaults=15354, pageins=0),
853-
"memory_full_info": Bunch(
854-
uss=51428381, rss=55934976, vms=7026937856, pfaults=15354, pageins=0
855-
),
856-
"cpu_percent": 6.0,
857-
"num_fds": 8,
858-
"cmdline": ["python mock", "", "", "", "", "", "", "", "", "", "", ""],
859-
"create_time": 1614826391.338613,
860-
"pid": 7175,
861-
"cpu_times": Bunch(
862-
user=0.607899328,
863-
system=0.274044032,
864-
children_user=0.0,
865-
children_system=0.0,
866-
),
867-
}
868-
test_stats["workers"] = [idle_stats, unknown_stats]
869-
870-
records = agent._to_records(test_stats, cluster_stats)
871-
uss_records, cpu_records, num_fds_records = get_uss_and_cpu_and_num_fds_records(
872-
records
873-
)
874-
assert "python mock" not in uss_records
875-
assert "python mock" not in cpu_records
876-
assert "python mock" not in num_fds_records
877-
878847
stats_payload = agent._generate_stats_payload(test_stats)
879848
assert stats_payload is not None
880849
assert isinstance(stats_payload, str)

python/ray/tests/test_metrics_agent.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ def verify_node_metrics():
439439
samples = avail_metrics[metric]
440440
for sample in samples:
441441
components.add(sample.labels["Component"])
442-
assert components == {"gcs", "raylet", "agent", "ray::IDLE"}
442+
assert components == {"gcs", "raylet", "agent", "ray::IDLE", sys.executable}
443443

444444
avail_metrics = set(avail_metrics)
445445

@@ -886,6 +886,7 @@ def verify_components():
886886
components.add(sample.labels["Component"])
887887
print(components)
888888
assert {
889+
sys.executable, # driver process
889890
"raylet",
890891
"agent",
891892
"ray::Actor",

0 commit comments

Comments
 (0)