Skip to content

Commit 648c074

Browse files
Qiaolin-Yudayshahstephanie-wang
committed
[doc][rdt] Add the limitations of rdt (ray-project#58063)
Signed-off-by: Dhyey Shah <[email protected]> Signed-off-by: Qiaolin-Yu <[email protected]> Signed-off-by: Qiaolin Yu <[email protected]> Co-authored-by: Dhyey Shah <[email protected]> Co-authored-by: Stephanie Wang <[email protected]>
1 parent 92ad009 commit 648c074

File tree

4 files changed

+51
-0
lines changed

4 files changed

+51
-0
lines changed

doc/source/ray-core/direct-transport.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,13 @@ For collective-based tensor transports (Gloo and NCCL):
298298
* Any unexpected system bugs
299299

300300

301+
Due to a known issue, we currently do not support repeated transfers of tensors that share the same memory space but simultaneously belong to different objects. To support this pattern, ensure that the first object is freed before storing the same tensor again in a second object.
302+
303+
.. literalinclude:: doc_code/direct_transport_nixl.py
304+
:language: python
305+
:start-after: __nixl_limitations_start__
306+
:end-before: __nixl_limitations_end__
307+
301308
Advanced: RDT Internals
302309
=======================
303310

doc/source/ray-core/doc_code/direct_transport_nixl.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,36 @@ def consume_with_nixl(self, refs):
5656
ref1 = receiver.consume_with_nixl.remote(refs)
5757
print(ray.get(ref1))
5858
# __nixl_put__and_get_end__
59+
60+
61+
# __nixl_limitations_start__
62+
@ray.remote(num_gpus=1)
63+
class Actor:
64+
def __init__(self):
65+
self.tensor1 = torch.tensor([1, 2, 3])
66+
self.tensor2 = torch.tensor([4, 5, 6])
67+
self.tensor3 = torch.tensor([7, 8, 9])
68+
69+
@ray.method(tensor_transport="nixl")
70+
def send_dict1(self):
71+
return {"round1-1": self.tensor1, "round1-2": self.tensor2}
72+
73+
@ray.method(tensor_transport="nixl")
74+
def send_dict2(self):
75+
return {"round2-1": self.tensor1, "round2-3": self.tensor3}
76+
77+
def sum_dict(self, dict):
78+
return sum(v.sum().item() for v in dict.values())
79+
80+
81+
sender, receiver = Actor.remote(), Actor.remote()
82+
ref1 = sender.send_dict1.remote()
83+
result1 = receiver.sum_dict.remote(ref1)
84+
print(ray.get(result1))
85+
ref2 = sender.send_dict2.remote()
86+
result2 = receiver.sum_dict.remote(ref2)
87+
try:
88+
print(ray.get(result2))
89+
except ValueError as e:
90+
print("Error caught:", e)
91+
# __nixl_limitations_end__

python/ray/experimental/gpu_object_manager/gpu_object_store.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ def add_object(
202202
is_primary: Whether the GPU object is the primary copy.
203203
"""
204204
with self._object_present_cv:
205+
for tensor in gpu_object:
206+
if tensor in self._tensor_to_object_ids:
207+
raise ValueError(
208+
f"Tensor already exists in the RDT object store. Free all references to ObjectRef({obj_id}) before storing the tensor again."
209+
)
205210
for tensor in gpu_object:
206211
self._tensor_to_object_ids[tensor].add(obj_id)
207212
# Append to the queue instead of overwriting

python/ray/tests/gpu_objects/test_gpu_objects_gloo.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,9 @@ def gc():
809809
assert not gpu_object_store.has_object(obj_id)
810810

811811

812+
@pytest.mark.skip(
813+
reason="RDT currently doesn't support multiple objects containing the same tensor"
814+
)
812815
def test_wait_tensor_freed_double_tensor(ray_start_regular):
813816
"""Unit test for ray.experimental.wait_tensor_freed when multiple objects
814817
contain the same tensor."""
@@ -848,6 +851,9 @@ def gc(obj_id):
848851
assert not gpu_object_store.has_object(obj_id2)
849852

850853

854+
@pytest.mark.skip(
855+
reason="RDT currently doesn't support multiple objects containing the same tensor"
856+
)
851857
def test_send_back_and_dst_warning(ray_start_regular):
852858
# Test warning when object is sent back to the src actor and to dst actors
853859
world_size = 2

0 commit comments

Comments
 (0)