Skip to content

Commit

Permalink
Adds support for jaccard_coefficient (#62)
Browse files Browse the repository at this point in the history
Adds support for `jaccard_coefficient` to nx-cugraph.

This includes a test, but relies largely on the existing test coverage provided by NetworkX. The test included here could (should) be submitted to NetworkX though in a separate PR, since it is not covering anything unique to nx-cugraph.

A benchmark is also included, with results showing 2-4X speedup. I've seen much, much larger speedup on a different graph (large movie review bipartite graph, showing 966s for NX, 2s for nx-cugraph = ~500X), so I need to investigate further.  This investigation need not prevent this PR from being merged now though.

![image](https://github.com/user-attachments/assets/3ceb7d62-50c4-437e-96d2-0ab452dd39d2)

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Ralph Liu (https://github.com/nv-rliu)
  - Erik Welch (https://github.com/eriknw)

URL: #62
  • Loading branch information
rlratzel authored Jan 30, 2025
1 parent 8427a4d commit 8f2ee66
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 3 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
│ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.hits_alg.hits.html#networkx.algorithms.link_analysis.hits_alg.hits">hits</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/link_analysis.html#module-networkx.algorithms.link_analysis.pagerank_alg">pagerank_alg</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank">pagerank</a>
<a href="https://networkx.org/documentation/stable/reference/algorithms/link_prediction.html#module-networkx.algorithms.link_prediction">link_prediction</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.jaccard_coefficient.html#networkx.algorithms.link_prediction.jaccard_coefficient">jaccard_coefficient</a>
<a href="https://networkx.org/documentation/stable/reference/algorithms/operators.html">operators</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/operators.html#module-networkx.algorithms.operators.unary">unary</a>
├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.unary.complement.html#networkx.algorithms.operators.unary.complement">complement</a>
Expand Down
1 change: 1 addition & 0 deletions _nx_cugraph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
"is_tree",
"is_weakly_connected",
"isolates",
"jaccard_coefficient",
"k_truss",
"karate_club_graph",
"katz_centrality",
Expand Down
35 changes: 34 additions & 1 deletion benchmarks/pytest-based/bench_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,11 @@ def get_graph_obj_for_benchmark(graph_obj, backend_wrapper):
"""
G = graph_obj
if backend_wrapper.backend_name == "cugraph-preconverted":
G = nxcg.from_networkx(G, preserve_all_attrs=True)
G = nxcg.from_networkx(
G,
preserve_all_attrs=True,
use_compat_graph=True,
)
return G


Expand Down Expand Up @@ -919,6 +923,35 @@ def bench_bipartite_BC_n1000_m3000_k100000(benchmark, backend_wrapper):
assert type(result) is dict


def bench_jaccard(benchmark, graph_obj, backend_wrapper):
G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)

# ebunch is a list of node pairs to limit the jaccard run.
nodes = list(G.nodes)
start = nodes[0]
ebunch = [(start, n) for n in nodes[1:]]
start = nodes[1]
ebunch += [(start, n) for n in nodes[2:]]
start = nodes[2]
ebunch += [(start, n) for n in nodes[3:]]

# DiGraphs are not supported
if G.is_directed():
G = G.to_undirected()

result = benchmark.pedantic(
target=backend_wrapper(nx.jaccard_coefficient, force_unlazy_eval=True),
args=(G,),
kwargs=dict(
ebunch=ebunch,
),
rounds=rounds,
iterations=iterations,
warmup_rounds=warmup_rounds,
)
assert type(result) is list


@pytest.mark.skip(reason="benchmark not implemented")
def bench_complete_bipartite_graph(benchmark, graph_obj, backend_wrapper):
pass
Expand Down
4 changes: 3 additions & 1 deletion nx_cugraph/algorithms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -17,6 +17,7 @@
community,
components,
link_analysis,
link_prediction,
operators,
shortest_paths,
traversal,
Expand All @@ -30,6 +31,7 @@
from .dag import *
from .isolate import *
from .link_analysis import *
from .link_prediction import *
from .operators import *
from .reciprocity import *
from .shortest_paths import *
Expand Down
68 changes: 68 additions & 0 deletions nx_cugraph/algorithms/link_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import cupy as cp
import networkx as nx
import pylibcugraph as plc

from nx_cugraph.convert import _to_undirected_graph
from nx_cugraph.utils import index_dtype, networkx_algorithm, not_implemented_for

__all__ = [
"jaccard_coefficient",
]


@not_implemented_for("directed")
@not_implemented_for("multigraph")
@networkx_algorithm(version_added="25.02", _plc="jaccard_coefficients")
def jaccard_coefficient(G, ebunch=None):
G = _to_undirected_graph(G)

# If ebunch is not specified, create pairs representing all non-edges.
# This can be an extremely large set and is not realistic for large graphs,
# but this is required for NX compatibility.
if ebunch is None:
A = cp.tri(G._N, G._N, dtype=bool)
A[G.src_indices, G.dst_indices] = True
u_indices, v_indices = cp.nonzero(~A)
if u_indices.size == 0:
return iter([])
u_indices = u_indices.astype(index_dtype)
v_indices = v_indices.astype(index_dtype)

else:
(u, v) = zip(*ebunch)
try:
# Convert the ebunch lists to cupy arrays for passing to PLC, possibly
# mapping to integers if the Graph was renumbered.
# Allow the Graph renumber lookup (if renumbering was done) to check
# for invalid node IDs in ebunch.
u_indices = G._list_to_nodearray(u)
v_indices = G._list_to_nodearray(v)
except (KeyError, ValueError) as n:
raise nx.NodeNotFound(f"Node {n} not in G.")

(u, v, p) = plc.jaccard_coefficients(
resource_handle=plc.ResourceHandle(),
graph=G._get_plc_graph(),
first=u_indices,
second=v_indices,
use_weight=False,
do_expensive_check=False,
)

u = G._nodearray_to_list(u)
v = G._nodearray_to_list(v)
p = p.tolist()

return zip(u, v, p)
16 changes: 15 additions & 1 deletion nx_cugraph/classes/graph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -1009,6 +1009,11 @@ def _get_plc_graph(
dst_indices = self.dst_indices
if switch_indices:
src_indices, dst_indices = dst_indices, src_indices

# FIXME: the SGGraph constructor arg "symmetrize" will perform all
# symmetrization steps required by libcugraph. The edge_array check
# should be kept, but all other code in this `if` block should be
# removed if possible.
if symmetrize is not None:
if edge_array is not None:
raise NotImplementedError(
Expand Down Expand Up @@ -1147,6 +1152,15 @@ def _nodearray_to_list(self, node_ids: cp.ndarray[IndexValue]) -> list[NodeKey]:
def _list_to_nodearray(self, nodes: list[NodeKey]) -> cp.ndarray[IndexValue]:
if (key_to_id := self.key_to_id) is not None:
nodes = [key_to_id[node] for node in nodes]
else:
N = self._N
for node in nodes:
try:
n = int(node)
except (TypeError, ValueError):
raise KeyError(node) from None
if n != node or n < 0 or n >= N:
raise KeyError(node)
return cp.array(nodes, dtype=index_dtype)

def _nodearray_to_set(self, node_ids: cp.ndarray[IndexValue]) -> set[NodeKey]:
Expand Down
45 changes: 45 additions & 0 deletions nx_cugraph/tests/test_link_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Iterable

import networkx as nx
import pytest

# The tests in this file cover use cases unique to nx-cugraph. If the coverage
# here is not unique to nx-cugraph, consider moving those tests to the NetworkX
# project.


def test_no_nonexistent_edges_no_ebunch():
"""Test no ebunch and G is fully connected
Ensure function returns iter([]) or equivalent due to no nonexistent edges.
"""
G = nx.complete_graph(5)
result = nx.jaccard_coefficient(G)
assert isinstance(result, Iterable)
assert pytest.raises(StopIteration, next, result)


def test_node_not_found_in_ebunch():
"""Test that all nodes in ebunch are valid
Ensure function raises NodeNotFound for invalid nodes in ebunch.
"""
G = nx.Graph([(0, 1), (1, 2)])
with pytest.raises(nx.NodeNotFound, match="Node [']*A[']* not in G."):
nx.jaccard_coefficient(G, [("A", 1)])
with pytest.raises(nx.NodeNotFound, match=r"Node \(1,\) not in G."):
nx.jaccard_coefficient(G, [(0, (1,))])
with pytest.raises(nx.NodeNotFound, match="Node 9999 not in G."):
nx.jaccard_coefficient(G, [(0, 9999)])

0 comments on commit 8f2ee66

Please sign in to comment.