Skip to content

Commit bccd615

Browse files
committed
add 3 example files
1 parent 50f4ddb commit bccd615

File tree

3 files changed

+509
-0
lines changed

3 files changed

+509
-0
lines changed

examples/clustering_adbscan.jl

Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
import ClusteringAPI
2+
import Clustering
3+
import Optim
4+
using Distances
5+
6+
#####################################################################################
7+
# Clustering API
8+
#####################################################################################
9+
# improved dbscan algorithm from Attractors.jl
10+
@kwdef struct ADBSCAN{R<:Union{Real, String}, M, F<:Function} <: ClusteringAlgorithm
11+
clust_distance_metric::M = Euclidean()
12+
min_neighbors::Int = 10
13+
rescale_features::Bool = true
14+
optimal_radius_method::R = "silhouettes_optim"
15+
num_attempts_radius::Int = 100
16+
silhouette_statistic::F = mean
17+
max_used_features::Int = 0
18+
use_mmap::Bool = false
19+
end
20+
21+
function ClusteringAPI.cluster(config::ADBSCAN, data)
22+
d, m = input_data_size(data) # dimension = rows, amount = columns
23+
if d m
24+
throw(ArgumentError("""
25+
Not enough data points. The algorithm needs more data points than data dimensions.
26+
"""))
27+
end
28+
if config.rescale_features
29+
data = _rescale_to_01(data)
30+
end
31+
ϵ_optimal, v_optimal = _extract_ϵ_optimal(data, config)
32+
distances = _distance_matrix(data, config)
33+
labels = _cluster_distances_into_labels(distances, ϵ_optimal, config.min_neighbors)
34+
return ADBSCANResult(ca, labels, ϵ_optimal, v_optimal, data)
35+
end
36+
37+
struct ADBSCANResult
38+
algorithm::ADBSCAN
39+
labels::Vector{Int}
40+
ε_optimal
41+
v_optimal
42+
data
43+
end
44+
45+
ClusteringAPI.cluster_labels(r::ADBSCANResult) = r.labels
46+
47+
#####################################################################################
48+
# Source code
49+
#####################################################################################
50+
function _rescale_to_01(features)
51+
mini, maxi = _minmaxima(features)
52+
rescaled = map(f -> (f .- mini) ./ (maxi .- mini), each_data_point(features))
53+
# TODO: This doesn't respect matrix input
54+
return rescaled # ensure it stays the same type
55+
end
56+
57+
function _minmaxima(features)
58+
points = each_data_point(features)
59+
D = length(first(points))
60+
mi = fill(Inf, D)
61+
ma = fill(-Inf, D)
62+
for point in points
63+
for i in 1:D
64+
if point[i] > ma[i]
65+
ma[i] = point[i]
66+
elseif point[i] < mi[i]
67+
mi[i] = point[i]
68+
end
69+
end
70+
end
71+
return mi, ma
72+
end
73+
74+
function _distance_matrix(data, config::ADBSCAN)
75+
metric = config.clust_distance_metric
76+
L = input_data_size(data)[2]
77+
if config.use_mmap
78+
pth, s = mktemp()
79+
dists = Mmap.mmap(s, Matrix{Float32}, (L, L))
80+
else
81+
dists = zeros(L, L)
82+
end
83+
if metric isa Metric # then the `pairwise` function is valid
84+
# ensure that we give the vector of static vectors to pairwise!
85+
pairwise!(metric, dists, data; symmetric = true)
86+
else # it is any arbitrary distance function, e.g., used in aggregating attractors
87+
points = each_data_point(data)
88+
@inbounds for i in eachindex(points)
89+
Threads.@threads for j in i:length(points)
90+
v = metric(points[i], points[j])
91+
dists[i, j] = dists[j, i] = v # utilize symmetry
92+
end
93+
end
94+
end
95+
return dists
96+
end
97+
98+
function _extract_ϵ_optimal(features, config::ADBSCAN)
99+
(; min_neighbors, clust_distance_metric, optimal_radius_method,
100+
num_attempts_radius, silhouette_statistic, max_used_features) = config
101+
m = input_data_size(features)[2]
102+
103+
if optimal_radius_method isa String
104+
if max_used_features == 0 || max_used_features > m
105+
features_for_optimal = each_data_point(features)
106+
else
107+
# subsample features to accelerate optimal radius search
108+
features_for_optimal = sample(each_data_point(features), max_used_features; replace = false)
109+
end
110+
# get optimal radius (function dispatches on the radius method)
111+
ϵ_optimal, v_optimal = optimal_radius_dbscan(
112+
features_for_optimal, min_neighbors, clust_distance_metric,
113+
optimal_radius_method, num_attempts_radius, silhouette_statistic
114+
)
115+
elseif optimal_radius_method isa Real
116+
ϵ_optimal = optimal_radius_method
117+
v_optimal = missing
118+
else
119+
error("Specified `optimal_radius_method` is incorrect. Please specify the radius
120+
directly as a `Real` number or the method to compute it as a `String`")
121+
end
122+
return ϵ_optimal, v_optimal
123+
end
124+
125+
# Already expecting the distance matrix, the output of `pairwise`
126+
function _cluster_distances_into_labels(distances, ϵ_optimal, min_neighbors)
127+
dbscanresult = Clustering.dbscan(distances, ϵ_optimal; min_neighbors, metric=nothing)
128+
cluster_labels = cluster_assignment(dbscanresult)
129+
return cluster_labels
130+
end
131+
132+
"""
133+
Util function for `classify_features`. Returns the assignment vector, in which the i-th
134+
component is the cluster index of the i-th feature.
135+
"""
136+
function cluster_assignment(clusters, data; include_boundary=true)
137+
assign = zeros(Int, size(data)[2])
138+
for (idx, cluster) in enumerate(clusters)
139+
assign[cluster.core_indices] .= idx
140+
if cluster.boundary_indices != []
141+
if include_boundary
142+
assign[cluster.boundary_indices] .= idx
143+
else
144+
assign[cluster.boundary_indices] .= -1
145+
end
146+
end
147+
end
148+
return assign
149+
end
150+
function cluster_assignment(dbscanresult::Clustering.DbscanResult)
151+
labels = dbscanresult.assignments
152+
# Attractors.jl follows the convention of `-1` to unclustered points.
153+
return replace!(labels, 0=>-1)
154+
end
155+
156+
#####################################################################################
157+
# Source code - optimal radius
158+
#####################################################################################
159+
function optimal_radius_dbscan(features, min_neighbors, metric, optimal_radius_method,
160+
num_attempts_radius, silhouette_statistic)
161+
if optimal_radius_method == "silhouettes"
162+
ϵ_optimal, v_optimal = optimal_radius_dbscan_silhouette(
163+
features, min_neighbors, metric, num_attempts_radius, silhouette_statistic
164+
)
165+
elseif optimal_radius_method == "silhouettes_optim"
166+
ϵ_optimal, v_optimal = optimal_radius_dbscan_silhouette_optim(
167+
features, min_neighbors, metric, num_attempts_radius, silhouette_statistic
168+
)
169+
elseif optimal_radius_method == "knee"
170+
ϵ_optimal, v_optimal = optimal_radius_dbscan_knee(features, min_neighbors, metric)
171+
elseif optimal_radius_method isa Real
172+
ϵ_optimal = optimal_radius_method
173+
v_optimal = NaN
174+
else
175+
error("Unknown `optimal_radius_method`.")
176+
end
177+
return ϵ_optimal, v_optimal
178+
end
179+
180+
"""
181+
Find the optimal radius ε of a point neighborhood to use in DBSCAN, the unsupervised
182+
clustering method for `AttractorsViaFeaturizing`. Iteratively search
183+
for the radius that leads to the best clustering, as characterized by quantifiers known as
184+
silhouettes. Does a linear (sequential) search.
185+
"""
186+
function optimal_radius_dbscan_silhouette(features, min_neighbors, metric,
187+
num_attempts_radius, silhouette_statistic
188+
)
189+
feat_ranges = features_ranges(features)
190+
ϵ_grid = range(
191+
minimum(feat_ranges)/num_attempts_radius, minimum(feat_ranges);
192+
length=num_attempts_radius
193+
)
194+
s_grid = zeros(size(ϵ_grid)) # silhouette statistic values (which we want to maximize)
195+
196+
# vary ϵ to find the best one (which will maximize the mean silhouette)
197+
dists = pairwise(metric, features)
198+
for i in eachindex(ϵ_grid)
199+
clusters = dbscan(dists, ϵ_grid[i]; min_neighbors, metric = nothing)
200+
sils = silhouettes_new(clusters, dists)
201+
s_grid[i] = silhouette_statistic(sils)
202+
end
203+
204+
optimal_val, idx = findmax(s_grid)
205+
ϵ_optimal = ϵ_grid[idx]
206+
return ϵ_optimal, optimal_val
207+
end
208+
209+
function features_ranges(features)
210+
d = StateSpaceSet(features) # zero cost if `features` is a `Vector{<:SVector}`
211+
mini, maxi = minmaxima(d)
212+
return maxi .- mini
213+
end
214+
215+
"""
216+
Same as `optimal_radius_dbscan_silhouette`,
217+
but find minimum via optimization with Optim.jl.
218+
"""
219+
function optimal_radius_dbscan_silhouette_optim(
220+
features, min_neighbors, metric, num_attempts_radius, silhouette_statistic
221+
)
222+
feat_ranges = features_ranges(features)
223+
# vary ϵ to find the best radius (which will maximize the mean silhouette)
224+
dists = pairwise(metric, features)
225+
f = (ϵ) -> silhouettes_from_distances(
226+
ϵ, dists, min_neighbors, silhouette_statistic
227+
)
228+
opt = Optim.optimize(
229+
f, minimum(feat_ranges)/100, minimum(feat_ranges); iterations=num_attempts_radius
230+
)
231+
ϵ_optimal = Optim.minimizer(opt)
232+
optimal_val = -Optim.minimum(opt) # we minimize using `-`
233+
return ϵ_optimal, optimal_val
234+
end
235+
236+
function silhouettes_from_distances(ϵ, dists, min_neighbors, silhouette_statistic=mean)
237+
clusters = Clustering.dbscan(dists, ϵ; min_neighbors, metric = nothing)
238+
sils = silhouettes_new(clusters, dists)
239+
# We return minus here because Optim finds minimum; we want maximum
240+
return -silhouette_statistic(sils)
241+
end
242+
243+
"""
244+
Find the optimal radius ϵ of a point neighborhood for use in DBSCAN through the elbow method
245+
(knee method, highest derivative method).
246+
"""
247+
function optimal_radius_dbscan_knee(_features::Vector, min_neighbors, metric)
248+
features = StateSpaceSet(_features)
249+
tree = searchstructure(KDTree, features, metric)
250+
# Get distances, excluding distance to self (hence the Theiler window)
251+
d, n = size(features)
252+
features_vec = [features[:,j] for j=1:n]
253+
_, distances = bulksearch(tree, features_vec, NeighborNumber(min_neighbors), Theiler(0))
254+
meandistances = map(mean, distances)
255+
sort!(meandistances)
256+
maxdiff, idx = findmax(diff(meandistances))
257+
ϵ_optimal = meandistances[idx]
258+
return ϵ_optimal, maxdiff
259+
end
260+
261+
262+
# The following function is left here for reference. It is not used anywhere in the code.
263+
# It is the original implementation we have written based on the bSTAB paper.
264+
"""
265+
Find the optimal radius ε of a point neighborhood to use in DBSCAN, the unsupervised clustering
266+
method for `AttractorsViaFeaturizing`. The basic idea is to iteratively search for the radius that
267+
leads to the best clustering, as characterized by quantifiers known as silhouettes.
268+
"""
269+
function optimal_radius_dbscan_silhouette_original(features, min_neighbors, metric; num_attempts_radius=200)
270+
d,n = size(features)
271+
feat_ranges = maximum(features, dims = d)[:,1] .- minimum(features, dims = d)[:,1];
272+
ϵ_grid = range(minimum(feat_ranges)/num_attempts_radius, minimum(feat_ranges), length=num_attempts_radius)
273+
s_grid = zeros(size(ϵ_grid)) # average silhouette values (which we want to maximize)
274+
275+
# vary ϵ to find the best one (which will maximize the minimum silhouette)
276+
for i in eachindex(ϵ_grid)
277+
clusters = Clustering.dbscan(features, ϵ_grid[i]; min_neighbors)
278+
dists = pairwise(metric, features)
279+
class_labels = cluster_assignment(clusters, features)
280+
if length(clusters) 1 # silhouette undefined if only one cluster
281+
sils = Clustering.silhouettes(class_labels, dists)
282+
s_grid[i] = minimum(sils)
283+
else
284+
s_grid[i] = 0; # considers single-cluster solution on the midpoint (following Wikipedia)
285+
end
286+
end
287+
288+
max, idx = findmax(s_grid)
289+
ϵ_optimal = ϵ_grid[idx]
290+
end
291+
292+
293+
#####################################################################################
294+
# Calculate Silhouettes
295+
#####################################################################################
296+
"""
297+
Calculate silhouettes. A bit slower than the implementation in `Clustering.jl` but seems
298+
to be more robust. The latter seems to be incorrect in some cases.
299+
"""
300+
function silhouettes_new(dbscanresult::Clustering.DbscanResult, dists::AbstractMatrix)
301+
labels = dbscanresult.assignments
302+
clusters = [findall(x->x==i, labels) for i=1:maximum(labels)] #all clusters
303+
if length(clusters) == 1 return zeros(length(clusters[1])) end #all points in the same cluster -> sil = 0
304+
sils = zeros(length(labels))
305+
outsideclusters = findall(x->x==0, labels)
306+
for (idx_c, cluster) in enumerate(clusters)
307+
@inbounds for i in cluster
308+
a = sum(@view dists[i, cluster])/(length(cluster)-1) #dists should be organized s.t. dist[i, cluster] i= dist from i to idxs in cluster
309+
b = _calcb!(i, idx_c, dists, clusters, outsideclusters)
310+
sils[i] = (b-a)/(max(a,b))
311+
end
312+
end
313+
return sils
314+
end
315+
316+
function _calcb!(i, idx_c_i, dists, clusters, outsideclusters)
317+
min_dist_to_clstr = typemax(eltype(dists))
318+
for (idx_c, cluster) in enumerate(clusters)
319+
idx_c == idx_c_i && continue
320+
dist_to_clstr = mean(@view dists[cluster,i]) #mean distance to other clusters
321+
if dist_to_clstr < min_dist_to_clstr min_dist_to_clstr = dist_to_clstr end
322+
end
323+
min_dist_to_pts = typemax(eltype(dists))
324+
for point in outsideclusters
325+
dist_to_pts = dists[point, i] # distance to points outside clusters
326+
if dist_to_pts < min_dist_to_pts
327+
min_dist_to_pts = dist_to_pts
328+
end
329+
end
330+
return min(min_dist_to_clstr, min_dist_to_pts)
331+
end

examples/clustering_hclust_auto.jl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import ClusteringAPI
2+
import Clustering
3+
4+
5+
# The automatic H-clustering which decides the number of clusters
6+
# by splitting at the largest gap in the dendrogram
7+
struct HClustAuto <: ClusteringAlgorithm end
8+
9+
struct HClustAutoResults
10+
labels::Vector{Int}
11+
hclust
12+
splitidx
13+
end
14+
15+
function ClusteringAPI.cluster(::HClustAuto, X)
16+
input_data_size(X)[2] < 3000 || throw(ArgumentError("size more than 3000"))
17+
hc = Clustering.hclust(pairwise(Euclidean(), X))
18+
# Split at the largest gap in the dendrogram
19+
idx = argmax(diff(hc.heights))
20+
labels = Clustering.cutree(hc; h=mean(hc.heights[idx:idx+1]))
21+
return HClustAutoResults(labels, hc, idx)
22+
end

0 commit comments

Comments
 (0)