Skip to content

Commit

Permalink
Code refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
phil85 committed Oct 18, 2024
1 parent fb4368b commit d9dcf1c
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 63 deletions.
69 changes: 11 additions & 58 deletions pccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@


def choose_initial_cluster_centers(data, n_clusters, **kwargs):

# Get parameters
n_datapoints = data.shape[0]
init = kwargs.get('init', 'k-means++')
Expand All @@ -39,7 +38,6 @@ def choose_initial_cluster_centers(data, n_clusters, **kwargs):

def reposition_cluster_centers(data, n_clusters, labels, current_centers, weights, scl, confidence,
cluster_repositioning):

if cluster_repositioning == 'inertia':

# Get inertia per cluster
Expand Down Expand Up @@ -83,7 +81,6 @@ def reposition_cluster_centers(data, n_clusters, labels, current_centers, weight


def get_weighted_violations_per_cluster(n_clusters, labels, scl, confidence):

# Initialize violations per cluster
weighted_violations_per_cluster = np.zeros(n_clusters)

Expand Down Expand Up @@ -111,7 +108,6 @@ def get_weighted_violations_per_cluster(n_clusters, labels, scl, confidence):


def get_inertia_per_cluster(n_clusters, data, centers, labels, weights):

# Compute inertia for each cluster
inertia_per_object = ((data - centers[labels, :]) ** 2).sum(axis=1)

Expand All @@ -123,7 +119,6 @@ def get_inertia_per_cluster(n_clusters, data, centers, labels, weights):


def update_centers(data, centers, n_clusters, labels, weights, scl, confidence):

# Identify empty clusters
non_empty_clusters = np.unique(labels)
empty_clusters = np.setdiff1d(np.arange(n_clusters), non_empty_clusters)
Expand All @@ -143,7 +138,6 @@ def update_centers(data, centers, n_clusters, labels, weights, scl, confidence):
# Reposition empty clusters
position = -1
for i in empty_clusters:

# Get object from last cluster
candidates = np.where(labels == rank[position])[0]

Expand All @@ -165,7 +159,6 @@ def update_centers(data, centers, n_clusters, labels, weights, scl, confidence):


def preprocessing(data, ml, cl, sml, scl, sml_weights, scl_weights):

# If there are no hard must-link constraints most of the preprocessing is skipped
if len(ml) == 0:
weights = np.ones(data.shape[0])
Expand Down Expand Up @@ -227,10 +220,7 @@ def preprocessing(data, ml, cl, sml, scl, sml_weights, scl_weights):
# Merge all weights associated with soft constraints into one pandas series named confidence
confidence, sml, scl = aggregate_confidence_values(scl, scl_weights, sml, sml_weights)

# Construct kdtree
kdt = KDTree(data)

return data, weights, mapping, cl, sml, scl, confidence, kdt
return data, weights, mapping, cl, sml, scl, confidence


def aggregate_confidence_values(scl, scl_weights, sml, sml_weights):
Expand Down Expand Up @@ -285,7 +275,6 @@ def my_callback(model, where):

def get_relevant_constraints_dynamic(constraints, n_clusters, n_neighbors, df_nearest_centers_unselected,
df_nearest_centers_selected, n_neighbors_selected, selected_idx, unselected_idx):

# Get number of cannot-link constraints
n_constraints = constraints.shape[0]
constraint_ids = np.arange(n_constraints)
Expand Down Expand Up @@ -346,8 +335,7 @@ def get_relevant_constraints_dynamic(constraints, n_clusters, n_neighbors, df_ne
return idx_constraints, idx_clusters, relevant_constraints


def assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, kdt, log, labels=None, **kwargs):

def assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, log, labels=None, **kwargs):
# Get parameters
n_representatives = data.shape[0]
n_clusters = centers.shape[0]
Expand Down Expand Up @@ -486,7 +474,6 @@ def assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, kdt, lo
df_distances_unselected = pd.DataFrame(distances_unselected, index=unselected)
df_nearest_centers_unselected = pd.DataFrame(nearest_centers_unselected, index=unselected)


if increase_strategy == 'n_clusters':
n_neighbors_selected = n_clusters
elif increase_strategy.isnumeric():
Expand Down Expand Up @@ -553,47 +540,19 @@ def assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, kdt, lo

# Add variables from unselected representatives
distances_unselected = {(i, df_nearest_centers_unselected.at[i, j]):
df_distances_unselected.at[i, j] * weights[i]
for i in unselected for j in range(n_neighbors)}
df_distances_unselected.at[i, j] * weights[i]
for i in unselected for j in range(n_neighbors)}

# Add variables from selected representatives
distances_selected = {(i, df_nearest_centers_selected.at[i, j]):
df_distances_selected.at[i, j] * weights[i]
for i in selected for j in range(n_neighbors_selected)}
df_distances_selected.at[i, j] * weights[i]
for i in selected for j in range(n_neighbors_selected)}

distances = {**distances_unselected, **distances_selected}
else:
distances = {(i, nearest_centers[i, j]): distances[i, j] * weights[i]
for i in range(n_representatives) for j in range(n_neighbors)}

# # Include empty clusters
# if dynamic_n_neighbors is not None and len(selected) > 0:
# covered_clusters = np.unique(np.concatenate((nearest_centers_selected.ravel(),
# nearest_centers_unselected.ravel())))
# else:
# covered_clusters = np.unique(nearest_centers)

# # Get empty clusters
# empty_clusters = np.setdiff1d(range(n_clusters), covered_clusters)
#
# if log_flag:
# iteration_log['empty_clusters'] = len(empty_clusters)
#
# if len(empty_clusters) > 0:
# # Determine the closest representatives for each empty cluster
# distances_to_representatives, nearest_representatives = kdt.query(centers[empty_clusters, :])
#
# if metric == 'squared_euclidean':
# distances_to_representatives = distances_to_representatives ** 2
#
# # Create dictionary
# keys = zip(nearest_representatives[:, 0], empty_clusters)
# values = distances_to_representatives[:, 0] * weights[nearest_representatives[:, 0]]
# distances_to_representatives = dict(zip(keys, values))
#
# # Merge dictionaries
# distances = {**distances, **distances_to_representatives}

# Add decision variables to model
x = m.addVars(distances.keys(), vtype=gb.GRB.BINARY, obj=distances)

Expand Down Expand Up @@ -685,9 +644,6 @@ def assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, kdt, lo
# Each representative must be assigned to a cluster
m.addConstrs(x.sum(i, '*') == 1 for i in range(n_representatives))

# # Each cluster must have at least one representative
# m.addConstrs(x.sum('*', j) >= 1 for j in range(n_clusters))

# Set solver parameters
if verbose < 2:
m.setParam('OutputFlag', 0)
Expand Down Expand Up @@ -772,7 +728,6 @@ def get_relevant_constraints(constraints, n_clusters, n_neighbors, nearest_cente


def get_total_distance(data, centers, labels, weights, **kwargs):

metric = kwargs.get('metric', 'euclidean')

if metric == 'euclidean':
Expand Down Expand Up @@ -878,7 +833,6 @@ def initialize_log(log, X, cl, ml, scl, sml):


def pccc(X, n_clusters, ml=None, cl=None, sml=None, scl=None, sml_weights=None, scl_weights=None, **kwargs):

log_flag = kwargs.get('log_flag', False)
max_iter = kwargs.get('max_iter', 1e6)
time_limit = kwargs.get('time_limit', 1e6)
Expand Down Expand Up @@ -912,13 +866,12 @@ def pccc(X, n_clusters, ml=None, cl=None, sml=None, scl=None, sml_weights=None,

# Perform preprocessing
if perform_preprocessing:
data, weights, mapping, cl, sml, scl, confidence, kdt = preprocessing(data, ml, cl, sml, scl, sml_weights,
scl_weights)
data, weights, mapping, cl, sml, scl, confidence = preprocessing(data, ml, cl, sml, scl, sml_weights,
scl_weights)
else:
weights = np.ones(data.shape[0])
mapping = np.arange(data.shape[0])
confidence, sml, scl = aggregate_confidence_values(scl, scl_weights, sml, sml_weights)
kdt = KDTree(data)

# Perform feasibility check
if data.shape[0] < n_clusters:
Expand All @@ -944,7 +897,7 @@ def pccc(X, n_clusters, ml=None, cl=None, sml=None, scl=None, sml_weights=None,

# Assign objects
initial_labels = None
labels, total_penalty, log = assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, kdt, log,
labels, total_penalty, log = assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, log,
labels=initial_labels, **kwargs)

# Perform feasibility check
Expand Down Expand Up @@ -972,8 +925,8 @@ def pccc(X, n_clusters, ml=None, cl=None, sml=None, scl=None, sml_weights=None,
while (n_iter < max_iter) and (elapsed_time < time_limit):

# Assign objects
labels, total_penalty, log = assign_objects(data, centers, weights, ml, cl, sml, scl, confidence,
kdt, log, labels=labels, **kwargs)
labels, total_penalty, log = assign_objects(data, centers, weights, ml, cl, sml, scl, confidence, log,
labels=labels, **kwargs)

# Perform feasibility check
if labels is None:
Expand Down
5 changes: 0 additions & 5 deletions run_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@
sml_weights = constraints['sml_proba']
scl_weights = constraints['scl_proba']

if not sml_weights:
sml_weights = None
if not scl_weights:
scl_weights = None

# %% Apply PCCC algorithm
output = pccc(X, n_clusters,
ml=ml,
Expand Down

0 comments on commit d9dcf1c

Please sign in to comment.