-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Learning to Cluster Faces by Infomap | ||
|
||
## Infomap Intorduction | ||
[Infomap Website](https://www.mapequation.org/publications.html#Rosvall-Axelsson-Bergstrom-2009-Map-equation) | ||
|
||
## Requirements | ||
* Python >= 3.6 | ||
* sklearn | ||
* infomap | ||
* numpy | ||
|
||
## Datasets | ||
MS-Celeb-1M : part1_test (584K) | ||
[download](https://github.com/yl-1993/learn-to-cluster/blob/master/DATASET.md) | ||
|
||
## Run | ||
```bash | ||
python face-cluster-by-infomap | ||
``` | ||
|
||
## Results on part1_test (584K) | ||
| Method | Precision | Recall | F-score | | ||
| ------ |:---------:|:------:|:-------:| | ||
| Chinese Whispers (k=80, th=0.6, iters=20) | 55.49 | 52.46 | 53.93 | | ||
| Approx Rank Order (k=80, th=0) | 99.77 | 7.2 | 13.42 | | ||
| MiniBatchKmeans (ncluster=5000, bs=100) | 45.48 | 80.98 | 58.25 | | ||
| KNN DBSCAN (k=80, th=0.7, eps=0.25, min=1) | 95.25 | 52.79 | 67.93 | | ||
| FastHAC (dist=0.72, single) | 92.07 | 57.28 | 70.63 | | ||
| [DaskSpectral](https://ml.dask.org/clustering.html#spectral-clustering) (ncluster=8573, affinity='rbf') | 78.75 | 66.59 | 72.16 | | ||
| [CDP](https://github.com/XiaohangZhan/cdp) (single model, th=0.7) | 80.19 | 70.47 | 75.02 | | ||
| [L-GCN](https://github.com/yl-1993/learn-to-cluster/tree/master/lgcn) (k_at_hop=[200, 10], active_conn=10, step=0.6, maxsz=300) | 74.38 | 83.51 | 78.68 | | ||
| GCN-D (2 prpsls) | 95.41 | 67.77 | 79.25 | | ||
| GCN-D (5 prpsls) | 94.62 | 72.59 | 82.15 | | ||
| GCN-D (8 prpsls) | 94.23 | 79.69 | 86.35 | | ||
| GCN-D (20 prplss) | 94.54 | 81.62 | 87.61 | | ||
| GCN-D + GCN-S (2 prpsls) | 99.07 | 67.22 | 80.1 | | ||
| GCN-D + GCN-S (5 prpsls) | 98.84 | 72.01 | 83.31 | | ||
| GCN-D + GCN-S (8 prpsls) | 97.93 | 78.98 | 87.44 | | ||
| GCN-D + GCN-S (20 prpsls) | 97.91 | 80.86 | 88.57 | | ||
| GCN-V | 92.45 | 82.42 | 87.14 | | ||
| GCN-V + GCN-E | 92.56 | 83.74 | 87.93 | | ||
| Infomap(ours) | 95.50 | 92.51 | 93.98 | | ||
|
||
![avatar](./image/evaluate.png) | ||
|
||
## References | ||
[最小熵原理(五):“层层递进”之社区发现与聚类](https://spaces.ac.cn/archives/7006) | ||
[人脸聚档主流方案](https://github.com/yl-1993/learn-to-cluster) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
from .metrics import * | ||
from .evaluate import evaluate |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
import inspect | ||
import argparse | ||
import numpy as np | ||
|
||
from evaluation import metrics | ||
from utils import Timer, TextColors | ||
|
||
|
||
def _read_meta(fn): | ||
labels = list() | ||
lb_set = set() | ||
with open(fn) as f: | ||
for lb in f.readlines(): | ||
lb = int(lb.strip()) | ||
labels.append(lb) | ||
lb_set.add(lb) | ||
return np.array(labels), lb_set | ||
|
||
|
||
def evaluate(gt_labels, pred_labels, metric='pairwise'): | ||
if isinstance(gt_labels, str) and isinstance(pred_labels, str): | ||
print('[gt_labels] {}'.format(gt_labels)) | ||
print('[pred_labels] {}'.format(pred_labels)) | ||
gt_labels, gt_lb_set = _read_meta(gt_labels) | ||
pred_labels, pred_lb_set = _read_meta(pred_labels) | ||
|
||
print('#inst: gt({}) vs pred({})'.format(len(gt_labels), | ||
len(pred_labels))) | ||
print('#cls: gt({}) vs pred({})'.format(len(gt_lb_set), | ||
len(pred_lb_set))) | ||
|
||
metric_func = metrics.__dict__[metric] | ||
|
||
with Timer('evaluate with {}{}{}'.format(TextColors.FATAL, metric, | ||
TextColors.ENDC)): | ||
result = metric_func(gt_labels, pred_labels) | ||
if isinstance(result, np.float): | ||
print('{}{}: {:.4f}{}'.format(TextColors.OKGREEN, metric, result, | ||
TextColors.ENDC)) | ||
else: | ||
ave_pre, ave_rec, fscore = result | ||
print('{}ave_pre: {:.4f}, ave_rec: {:.4f}, fscore: {:.4f}{}'.format( | ||
TextColors.OKGREEN, ave_pre, ave_rec, fscore, TextColors.ENDC)) | ||
|
||
|
||
if __name__ == '__main__': | ||
metric_funcs = inspect.getmembers(metrics, inspect.isfunction) | ||
metric_names = [n for n, _ in metric_funcs] | ||
|
||
parser = argparse.ArgumentParser(description='Evaluate Cluster') | ||
parser.add_argument('--gt_labels', type=str, required=True) | ||
parser.add_argument('--pred_labels', type=str, required=True) | ||
parser.add_argument('--metric', default='pairwise', choices=metric_names) | ||
args = parser.parse_args() | ||
|
||
evaluate(args.gt_labels, args.pred_labels, args.metric) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import division | ||
|
||
import numpy as np | ||
import json | ||
from sklearn.metrics.cluster import (contingency_matrix, | ||
normalized_mutual_info_score) | ||
from sklearn.metrics import (precision_score, recall_score) | ||
|
||
__all__ = ['pairwise', 'bcubed', 'nmi', 'precision', 'recall', 'accuracy'] | ||
|
||
|
||
class NpEncoder(json.JSONEncoder): | ||
def default(self, obj): | ||
if isinstance(obj, np.integer): | ||
return int(obj) | ||
elif isinstance(obj, np.floating): | ||
return float(obj) | ||
elif isinstance(obj, np.ndarray): | ||
return obj.tolist() | ||
else: | ||
return super(NpEncoder, self).default(obj) | ||
|
||
|
||
def _check(gt_labels, pred_labels): | ||
if gt_labels.ndim != 1: | ||
raise ValueError("gt_labels must be 1D: shape is %r" % | ||
(gt_labels.shape,)) | ||
if pred_labels.ndim != 1: | ||
raise ValueError("pred_labels must be 1D: shape is %r" % | ||
(pred_labels.shape,)) | ||
if gt_labels.shape != pred_labels.shape: | ||
raise ValueError( | ||
"gt_labels and pred_labels must have same size, got %d and %d" % | ||
(gt_labels.shape[0], pred_labels.shape[0])) | ||
return gt_labels, pred_labels | ||
|
||
|
||
def _get_lb2idxs(labels): | ||
lb2idxs = {} | ||
for idx, lb in enumerate(labels): | ||
if lb not in lb2idxs: | ||
lb2idxs[lb] = [] | ||
lb2idxs[lb].append(idx) | ||
return lb2idxs | ||
|
||
|
||
def _compute_fscore(pre, rec): | ||
return 2. * pre * rec / (pre + rec) | ||
|
||
|
||
def fowlkes_mallows_score(gt_labels, pred_labels, sparse=True): | ||
''' The original function is from `sklearn.metrics.fowlkes_mallows_score`. | ||
We output the pairwise precision, pairwise recall and F-measure, | ||
instead of calculating the geometry mean of precision and recall. | ||
''' | ||
n_samples, = gt_labels.shape | ||
|
||
c = contingency_matrix(gt_labels, pred_labels, sparse=sparse) | ||
tk = np.dot(c.data, c.data) - n_samples | ||
pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples | ||
qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples | ||
|
||
avg_pre = tk / pk | ||
avg_rec = tk / qk | ||
fscore = _compute_fscore(avg_pre, avg_rec) | ||
|
||
return avg_pre, avg_rec, fscore | ||
|
||
|
||
def pairwise(gt_labels, pred_labels, sparse=True): | ||
_check(gt_labels, pred_labels) | ||
return fowlkes_mallows_score(gt_labels, pred_labels, sparse) | ||
|
||
|
||
def bcubed0(gt_labels, pred_labels): | ||
""" | ||
计算bcubed的precision, recall, f-score及expanding | ||
:param gt_labels: | ||
:param pred_labels: | ||
:return: | ||
""" | ||
gt_lb2idxs = _get_lb2idxs(gt_labels) | ||
pred_lb2idxs = _get_lb2idxs(pred_labels) | ||
|
||
num_lbs = len(gt_lb2idxs) | ||
pre = np.zeros(num_lbs) | ||
rec = np.zeros(num_lbs) | ||
gt_num = np.zeros(num_lbs) | ||
|
||
expand = np.zeros(num_lbs) | ||
for i, gt_idxs in enumerate(gt_lb2idxs.values()): | ||
all_pred_lbs = np.unique(pred_labels[gt_idxs]) | ||
gt_num[i] = len(gt_idxs) | ||
expand[i] = all_pred_lbs.shape[0] | ||
for pred_lb in all_pred_lbs: | ||
pred_idxs = pred_lb2idxs[pred_lb] | ||
n = 1. * np.intersect1d(gt_idxs, pred_idxs).size | ||
pre[i] += n ** 2 / len(pred_idxs) | ||
rec[i] += n ** 2 / gt_num[i] | ||
|
||
gt_num = gt_num.sum() | ||
avg_pre = pre.sum() / gt_num | ||
avg_rec = rec.sum() / gt_num | ||
fscore = _compute_fscore(avg_pre, avg_rec) | ||
|
||
return avg_pre, avg_rec, fscore, expand.mean() | ||
|
||
|
||
def bcubed(gt_labels, pred_labels): | ||
""" | ||
输出becubed函数中各项指标,以及丢弃n个档案后的指标 | ||
和剩余的图片数量和label数量 | ||
:param gt_labels: | ||
:param pred_labels: | ||
:param n: | ||
:return: | ||
""" | ||
pred_lb2idxs = _get_lb2idxs(pred_labels) | ||
n = 1 | ||
ind = [] | ||
for i in pred_lb2idxs.values(): | ||
if len(i) > n: | ||
for m in i: | ||
ind.append(m) | ||
|
||
avg_pre, avg_rec, fscore, expand = bcubed0(gt_labels, pred_labels) | ||
# print('avg_pre:{}, avg_rec:{}, fscore:{}, expanding:{}, rest images:{}, rest_gt_labels:{} '. | ||
# format(avg_pre, avg_rec, fscore, expand, len(gt_labels), len(list(set(gt_labels))))) | ||
# | ||
# avg_pre1, avg_rec1, fscore1, expand1 = bcubed0(gt_labels[ind], pred_labels[ind]) | ||
# print('avg_pre:{}, avg_rec:{}, fscore:{}, expanding:{}, rest images:{}, rest_gt_labels:{} '. | ||
# format(avg_pre1, avg_rec1, fscore1, expand1, len(ind), len(list(set(gt_labels[ind]))))) | ||
|
||
return avg_pre, avg_rec, fscore | ||
|
||
|
||
def nmi(gt_labels, pred_labels): | ||
return normalized_mutual_info_score(pred_labels, gt_labels) | ||
|
||
|
||
def precision(gt_labels, pred_labels): | ||
return precision_score(gt_labels, pred_labels) | ||
|
||
|
||
def recall(gt_labels, pred_labels): | ||
return recall_score(gt_labels, pred_labels) | ||
|
||
|
||
def accuracy(gt_labels, pred_labels): | ||
return np.mean(gt_labels == pred_labels) |