Skip to content

Commit

Permalink
v1
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoxiong74 committed Nov 23, 2020
0 parents commit 8cadc08
Show file tree
Hide file tree
Showing 16 changed files with 627 additions and 0 deletions.
15 changes: 15 additions & 0 deletions .idea/deployment.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions .idea/face-cluster-by-infomap.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions .idea/remote-mappings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions .idea/webServers.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 48 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Learning to Cluster Faces by Infomap

## Infomap Intorduction
[Infomap Website](https://www.mapequation.org/publications.html#Rosvall-Axelsson-Bergstrom-2009-Map-equation)

## Requirements
* Python >= 3.6
* sklearn
* infomap
* numpy

## Datasets
MS-Celeb-1M : part1_test (584K)
[download](https://github.com/yl-1993/learn-to-cluster/blob/master/DATASET.md)

## Run
```bash
python face-cluster-by-infomap
```

## Results on part1_test (584K)
| Method | Precision | Recall | F-score |
| ------ |:---------:|:------:|:-------:|
| Chinese Whispers (k=80, th=0.6, iters=20) | 55.49 | 52.46 | 53.93 |
| Approx Rank Order (k=80, th=0) | 99.77 | 7.2 | 13.42 |
| MiniBatchKmeans (ncluster=5000, bs=100) | 45.48 | 80.98 | 58.25 |
| KNN DBSCAN (k=80, th=0.7, eps=0.25, min=1) | 95.25 | 52.79 | 67.93 |
| FastHAC (dist=0.72, single) | 92.07 | 57.28 | 70.63 |
| [DaskSpectral](https://ml.dask.org/clustering.html#spectral-clustering) (ncluster=8573, affinity='rbf') | 78.75 | 66.59 | 72.16 |
| [CDP](https://github.com/XiaohangZhan/cdp) (single model, th=0.7) | 80.19 | 70.47 | 75.02 |
| [L-GCN](https://github.com/yl-1993/learn-to-cluster/tree/master/lgcn) (k_at_hop=[200, 10], active_conn=10, step=0.6, maxsz=300) | 74.38 | 83.51 | 78.68 |
| GCN-D (2 prpsls) | 95.41 | 67.77 | 79.25 |
| GCN-D (5 prpsls) | 94.62 | 72.59 | 82.15 |
| GCN-D (8 prpsls) | 94.23 | 79.69 | 86.35 |
| GCN-D (20 prplss) | 94.54 | 81.62 | 87.61 |
| GCN-D + GCN-S (2 prpsls) | 99.07 | 67.22 | 80.1 |
| GCN-D + GCN-S (5 prpsls) | 98.84 | 72.01 | 83.31 |
| GCN-D + GCN-S (8 prpsls) | 97.93 | 78.98 | 87.44 |
| GCN-D + GCN-S (20 prpsls) | 97.91 | 80.86 | 88.57 |
| GCN-V | 92.45 | 82.42 | 87.14 |
| GCN-V + GCN-E | 92.56 | 83.74 | 87.93 |
| Infomap(ours) | 95.50 | 92.51 | 93.98 |

![avatar](./image/evaluate.png)

## References
[最小熵原理(五):“层层递进”之社区发现与聚类](https://spaces.ac.cn/archives/7006)
[人脸聚档主流方案](https://github.com/yl-1993/learn-to-cluster)
5 changes: 5 additions & 0 deletions evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from .metrics import *
from .evaluate import evaluate
59 changes: 59 additions & 0 deletions evaluation/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import inspect
import argparse
import numpy as np

from evaluation import metrics
from utils import Timer, TextColors


def _read_meta(fn):
labels = list()
lb_set = set()
with open(fn) as f:
for lb in f.readlines():
lb = int(lb.strip())
labels.append(lb)
lb_set.add(lb)
return np.array(labels), lb_set


def evaluate(gt_labels, pred_labels, metric='pairwise'):
if isinstance(gt_labels, str) and isinstance(pred_labels, str):
print('[gt_labels] {}'.format(gt_labels))
print('[pred_labels] {}'.format(pred_labels))
gt_labels, gt_lb_set = _read_meta(gt_labels)
pred_labels, pred_lb_set = _read_meta(pred_labels)

print('#inst: gt({}) vs pred({})'.format(len(gt_labels),
len(pred_labels)))
print('#cls: gt({}) vs pred({})'.format(len(gt_lb_set),
len(pred_lb_set)))

metric_func = metrics.__dict__[metric]

with Timer('evaluate with {}{}{}'.format(TextColors.FATAL, metric,
TextColors.ENDC)):
result = metric_func(gt_labels, pred_labels)
if isinstance(result, np.float):
print('{}{}: {:.4f}{}'.format(TextColors.OKGREEN, metric, result,
TextColors.ENDC))
else:
ave_pre, ave_rec, fscore = result
print('{}ave_pre: {:.4f}, ave_rec: {:.4f}, fscore: {:.4f}{}'.format(
TextColors.OKGREEN, ave_pre, ave_rec, fscore, TextColors.ENDC))


if __name__ == '__main__':
metric_funcs = inspect.getmembers(metrics, inspect.isfunction)
metric_names = [n for n, _ in metric_funcs]

parser = argparse.ArgumentParser(description='Evaluate Cluster')
parser.add_argument('--gt_labels', type=str, required=True)
parser.add_argument('--pred_labels', type=str, required=True)
parser.add_argument('--metric', default='pairwise', choices=metric_names)
args = parser.parse_args()

evaluate(args.gt_labels, args.pred_labels, args.metric)
153 changes: 153 additions & 0 deletions evaluation/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import numpy as np
import json
from sklearn.metrics.cluster import (contingency_matrix,
normalized_mutual_info_score)
from sklearn.metrics import (precision_score, recall_score)

__all__ = ['pairwise', 'bcubed', 'nmi', 'precision', 'recall', 'accuracy']


class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return super(NpEncoder, self).default(obj)


def _check(gt_labels, pred_labels):
if gt_labels.ndim != 1:
raise ValueError("gt_labels must be 1D: shape is %r" %
(gt_labels.shape,))
if pred_labels.ndim != 1:
raise ValueError("pred_labels must be 1D: shape is %r" %
(pred_labels.shape,))
if gt_labels.shape != pred_labels.shape:
raise ValueError(
"gt_labels and pred_labels must have same size, got %d and %d" %
(gt_labels.shape[0], pred_labels.shape[0]))
return gt_labels, pred_labels


def _get_lb2idxs(labels):
lb2idxs = {}
for idx, lb in enumerate(labels):
if lb not in lb2idxs:
lb2idxs[lb] = []
lb2idxs[lb].append(idx)
return lb2idxs


def _compute_fscore(pre, rec):
return 2. * pre * rec / (pre + rec)


def fowlkes_mallows_score(gt_labels, pred_labels, sparse=True):
''' The original function is from `sklearn.metrics.fowlkes_mallows_score`.
We output the pairwise precision, pairwise recall and F-measure,
instead of calculating the geometry mean of precision and recall.
'''
n_samples, = gt_labels.shape

c = contingency_matrix(gt_labels, pred_labels, sparse=sparse)
tk = np.dot(c.data, c.data) - n_samples
pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples
qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples

avg_pre = tk / pk
avg_rec = tk / qk
fscore = _compute_fscore(avg_pre, avg_rec)

return avg_pre, avg_rec, fscore


def pairwise(gt_labels, pred_labels, sparse=True):
_check(gt_labels, pred_labels)
return fowlkes_mallows_score(gt_labels, pred_labels, sparse)


def bcubed0(gt_labels, pred_labels):
"""
计算bcubed的precision, recall, f-score及expanding
:param gt_labels:
:param pred_labels:
:return:
"""
gt_lb2idxs = _get_lb2idxs(gt_labels)
pred_lb2idxs = _get_lb2idxs(pred_labels)

num_lbs = len(gt_lb2idxs)
pre = np.zeros(num_lbs)
rec = np.zeros(num_lbs)
gt_num = np.zeros(num_lbs)

expand = np.zeros(num_lbs)
for i, gt_idxs in enumerate(gt_lb2idxs.values()):
all_pred_lbs = np.unique(pred_labels[gt_idxs])
gt_num[i] = len(gt_idxs)
expand[i] = all_pred_lbs.shape[0]
for pred_lb in all_pred_lbs:
pred_idxs = pred_lb2idxs[pred_lb]
n = 1. * np.intersect1d(gt_idxs, pred_idxs).size
pre[i] += n ** 2 / len(pred_idxs)
rec[i] += n ** 2 / gt_num[i]

gt_num = gt_num.sum()
avg_pre = pre.sum() / gt_num
avg_rec = rec.sum() / gt_num
fscore = _compute_fscore(avg_pre, avg_rec)

return avg_pre, avg_rec, fscore, expand.mean()


def bcubed(gt_labels, pred_labels):
"""
输出becubed函数中各项指标,以及丢弃n个档案后的指标
和剩余的图片数量和label数量
:param gt_labels:
:param pred_labels:
:param n:
:return:
"""
pred_lb2idxs = _get_lb2idxs(pred_labels)
n = 1
ind = []
for i in pred_lb2idxs.values():
if len(i) > n:
for m in i:
ind.append(m)

avg_pre, avg_rec, fscore, expand = bcubed0(gt_labels, pred_labels)
# print('avg_pre:{}, avg_rec:{}, fscore:{}, expanding:{}, rest images:{}, rest_gt_labels:{} '.
# format(avg_pre, avg_rec, fscore, expand, len(gt_labels), len(list(set(gt_labels)))))
#
# avg_pre1, avg_rec1, fscore1, expand1 = bcubed0(gt_labels[ind], pred_labels[ind])
# print('avg_pre:{}, avg_rec:{}, fscore:{}, expanding:{}, rest images:{}, rest_gt_labels:{} '.
# format(avg_pre1, avg_rec1, fscore1, expand1, len(ind), len(list(set(gt_labels[ind])))))

return avg_pre, avg_rec, fscore


def nmi(gt_labels, pred_labels):
return normalized_mutual_info_score(pred_labels, gt_labels)


def precision(gt_labels, pred_labels):
return precision_score(gt_labels, pred_labels)


def recall(gt_labels, pred_labels):
return recall_score(gt_labels, pred_labels)


def accuracy(gt_labels, pred_labels):
return np.mean(gt_labels == pred_labels)
Loading

0 comments on commit 8cadc08

Please sign in to comment.