-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalternative_methods.py
49 lines (35 loc) · 1.75 KB
/
alternative_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
__author__ = 'lene'
from random import randint
from sys import maxsize
from operator import itemgetter
from typing import Set, Callable, Sequence
from typedefs import Item
HashFunction = Callable[[Item], int]
def asymmetric_similarity(set1: Set[Item], set2: Set[Item]) -> float:
"""
Here's an idea for a different metric for similarity: We are not interested in the length of
set2, only of set1. This implies that this function is not commutative.
In terms of the problem that means, roughly, a user is only interested if another user shares
their preferences - not if they share the other user's preferences as well.
"""
return len(set1 & set2) / len(set1)
def minhash_similarity(set1: Set[Item], set2: Set[Item], length: int=100) -> float:
"""
An attempt at approximating the jaccard similarity with MinHash.
Algorithm adapted from
http://robertheaton.com/2014/05/02/jaccard-similarity-and-minhash-for-winners/
Probably buggy, but passes my feeble attempts at testing.
"""
(hashed1, hashed2) = (minhashed(set1, length), minhashed(set2, length))
x = minhashed(hashed1 | hashed2, length)
y = x & hashed1 & hashed2
return len(y) / min(max(len(set1), len(set2)), length)
def random_hash(item: Item) -> int:
return (randint(1, maxsize) + item) % maxsize
def minhashed(data: Set[Item], length: int=100, hash_func: HashFunction=random_hash):
indexable = list(data)
min_item_indices = [min_item_index(hash_func, indexable)[0] for _ in range(length)]
return {indexable[i] for i in min_item_indices}
def min_item_index(hash_func: HashFunction, indexable: Sequence[Item]):
index_and_hash = [(i, hash_func(indexable[i])) for i in range(len(indexable))]
return min(index_and_hash, key=itemgetter(1))