-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoresets_1.py
executable file
·181 lines (137 loc) · 6.65 KB
/
coresets_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
###################################################################################################
##
## Data Mining: Learning from Large Datasets
## Lecture Project - Task 3 (Large Scale Clustering)
##
## Team: Datasloths
## Authors: Raphael S. (rasuter@student.ethz.ch)
## Hwang S. (hwangse@student.ethz.ch)
## .... (wch@student.ethz.ch)
##
## Approach:
## 1) Parallel Computation of Coresets in the mapper
## following 'Practical coreset construction' from the lecture
## 2) Merge all Coresets to a large one in the reducer
## 3) (optionally: Compress large Coreset to a smaller one)
## 4) Run Lloyd's algorithm on final Coreset
##
###################################################################################################
import numpy as np
## SETTINGS #######################################################################################
#TODO tune parameters (especially coreset_size and size_B)
k = 200 # number of clusters, given in task description to be 200
runs = 10 # number of runs of k-means (choose lowest error)
coreset_size = 800 # size of coreset in each mapper
size_B = 200 # number of clusters considered in the Coreset construction (for D^2 sampling)
alpha = np.log2(size_B) + 1 # parameter for importance sampling (value suggestion from old slides)
## For run of gridsearch file (read parameters from .npy files), uncomment these lines for submission
# runs = int(np.load('runs.npy'))
# coreset_size = int(np.load('coreset_size.npy'))
# size_B = int(np.load('size_B.npy'))
# alpha = float(np.load('alpha.npy'))
## HELPER FUNCTIONS ###############################################################################
def d_squared_sampling(vectors, size_B):
"""
@brief Perform D^2 Sampling (as described in lecture 11, slide 29)
@param vectors original vectors, 2D numpy array of dimension (num_vecs, feature_dim)
@param size_B 2D numpy array of dimension (num_vecs, feature_dim)
@return pair of
- B indizes of chosen points
- sq_distance_to_B (2D numpy array where entry i,j stands for squared distance of vector_i to b_j)
"""
(num_vecs, dim) = vectors.shape
sq_distance_to_B = np.inf * np.ones([num_vecs, size_B]) # distance_to_B(i,j) == squared L2-norm from point x_i to cluster center b_j
B = size_B * [None]
B[0] = np.random.randint(0,num_vecs) # start with random sample, augment list up to size size_b
sq_distance_to_B[:,0] = np.sum( np.square(vectors - vectors[B[0],:]), axis=1)
for i in range(1, size_B):
# sample according to squared distance to B
d_squared = np.amin(sq_distance_to_B, axis=1)
d_squared /= np.sum(d_squared)
B[i] = np.random.choice(num_vecs, p=d_squared) # draw samples with weighting
sq_distance_to_B[:,i] = np.sum( (vectors - vectors[B[i],:])**2, axis=1) # update distances
return B, sq_distance_to_B
def weighted_kmeans(weights, vectors, k):
"""
@brief Applies Lloyds algorithm using weighted vectors
@param weights weights, numpy array of dimension num_vecs
@param vectors 2D numpy array of dimension (num_vecs, feature_dim)
@return pair of
- k clusters centern in a 2D numpy array of dim (k, feature_dim)
- quantization error
"""
(num_vecs, dim) = vectors.shape
# Initialize from randomly chosen points
# mus = vectors[np.random.choice(num_vecs, size=k, replace=False) ,:] # initialize smarter (e.g. d^2-sampling)
sample_idx, _ = d_squared_sampling(vectors, k)
mus = vectors[sample_idx, :]
sq_distances_to_clusters = np.empty([num_vecs, k])
cluster = np.zeros(num_vecs)
while True:
# Assign points to closest cluster
for i in range(k):
sq_distances_to_clusters[:,i] = np.sum( np.square(vectors - mus[i,:]), axis=1)
cluster_old = cluster
cluster = np.argmin(sq_distances_to_clusters, axis=1)
# Compute new means
for i in range(k):
if len(weights[cluster==i]) > 0: # potentially start new k-means method
mus[i,:] = np.average(vectors[cluster==i,:], axis=0, weights=weights[cluster==i]) # compute weighted average
else:
return (mus, np.inf) # have a lonely cluster center (no corresponding vectors) -> end run
# Check for convergence
if np.sum(cluster_old != cluster) == 0:
break
sq_distance = np.amin(sq_distances_to_clusters, axis=1)
quantization_error = np.dot(weights, sq_distance)
return (mus, quantization_error)
## MAPPER #########################################################################################
def mapper(key, value):
"""
@brief Computes Coreset of vectors given in value
@param key None
@param value 2D numpy array (3000 x 250)
@yield dummy key, value = (weight, vector in Coreset)
"""
global k, coreset_size, size_B, alpha
(num_vecs, dim) = value.shape
## D^2 Sampling (lecture 11, slide 26)
B, sq_distance_to_B = d_squared_sampling(value, size_B)
## Importance Sampling (lecture 11, slide 29)
d_squared = np.amin(sq_distance_to_B, axis=1)
d_squared /= np.sum(d_squared)
c_phi = np.mean(d_squared)
q = d_squared / c_phi # individual contribution to weighting q
# compute clusterwise contribution
cluster = np.argmin(sq_distance_to_B, axis=1) # which point in B is the closest
for b in range(len(B)):
size_Bi = np.sum(cluster==b)
q[cluster==b] += 2*alpha * np.sum(d_squared[cluster==b]) / size_Bi / c_phi + 4*num_vecs/size_Bi
q /= np.sum(q) # normalize
# draw samples according to q
sample_indizes = np.random.choice(num_vecs, size=coreset_size, p=q) # draw samples with weighting
samples = value[sample_indizes,:]
weights = np.reciprocal(q[sample_indizes])
yield 0, np.c_[weights, samples] # in the first column we have the weights
## REDUCER ########################################################################################
def reducer(key, values):
"""
@brief Merges all collected Coresets from the mappers and runs Lloyd's algorithm on the
final Coreset (weighted vectors!)
TODO: potentially compress large Coreset again before running Lloyd's algorithm
@param key dummy
@param values 2D numpy array with columns corresponding to the vectors in the coreset
the first column contains the weight of each vector
@yield 200x250 numpy array containing the k-means solution vectors as columns (no key)
"""
global k, runs
weights = values[:,0]
vectors = values[:,1:]
all_mus = []
all_errors = []
for _ in range(runs):
mus, quantization_error = weighted_kmeans(weights, vectors, k)
all_mus.append(mus)
all_errors.append(quantization_error)
print 'Quantization Errors:', all_errors
yield all_mus[np.argmin(all_errors)]