-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSkeletonCode.py
More file actions
116 lines (89 loc) · 4.84 KB
/
SkeletonCode.py
File metadata and controls
116 lines (89 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from sklearn.datasets import fetch_kddcup99
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import wkpp as wkpp
import numpy as np
import random
# Real data input
dataset = fetch_kddcup99() # Fetch kddcup99
data = dataset.data # Load data
data = np.delete(data,[0,1,2,3],1) # Preprocess
data = data.astype(float) # Preprocess
data = StandardScaler().fit_transform(data) # Preprocess
n = np.size(data,0) # Number of points in the dataset
d = np.size(data,1) # Number of dimension/features in the dataset.
k = 17 # Number of clusters (say k = 17)
Sample_size = 100
def dist_to_B(x, B, return_closest_index=False):
min_dist = np.inf
closest_index = -1
for i, b in enumerate(B):
dist = np.linalg.norm(x - b)
if dist < min_dist:
min_dist = dist
closest_index = i
if return_closest_index:
return min_dist, closest_index
return min_dist
def D2(data, k):
# Implementation of Algorithm1
centers = []
centers.append(data[np.random.choice(len(data))])
for _ in range(k - 1):
p = np.zeros(len(data))
for i, x in enumerate(data):
p[i] = dist_to_B(x, centers) ** 2
p = p / sum(p)
centers.append(data[np.random.choice(len(data), p=p)])
return centers
centers = D2(data,k)
def Sampling(data, k, centers, Sample_size):
# Implementation of Algorithm2
alpha = 16 * (np.log2(k) + 2)
B_i_totals = [0] * len(centers)
B_i = [np.empty_like(data) for _ in range(len(centers))]
for x in data:
_, closest_index = dist_to_B(x, centers, return_closest_index=True)
B_i[closest_index][B_i_totals[closest_index]] = x
B_i_totals[closest_index] += 1
c_phi = sum([dist_to_B(x, centers) ** 2 for x in data]) / len(data)
p = np.zeros(len(data))
sum_dist = {i: 0 for i in range(len(centers))}
for i, x in enumerate(data):
dist, closest_index = dist_to_B(x, centers, return_closest_index=True)
sum_dist[closest_index] += dist ** 2
for i, x in enumerate(data):
p[i] = 2 * alpha * dist_to_B(x, centers) ** 2 / c_phi
_, closest_index = dist_to_B(x, centers, return_closest_index=True)
p[i] += 4 * alpha * sum_dist[closest_index] / (B_i_totals[closest_index] * c_phi)
p[i] += 4 * len(data) / B_i_totals[closest_index]
p = p / sum(p)
chosen_indices = np.random.choice(len(data), size=Sample_size, p=p)
weights = [1 / (Sample_size * p[i]) for i in chosen_indices]
coresets=[data[i] for i in chosen_indices]
return coresets, weights
coreset, weight = Sampling(data,k,centers,Sample_size)
fkmeans = KMeans(n_clusters=k,init='k-means++')
fkmeans.fit_predict(data)
coreset=np.array(coreset)
weight=np.array(weight)
Coreset_centers, _ = wkpp.kmeans_plusplus_w(coreset, k, w=weight, n_local_trials=100) # Run weighted kMeans++ on coreset points
wt_kmeansclus = KMeans(n_clusters=k, init=Coreset_centers, max_iter=10).fit(coreset,sample_weight = weight) # Run weighted KMeans on the coreset, using the inital centers from the above line.
Coreset_centers = wt_kmeansclus.cluster_centers_ # Compute cluster centers
coreset_cost = np.sum(np.min(cdist(data,Coreset_centers)**2,axis=1)) # Compute clustering cost from the above centers
reative_error_practicalCoreset = abs(coreset_cost - fkmeans.inertia_)/fkmeans.inertia_ # Computing relative error from practical coreset, here fkmeans.inertia_ is the optimal cost on the complete data.
#-----Uniform Sampling based Coreset-----#
tmp = np.random.choice(range(n),size=Sample_size,replace=False)
sample = data[tmp][:] # Uniform sampling
sweight = n*np.ones(Sample_size)/Sample_size # Maintain appropriate weight
sweight = sweight/np.sum(sweight) # Normalize weight to define a distribution
#-----Uniform Samling based Coreset performance-----#
wt_kmeansclus = KMeans(n_clusters=k, init='k-means++', max_iter=10).fit(sample,sample_weight = sweight) # Run KMeans on the random coreset
Uniform_centers = wt_kmeansclus.cluster_centers_ # Compute cluster centers
uniform_cost = np.sum(np.min(cdist(data,Uniform_centers)**2,axis=1)) # Compute clustering cost from the above centers
reative_error_unifromCoreset = abs(uniform_cost - fkmeans.inertia_)/fkmeans.inertia_ # Computing relative error from random coreset, here fkmeans.inertia_ is the optimal cost on the full data.
print("Relative error from Practical Coreset is",reative_error_practicalCoreset)
print("Relative error from Uniformly random Coreset is",reative_error_unifromCoreset)
##Relative error from Practical Coreset is 0.8495299208348667
##Relative error from Uniformly random Coreset is 1.5071939069065823