Coresets-for-Image-Segmentation/SkeletonCode.py at main · OnePunchMonk/Coresets-for-Image-Segmentation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from sklearn.datasets import fetch_kddcup99
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import wkpp as wkpp
import numpy as np
import random


# Real data input
dataset = fetch_kddcup99()								# Fetch kddcup99
data = dataset.data										# Load data
data = np.delete(data,[0,1,2,3],1) 						# Preprocess
data = data.astype(float)								# Preprocess
data = StandardScaler().fit_transform(data)				# Preprocess

n = np.size(data,0)										# Number of points in the dataset
d = np.size(data,1)										# Number of dimension/features in the dataset.
k = 17													# Number of clusters (say k = 17)
Sample_size = 100

def dist_to_B(x, B, return_closest_index=False):
    min_dist = np.inf
    closest_index = -1
    for i, b in enumerate(B):
        dist = np.linalg.norm(x - b)
        if dist < min_dist:
            min_dist = dist
            closest_index = i
    if return_closest_index:
        return min_dist, closest_index
    return min_dist

def D2(data, k):
    # Implementation of Algorithm1
    centers = []
    centers.append(data[np.random.choice(len(data))])

    for _ in range(k - 1):
        p = np.zeros(len(data))
        for i, x in enumerate(data):
            p[i] = dist_to_B(x, centers) ** 2
        p = p / sum(p)
        centers.append(data[np.random.choice(len(data), p=p)])

    return centers

centers = D2(data,k)

def Sampling(data, k, centers, Sample_size):
    # Implementation of Algorithm2

    alpha = 16 * (np.log2(k) + 2)

    B_i_totals = [0] * len(centers)
    B_i = [np.empty_like(data) for _ in range(len(centers))]
    for x in data:
        _, closest_index = dist_to_B(x, centers, return_closest_index=True)
        B_i[closest_index][B_i_totals[closest_index]] = x
        B_i_totals[closest_index] += 1

    c_phi = sum([dist_to_B(x, centers) ** 2 for x in data]) / len(data)

    p = np.zeros(len(data))

    sum_dist = {i: 0 for i in range(len(centers))}
    for i, x in enumerate(data):
        dist, closest_index = dist_to_B(x, centers, return_closest_index=True)
        sum_dist[closest_index] += dist ** 2

    for i, x in enumerate(data):
        p[i] = 2 * alpha * dist_to_B(x, centers) ** 2 / c_phi

        _, closest_index = dist_to_B(x, centers, return_closest_index=True)
        p[i] += 4 * alpha * sum_dist[closest_index] / (B_i_totals[closest_index] * c_phi)

        p[i] += 4 * len(data) / B_i_totals[closest_index]
    p = p / sum(p)

    chosen_indices = np.random.choice(len(data), size=Sample_size, p=p)
    weights = [1 / (Sample_size * p[i]) for i in chosen_indices]
    coresets=[data[i] for i in chosen_indices]
    return coresets, weights

coreset, weight = Sampling(data,k,centers,Sample_size)

fkmeans = KMeans(n_clusters=k,init='k-means++')
fkmeans.fit_predict(data)

coreset=np.array(coreset)
weight=np.array(weight)

Coreset_centers, _ = wkpp.kmeans_plusplus_w(coreset, k, w=weight, n_local_trials=100)						# Run weighted kMeans++ on coreset points
wt_kmeansclus = KMeans(n_clusters=k, init=Coreset_centers, max_iter=10).fit(coreset,sample_weight = weight)	# Run weighted KMeans on the coreset, using the inital centers from the above line.
Coreset_centers = wt_kmeansclus.cluster_centers_															# Compute cluster centers
coreset_cost = np.sum(np.min(cdist(data,Coreset_centers)**2,axis=1))										# Compute clustering cost from the above centers
reative_error_practicalCoreset = abs(coreset_cost - fkmeans.inertia_)/fkmeans.inertia_						# Computing relative error from practical coreset, here fkmeans.inertia_ is the optimal cost on the complete data.

#-----Uniform Sampling based Coreset-----#
tmp = np.random.choice(range(n),size=Sample_size,replace=False)
sample = data[tmp][:]																						# Uniform sampling
sweight = n*np.ones(Sample_size)/Sample_size 																# Maintain appropriate weight
sweight = sweight/np.sum(sweight)																			# Normalize weight to define a distribution

#-----Uniform Samling based Coreset performance-----#
wt_kmeansclus = KMeans(n_clusters=k, init='k-means++', max_iter=10).fit(sample,sample_weight = sweight)		# Run KMeans on the random coreset
Uniform_centers = wt_kmeansclus.cluster_centers_															# Compute cluster centers
uniform_cost = np.sum(np.min(cdist(data,Uniform_centers)**2,axis=1))										# Compute clustering cost from the above centers
reative_error_unifromCoreset = abs(uniform_cost - fkmeans.inertia_)/fkmeans.inertia_						# Computing relative error from random coreset, here fkmeans.inertia_ is the optimal cost on the full data.


print("Relative error from Practical Coreset is",reative_error_practicalCoreset)
print("Relative error from Uniformly random Coreset is",reative_error_unifromCoreset)

##Relative error from Practical Coreset is 0.8495299208348667
##Relative error from Uniformly random Coreset is 1.5071939069065823