I implemented an algorithm called naiveKmeans in python. But it's so long.
I tried to convert the python code into cython code. I typed also variables in order to speed up the algortihm. But it remains so long again. Cython doesn't accelerate my algorithm.
cython code implementing naiveKmeans algorithm
from scipy.spatial.distance import euclidean
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t
import time
cdef class NaiveKmeans:
cdef public np.ndarray data
cdef public np.ndarray assignment
cdef public np.ndarray clusterSize
cdef public np.ndarray centerMovement
cdef public np.ndarray centers
cdef public np.ndarray sumNewCenters
cdef public int n
cdef public int k
cdef public int d
def __init__(self,DTYPE_t kk,DTYPE_t nn, np.ndarray X, np.ndarray cent):
self.n = nn
self.k = kk
self.d = X.shape[1]
self.data = X
self.centers = cent
#For each point in x, keep which cluster it is assigned to. By using a
#short, we assume a limited number of clusters (fewer than 2^16).
self.assignment = np.full((self.n),-1,dtype=float)
# To communicate (to all threads) that we have converged.
#bool converged;
#Keep track of how many points are in each cluster, divided over each
#thread.
self.clusterSize = np.zeros(self.k)
#centerMovement is computed in move_centers() and used to detect
#convergence (if max(centerMovement) == 0.0) and update point-center
#distance bounds (in subclasses that use them).
#self.centerMovement = None
""" sumNewCenters and centerCount provide sufficient statistics to
// quickly calculate the changing locations of the centers. Whenever a
// point changes cluster membership, we subtract (add) it from (to) the
// row in sumNewCenters associated with its old (new) cluster. We also
// decrement (increment) centerCount for the old (new) cluster."""
self.sumNewCenters = np.zeros((self.k,self.d),dtype=float)
def changeAssignment(self,xIndex,closestCluster):
oldAssignment = self.assignment[xIndex]
self.clusterSize[self.assignment[xIndex]] -= 1
self.clusterSize[closestCluster] += 1
self.assignment[xIndex] = closestCluster
xp = self.data[xIndex]
self.sumNewCenters[oldAssignment] = np.subtract(self.sumNewCenters[oldAssignment],xp)
self.sumNewCenters[closestCluster] = np.add(self.sumNewCenters[closestCluster],xp)
def move_centers(self):
cdef int furthestMovingCenter = 0
self.centerMovement = np.zeros(self.k,dtype=float)
for j in range(self.k):
if (self.clusterSize[j] > 0):
z = np.divide(self.sumNewCenters[j],self.clusterSize[j])
#self.old_centers[j] = z
#self.centerMovement[j] = np.sqrt(np.square( np.subtract(z,self.centers[j]) ))
self.centerMovement[j] = euclidean(z,self.centers[j])
self.centers[j] = z
#self.centerMovement[j] = np.sqrt(self.centerMovement[j])
#print("shape fur naiv",self.centerMovement[furthestMovingCenter].shape,self.centerMovement[furthestMovingCenter])
print("cem",self.centerMovement,self.centerMovement[2])
if (self.centerMovement[furthestMovingCenter] < self.centerMovement[j]):
furthestMovingCenter = j
return furthestMovingCenter
def runThread(self,maxIterations):
#track the number of iterations the algorithm performs
iterations = 0
converged = False
while ((iterations < maxIterations) and (not converged)):
#print("ce nv",self.centers)
st = time.time()
iterations+=1
#loop over all examples
for i in range(self.n):
# look for the closest center to this example
closest = 0
closestDist2 = np.inf
for j in range(self.k):
d2 = euclidean(self.data[i], self.centers[j]);
if (d2 < closestDist2):
closest = j
closestDist2 = d2
if (self.assignment[i] != closest):
self.changeAssignment(i, closest)
print("temps iter cython nv",time.time()-st)
furthestMovingCenter = self.move_centers()
converged = (0.0 == self.centerMovement[furthestMovingCenter])
#print(converged,converged.all())
return iterations
setup
from distutils.core import setup, Extension
from Cython.Build import cythonize
import numpy
setup(
ext_modules=cythonize(Extension('nv', ["nv.pyx"], include_dirs=[numpy.get_include()]))
)
main program python calling NaiveKmeans
from sklearn.datasets.samples_generator import make_classification
from nv import NaiveKmeans
k=5
n=40000
(data,y) = make_classification(n_samples=n,n_features=2, n_redundant=0,n_clusters_per_class=1)
# define initial centroids (points obtained from data)
centers = _k_init(X=data, n_clusters=k)
NV=NaiveKmeans(k,n,data,centers)
start = time.time()
print(NV.runThread(100))
print("time NV",time.time()-start)
Each iteration of the algorithm in cython takes about 2.5 seconds, that is too long for a dataset containing only 40 000 points of 2 dimensions. It takes the same time that in pure python. I don't why cython fails to speed up the program.