I implemented an algorithm called naiveKmeans in python. But it's so long.

I tried to convert the python code into cython code. I typed also variables in order to speed up the algortihm. But it remains so long again. Cython doesn't accelerate my algorithm.

cython code implementing naiveKmeans algorithm

from scipy.spatial.distance import euclidean
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t


import time

cdef class NaiveKmeans:
    cdef public np.ndarray data
    cdef public np.ndarray assignment
    cdef public np.ndarray clusterSize
    cdef public np.ndarray centerMovement
    cdef public np.ndarray centers
    cdef public np.ndarray sumNewCenters
    cdef public int n
    cdef public int k
    cdef public int d
    def __init__(self,DTYPE_t kk,DTYPE_t nn, np.ndarray X, np.ndarray cent):
        self.n = nn
        self.k = kk
        self.d = X.shape[1]
        self.data = X
        self.centers = cent
        #For each point in x, keep which cluster it is assigned to. By using a
        #short, we assume a limited number of clusters (fewer than 2^16).
        self.assignment = np.full((self.n),-1,dtype=float)
        # To communicate (to all threads) that we have converged.
        #bool converged;
        #Keep track of how many points are in each cluster, divided over each
        #thread.
        self.clusterSize = np.zeros(self.k)
        #centerMovement is computed in move_centers() and used to detect
        #convergence (if max(centerMovement) == 0.0) and update point-center
        #distance bounds (in subclasses that use them).
        #self.centerMovement = None

        """ sumNewCenters and centerCount provide sufficient statistics to
        // quickly calculate the changing locations of the centers. Whenever a
        // point changes cluster membership, we subtract (add) it from (to) the
        // row in sumNewCenters associated with its old (new) cluster. We also
        // decrement (increment) centerCount for the old (new) cluster."""
        self.sumNewCenters = np.zeros((self.k,self.d),dtype=float)

    def changeAssignment(self,xIndex,closestCluster):
        oldAssignment = self.assignment[xIndex]
        self.clusterSize[self.assignment[xIndex]] -= 1
        self.clusterSize[closestCluster] += 1
        self.assignment[xIndex] = closestCluster
        xp = self.data[xIndex]
        self.sumNewCenters[oldAssignment] = np.subtract(self.sumNewCenters[oldAssignment],xp) 
        self.sumNewCenters[closestCluster] = np.add(self.sumNewCenters[closestCluster],xp)

    def move_centers(self):
        cdef int furthestMovingCenter = 0
        self.centerMovement = np.zeros(self.k,dtype=float)
        for j in range(self.k):
            if (self.clusterSize[j] > 0):
                    z = np.divide(self.sumNewCenters[j],self.clusterSize[j])
                    #self.old_centers[j] = z
                    #self.centerMovement[j] = np.sqrt(np.square( np.subtract(z,self.centers[j]) ))
                    self.centerMovement[j] = euclidean(z,self.centers[j]) 
                    self.centers[j] = z
                    #self.centerMovement[j] = np.sqrt(self.centerMovement[j])
            #print("shape fur naiv",self.centerMovement[furthestMovingCenter].shape,self.centerMovement[furthestMovingCenter])
            print("cem",self.centerMovement,self.centerMovement[2])
            if (self.centerMovement[furthestMovingCenter] < self.centerMovement[j]):
                furthestMovingCenter = j
        return furthestMovingCenter

    def runThread(self,maxIterations):
        #track the number of iterations the algorithm performs
        iterations = 0
        converged = False
        while ((iterations < maxIterations) and (not converged)):
            #print("ce nv",self.centers)
            st = time.time()
            iterations+=1
            #loop over all examples
            for i in range(self.n):
                # look for the closest center to this example
                closest = 0
                closestDist2 = np.inf
                for j in range(self.k):
                    d2 = euclidean(self.data[i], self.centers[j]);
                    if (d2 < closestDist2):
                        closest = j
                        closestDist2 = d2
                if (self.assignment[i] != closest):
                    self.changeAssignment(i, closest)
            print("temps iter cython nv",time.time()-st)
            furthestMovingCenter = self.move_centers()
            converged = (0.0 == self.centerMovement[furthestMovingCenter])
            #print(converged,converged.all())


        return iterations

setup

from distutils.core import setup, Extension
from Cython.Build import cythonize
import numpy 

setup(
        ext_modules=cythonize(Extension('nv', ["nv.pyx"], include_dirs=[numpy.get_include()]))

)

main program python calling NaiveKmeans

from sklearn.datasets.samples_generator import make_classification
from nv import NaiveKmeans
k=5
n=40000

(data,y) = make_classification(n_samples=n,n_features=2, n_redundant=0,n_clusters_per_class=1)


# define initial centroids (points obtained from data) 
centers = _k_init(X=data, n_clusters=k)


NV=NaiveKmeans(k,n,data,centers)
start = time.time()
print(NV.runThread(100))
print("time NV",time.time()-start)

Each iteration of the algorithm in cython takes about 2.5 seconds, that is too long for a dataset containing only 40 000 points of 2 dimensions. It takes the same time that in pure python. I don't why cython fails to speed up the program.

0 Answers