Why doesn't code acceleration work with Cython?

265 views Asked by At

I need to speed up this code to 4 milliseconds.

import numpy as np



def return_call(data):
    num = int(data.shape[0] / 4096)
    buff_spectrum  = np.empty(2048,dtype= np.uint64)
    buff_detect =  np.empty(2048,dtype= np.uint64)
    end_spetrum = np.empty(num*1024,dtype=np.uint64)
    end_detect = np.empty(num*1024,dtype= np.uint64)
    _data = np.reshape(data,(num,4096))

    for _raw_data_spec in _data:
        raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
        for i in range(2048):
            buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
            buff_detect[i] = (np.int16(raw_data_spec[i][0])>>15)
        for i in range (511,-1,-1):
            if buff_spectrum[i+1024] != 0:
                end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
                end_detect[i]=buff_detect[i+1024]
            else:
                end_spetrum[i] =0
                end_detect[i] = 0
        for i in range(1023, 511, -1):
            if buff_spectrum[i+1024] != 0:
                end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
                end_detect[i] = buff_detect[i + 1024]
            else:
                end_spetrum[i] = 0
                end_detect[i] = 0

    return end_spetrum, end_detect

I decided to use Cython for this task. But I didn’t get any acceleration.

import numpy as np
cimport numpy


ctypedef signed short DTYPE_t
cpdef return_call(numpy.ndarray[DTYPE_t, ndim=1] data):
    cdef int i
    cdef int num = data.shape[0]/4096
    cdef numpy.ndarray _data

    cdef numpy.ndarray[unsigned long long, ndim=1] buff_spectrum  = np.empty(2048,dtype= np.uint64)
    cdef numpy.ndarray[ unsigned long long, ndim=1] buff_detect =  np.empty(2048,dtype= np.uint64)
    cdef numpy.ndarray[double , ndim=1] end_spetrum = np.empty(num*1024,dtype= np.double)
    cdef numpy.ndarray[double , ndim=1] end_detect = np.empty(num*1024,dtype= np.double)
    _data = np.reshape(data,(num,4096))

    for _raw_data_spec in _data:
        raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
        for i in range(2048):
            buff_spectrum[i] = (np.uint16(raw_data_spec[i][0])<<17)|(np.uint16(raw_data_spec[i][1] <<1))>>1
            buff_detect[i] = (np.uint16(raw_data_spec[i][0])>>15)
        for i in range (511,-1,-1):
            if buff_spectrum[i+1024] != 0:
                end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
                end_detect[i]=buff_detect[i+1024]
            else:
                end_spetrum[i] =0
                end_detect[i] = 0
        for i in range(1023, 511, -1):
            if buff_spectrum[i+1024] != 0:
                end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
                end_detect[i] = buff_detect[i + 1024]
            else:
                end_spetrum[i] = 0
                end_detect[i] = 0

    return end_spetrum, end_detect

The maximum speed I achieved is 80 milliseconds, but I need it much faster. Since you need to process data from iron in almost real time Tell me the reason. And is it realistic to achieve the desired results. I also enclose the code for the test file.


import numpy as np
import example_original
import example_cython
data = np.empty(8192*2, dtype=np.int16)
import time
startpy = time.time()


example_original.return_call(data)
finpy = time.time() -startpy
startcy = time.time()
k,r = example_cython.return_call(data)
fincy = time.time() -startcy
print( fincy, finpy)
print('Cython is {}x faster'.format(finpy/fincy))
3

There are 3 answers

1
DavidW On BEST ANSWER
raw_data_spec = np.reshape(_raw_data_spec,(2048,2))

raw_data_spec isn't typed. At the start of the function add a definition for it. I recommend the newer memoryview syntax (but use the old numpy syntax if you want):

cdef DTYPE_t[:,:] raw_data_spec

This line (which you have identified as a bottle-neck) is a mess:

buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
  1. Do indexing in one step, not two: raw_data_spec[i, 0] (note one lots of brackets and a comma).

  2. Reconsider the cast to a 16 bit integer. Does it really make sense to be shifting a 16 bit integer by 17 bits?

  3. You probably don't need a cast at all since the data will be known to be DTYPE_t, but if you do do want a cast then use angle brackets: <numpy.uint16_t>(raw_data_spec[i, 0])


Consider turning off boundscheck and wraparound. Verify to yourself that it's safe to do so and that you aren't relying to exceptions to tell you when you're indexing beyond the end of an array, or using negative indexing. Only do this after thought - not automatically in a "cargo cult" way.

cimport cython    

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef return_call(numpy.ndarray[DTYPE_t, ndim=1] data):

Ditch the calls to np.log10. This is a whole Python call on a single element, which ends up being inefficient. You can use the C standard library math functions instead:

from libc.math cimport log10

then replace np.log10 with log10.

3
abhinonymous On

I think a major reason for this might be because your python code had almost no python operations and all of it was numpy operations. A large portion of the numpy code is written in C. Some of it is written in Fortran. A lot of it is written in Python. Well-written numpy code is comparable in speed to C code.

1
max9111 On

I have not much experience with Cython, so this is only an example what timings should also be possible with Cython.

Example

import numpy as np
import numba as nb

@nb.njit(cache=True)
def return_call(data_in):
    #If the input is not contigous the reshape will fail
    #-> make a c-contigous copy if the array isn't c-contigous
    data=np.ascontiguousarray(data_in)

    num = int(data.shape[0] / 4096)
    buff_spectrum  = np.zeros(2048,dtype= np.uint64)
    buff_detect =  np.zeros(2048,dtype= np.uint64)
    end_spetrum = np.zeros(num*1024,dtype=np.float64)
    end_detect = np.zeros(num*1024,dtype= np.float64)
    _data = np.reshape(data,(num,4096))

    #for _raw_data_spec in _data: is not supported
    #but the followin works
    for x in range(_data.shape[0]):
        raw_data_spec = np.reshape(_data[x],(2048,2))
        for i in range(2048):
            buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
            buff_detect[i] = (np.int16(raw_data_spec[i][0])>>15)
        for i in range (511,-1,-1):
            if buff_spectrum[i+1024] != 0:
                end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
                end_detect[i]=buff_detect[i+1024]
            else:
                end_spetrum[i] =0
                end_detect[i] = 0
        for i in range(1023, 511, -1):
            if buff_spectrum[i+1024] != 0:
                end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
                end_detect[i] = buff_detect[i + 1024]
            else:
                end_spetrum[i] = 0
                end_detect[i] = 0

    return end_spetrum, end_detect

Timings

data = np.random.rand(8192*2)*20
data=data.astype(np.int16)

#with compilation
%timeit end_spetrum, end_detect=return_call(data)
#32.7 µs ± 5.61 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

#without compilation
%timeit end_spetrum, end_detect=return_call_orig(data)
#106 ms ± 448 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)