I Want to implement accumulate sum with neon. Following is my pseudo code. uint32_t pDst[16]; unsigned char buf[16] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]; uint8x16_t eles = vld1q_u8(buf); uint8x8_t lowEles8 = vget_low_u8(eles); uint8x8_t highEles8 = vget_high_u8(eles);
//do add with 3 pass for lowEles
uint16x8_t lowEles16 = vaddl_u8(lowEles8, vext_u8(zeroVec8, lowEles8, 7));
lowEles16 = vaddq_u16(lowEles16, vextq_u16(zeroVec, lowEles16, 6));
lowEles16 = vaddq_u16(lowEles16, vextq_u16(zeroVec, lowEles16, 4));
//do add with 3 pass for highEles
uint16x8_t highEles16 = vaddl_u8(highEles8, vext_u8(zeroVec8, highEles8, 7));
highEles16 = vaddq_u16(highEles16, vextq_u16(zeroVec, highEles16, 6));
highEles16 = vaddq_u16(highEles16, vextq_u16(zeroVec, highEles16, 4));
// save for the first 8 elements
uint32x4_t res32x4 = vmovl_u16(vget_low_u16(lowEles16));
vst1q_u32(pDst, res32x4);
res32x4 = vmovl_u16(vget_high_u16(lowEles16));
vst1q_u32(pDst+4, res32x4);
// save for the last 8 elements
int preSum = vgetq_lane_u32(res32x4, 3);
uint32x4_t preSum32x4 = vdupq_n_u32(preSum);
res32x4 = vaddq_u32(vmovl_u16(vget_low_u16(lowEles16)), preSum32x4);
vst1q_u32(pDst+8, res32x4);
res32x4 = vaddq_u32(vmovl_u16(vget_high_u16(lowEles16)), preSum32x4);
vst1q_u32(pDst+12, res32x4);
I found my implementation is slow than native c++ implementation. Are there any suggestions