I've been trying to speed up some neural network computations using AVX instructions. However, I keep running into the following error "Unhandled exception at [...]: Access violation reading location [...]".
I tried to isolate the issue but as the memory seems to be corrupted somewhere, the error doesn't show up at the same place each time, and i suspect it be misleading. Does anyone know what might cause the issue?
Here is some reproducible code:
#include <immintrin.h>
#include <cmath>
#include <iostream>
#include <array>
#include <vector>
inline const int num_avx_registers = 16;
inline const int floats_per_reg = 4;
inline const int HKP_size = 100;
inline constexpr int acc_size = 256;
class NNLayer {
public:
alignas(32) float* weight;
alignas(32) float* bias;
NNLayer(){
weight = new float[HKP_size * acc_size]; // flattened 2D array.
bias = new float[acc_size];
// initialize the weights and bias with test values
for (uint32_t i=0; i<HKP_size * acc_size; i++){
weight[i] = 1.F;
}
for (int i=0; i<acc_size; i++){
bias[i] = static_cast<float>(i);
}
}
~NNLayer(){
delete[] weight;
delete[] bias;
}
};
class Accumulator {
public:
alignas(32) std::array<float, acc_size> accumulator_w;
alignas(32) std::array<float, acc_size> accumulator_b;
std::array<float, acc_size>& Accumulator::operator[](bool color){
return color ? accumulator_w : accumulator_b;
}
};
class NNUE {
public:
Accumulator accumulator;
NNLayer first_layer = NNLayer();
void compute_accumulator(const std::vector<int> active_features, bool color){
// we have 256 floats to process.
// there are 16 avx registers, and each can hold 4 floats.
// therefore we need to do 256/64 = 4 passes to the registers.
constexpr int c_size = num_avx_registers * floats_per_reg; //chunk size
constexpr int num_chunks = acc_size / c_size;
static_assert(acc_size % c_size == 0);
__m256 avx_regs[num_avx_registers];
// we process 1/4th of the whole data at each loop.
// we add c_idx to the indexes pick up where we left off at the last chunk.
for (int c_idx = 0; c_idx < num_chunks*c_size; c_idx += c_size){ // chunk index
// load the bias from memory
for (int i = 0; i < num_avx_registers; i++){
avx_regs[i] = _mm256_load_ps(&first_layer.bias[c_idx + i*floats_per_reg]);
}
// add the active weights
for (const int &a: active_features){
for (int i = 0; i < num_avx_registers; i++){
// a*acc_size is to get the a-th row of the flattened 2D array.
avx_regs[i] = _mm256_add_ps(
avx_regs[i],
_mm256_load_ps(&first_layer.weight[a*acc_size + c_idx + i*floats_per_reg])
);
}
}
//store the result in the accumulator
for (int i = 0; i < num_avx_registers; i++){
_mm256_store_ps(&accumulator[color][c_idx + i*floats_per_reg], avx_regs[i]);
}
}
}
};
int main(){
NNUE nnue;
std::vector<int> act_f = {2, 1, 70, 62};
nnue.compute_accumulator(act_f, true);
std::cout << "still alive\n";
return 0;
}
alignas(32) float* weight;only aligns the address where the pointer is stored, not the pointed-to-memory. If you want to create aligned memory withnewin C++17 you can write: