CUDA float matrix multiplication gives the wrong answer

49 views Asked by At

I recently started to program with CUDA. I wrote this code in which two matrices are initialized. Then these matrices are multiplied by each other by passing them in the GPU_setup_MaMul function. But if I do this, the answer is wrong.

I checked the code multiple times, but I can't find the mistake. I tried to print out the values from the matrix multiplication and sometimes it seems that the kernel accesses the wrong data, and if it accesses the right data, the answer to the multiplication is sometimes 0 (by two non-0 floats).

Furthermore, I also wrote the code for integer matrix multiplication; with integers, I had no problems.

Does anyone know what the problem is?

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void matrixMulGPU(float* a, float* b, float* c, int M_, int N_, int K_)
{
    //calcukeate row and colum
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if ((row < M_ && col < K_))
    {
        int temp = 0;


        for (int i = 0; i < N_; i++) {
            
            temp += a[row * N_ + i] * b[i * K_ + col];
            printf("Thread [%f, %f]: temp = %f\n", a[row * N_ + i], b[i * K_ + col], temp);
        }
        c[row * K_ + col] = temp;

    }
}

void GPU_setup_MaMul(float* h_a, float* h_b, float* h_c, int M, int N, int K) {
    float* d_a, * d_b, * d_c;
    cudaMalloc(&d_a, M * N * sizeof(float));
    cudaMalloc(&d_b, N * K * sizeof(float));
    cudaMalloc(&d_c, M * K * sizeof(float));

    cudaMemcpy(d_a, h_a, N * M * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, N * K * sizeof(float), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((M + 16 - 1) /16, (K + 16 - 1) / 16);
    matrixMulGPU << <numBlocks, threadsPerBlock >> > (d_a, d_b, d_c, M, N, K);
    
    cudaMemcpy(h_c, d_c, M * K * sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}

void init_Scale(float x,float y,float z,float w,float* zu_verschieben,int lenght, int height) {

    zu_verschieben[0] = x;
    zu_verschieben[1 * lenght + 1] = y;
    zu_verschieben[2 * lenght + 2] = z;
    zu_verschieben[3 * lenght + 3] = w;


    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 4; j++) {
            std::cout << " " << zu_verschieben[i * 4 + j];
        }
        std::cout << "\n";
    }


}

#define LENGHT 50
#define HEIGHT 3
#define COL_TRI 4
#define ROW_TRI 50
int main()
{
    size_t bytes_2 = (COL_TRI)*ROW_TRI * sizeof(float);
    
    float * Scale, * Scale2, * answer;
    Scale = (float*)calloc(COL_TRI, COL_TRI * sizeof(float));
    Scale2 = (float*)calloc(COL_TRI, COL_TRI * sizeof(float));
    answer = (float*)calloc(COL_TRI, COL_TRI * sizeof(float));

    init_Scale(3.0f, 3.0f, 2.0f, 2.0f, Scale2, COL_TRI, COL_TRI);
    init_Scale(3.0f, 3.0f, 2.0f, 2.0f, Scale, COL_TRI, COL_TRI);
    GPU_setup_MaMul(Scale, Scale2, answer, COL_TRI, COL_TRI, ROW_TRI);


    
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 4; j++) {
            std::cout << " " << answer[i * 4 + j];
        }
        std::cout << "\n";
    }



    free(triMa);
    triMa = NULL;

    free(Scale);
    Scale = NULL;
    

    return 0;
}
 

When I multiply the matrix,
[[3,0,0,0]
[0,3,0,0]
[0,0,2,0]
[0,0,0,2]]
with itself, the answer that I get is
[[9,0,0,0]
[0,9,0,0]
[0,0,6,0]
[0,0,0,6]]
but it should be 
[[9,0,0,0]
[0,9,0,0]
[0,0,4,0]
[0,0,0,4]]
Also, when I multiply the matrix:
[[2,0,0,0]
[0,3,0,0]
[0,0,2,0]
[0,0,0,2]] 
with itself, I get:

[[4,0,0,0]
[0,6,0,0]
[0,0,4,0]
[0,0,0,4]]
instead of
[[4,0,0,0]
[0,9,0,0]
[0,0,4,0]
[0,0,0,4]]
0

There are 0 answers