Here is the code, matrix multiplication in C, the matrix's fill is integer. Optimization for loop nest and loop unrolling. Run well till N is 7168. For N 8196, insufficient memory.If you run it, just screen, wait till get time result. This is an alarm from my computer (Asus A456UQ) or code is still wrong?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include<time.h>
#include <sys/time.h>
#include<assert.h>
#define mat(m, col, row) \
m.data[row*m.cols + col]
#define N 8192
typedef struct
{
int cols, rows;
int *data;
}matrix_t;
void matrix_init(matrix_t *matrix, int cols, int rows)
{
int i,j;
matrix->cols = cols;
matrix->rows = rows;
matrix->data = malloc(sizeof(int)*cols*rows);
if (!matrix->data)
{
fprintf(stderr, "Insufficient memory for a %dx%d matrix", cols, rows);
exit(EXIT_FAILURE);
}
// 0 fill
memset(matrix->data, 0, cols*rows);
}
void matrix_print(matrix_t *matrix)
{
int i, n;
n = matrix->cols * matrix->rows;
printf("--M%dx%d--\n", matrix->rows, matrix->cols);
for (i=0; i<n; i++)
{
printf("%d ", matrix->data[i]);
if ((i+1)%matrix->cols == 0) printf("\n");
}
}
void free_array(matrix_t *matrix) {
free(matrix->data);
matrix->data = NULL;
matrix->cols = matrix->rows = 0;
}
void mul_matrix(matrix_t A, matrix_t B, matrix_t C){
int ii, kk,ib,kb,j,i,k;
int acc00, acc01, acc10,acc11;
int size = A.cols;
ib=16;
kb=16;
for (ii = 0; ii < size; ii += ib)
{
for (kk = 0; kk < size; kk += kb)
{
for (j=0; j < size; j += 2)
{
for(i = ii; i < ii + ib; i += 2 )
{
if (kk == 0)
acc00 = acc01 = acc10 = acc11 = 0;
else
{
acc00 = mat(C,i,j);
acc01 = mat(C,i,(j+1));
acc10 = mat(C,(i+1),j);
acc11 = mat(C,(i+1),(j+1));
}
}
for (k = kk; k < kk + kb; k++)
{
acc00 += mat(A,i,k) * mat(B,k,j);
acc01 += mat(A,i,k) * mat(B,k,j+1);
acc10 += mat(A,(i+1),k) * mat(B,k,j);
acc11 += mat(A,(i+1),k) * mat(B,k,(j+1));
}
mat(C,i,j) = acc00;
mat(C,i,(j+1)) = acc01;
mat(C,(i+1),j) = acc10;
mat(C,(i+1),(j+1)) = acc11;
}
}
}
}
/*
void mul_matrix(matrix_t A, matrix_t B, matrix_t C){
int i,j,k;
int t;
int size = A.cols;
for (int j=0; j<size; j++) {
// iterates the j-th row of c
for (int i=0; i<size; i++) {
mat(C,i,j)=0;
}
// iterates the j-th row of b
for (int k=0; k<size; k++) {
t = mat(B,k,j);
// iterates the j-th row of c
// iterates the k-th row of a
for (int i=0; i<size; i++) {
mat(C,i,j) += mat(A,i,k) * t;
}
}
}
*/
/*
for (i = 0; i < size; i++){
for (j = 0; j < size; j++) {
mat(C,i,j) = 0;
for (k = 0; k < size; k++)
mat(C,i,j) += mat(A,i,k) * mat(B, k,j);
}
}
}
*/
void sum_matrix (matrix_t A, matrix_t B, matrix_t C){
int i,j;
int size = A.cols;
for(i=0; i< size; i++) {
for (j=0; j<size; j++)
mat(C,i,j) = mat(A,i,j) + mat(B, i,j);
}
}
void sub_matrix (matrix_t A, matrix_t B, matrix_t C){
int i,j;
int size = A.cols;
for(i=0; i< size; i++) {
for (j=0; j<size; j++)
mat(C,i,j) = mat(A,i,j) - mat(B, i,j);
}
}
int main(int argc, char **argv)
{
matrix_t x;
matrix_t y;
matrix_t z;
matrix_t m1,m2,m3,m4,m5,m6,m7;
matrix_t a11,a12,a21,a22;
matrix_t b11,b12,b21,b22;
matrix_t aresult, bresult;
matrix_t c11,c12,c21,c22;
int i,j,k =0;
int half = N/2;
matrix_init(&x, N, N);
matrix_init(&y, N, N);
matrix_init(&z, N, N);
//for(i = 0; i<N; i++)
// for(j = 0; j<N; j++)
// mat(z,i,j) =0;
//matrix_print(&z);
matrix_init(&a11, half, half);
matrix_init(&a12, half, half);
matrix_init(&a21, half, half);
matrix_init(&a22, half, half);
matrix_init(&b11, half, half);
matrix_init(&b12, half, half);
matrix_init(&b21, half, half);
matrix_init(&b22, half, half);
matrix_init(&aresult,half,half);
matrix_init(&bresult,half,half);
matrix_init(&m1, half, half);
matrix_init(&m2, half, half);
matrix_init(&m3, half, half);
matrix_init(&m4, half, half);
matrix_init(&m5, half, half);
matrix_init(&m6, half, half);
matrix_init(&m7, half, half);
matrix_init(&c11, half, half);
matrix_init(&c12, half, half);
matrix_init(&c21, half, half);
matrix_init(&c22, half, half);
struct timeval tv1, tv2;
//srand((unsigned)time(NULL));
for(i = 0; i<N; i++)
for(j = 0; j<N; j++)
mat(x,i,j) =1;// rand() % 10;
for(i = 0; i<N; i++)
for(j = 0; j<N; j++)
mat(y,i,j) =1;// rand() % 10;
gettimeofday(&tv1, NULL);
//dividing the matrices in 4 sub-matrices:
for(i=0; i<half; i++){
for (j=0; j<half;j++){
mat(a11,i,j)= mat(x,i,j);
mat(a12,i,j)= mat(x,i,j+half);
mat(a21,i,j)= mat(x,i+half,j);
mat(a22,i,j)= mat(x,i+half,j+half);
mat(b11,i,j)= mat(y,i,j);
mat(b12,i,j)= mat(y,i,j+half);
mat(b21,i,j)= mat(y,i+half, j);
mat(b22,i,j)= mat(y, i+half, j+half);
}
}
//matrix_print(&a11);
//matrix_print(&a12);
//matrix_print(&a21);
//matrix_print(&a22);
// Calculating m1 to m7:
sum_matrix(a11,a22,aresult);
sum_matrix(b11,b22,bresult);
mul_matrix(aresult,bresult,m1);
sum_matrix(a21,a22,aresult);
mul_matrix(aresult,b11,m2);
sub_matrix(b12,b22,aresult);
mul_matrix(a11,aresult,m3);
sub_matrix(b21,b11, aresult);
mul_matrix(a22,aresult,m4);
sum_matrix(a11,a12,aresult);
mul_matrix(aresult,b22,m5);
sub_matrix(a21,a11,aresult);
sum_matrix(b11,b12,bresult);
mul_matrix(aresult,bresult,m6);
sub_matrix(a12,a22,aresult);
sum_matrix(b21,b22,bresult);
mul_matrix(aresult,bresult,m7);
sum_matrix(m1,m4,aresult);
sub_matrix(aresult,m5,bresult);
sum_matrix(bresult,m7,c11);
sum_matrix(m3,m5,c12);
sum_matrix(m2,m4,c21);
sub_matrix(m1,m2, aresult);
sum_matrix(aresult,m3,bresult);
sum_matrix(bresult,m6,c22);
//matrix_print(&c11);
//matrix_print(&c12);
//matrix_print(&c21);
//matrix_print(&c22);
//combine matrix
for(i=0; i < half; i++){
for (j=0; j<half;j++){
mat(z,i,j)= mat(c11,i,j);
mat(z,(i+half),j)= mat(c12,i,j);
mat(z,i,(j+half))= mat(c21,i,j);
mat(z,(i+half),(j+half))= mat(c22,i,j);
}
}
gettimeofday(&tv2, NULL);
//matrix_print(&x);
//matrix_print(&y);
//matrix_print(&z);
printf ("\n \nTotal time = %f seconds\n",(double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec));
free_array(&x);
free_array(&y);
free_array(&z);
free_array(&m1);
free_array(&m2);
free_array(&m3);
free_array(&m4);
free_array(&m5);
free_array(&m6);
free_array(&m7);
free_array(&a11);
free_array(&a12);
free_array(&a21);
free_array(&a22);
free_array(&b11);
free_array(&b12);
free_array(&b21);
free_array(&b22);
free_array(&aresult);
free_array(&bresult);
free_array(&c11);
free_array(&c12);
free_array(&c21);
free_array(&c22);
return 0;
}