I'm trying to use a Gaussian filter on a generated Mandelbrot set, and I have a solution that will work sequentially. However, I do want to have a solution that works using GPU processing.
I'm using C++ AMP to use GPU processing. An issue with storing the individual color channels of the pixels is that in AMP you can't use unsigned 8 bit integers ( uint8_t ), therefore I've had to store the channels in unsigned 32 bit integers ( uint32_t ). All of the syntax that I've implemented into the AMP code is the same as the sequential one, however the colours of the one generated in AMP has the wrong colours output, however the shape of the Mandelbrot can still be seen.
So, I believe its the way that I'm recollecting the channels together for the pixel that's giving the wrong colour. If anymore details are needed, I can provide them.
Sequential Output
Sequential Code:
void applyFilter(){
int kernelCentreX, kernelCentreY;// center index of kernel
int kernelRadius = 5 / 2;
uint8_t r, g, b;
float kernelTotal = 0.0;
float redTotal = 0.0, blueTotal = 0.0, greenTotal = 0.0;
float sum;
the_clock::time_point start = the_clock::now();
for (int y = 0; y < HEIGHT; y++) { //loop through image height
for (int x = 0; x < WIDTH; x++) {//loop through image width
float redTotal = 0.0, blueTotal = 0.0, greenTotal = 0.0;
float kernelTotal = 0.0;
for (int v = 0; v < 5; v++) { //loop through for kernel height
for (int u = 0; u < 5; u++) { //loop through for kernel width
// Current position
int cX = x + u - kernelRadius;
int cY = y + v - kernelRadius;
//Make sure we stay in boundries
if (cX < 0 || cX > WIDTH - 1 || cY < 0 || cY > HEIGHT - 1)
{
continue;
}
//Get colour channels of current pixel
r = (image[cY][cX] >> 16);
g = (image[cY][cX] >> 8);
b = (image[cY][cX]);
//Get colour channels of current pixel
/*r = image[cY][cX] >> 16;
g = image[cY][cX] >> 8;
b = image[cY][cX];*/
//Calculate Totals
redTotal += r *kernel[v][u];
greenTotal += g *kernel[v][u];
blueTotal += b *kernel[v][u];
kernelTotal += kernel[v][u];
}
}
//Calculate new pixel values
r = (redTotal / kernelTotal);
g = (greenTotal / kernelTotal);
b = (blueTotal / kernelTotal);
image[y][x] = (r << 16 | g << 8 | b);
}
}
the_clock::time_point end = the_clock::now();
auto time_taken = duration_cast<milliseconds>(end - start).count();
cout << "Time taken to apply kernel(sequential) " << time_taken << "ms" << endl;
}
Parallel Output
Parallel Code:
void applyFilterAMP() {
the_clock::time_point start = the_clock::now();
uint32_t *pImage = &(image[0][0]);
uint32_t *pFilteredImage = &(filteredImage[0][0]);
float *pKernel = &(kernel[0][0]);
uint32_t pi[HEIGHT/3][WIDTH/3][3];
uint32_t* pPi = &(pi[0][0][0]);
array_view<float, 2>k(5, 5, pKernel);
array_view<uint32_t, 2> m(HEIGHT, WIDTH, pImage);
array_view<uint32_t, 2> fi(HEIGHT, WIDTH, pFilteredImage);
//array_view<uint32_t, 3> piColor(HEIGHT/3, WIDTH/3, 3, pPi);
fi.discard_data();
static const int TileSize = 16;
parallel_for_each(fi.extent, [=](concurrency::index<2> idx) restrict(amp) {
//parallel_for_each(fi.extent.tile<TileSize, TileSize>(), [=](tiled_index<TileSize, TileSize> tidx) restrict(amp) {
int kernelCentreX, kernelCentreY;// center index of kernel
int kernelRadius = 5/2;
int y = idx[0];
int x = idx[1];
//int y = tidx.global[0];
//int x = tidx.global[1];
float kernelTotal = 0.0;
float redTotal = 0.0, blueTotal = 0.0, greenTotal = 0.0;
float sum=0.0;
uint32_t r, g, b, newPixel;
for (int v = 0; v < 5; v++) { //loop through for kernel height
for (int u = 0; u < 5; u++) { //loop through for kernel width
// Current position
int cX = x + u - kernelRadius;
int cY = y + v - kernelRadius;
//int cX = tidx.global[1] + u - kernelRadius;
//int cY = tidx.global[0] + v - kernelRadius;
//Make sure we stay in boundries
if ((cX < 0 || cX > WIDTH - 1 || cY < 0 || cY > HEIGHT - 1))
{
continue;
}
//Get colour channels of pixel
r = m[cY][cX] >> 16;
g = m[cY][cX] >> 8;
b = m[cY][cX];
//Calculate Totals
redTotal += r *k[v][u];
greenTotal += g *k[v][u];
blueTotal += b *k[v][u];
kernelTotal += k[v][u];
}
}
//Calculate new pixel values
r = (redTotal / kernelTotal);
g = (greenTotal / kernelTotal);
b = (blueTotal / kernelTotal);
fi[y][x] = (r << 16 | g << 8 | b);
});
fi.synchronize();
//m.synchronize();
the_clock::time_point end = the_clock::now();
auto time_taken = duration_cast<milliseconds>(end - start).count();
cout << "Time taken to apply kernel(parallel) " << time_taken << "ms" << endl;
errno_t err = memcpy_s(image, sizeof(uint32_t)*(HEIGHT * WIDTH), filteredImage, sizeof(uint32_t)*(HEIGHT * WIDTH));
if (err)
{
printf("Error executing memcpy_s.\n");
}
/*for (int j = 0; j < HEIGHT; j++) {
for (int i = 0; i < WIDTH; i++) {
image[j][i] = filteredImage[j][i];
}
}*/
}