I am writing a c++ code for comparing the performance of morphologyEx method of opencv using the CPU and GPU versions. Here is my code:
#include <opencv2/opencv.hpp>
#include <opencv2/gpu/gpu.hpp>
#include <sys/time.h>       
#include <ctime>
using namespace cv;
using namespace std;
double start_timer()
{
     double start_time = (double) getTickCount();
     return start_time;
}
double end_timer(double start_time,int num_tests)
{
    double time = (1000 * ((double) getTickCount() - start_time)/ getTickFrequency());
    cout << "Average time of " << num_tests  << " frames is: " << time/num_tests <<  " ms" << endl;
    return time;
}
int main()
{
    Mat cpuSrc;
    cv::gpu::GpuMat src_gpu, dst_gpu;
    Mat dst;
    Mat element;
    int element_shape = MORPH_RECT;
    element = getStructuringElement(element_shape, Size(10, 10 ), Point(-1, -1) );
    cpuSrc = imread("images.jpeg",CV_LOAD_IMAGE_ANYDEPTH);
    if (!cpuSrc.data)
    {
        cerr << "Cannot read the data" << endl;
        return -1;
    }
    cout << "Starting calculating time for CPU ....." << endl;
    double start_time = start_timer();
    int d = 0;
    while(d<100)
    {
        cv::morphologyEx(cpuSrc, dst, CV_MOP_OPEN, element,Point(-1,-1),1);
    }
    double total_time_cpu = end_timer(start_time,d);
//--------------------------------------------------------------
    cout << "Starting calculating time for GPU ....." << endl;
    d = 0;
    cv::gpu::GpuMat buf1, buf2;
    gpu::Stream stream;
    double start_time_1 = start_timer();
    while(d<100)
    {
        stream.enqueueUpload(cpuSrc, src_gpu);
        cv::gpu::morphologyEx(src_gpu,dst_gpu,CV_MOP_OPEN,element,
                   buf1,buf2,Point(-1,-1),1,stream);
        stream.enqueueDownload(dst_gpu, dst);
    }
    stream.waitForCompletion();
    double total_time_gpu = end_timer(start_time_1,d);
    cout << "Gain is: " << total_time_cpu / total_time_gpu << endl;
    return 0;
}
I am using a loop as if i am simulating a video that contains 100 frames. I am using NVIDIA Corporation GF110 [GeForce GTX 570] and Intel Corporation Xeon E5/Core i7 DMI2. Moreover, i tested the time for uploading and downloading and it is very large in the first frame but after that it can be neglected approximately for uploading it is 0.02ms per frame and downloading is 0.1ms and the main time consumption is with the morphologyEx operation.
The time results for this simulations are as follows:
for CPU morphology version, The average time of 100 frames is:: 0.027349 ms and for the GPU version is:: 18.0128 ms
Could you please help me to figure out what might be the reasons for such unexpected performance?!!
Thank you so much in advance.
 
                        
In the initialization you should call:
It will speed up initialization.