PCL GPU Octree implementation is slow

813 views Asked by At

I want to use pcl::gpu::Octree for storing the obstacle map and retrieving the closest distance to a given point. I've modified an official test example as follows:

  int main(int argc, char** argv){

        using namespace pcl::gpu;

        pcl::gpu::DataGenerator data;
        data.data_size = 871000;
        data.tests_num = 2;    
        data.cube_size = 1024.f;
        data.max_radius    = data.cube_size/30.f;
        data.shared_radius = data.cube_size/30.f;
        data.printParams();

        const float host_octree_resolution = 25.f;
        const int k = 1; // only this is supported

        //generate
        data();
        //prepare device cloud
        pcl::gpu::Octree::PointCloud cloud_device;
        cloud_device.upload(data.points);

        //prepare host cloud
        pcl::PointCloud<pcl::PointXYZ>::Ptr cloud_host(new pcl::PointCloud<pcl::PointXYZ>); 
        cloud_host->width = data.points.size();
        cloud_host->height = 1;
        cloud_host->points.resize (cloud_host->width * cloud_host->height);    
        std::transform(data.points.cbegin(), data.points.cend(), cloud_host->begin(), DataGenerator::ConvPoint<pcl::PointXYZ>());

        //gpu build 
        pcl::gpu::Octree octree_device;                
        octree_device.setCloud(cloud_device);       
        octree_device.build();
        
        //build host octree
        pcl::octree::OctreePointCloudSearch<pcl::PointXYZ> octree_host(host_octree_resolution);
        octree_host.setInputCloud (cloud_host);    
        octree_host.addPointsFromInputCloud();
               
        //upload queries
        pcl::gpu::Octree::Queries queries_device;
        queries_device.upload(data.queries);

        //prepare output buffers on host
        std::vector<pcl::Indices > result_host(data.tests_num);
        std::vector<std::vector<float> >  dists_host(data.tests_num);
        for(std::size_t i = 0; i < data.tests_num; ++i)
        {
            result_host[i].reserve(k);
            dists_host[i].reserve(k);
        }

        //prepare output buffers on device
        pcl::gpu::NeighborIndices result_device;
        pcl::gpu::Octree::ResultSqrDists result_sqr_distances;

        //search GPU shared
        {
            pcl::ScopeTime time("1nn-gpu");
            octree_device.nearestKSearchBatch(queries_device, k, result_device, result_sqr_distances);
        }

        std::vector<int> downloaded;
        result_device.data.download(downloaded);

        std::vector<float> downloaded_sqr_dists;
        result_sqr_distances.download(downloaded_sqr_dists);

        {
            pcl::ScopeTime time("1nn-cpu");
            for(std::size_t i = 0; i < data.tests_num; ++i){
                auto distance  = octree_host.nearestKSearch(data.queries[i], k, result_host[i], dists_host[i]);
                for (std::size_t j = 0; j <result_host[i].size(); ++j){
                    std::cout << "  cpu  "  <<   (*cloud_host)[ result_host[i].at(j) ].x  << " " << (*cloud_host)[ result_host[i].at(j) ].y << " " << (*cloud_host)[ result_host[i].at(j) ].z << " (squared distance: " << dists_host[i].at(j) << ")" << std::endl;
                    std::cout << "  gpu  "  <<   (*cloud_host)[downloaded[i]].x  << " " << (*cloud_host)[downloaded[i]].y << " " << (*cloud_host)[downloaded[i]].z << " (squared distance: " << downloaded_sqr_dists[i] << ")" << std::endl;
                }    
            }
        }
    }

Output:

1nn-gpu took 0.103565ms.

1nn-cpu took 0.057062ms.

Problem: Since Map is huge, my objective is to store the map as a kdtree and when host needs free distance to given position GPU gives the free distance. However, CPU is always better than GPU, am I using the wrong approach or I can not use GPU in this case?

0

There are 0 answers