caffe forward net in a for loop not working

I am currently trying to write a c++ wrapper for PSPNet's prediction (originally in Matlab). PSPNet runs on Caffe.

Situation: I have a trained caffe model, and would like to implement this wrapper to run the segmentation result when given an input. In this case, my crop_size is smaller than it's original size. Thus, it is being cropped manually to multiple 425x425 "frames" and fed forward into caffe net after the pre-processes in a for-loop.

Problem: However, net seems to only be running forward once despite being in a for loop. Supported by its processing time and output, refer below.

This is the incomplete code I am currently trying to work on:

#define USE_OPENCV 1
#define trimapSize 1
#define Debug 0
#include <caffe/caffe.hpp>

#include "Header.h"
#include "caffe/data_reader.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/blob.hpp"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif  // USE_OPENCV

#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include <chrono> //Just for time measurement
#include <cmath>
#include <array>

#include <iostream>
#include <fstream>

using namespace caffe;  // NOLINT(build/namespaces)
using std::string;

class Classifier {
  Classifier(const string& model_file,
             const string& trained_file);

  cv::Mat Predict(const cv::Mat& img);

  void SetMean(int weight, int heigh);

  void WrapInputLayer(std::vector<cv::Mat>* input_channels);

  cv::Mat Visualization(Blob<float>* output_layer);
  cv::Mat Preprocess(const cv::Mat& img_scale, int ori_rows, int ori_cols, std::vector<cv::Mat>* input_channels);

  shared_ptr<Net<float> > net_;
  cv::Size input_geometry_;
  int num_channels_;
  cv::Mat mean_;

Classifier::Classifier(const string& model_file,
                       const string& trained_file) {


  /* Load the network. */
  net_.reset(new Net<float>(model_file, TEST));

  CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
  CHECK_EQ(net_->num_outputs(), 2) << "Network should have exactly one output.";

  Blob<float>* input_layer = net_->input_blobs()[0];
  num_channels_ = input_layer->channels();
  CHECK(num_channels_ == 3 || num_channels_ == 1)
    << "Input layer should have 1 or 3 channels.";
  input_geometry_ = cv::Size(input_layer->width(), input_layer->height());

/* Create the mean file in binaryproto format. */
void Classifier::SetMean(int weight, int heigh) {

  mean_ = cv::Mat(heigh, weight, CV_32FC3);

  mean_ = cv::Scalar(94.6744, 88.8887, 100.5404);//RGB


cv::Mat Classifier::Predict(const cv::Mat& img) {

  cv::Mat originalTmp = img.clone();
  Blob<float>* input_layer = net_->input_blobs()[0];
  input_layer->Reshape(1, num_channels_,
                       input_geometry_.height, input_geometry_.width);

  std::cout << "input_geometry_.height = " << input_geometry_.height << "input_geometry_.width = "<< input_geometry_.width << std::endl;

  /* Forward dimension change to all layers. */

  std::vector<cv::Mat> input_channels;

   /*-----------------------------FOR MULTI-SCALE PROCESSING--------------------------*/
  int base_size = 0;
  int ori_rows = img.rows;
  int ori_cols = img.cols;
  float scale_array [1] = {1};
  //  float scale_array = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
  std::cout << "ori_rows = " << ori_rows << "\t ori_cols = " << ori_cols << std::endl;
  cv::Mat data_all = cv::Mat::zeros(cv::Size(425, 425), CV_32FC3);

  if (ori_rows > ori_cols) {
       base_size = ori_rows;
  else base_size =  ori_cols;

  std::cout << "base_size = " << base_size << std::endl;
  std::cout << "size of array = " << (sizeof(scale_array)/sizeof(*scale_array)) << std::endl;

  for (int i=0; i < (sizeof(scale_array)/sizeof(*scale_array)); i++){
    int long_size = base_size * scale_array[i] + 1;
    int new_rows = long_size;
    int new_cols = long_size;

     std::cout << "BEFORE new rows = " << new_rows << "\t new cols = " << new_cols << std::endl;

    if (ori_rows > ori_cols){
      new_cols = round(long_size/ori_rows*ori_cols);
    else {new_rows = round(long_size/ori_cols*ori_rows);}

    std::cout << "AFTER new rows = " << new_rows << "\t new cols = " << new_cols << std::endl;

    cv::Mat img_scale;
    cv::resize(img, img_scale, cv::Size(new_cols, new_rows), 0, 0, CV_INTER_LINEAR);

    std::cout << "img_scale height: " << img_scale.rows << "\t width = " << img_scale.cols << std::endl;

    data_all = data_all + Preprocess(img_scale, ori_rows, ori_cols, &input_channels);
    std::cout << "ok! DONE PREPROCESS!" << std::endl;

  return data_all;

cv::Mat Classifier::Preprocess(const cv::Mat& img_scale, int ori_rows, int ori_cols, std::vector<cv::Mat>* input_channels)
  int crop_size = 425;
  int new_rows = img_scale.rows;
  int new_cols = img_scale.cols;
  cv::Mat data_output = cv::Mat::zeros(cv::Size(ori_cols, ori_rows), CV_32FC3);
  int long_size = new_rows;
  cv::Mat img_processed;

  if (new_cols > new_rows){
    long_size = new_cols;

  if (long_size <= crop_size){
    // img_processed = Preprocess(img_scale, &input_channels);
    //RUN CAFFE --- NOT YET DONE ---
    std::cout << "OK!" << std::endl;
  else {
    float stride_rate = 2.0/3.0;
    std::cout << "stride_rate = " << stride_rate << std::endl;
    int stride = ceil(crop_size*stride_rate);
    std::cout << "stride = " << stride << std::endl;

    cv::Mat img_pad = img_scale;

    int pad_rows = img_pad.rows;
    int pad_cols = img_pad.cols;
    int h_grid = ceil((pad_rows - crop_size)/stride) + 1;
    int w_grid = ceil((pad_cols - crop_size)/stride) + 1;
    cv::Mat img_sub;

    cv::Mat data_scale = cv::Mat::zeros(cv::Size(pad_cols, pad_cols), CV_32FC3);

    for(int grid_yidx = 1; grid_yidx <= h_grid; grid_yidx++){
      for (int grid_xidx = 1; grid_xidx <= w_grid; grid_xidx++){
        int s_x = (grid_xidx-1)*stride+1;
        int s_y = (grid_yidx-1)*stride+1;
        int e_x = std::min(s_x + crop_size -1, pad_cols);
        int e_y = std::min(s_y + crop_size -1, pad_rows);
        s_x = e_x - crop_size + 1;
        s_y = e_y - crop_size + 1;

    /* Cropping image */

        cv::Mat sample;
       if (img_sub.channels() == 3 && num_channels_ == 1)
          cv::cvtColor(img_sub, sample, cv::COLOR_BGR2GRAY);
        else if (img_sub.channels() == 4 && num_channels_ == 1)
          cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2GRAY);
        else if (img_sub.channels() == 4 && num_channels_ == 3)
          cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2BGR);
        else if (img_sub.channels() == 1 && num_channels_ == 3)
          cv::cvtColor(img_sub, sample, cv::COLOR_GRAY2BGR);
          sample = img_sub;

        cv::Mat sample_float;

        if (num_channels_ == 3)
          sample.convertTo(sample_float, CV_32FC3);
          sample.convertTo(sample_float, CV_32FC1);

        SetMean(sample.rows, sample.cols);

        cv::imshow("sample_float", sample_float);

        cv::cvtColor(sample_float, sample_float, cv::COLOR_BGRA2RGB);
        sample_float =  sample_float.t();

        cv::Mat sample_normalized(sample_float.size(),sample_float.type());

        cv::subtract(sample_float.clone(), mean_, sample_normalized); 

  cv::Mat sample_temp;
  sample_normalized.convertTo(sample_temp, CV_32FC3, 255);
  cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/sample_normalized.png", sample_temp);
  cv::imshow("sample_normalized", sample_normalized);

        /* This operation will write the separate BGR planes directly to the
         * input layer of the network because it is wrapped by the cv::Mat
         * objects in input_channels. */
        img_processed = sample_normalized.t();

        cv::split(img_processed, *input_channels);

              == net_->input_blobs()[0]->cpu_data())
          << "Input channels are not wrapping the input layer of the network.";

        img_processed.convertTo(sample_temp, CV_32FC3, 255);
        cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/img_processed.png", sample_temp);

        std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); //Just for time measurement

        // float loss = 0.0;
        // net_->Forward(&loss);

        std::chrono::steady_clock::time_point end= std::chrono::steady_clock::now();
        std::cout << "Processing time = " << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count())/1000000.0 << " sec" <<std::endl; //Just for time measurement

        /* Copy the output layer to a std::vector */
        Blob<float>* output_layer = net_->output_blobs()[0];

        cv::Mat segment = Visualization(output_layer);
        cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/segment.png", segment);
  return (img_processed);

struct RGB {
  int R;
  int G;
  int B;

vector<RGB> get_palette(int nClass)
  vector<RGB> listPlalette;
  RGB rgb0;
  rgb0.R = 0;
  rgb0.G = 0;
  rgb0.B = 0;
  for (int i = 1; i < nClass; i++)
    RGB rgb;

    rgb.R = i*50;
    rgb.G = i*50 + i;
    rgb.B = 255-i*20;

  return listPlalette;

cv::Mat Classifier::Visualization(Blob<float>* output_layer) {

  std::vector<cv::Mat> input_channels;

  int H = output_layer->height();
  int W = output_layer->width();
  // int N = output_layer->num();      //Batch Size 
  int C = output_layer->channels(); //Number of classes

  int index = 0;

#ifdef CPU_ONLY
  const float* output_data = output_layer->cpu_data();
  const float* output_data = output_layer->cpu_data();
#endif // !CPU_ONLY

  cv::Mat class_each_row(C, W*H, CV_32F);
  for (int i = 0; i < C; i++) {
    for (int j = 0; j < (W*H); j++) {<float>(i, j) = output_data[index];
      index = index + 1;

  class_each_row = class_each_row.t();

//==================================CONVERT INTO LABELS==================================//
  float maxValue = 0;

  int* labelIndex = (int*)malloc(W*H * sizeof(int));
  int indexX = 0;
  for (int i = 0; i < class_each_row.rows; i++) {

    maxValue = -999999999999;
    indexX = 0;
    for (int k = 0; k < C; k++)
      float dataM =<float>(i, k);
      if (dataM > maxValue) {
        maxValue = dataM;
        indexX = k;

    labelIndex[i] = indexX;

  cv::Mat labelTmp(W, H, CV_8UC3);
  uchar* dataLabelTmp =;
  vector<RGB> listPalette = get_palette(21);

  for (int i = 0; i < H; i++)
    for (int j = 0; j < W; j++)
      RGB rgb = listPalette[labelIndex[(i*W + j)]];
      dataLabelTmp[3 * (i*W + j)] = rgb.B;
      dataLabelTmp[3 * (i*W + j) + 1] = rgb.G;
      dataLabelTmp[3 * (i*W + j) + 2] = rgb.R;


   cv::imshow( "Display window", labelTmp);

  labelIndex = NULL;

  return labelTmp;

/* Wrap the input layer of the network in separate cv::Mat objects
 * (one per channel). This way we save one memcpy operation and we
 * don't need to rely on cudaMemcpy2D. The last preprocessing
 * operation will write the separate channels directly to the input
 * layer. */
void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
  Blob<float>* input_layer = net_->input_blobs()[0];

  int width = input_layer->width();
  int height = input_layer->height();
  float* input_data = input_layer->mutable_cpu_data();
  for (int i = 0; i < input_layer->channels(); ++i) {
    cv::Mat channel(height, width, CV_32FC1, input_data);
    input_data += width * height;

int main(int argc, char** argv) {
  if (argc != 4) {
    std::cerr << "Usage: " << argv[0]
              << " \ndeploy.prototxt \nnetwork.caffemodel"
              << " \nimg.jpg" << " \ncamvid12.png (for example: /SegNet-Tutorial/Scripts/camvid12.png)" << std::endl;
    return 1;


  string model_file   = argv[1];
  string trained_file = argv[2]; //for visualization

  Classifier classifier(model_file, trained_file);

  string file = argv[3];

  std::cout << "---------- Semantic Segmentation for "
            << file << " ----------" << std::endl;

  cv::Mat img = cv::imread(file, 1);
  CHECK(!img.empty()) << "Unable to decode image " << file;
  cv::Mat prediction;

int main(int argc, char** argv) {
  LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
#endif //USE_OPENCV

To clarify: The for-loop refers to the one in pre-process: specifically this portion:

 for(int grid_yidx = 1; grid_yidx <= h_grid; grid_yidx++){
  for (int grid_xidx = 1; grid_xidx <= w_grid; grid_xidx++){
    int s_x = (grid_xidx-1)*stride+1;
    int s_y = (grid_yidx-1)*stride+1;
    int e_x = std::min(s_x + crop_size -1, pad_cols);
    int e_y = std::min(s_y + crop_size -1, pad_rows);
    s_x = e_x - crop_size + 1;
    s_y = e_y - crop_size + 1;

/* Cropping image */

    cv::Mat sample;
   if (img_sub.channels() == 3 && num_channels_ == 1)
      cv::cvtColor(img_sub, sample, cv::COLOR_BGR2GRAY);
    else if (img_sub.channels() == 4 && num_channels_ == 1)
      cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2GRAY);
    else if (img_sub.channels() == 4 && num_channels_ == 3)
      cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2BGR);
    else if (img_sub.channels() == 1 && num_channels_ == 3)
      cv::cvtColor(img_sub, sample, cv::COLOR_GRAY2BGR);
      sample = img_sub;

    cv::Mat sample_float;

    if (num_channels_ == 3)
      sample.convertTo(sample_float, CV_32FC3);
      sample.convertTo(sample_float, CV_32FC1);

    SetMean(sample.rows, sample.cols);

    cv::imshow("sample_float", sample_float);

    cv::cvtColor(sample_float, sample_float, cv::COLOR_BGRA2RGB);
    sample_float =  sample_float.t();

    cv::Mat sample_normalized(sample_float.size(),sample_float.type());

    cv::subtract(sample_float.clone(), mean_, sample_normalized); 

  cv::Mat sample_temp;
  sample_normalized.convertTo(sample_temp, CV_32FC3, 255);
  cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/sample_normalized.png", sample_temp);
  cv::imshow("sample_normalized", sample_normalized);

    /* This operation will write the separate BGR planes directly to the
     * input layer of the network because it is wrapped by the cv::Mat
     * objects in input_channels. */
    img_processed = sample_normalized.t();

    cv::split(img_processed, *input_channels);

          == net_->input_blobs()[0]->cpu_data())
      << "Input channels are not wrapping the input layer of the network.";

    img_processed.convertTo(sample_temp, CV_32FC3, 255);
    cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/img_processed.png", sample_temp);

    std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); //Just for time measurement

    // float loss = 0.0;
    // net_->Forward(&loss);

    std::chrono::steady_clock::time_point end= std::chrono::steady_clock::now();
    std::cout << "Processing time = " << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count())/1000000.0 << " sec" <<std::endl; //Just for time measurement

    /* Copy the output layer to a std::vector */
    Blob<float>* output_layer = net_->output_blobs()[0];

    cv::Mat segment = Visualization(output_layer);
    cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/segment.png", segment);

Original Image:Original Image (Without pre-processing)

Input: Input (first cropped frame)

Output: Output of the first cropped frame

Time taken for forwarding: Time taken

Following cropped frame gives the same output through out.

P/s: If i shift the code below to the end of predict function and return segment instead, it will work well. But only the last cropped frame will be segmented.

 std::chrono::steady_clock::time_point begin = 
 std::chrono::steady_clock::now(); //Just for time measurement

 // float loss = 0.0;
 // net_->Forward(&loss);

 std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
 std::cout << "Processing time = " << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count())/1000000.0 << " sec" <<std::endl; //Just for time measurement

 /* Copy the output layer to a std::vector */
 Blob<float>* output_layer = net_->output_blobs()[0];

 cv::Mat segment = Visualization(output_layer);
 cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/segment.png", segment);`

input: Input (Last cropped frame of pre-processed image)

output: Output of the last cropped frame

Any help will be appreciated, thank youuuuu!!!


This issue is solved by wrapping the input channel each time it is changed so that the input will be fed forward correctly.

Thus the function:


should be called in the double for loop.