I'm going over the code of haar.cpp to understand the sliding window approach. Here is the code:
for( factor = 1; ; factor *= scaleFactor )
        {
            CvSize winSize = { cvRound(winSize0.width*factor),
                                cvRound(winSize0.height*factor) };
            CvSize sz = { cvRound( img->cols/factor ), cvRound( img->rows/factor ) };
            CvSize sz1 = { sz.width - winSize0.width + 1, sz.height - winSize0.height + 1 };
            CvRect equRect = { icv_object_win_border, icv_object_win_border,
                winSize0.width - icv_object_win_border*2,
                winSize0.height - icv_object_win_border*2 };
            CvMat img1, sum1, sqsum1, norm1, tilted1, mask1;
            CvMat* _tilted = 0;
            if( sz1.width <= 0 || sz1.height <= 0 )
                break;
            if( winSize.width > maxSize.width || winSize.height > maxSize.height )
                break;
            if( winSize.width < minSize.width || winSize.height < minSize.height )
                continue;
            img1 = cvMat( sz.height, sz.width, CV_8UC1, imgSmall->data.ptr );
            sum1 = cvMat( sz.height+1, sz.width+1, CV_32SC1, sum->data.ptr );
            sqsum1 = cvMat( sz.height+1, sz.width+1, CV_64FC1, sqsum->data.ptr );
            if( tilted )
            {
                tilted1 = cvMat( sz.height+1, sz.width+1, CV_32SC1, tilted->data.ptr );
                _tilted = &tilted1;
            }
            norm1 = cvMat( sz1.height, sz1.width, CV_32FC1, normImg ? normImg->data.ptr : 0 );
            mask1 = cvMat( sz1.height, sz1.width, CV_8UC1, temp->data.ptr );
            cvResize( img, &img1, CV_INTER_LINEAR );
            cvIntegral( &img1, &sum1, &sqsum1, _tilted );
            int ystep = factor > 2 ? 1 : 2;
            const int LOCS_PER_THREAD = 1000;
            int stripCount = ((sz1.width/ystep)*(sz1.height + ystep-1)/ystep + LOCS_PER_THREAD/2)/LOCS_PER_THREAD;
            stripCount = std::min(std::max(stripCount, 1), 100);
#ifdef HAVE_IPP
            if( use_ipp )
            {
                cv::Mat fsum(sum1.rows, sum1.cols, CV_32F, sum1.data.ptr, sum1.step);
                cv::Mat(&sum1).convertTo(fsum, CV_32F, 1, -(1<<24));
            }
            else
#endif
                cvSetImagesForHaarClassifierCascade( cascade, &sum1, &sqsum1, _tilted, 1. );
            cv::Mat _norm1(&norm1), _mask1(&mask1);
            cv::parallel_for_(cv::Range(0, stripCount),
                         cv::HaarDetectObjects_ScaleImage_Invoker(cascade,
                                (((sz1.height + stripCount - 1)/stripCount + ystep-1)/ystep)*ystep,
                                factor, cv::Mat(&sum1), cv::Mat(&sqsum1), &_norm1, &_mask1,
                                cv::Rect(equRect), allCandidates, rejectLevels, levelWeights, outputRejectLevels, &mtx));
        }
    }
Now, I want to make sure I got everything right. As I understand, we loop over the scales and in each scale we subsample the image and try to find objects at a fixed size (20X20 for faces), going over all the x and y locations.
The pseudo- code is:
for scale=1:ScaleMax
 for X=1:width
      for Y=1:height 
            Try do detect a face at position (x,y) and of a fixedsize of 20X20.
Is that precise or did I get something wrong?
Thanks,
Gil.
 
                        
While the understanding is accurate, it is not precise. For better precision, you should read the original paper from Viola and Jones, since all the magic is in the step "Try do detect a face at position (x,y) and of a fixedsize of 20X20"