I wanted to benchmark the difference in execution speed between an unrolled loop and a for loop applied on a triangle
object. The entire example is available here.
Here is the complete code:
#include <iostream>
#include <vector>
#include <array>
#include <random>
#include <functional>
#include <chrono>
#include <fstream>
template<typename RT>
class Point
{
std::array<RT,3> data;
public:
Point() = default;
Point(std::initializer_list<RT>& ilist)
:
data(ilist)
{}
Point(RT x, RT y, RT z)
:
data({x,y,z})
{};
RT& operator[](int i)
{
return data[i];
}
RT operator[](int i) const
{
return data[i];
}
const Point& operator += (Point const& other)
{
data[0] += other.data[0];
data[1] += other.data[1];
data[2] += other.data[2];
return *this;
}
const Point& operator /= (RT const& s)
{
data[0] /= s;
data[1] /= s;
data[2] /= s;
return *this;
}
};
template<typename RT>
Point<RT> operator-(const Point<RT>& p1, const Point<RT>& p2)
{
return Point<RT>(p1[0] - p2[0], p1[1] - p2[1], p1[2] - p2[2]);
}
template<typename RT>
std::ostream& operator<<(std::ostream& os , Point<RT> const& p)
{
os << p[0] << " " << p[1] << " " << p[2];
return os;
}
template<typename Point>
class Triangle
{
std::array<Point, 3> points;
public:
typedef typename std::array<Point, 3>::value_type value_type;
typedef Point PointType;
Triangle() = default;
Triangle(std::initializer_list<Point>& ilist)
:
points(ilist)
{}
Triangle(Point const& p1, Point const& p2, Point const& p3)
:
points({p1, p2, p3})
{}
Point& operator[](int i)
{
return points[i];
}
Point operator[](int i) const
{
return points[i];
}
auto begin()
{
return points.begin();
}
const auto begin() const
{
return points.begin();
}
auto end()
{
return points.end();
}
const auto end() const
{
return points.end();
}
};
template<typename Triangle>
typename Triangle::PointType barycenter_for(Triangle const& triangle)
{
typename Triangle::value_type barycenter;
for (const auto& point : triangle)
{
barycenter += point;
}
barycenter /= 3.;
return barycenter;
}
template<typename Triangle>
typename Triangle::PointType barycenter_unrolled(Triangle const& triangle)
{
typename Triangle::PointType barycenter;
barycenter += triangle[0];
barycenter += triangle[1];
barycenter += triangle[2];
barycenter /= 3.;
return barycenter;
}
template<typename TriangleSequence>
typename TriangleSequence::value_type::value_type
barycenter(
TriangleSequence const& triangles,
std::function
<
typename TriangleSequence::value_type::value_type (
typename TriangleSequence::value_type const &
)
> triangle_barycenter
)
{
typename TriangleSequence::value_type::value_type barycenter;
for(const auto& triangle : triangles)
{
barycenter += triangle_barycenter(triangle);
}
barycenter /= double(triangles.size());
return barycenter;
}
using namespace std;
int main(int argc, const char *argv[])
{
typedef Point<double> point;
typedef Triangle<point> triangle;
const int EXP = (atoi(argv[1]));
ofstream outFile;
outFile.open("results.dat",std::ios_base::app);
const unsigned int MAX_TRIANGLES = pow(10, EXP);
typedef std::vector<triangle> triangleVector;
triangleVector triangles;
std::random_device rd;
std::default_random_engine e(rd());
std::uniform_real_distribution<double> dist(-10,10);
for (unsigned int i = 0; i < MAX_TRIANGLES; ++i)
{
triangles.push_back(
triangle(
point(dist(e), dist(e), dist(e)),
point(dist(e), dist(e), dist(e)),
point(dist(e), dist(e), dist(e))
)
);
}
typedef std::chrono::high_resolution_clock Clock;
auto start = Clock::now();
auto trianglesBarycenter = barycenter(triangles, [](const triangle& tri){return barycenter_for(tri);});
auto end = Clock::now();
auto forLoop = end - start;
start = Clock::now();
auto trianglesBarycenterUnrolled = barycenter(triangles, [](const triangle& tri){return barycenter_unrolled(tri);});
end = Clock::now();
auto unrolledLoop = end - start;
cout << "Barycenter difference (should be a zero vector): " << trianglesBarycenter - trianglesBarycenterUnrolled << endl;
outFile << MAX_TRIANGLES << " " << forLoop.count() << " " << unrolledLoop.count() << "\n";
outFile.close();
return 0;
}
The example consists of a Point type, and a Triangle type. The benchmarked calculation is the calculation of the triangle barycenter. It can be done with a for loop:
for (const auto& point : triangle)
{
barycenter += point;
}
barycenter /= 3.;
return barycenter;
or it can be unrolled since a triangle has three points:
barycenter += triangle[0];
barycenter += triangle[1];
barycenter += triangle[2];
barycenter /= 3.;
return barycenter;
So I wanted to test which function that computes a barycenter will be faster, for a set of triangles. To make the most of the test, I made the number of triangles being operated on variable by executing the main program with an integer exponent argument:
./main 6
resulting in 10^6 triangles. The number of triangles is ranging from 100 to 1e06. The main program creates "results.dat" file. To analyze the results, I've coded a small python script:
#!/usr/bin/python
from matplotlib import pyplot as plt
import numpy as np
import os
results = np.loadtxt("results.dat")
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax2 = ax1.twinx()
ax1.loglog();
ax2.loglog();
ratio = results[:,1] / results[:,2]
print("Speedup factors unrolled loop / for loop: ")
print(ratio)
l1 = ax1.plot(results[:,0], results[:,1], label="for loop", color='red')
l2 = ax1.plot(results[:,0], results[:,2], label="unrolled loop", color='black')
l3 = ax2.plot(results[:,0], ratio, label="speedup ratio", color='green')
lines = [l1, l2, l3];
ax1.set_ylabel("CPU count")
ax2.set_ylabel("relative speedup: unrolled loop / for loop")
ax1.legend(loc="center right")
ax2.legend(loc="center left")
plt.savefig("results.png")
And to make use of all that on your computer, copy the example code, compile it with:
g++ -std=c++1y -O3 -Wall -pedantic -pthread main.cpp -o main
To plot the measured CPU time for different barycenter functions, execute the python script (I've called it plotResults.py
):
for i in {1..6}; do ./main $i; done
./plotResults.py
What I have expected to see is that the relative speedup between the unrolled loop and the for
loop (for loop time / unrolled loop time) will increase with the size of the triangle set. This conclusion would follow from a logic: if an unrolled loop is faster than a for loop, executing a lot of unrolled loops should be faster than executing a lot of for loops. Here is a diagram of the results that is generated by the above python script:
The impact of loop unrolling dies of fast. As soon as I am working with more than 100 triangles, there seems to be no difference. Looking at the speedup computed by the python script:
[ 3.13502399 2.40828402 1.15045831 1.0197221 1.1042312 1.26175165
0.99736715]
the speedup when 100 triangles are used (3d place in the list corresponds to 10^2) is 1.15.
I came here to find out what I did wrong, because there must be something wrong here, IMHO. :) Thanks in advance.
Edit: plotting cachegrind cache miss ratios
If the program is run like this:
for input in {2..6}; do valgrind --tool=cachegrind ./main $input; done
cachegrind
generates a bunch of output files, that can be parsed with grep
for PROGRAM TOTALS
, a list of numbers representing the following data, taken from the cachegrind manual:
Cachegrind gathers the following statistics (abbreviations used for each statistic is given in parentheses):
I cache reads (Ir, which equals the number of instructions executed), I1 cache read misses (I1mr) and LL cache instruction read
misses (ILmr).
D cache reads (Dr, which equals the number of memory reads), D1 cache read misses (D1mr), and LL cache data read misses (DLmr). D cache writes (Dw, which equals the number of memory writes), D1 cache write misses (D1mw), and LL cache data write misses (DLmw). Conditional branches executed (Bc) and conditional branches mispredicted (Bcm). Indirect branches executed (Bi) and indirect branches mispredicted (Bim).
And the "combined" cache miss ratio is defined as: (ILmr + DLmr + DLmw) / (Ir + Dr + Dw)
The output files can be parsed like this:
for file in cache*; do cg_annotate $file | grep TOTALS >> program-totals.dat; done
sed -i 's/PROGRAM TOTALS//'g program-totals.dat
and the resulting data can be then visualized using this python script:
#!/usr/bin/python
from matplotlib import pyplot as plt
import numpy as np
import os
import locale
totalInput = [totalInput.strip().split(' ') for totalInput in open('program-totals.dat','r')]
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8' )
totals = []
for line in totalInput:
totals.append([locale.atoi(item) for item in line])
totals = np.array(totals)
# Assumed default output format
# Ir I1mr ILmr Dr Dmr DLmr Dw D1mw DLmw
# 0 1 2 3 4 5 6 7 8
cacheMissRatios = (totals[:,2] + totals[:,5] + totals[:,8]) / (totals[:,0] + totals[:,3] + totals[:,6])
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.loglog()
results = np.loadtxt("results.dat")
l1 = ax1.plot(results[:,0], cacheMissRatios, label="Cachegrind combined cache miss ratio", color='black', marker='x')
l1 = ax1.plot(results[:,0], results[:,1] / results[:,2], label="Execution speedup", color='blue', marker='o')
ax1.set_ylabel("Cachegrind combined cache miss ratio")
ax1.set_xlabel("Number of triangles")
ax1.legend(loc="center left")
plt.savefig("cacheMisses.png")
So, ploting the combined LL miss rate against the program speedup when the triangle access loop is unrolled, results in the following diagram:
And there seems to be a dependence in the LL mis rate : as it goes up, the speedup of the program goes down. But still, I can't see a clear reason for the bottleneck.
Is the combined LL miss rate the right thing to analyze? Looking at the valgrind output, all miss rates are reported to be less than 5%, this should be quite OK, right?
Even with your unrolling, calculation of
barycenter
is done one at a time. Furthermore, each step of the calculation (for a single barycenter) depends on the previous one, which means that they cannot be parallelized. You could certainly achieve a better throughput by computingn
barycenters at once, instead of just one, and benchmark on various values forn
to determine which amount will saturate the CPU pipelines.Another aspect which might help speeding up the computation is the data layout: instead of storing triangle points together in a single struct, you could try splitting them in 3 different arrays (one for each point), and again benchmark with different values for
n
.Regarding your main question, unless the code transformation reduces the degree of complexity of the underlying algorithm (which is totally possible), the gained speed should be at most linear on the data set size, but with a sufficiently large one, it is likely to hit different limits (for instance, what happens when one level of memory - cache level 1, level 2, main memory - becomes saturated?).