I have the first version of a math library completed, and for the next step I'd like to turn to expression templates to improve the performance of the code. However, my initial results are different than I expected. I am compiling in MSVC 2010, in vanilla Release mode (and am okay with C++0x).
Apologies in advance for the large amount of code I'll be showing you, it's as minimal as I can make it while letting people look at what I'm doing. Profiling framework:
#include <algorithm>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
namespace math
{
class vector; // to be determined
std::ostream& operator<<(std::ostream& stream, const vector& vec)
{
for (std::size_t i = 0; i < 4; ++i)
stream << vec[i] << " ";
return stream;
}
}
// test framework
typedef std::vector<math::vector> array_type[3];
typedef std::vector<math::vector> vector_type;
float generate_float()
{
return static_cast<float>(rand());
}
math::vector generate_vector()
{
return math::vector(generate_float(), generate_float(),
generate_float(), generate_float());
}
vector_type generate_source(std::size_t count)
{
vector_type result; result.reserve(count);
std::generate_n(std::back_inserter(result), count, generate_vector);
return result;
}
double test(const array_type& source,
vector_type& results, std::size_t iterations)
{
// time
std::clock_t begin = std::clock();
for (std::size_t i = 0; i < iterations; ++i)
{
const math::vector& v0 = source[0][i];
const math::vector& v1 = source[1][i];
const math::vector& v2 = source[2][i];
math::vector result(v0 + v1 + v2);
results.push_back(result);
}
std::clock_t end = std::clock();
// print time
double elapsed = static_cast<double>(end - begin) / CLOCKS_PER_SEC;
std::cout << "time: " << elapsed << "\n";
return elapsed;
}
int main()
{
// prepare tests
const std::size_t time_count = 50; // number of times to get time count
const std::size_t test_count = 10000000; // number of iterations in a test
std::cout << "allocating..." << std::endl;
std::vector<double> timeResults; timeResults.reserve(time_count);
array_type source;
for (std::size_t i = 0; i < 3; ++i)
source[i] = generate_source(test_count);
vector_type results;
results.reserve(test_count);
// pre tests
std::cout << "pre-testing..." << std::endl;
for (std::size_t i = 0; i < time_count / 10; ++i)
{
timeResults.push_back(test(source, results, test_count));
results.clear();
}
timeResults.clear();
// begin tests
std::cout << "testing..." << std::endl;
for (std::size_t i = 0; i < time_count; ++i)
{
timeResults.push_back(test(source, results, test_count));
results.clear();
}
// can be turned into functors for non-C++0x, for testing in C++03
double min = std::numeric_limits<double>::max();
double max = std::numeric_limits<double>::min();
std::for_each(timeResults.begin(), timeResults.end(),
[&min, &max](double x)
{
min = std::min(x, min);
max = std::max(x, max);
});
double sum = 0; // throws out max and min results
bool minFlag = false, maxFlag = false;
std::for_each(timeResults.begin(), timeResults.end(),
[min, max, &sum, &minFlag, &maxFlag](double x)
{
if (!minFlag && x <= min)
minFlag = true; // skip
else if (!maxFlag && x >= max)
maxFlag = true; // skip
else
sum += x; // add
});
// print results
double average = sum / (timeResults.size() - 2);
std::cout << "\ntotal time: " << sum << " average time: " << average
<< "\n" << "min: " << min << " max: " << max << std::endl;
}
Expression template vector:
namespace math
{
// core expression template
template <typename E>
class vector_expression
{
public:
template <typename std::size_t I>
float get() const
{
return static_cast<const E&>(*this).get<I>();
}
protected:
~vector_expression() {} // not a public base
};
// vector class
class vector : public vector_expression<vector>
{
public:
vector()
{
data[0] = data[1] = data[2] = data[3] = 0;
}
vector(float x, float y, float z, float w)
{
data[0] = x; data[1] = y; data[2] = z; data[3] = w;
}
template <typename E>
vector(const vector_expression<E>& e)
{
evaluate<0>(e);
}
template <std::size_t I>
float get() const
{
return data[I];
}
float operator[](std::size_t index) const
{
return data[index];
}
private:
template <std::size_t I, typename E>
void evaluate(const vector_expression<E>& e,
typename std::enable_if<I < 4>::type* = nullptr)
{
data[I] = e.get<I>();
evaluate<I + 1>(e);
}
template <std::size_t I, typename E>
void evaluate(const vector_expression<E>& e,
typename std::enable_if<I >= 4>::type* = nullptr)
{
// done
}
float data[4];
};
template <typename E1, typename E2>
class vector_expression_sum :
public vector_expression<vector_expression_sum<E1, E2>>
{
public:
vector_expression_sum(const vector_expression<E1>& first,
const vector_expression<E2>& second) :
mFirst(first),
mSecond(second)
{}
template <typename std::size_t I>
float get() const
{
return mFirst.get<I>() + mSecond.get<I>();
}
private:
const vector_expression<E1>& mFirst;
const vector_expression<E2>& mSecond;
};
template <typename E1, typename E2>
vector_expression_sum<E1, E2>
operator+(const vector_expression<E1>& first,
const vector_expression<E2>& second)
{
return vector_expression_sum<E1, E2>(first, second);
}
}
Manually inlined:
namespace math
{
// same definition
}
// ...
double test(const array_type& source,
vector_type& results, std::size_t iterations)
{
// ...
{
// ...
math::vector result(v0.get<0>() + v1.get<0>() + v2.get<0>(),
v0.get<1>() + v1.get<1>() + v2.get<1>(),
v0.get<2>() + v1.get<2>() + v2.get<2>(),
v0.get<3>() + v1.get<3>() + v2.get<3>());
// ...
}
// ...
}
// ...
Results:
Expression templates:
total time: 14.172 average time: 0.29525
min: 0.281 max: 0.422Manually inlined:
total time: 8.438 average time: 0.175792
min: 0.171 max: 0.188
As you can see, the expression templates (apparently) aren't turning into the fully inlined code. Here's the disassembly of test()
, to last call to std::clock()
:
Expression templates assembly:
test:
00401110 push ebp
00401111 mov ebp,esp
00401113 sub esp,38h
00401116 mov eax,dword ptr [___security_cookie (404018h)]
0040111B xor eax,ebp
0040111D mov dword ptr [ebp-4],eax
00401120 push ebx
00401121 push esi
00401122 mov esi,ecx
00401124 mov dword ptr [ebp-28h],esi
00401127 call dword ptr [__imp__clock (4030DCh)]
0040112D xor ebx,ebx
0040112F mov dword ptr [ebp-1Ch],eax
00401132 mov dword ptr [ebp-24h],ebx
00401135 jmp test+2Ah (40113Ah)
00401137 mov esi,dword ptr [ebp-28h]
0040113A mov eax,dword ptr [esi+20h]
0040113D mov edx,dword ptr [esi+10h]
00401140 mov ecx,dword ptr [esi]
00401142 add eax,ebx
00401144 mov dword ptr [ebp-18h],eax
00401147 add edx,ebx
00401149 add ecx,ebx
0040114B lea eax,[ebp-30h]
0040114E call math::operator+<math::vector,math::vector> (401E60h)
00401153 mov edx,dword ptr [ebp-18h]
00401156 mov ecx,eax
00401158 lea eax,[ebp-38h]
0040115B call math::operator+<math::vector,math::vector> (401E60h)
00401160 mov ecx,dword ptr [eax]
00401162 mov edx,dword ptr [ecx+4]
00401165 fld dword ptr [edx]
00401167 mov edx,dword ptr [ecx]
00401169 fadd dword ptr [edx]
0040116B mov eax,dword ptr [eax+4]
0040116E mov edx,dword ptr [ecx+4]
00401171 fstp dword ptr [ebp-18h]
00401174 fld dword ptr [ebp-18h]
00401177 fadd dword ptr [eax]
00401179 fstp dword ptr [ebp-14h]
0040117C fld dword ptr [edx+4]
0040117F mov edx,dword ptr [ecx]
00401181 fadd dword ptr [edx+4]
00401184 mov edx,dword ptr [ecx+4]
00401187 fstp dword ptr [ebp-18h]
0040118A fld dword ptr [ebp-18h]
0040118D fadd dword ptr [eax+4]
00401190 fstp dword ptr [ebp-10h]
00401193 fld dword ptr [edx+8]
00401196 mov edx,dword ptr [ecx]
00401198 fadd dword ptr [edx+8]
0040119B mov edx,dword ptr [ecx+4]
0040119E mov ecx,dword ptr [ecx]
004011A0 fstp dword ptr [ebp-18h]
004011A3 fld dword ptr [ebp-18h]
004011A6 fadd dword ptr [eax+8]
004011A9 fstp dword ptr [ebp-0Ch]
004011AC fld dword ptr [edx+0Ch]
004011AF lea edx,[ebp-14h]
004011B2 fadd dword ptr [ecx+0Ch]
004011B5 fstp dword ptr [ebp-18h]
004011B8 fld dword ptr [ebp-18h]
004011BB fadd dword ptr [eax+0Ch]
004011BE mov eax,dword ptr [edi+4]
004011C1 fstp dword ptr [ebp-8]
004011C4 cmp edx,eax
004011C6 jae test+12Ch (40123Ch)
004011C8 mov edx,dword ptr [edi]
004011CA lea ecx,[ebp-14h]
004011CD cmp edx,ecx
004011CF ja test+12Ch (40123Ch)
004011D1 mov esi,ecx
004011D3 mov ecx,dword ptr [edi+8]
004011D6 sub esi,edx
004011D8 cmp eax,ecx
004011DA jne test+10Bh (40121Bh)
004011DC sub eax,edx
004011DE sar eax,4
004011E1 cmp eax,0FFFFFFEh
004011E6 ja test+201h (401311h)
004011EC sub ecx,edx
004011EE inc eax
004011EF sar ecx,4
004011F2 cmp eax,ecx
004011F4 jbe test+10Bh (40121Bh)
004011F6 mov edx,ecx
004011F8 shr edx,1
004011FA mov ebx,0FFFFFFFh
004011FF sub ebx,edx
00401201 cmp ebx,ecx
00401203 jae test+0F9h (401209h)
00401205 xor ecx,ecx
00401207 jmp test+0FBh (40120Bh)
00401209 add ecx,edx
0040120B cmp ecx,eax
0040120D jae test+101h (401211h)
0040120F mov ecx,eax
00401211 mov edx,edi
00401213 call std::vector<math::vector,std::allocator<math::vector> >::reserve (401930h)
00401218 mov ebx,dword ptr [ebp-24h]
0040121B mov eax,dword ptr [edi+4]
0040121E and esi,0FFFFFFF0h
00401221 add esi,dword ptr [edi]
00401223 test eax,eax
00401225 je test+18Fh (40129Fh)
00401227 mov edx,dword ptr [esi]
00401229 mov dword ptr [eax],edx
0040122B mov ecx,dword ptr [esi+4]
0040122E mov dword ptr [eax+4],ecx
00401231 mov edx,dword ptr [esi+8]
00401234 mov dword ptr [eax+8],edx
00401237 mov ecx,dword ptr [esi+0Ch]
0040123A jmp test+18Ch (40129Ch)
0040123C mov ecx,dword ptr [edi+8]
0040123F cmp eax,ecx
00401241 jne test+171h (401281h)
00401243 mov edx,dword ptr [edi]
00401245 sub eax,edx
00401247 sar eax,4
0040124A cmp eax,0FFFFFFEh
0040124F ja test+201h (401311h)
00401255 sub ecx,edx
00401257 inc eax
00401258 sar ecx,4
0040125B cmp eax,ecx
0040125D jbe test+171h (401281h)
0040125F mov edx,ecx
00401261 shr edx,1
00401263 mov esi,0FFFFFFFh
00401268 sub esi,edx
0040126A cmp esi,ecx
0040126C jae test+162h (401272h)
0040126E xor ecx,ecx
00401270 jmp test+164h (401274h)
00401272 add ecx,edx
00401274 cmp ecx,eax
00401276 jae test+16Ah (40127Ah)
00401278 mov ecx,eax
0040127A mov edx,edi
0040127C call std::vector<math::vector,std::allocator<math::vector> >::reserve (401930h)
00401281 mov eax,dword ptr [edi+4]
00401284 test eax,eax
00401286 je test+18Fh (40129Fh)
00401288 mov edx,dword ptr [ebp-14h]
0040128B mov ecx,dword ptr [ebp-10h]
0040128E mov dword ptr [eax],edx
00401290 mov edx,dword ptr [ebp-0Ch]
00401293 mov dword ptr [eax+4],ecx
00401296 mov ecx,dword ptr [ebp-8]
00401299 mov dword ptr [eax+8],edx
0040129C mov dword ptr [eax+0Ch],ecx
0040129F add dword ptr [edi+4],10h
004012A3 add ebx,10h
004012A6 mov dword ptr [ebp-24h],ebx
004012A9 cmp ebx,9896800h
004012AF jb test+27h (401137h)
004012B5 call dword ptr [__imp__clock (4030DCh)]
Manual inline assembly:
test:
004010B0 push ebp
004010B1 mov ebp,esp
004010B3 sub esp,28h
004010B6 mov eax,dword ptr [___security_cookie (404018h)]
004010BB xor eax,ebp
004010BD mov dword ptr [ebp-4],eax
004010C0 push ebx
004010C1 push esi
004010C2 mov esi,ecx
004010C4 mov dword ptr [ebp-24h],esi
004010C7 call dword ptr [__imp__clock (4030DCh)]
004010CD xor ebx,ebx
004010CF mov dword ptr [ebp-1Ch],eax
004010D2 mov dword ptr [ebp-18h],ebx
004010D5 mov eax,dword ptr [esi]
004010D7 mov ecx,dword ptr [esi+10h]
004010DA fld dword ptr [eax+ebx]
004010DD fadd dword ptr [ecx+ebx]
004010E0 mov edx,dword ptr [esi+20h]
004010E3 add eax,ebx
004010E5 add ecx,ebx
004010E7 fadd dword ptr [edx+ebx]
004010EA add edx,ebx
004010EC fstp dword ptr [ebp-14h]
004010EF fld dword ptr [ecx+4]
004010F2 fadd dword ptr [eax+4]
004010F5 fadd dword ptr [edx+4]
004010F8 fstp dword ptr [ebp-10h]
004010FB fld dword ptr [ecx+8]
004010FE fadd dword ptr [eax+8]
00401101 fadd dword ptr [edx+8]
00401104 fstp dword ptr [ebp-0Ch]
00401107 fld dword ptr [ecx+0Ch]
0040110A lea ecx,[ebp-14h]
0040110D fadd dword ptr [eax+0Ch]
00401110 mov eax,dword ptr [edi+4]
00401113 fadd dword ptr [edx+0Ch]
00401116 fstp dword ptr [ebp-8]
00401119 cmp ecx,eax
0040111B jae test+0E4h (401194h)
0040111D mov edx,dword ptr [edi]
0040111F cmp edx,ecx
00401121 ja test+0E4h (401194h)
00401123 mov esi,ecx
00401125 mov ecx,dword ptr [edi+8]
00401128 sub esi,edx
0040112A cmp eax,ecx
0040112C jne test+0BDh (40116Dh)
0040112E sub eax,edx
00401130 sar eax,4
00401133 cmp eax,0FFFFFFEh
00401138 ja test+1BCh (40126Ch)
0040113E sub ecx,edx
00401140 inc eax
00401141 sar ecx,4
00401144 cmp eax,ecx
00401146 jbe test+0BDh (40116Dh)
00401148 mov edx,ecx
0040114A shr edx,1
0040114C mov ebx,0FFFFFFFh
00401151 sub ebx,edx
00401153 cmp ebx,ecx
00401155 jae test+0ABh (40115Bh)
00401157 xor ecx,ecx
00401159 jmp test+0ADh (40115Dh)
0040115B add ecx,edx
0040115D cmp ecx,eax
0040115F jae test+0B3h (401163h)
00401161 mov ecx,eax
00401163 mov edx,edi
00401165 call std::vector<math::vector,std::allocator<math::vector> >::reserve (401890h)
0040116A mov ebx,dword ptr [ebp-18h]
0040116D mov eax,dword ptr [edi+4]
00401170 and esi,0FFFFFFF0h
00401173 add esi,dword ptr [edi]
00401175 test eax,eax
00401177 je test+0DFh (40118Fh)
00401179 mov edx,dword ptr [esi]
0040117B mov dword ptr [eax],edx
0040117D mov ecx,dword ptr [esi+4]
00401180 mov dword ptr [eax+4],ecx
00401183 mov edx,dword ptr [esi+8]
00401186 mov dword ptr [eax+8],edx
00401189 mov ecx,dword ptr [esi+0Ch]
0040118C mov dword ptr [eax+0Ch],ecx
0040118F mov esi,dword ptr [ebp-24h]
00401192 jmp test+14Ah (4011FAh)
00401194 mov ecx,dword ptr [edi+8]
00401197 cmp eax,ecx
00401199 jne test+12Ch (4011DCh)
0040119B mov edx,dword ptr [edi]
0040119D sub eax,edx
0040119F sar eax,4
004011A2 cmp eax,0FFFFFFEh
004011A7 ja test+1BCh (40126Ch)
004011AD sub ecx,edx
004011AF inc eax
004011B0 sar ecx,4
004011B3 cmp eax,ecx
004011B5 jbe test+12Ch (4011DCh)
004011B7 mov edx,ecx
004011B9 shr edx,1
004011BB mov esi,0FFFFFFFh
004011C0 sub esi,edx
004011C2 cmp esi,ecx
004011C4 jae test+11Ah (4011CAh)
004011C6 xor ecx,ecx
004011C8 jmp test+11Ch (4011CCh)
004011CA add ecx,edx
004011CC cmp ecx,eax
004011CE jae test+122h (4011D2h)
004011D0 mov ecx,eax
004011D2 mov edx,edi
004011D4 call std::vector<math::vector,std::allocator<math::vector> >::reserve (401890h)
004011D9 mov esi,dword ptr [ebp-24h]
004011DC mov eax,dword ptr [edi+4]
004011DF test eax,eax
004011E1 je test+14Ah (4011FAh)
004011E3 mov edx,dword ptr [ebp-14h]
004011E6 mov ecx,dword ptr [ebp-10h]
004011E9 mov dword ptr [eax],edx
004011EB mov edx,dword ptr [ebp-0Ch]
004011EE mov dword ptr [eax+4],ecx
004011F1 mov ecx,dword ptr [ebp-8]
004011F4 mov dword ptr [eax+8],edx
004011F7 mov dword ptr [eax+0Ch],ecx
004011FA add dword ptr [edi+4],10h
004011FE add ebx,10h
00401201 mov dword ptr [ebp-18h],ebx
00401204 cmp ebx,9896800h
0040120A jb test+25h (4010D5h)
00401210 call dword ptr [__imp__clock (4030DCh)]
Conclusion: For whatever reason, MSVC2010 does not inline calls to operator+
. Does anybody know why this is? Even putting __forceinline
(which I'd like to avoid) doesn't inline.
Update: As jdv-Jan de Vaan mentioned, when I remove the destructor:
// ~vector_expression() {} // not a public base
it inlines operator+
. The strange thing is that it inlines it to different assembly, and my tests indicate this output, while performing better than my original, still doesn't reach the same status as the manually inlined version. Any ideas why that is?
00A710B0 push ebp
00A710B1 mov ebp,esp
00A710B3 sub esp,28h
00A710B6 mov eax,dword ptr [___security_cookie (0A74018h)]
00A710BB xor eax,ebp
00A710BD mov dword ptr [ebp-4],eax
00A710C0 push ebx
00A710C1 push esi
00A710C2 mov esi,ecx
00A710C4 mov dword ptr [ebp-24h],esi
00A710C7 call dword ptr [__imp__clock (0A730DCh)]
00A710CD xor ebx,ebx
00A710CF mov dword ptr [ebp-1Ch],eax
00A710D2 mov dword ptr [ebp-28h],ebx
00A710D5 mov eax,dword ptr [esi]
00A710D7 mov ecx,dword ptr [esi+10h]
00A710DA fld dword ptr [eax+ebx]
00A710DD fadd dword ptr [ecx+ebx]
00A710E0 mov edx,dword ptr [esi+20h]
00A710E3 add eax,ebx
00A710E5 add ecx,ebx
00A710E7 fstp dword ptr [ebp-18h]
00A710EA add edx,ebx
00A710EC fld dword ptr [ebp-18h]
00A710EF fadd dword ptr [edx]
00A710F1 fstp dword ptr [ebp-14h]
00A710F4 fld dword ptr [eax+4]
00A710F7 fadd dword ptr [ecx+4]
00A710FA fstp dword ptr [ebp-18h]
00A710FD fld dword ptr [ebp-18h]
00A71100 fadd dword ptr [edx+4]
00A71103 fstp dword ptr [ebp-10h]
00A71106 fld dword ptr [eax+8]
00A71109 fadd dword ptr [ecx+8]
00A7110C fstp dword ptr [ebp-18h]
00A7110F fld dword ptr [ebp-18h]
00A71112 fadd dword ptr [edx+8]
00A71115 fstp dword ptr [ebp-0Ch]
00A71118 fld dword ptr [eax+0Ch]
00A7111B mov eax,dword ptr [edi+4]
00A7111E fadd dword ptr [ecx+0Ch]
00A71121 lea ecx,[ebp-14h]
00A71124 fstp dword ptr [ebp-18h]
00A71127 fld dword ptr [ebp-18h]
00A7112A fadd dword ptr [edx+0Ch]
00A7112D fstp dword ptr [ebp-8]
00A71130 cmp ecx,eax
00A71132 jae test+0FBh (0A711ABh)
00A71134 mov edx,dword ptr [edi]
00A71136 cmp edx,ecx
00A71138 ja test+0FBh (0A711ABh)
00A7113A mov esi,ecx
00A7113C mov ecx,dword ptr [edi+8]
00A7113F sub esi,edx
00A71141 cmp eax,ecx
00A71143 jne test+0D4h (0A71184h)
00A71145 sub eax,edx
00A71147 sar eax,4
00A7114A cmp eax,0FFFFFFEh
00A7114F ja test+1D3h (0A71283h)
00A71155 sub ecx,edx
00A71157 inc eax
00A71158 sar ecx,4
00A7115B cmp eax,ecx
00A7115D jbe test+0D4h (0A71184h)
00A7115F mov edx,ecx
00A71161 shr edx,1
00A71163 mov ebx,0FFFFFFFh
00A71168 sub ebx,edx
00A7116A cmp ebx,ecx
00A7116C jae test+0C2h (0A71172h)
00A7116E xor ecx,ecx
00A71170 jmp test+0C4h (0A71174h)
00A71172 add ecx,edx
00A71174 cmp ecx,eax
00A71176 jae test+0CAh (0A7117Ah)
00A71178 mov ecx,eax
00A7117A mov edx,edi
00A7117C call std::vector<math::vector,std::allocator<math::vector> >::reserve (0A718A0h)
00A71181 mov ebx,dword ptr [ebp-28h]
00A71184 mov eax,dword ptr [edi+4]
00A71187 and esi,0FFFFFFF0h
00A7118A add esi,dword ptr [edi]
00A7118C test eax,eax
00A7118E je test+0F6h (0A711A6h)
00A71190 mov edx,dword ptr [esi]
00A71192 mov dword ptr [eax],edx
00A71194 mov ecx,dword ptr [esi+4]
00A71197 mov dword ptr [eax+4],ecx
00A7119A mov edx,dword ptr [esi+8]
00A7119D mov dword ptr [eax+8],edx
00A711A0 mov ecx,dword ptr [esi+0Ch]
00A711A3 mov dword ptr [eax+0Ch],ecx
00A711A6 mov esi,dword ptr [ebp-24h]
00A711A9 jmp test+161h (0A71211h)
00A711AB mov ecx,dword ptr [edi+8]
00A711AE cmp eax,ecx
00A711B0 jne test+143h (0A711F3h)
00A711B2 mov edx,dword ptr [edi]
00A711B4 sub eax,edx
00A711B6 sar eax,4
00A711B9 cmp eax,0FFFFFFEh
00A711BE ja test+1D3h (0A71283h)
00A711C4 sub ecx,edx
00A711C6 inc eax
00A711C7 sar ecx,4
00A711CA cmp eax,ecx
00A711CC jbe test+143h (0A711F3h)
00A711CE mov edx,ecx
00A711D0 shr edx,1
00A711D2 mov esi,0FFFFFFFh
00A711D7 sub esi,edx
00A711D9 cmp esi,ecx
00A711DB jae test+131h (0A711E1h)
00A711DD xor ecx,ecx
00A711DF jmp test+133h (0A711E3h)
00A711E1 add ecx,edx
00A711E3 cmp ecx,eax
00A711E5 jae test+139h (0A711E9h)
00A711E7 mov ecx,eax
00A711E9 mov edx,edi
00A711EB call std::vector<math::vector,std::allocator<math::vector> >::reserve (0A718A0h)
00A711F0 mov esi,dword ptr [ebp-24h]
00A711F3 mov eax,dword ptr [edi+4]
00A711F6 test eax,eax
00A711F8 je test+161h (0A71211h)
00A711FA mov edx,dword ptr [ebp-14h]
00A711FD mov ecx,dword ptr [ebp-10h]
00A71200 mov dword ptr [eax],edx
00A71202 mov edx,dword ptr [ebp-0Ch]
00A71205 mov dword ptr [eax+4],ecx
00A71208 mov ecx,dword ptr [ebp-8]
00A7120B mov dword ptr [eax+8],edx
00A7120E mov dword ptr [eax+0Ch],ecx
00A71211 add dword ptr [edi+4],10h
00A71215 add ebx,10h
00A71218 mov dword ptr [ebp-28h],ebx
00A7121B cmp ebx,9896800h
00A71221 jb test+25h (0A710D5h)
00A71227 call dword ptr [__imp__clock (0A730DCh)]
I already commented earlier to this question. I was concerned about the presense of an empty user defined destructor, which could disable inlining. After some googling around, I feel more confident that this might actually be the answer.
This answer describes a situation that is eerily close to what you describe in your question. Here, a user defined destructor prevents inlining of a
operator+
even if__forceinline
is set. There are also useful debugging tips to be found here.There is also a bug report in microsoft connect. I first heard about it on a discussion of the safeint library on channel9.