So my goal is to test how many clock cycles does it take to allocate one million integers with "malloc" function in C++.
I have 2 programs, one in Visual Studio, which calls RDTSC from inline assembly, using Microsoft compiler asm syntax. Here I also use CPUID instruction to cause all the instructions preceding it to be executed. Here is the code:
#include <iostream>
using namespace std;
#define rdtsc __asm __emit 0fh __asm __emit 031h
#define cpuid __asm __emit 0fh __asm __emit 0a2h
int main()
{
unsigned cycles_high1 = 0, cycles_low1 = 0, cpuid_time = 0;
unsigned cycles_high2 = 0, cycles_low2 = 0;
unsigned __int64 temp_cycles1 = 0, temp_cycles2 = 0;
__int64 total_cycles = 0;
//compute the CPUID overhead
__asm {
pushad
CPUID
RDTSC
mov cycles_high1, edx
mov cycles_low1, eax
popad
pushad
CPUID
RDTSC
popad
pushad
CPUID
RDTSC
mov cycles_high1, edx
mov cycles_low1, eax
popad
pushad
CPUID
RDTSC
popad
pushad
CPUID
RDTSC
mov cycles_high1, edx
mov cycles_low1, eax
popad
pushad
CPUID
RDTSC
sub eax, cycles_low1
mov cpuid_time, eax
popad
}
cycles_high1 = 0;
cycles_low1 = 0;
//Measure the code sequence
__asm {
pushad
CPUID
RDTSC
mov cycles_high1, edx
mov cycles_low1, eax
popad
}
int* p = (int*)malloc(sizeof(int) * 1000000);
__asm {
pushad
CPUID
RDTSC
mov cycles_high2, edx
mov cycles_low2, eax
popad
}
temp_cycles1 = ((unsigned __int64)cycles_high1 << 32) | cycles_low1;
temp_cycles2 = ((unsigned __int64)cycles_high2 << 32) | cycles_low2;
total_cycles = temp_cycles2 - temp_cycles1 - cpuid_time;
cout << total_cycles;
return 0;
}
On the other hand, I have a project in Visual Studio Code, which runs the RDTSC instruction much more simply and compiles the code using G++ compiler. Here is the code:
#include <iostream>
#include <intrin.h>
using namespace std;
int main()
{
unsigned long long start, end;
start = __rdtsc();
int* p = (int*)malloc(sizeof(int) * 1000000);
end = __rdtsc();
cout << end - start;
return 0;
}
The 2nd project (the one in VSCode) returns a value ten times smaller than the 1st project, even though I did not use CPUID instruction in the 2nd project. What am I doing wrong?
EDIT: I'm a beginner in benchmarking, I was told by the TA I should use RDTSC in combination with CPUID, and a short description about how they work.