So my goal is to test how many clock cycles does it take to allocate one million integers with "malloc" function in C++.

I have 2 programs, one in Visual Studio, which calls RDTSC from inline assembly, using Microsoft compiler asm syntax. Here I also use CPUID instruction to cause all the instructions preceding it to be executed. Here is the code:

#include <iostream>

using namespace std;

#define rdtsc __asm __emit 0fh __asm __emit 031h
#define cpuid __asm __emit 0fh __asm __emit 0a2h

int main()
{
    unsigned cycles_high1 = 0, cycles_low1 = 0, cpuid_time = 0;
    unsigned cycles_high2 = 0, cycles_low2 = 0;
    unsigned __int64 temp_cycles1 = 0, temp_cycles2 = 0;
    __int64 total_cycles = 0;

    //compute the CPUID overhead
    __asm {
        pushad
        CPUID
        RDTSC
        mov cycles_high1, edx
        mov cycles_low1, eax
        popad
        pushad
        CPUID
        RDTSC
        popad
        pushad
        CPUID
        RDTSC
        mov cycles_high1, edx
        mov cycles_low1, eax
        popad
        pushad
        CPUID
        RDTSC
        popad
        pushad
        CPUID
        RDTSC
        mov cycles_high1, edx
        mov cycles_low1, eax
        popad
        pushad
        CPUID
        RDTSC
        sub eax, cycles_low1
        mov cpuid_time, eax
        popad
    }
    cycles_high1 = 0;
    cycles_low1 = 0;

    //Measure the code sequence
    __asm {
        pushad
        CPUID
        RDTSC
        mov cycles_high1, edx
        mov cycles_low1, eax
        popad
    }

    int* p = (int*)malloc(sizeof(int) * 1000000);

    __asm {
        pushad
        CPUID
        RDTSC
        mov cycles_high2, edx
        mov cycles_low2, eax
        popad
    }
    temp_cycles1 = ((unsigned __int64)cycles_high1 << 32) | cycles_low1;
    temp_cycles2 = ((unsigned __int64)cycles_high2 << 32) | cycles_low2;
    total_cycles = temp_cycles2 - temp_cycles1 - cpuid_time;

    cout << total_cycles;
    return 0;
}

On the other hand, I have a project in Visual Studio Code, which runs the RDTSC instruction much more simply and compiles the code using G++ compiler. Here is the code:

#include <iostream>
#include <intrin.h>


using namespace std;

int main()
{
    unsigned long long start, end;

    start = __rdtsc();

    int* p = (int*)malloc(sizeof(int) * 1000000);
  
    end = __rdtsc();

    cout << end - start;
    return 0;
}   

The 2nd project (the one in VSCode) returns a value ten times smaller than the 1st project, even though I did not use CPUID instruction in the 2nd project. What am I doing wrong?

EDIT: I'm a beginner in benchmarking, I was told by the TA I should use RDTSC in combination with CPUID, and a short description about how they work.

0

There are 0 answers