I have been trying to measure cache hit and cache miss.
I have been working on a Quad core Cortex-A72 (ARM v8) 64-bit SoC @ 1.5GHz.
My c code to measure cache hit is:
#define _GNU_SOURCE
#include <assert.h>
#include <sched.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/random.h>
#include <stdbool.h>
#include <fcntl.h>
#include <fcntl.h>
#include <sys/time.h>
char *chunk;
const size_t chunk_size = 1<<30;
/* FUNCTIONS */
struct timeval start_time, start_time1;
double get_diff(){
struct timeval end_time;
int rc = gettimeofday(&end_time, NULL);
assert(rc == 0);
return (end_time.tv_sec - start_time.tv_sec + (double) (end_time.tv_usec - start_time.tv_usec) / 1e6);
}
void print_affinity(){
cpu_set_t mask;
long nproc, i;
if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1){
perror("sched_getaffinity");
assert(false);
}
nproc = sysconf(_SC_NPROCESSORS_ONLN);
printf("sched_getaffinity = ");
for (i = 0; i < nproc; i++)
printf("%d ", CPU_ISSET(i, &mask));
}
void bind_to_cpu (){
cpu_set_t mask;
print_affinity();
printf("\n");
printf("sched_getcpu = %d\n", sched_getcpu());
CPU_ZERO(&mask);
CPU_SET(0, &mask);
if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
perror("sched_setaffinity");
assert(false);
}
print_affinity();
printf("\nsched_getcpu = %d\n", sched_getcpu());
}
void reset_mem(){
memset(chunk, -1, chunk_size);
}
void initialize(size_t chunk_size){
chunk = (char *) mmap(NULL,chunk_size, PROT_READ | MAP_POPULATE |PROT_WRITE,MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
assert(chunk!=MAP_FAILED);
//initialize all bits to INIT_BIT value
printf("Initializing memory...\n\n");
reset_mem();
}
int main(int argc, char** argv){
bind_to_cpu(); // pinning/binding cpu
initialize(chunk_size);
uint64_t temp=0 ;
size_t offset1 = (rand() << 12) % chunk_size;
size_t offset2 = (rand() << 12) % chunk_size;
uint64_t *addr1 = (uint64_t*) (chunk+offset1);
uint64_t *addr2 = (uint64_t*) (chunk+offset2);
double time_result;
sched_yield();
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
for(int i =0; i<5000;i++){
gettimeofday(&start_time, NULL);
volatile uint64_t value;
asm volatile ("LDR %0, [%1]\n\t"
: "=r" (value)
: "r" (addr1)
);
asm volatile ("LDR %0, [%1]\n\t"
: "=r" (value)
: "r" (addr2)
);
time_result += get_diff();
//__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
//__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
}
sched_yield();
printf("Total Time: %f\n\n", time_result);
return 0;
}
The code to measure cache miss is the same but using the two flush instructions with comments:
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
So when i use LDR instruction everything seems to be ok, and i got the following outputs:
Cache hit: Cache miss:
0.000522 0.001503
0.000558 0.001696
0.000584 0.001977
0.000712 0.002032
0.000683 0.001137
When i use STR instruction:
for(int i =0; i<5000;i++){
gettimeofday(&start_time, NULL);
asm volatile("str %x1, %x0" : "=m"(*addr1) : "r"(temp));
asm volatile("str %x1, %x0" : "=m"(*addr2) : "r"(temp));
time_result += get_diff();
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr1) :"memory");
__asm__ __volatile__("dc civac, %0\n\t" : : "r" (addr2) :"memory");
}
I got these outputs:
Cache hit: Cache miss:
0.000603 0.000299
0.000287 0.000311
0.000376 0.000290
0.000311 0.000305
0.000518 0.000297
The differences between cache hit and cache miss are very subtle.
Why ? am i not flushing the cache in the right way ?
High-performance CPUs have a store buffer that decouples execution of store instructions from commit to cache, making the actual store instruction itself able to execute quickly (and speculatively) regardless of cache hit or miss. (See this, this, this, also Does processor stall during cache coherence operation)
In your second version, you're timing just the stores, not the
dc civac
instructions that have to do the actual work of making sure dirty data is written back to RAM. Thestr
instruction itself only has to write the store data and address into the store buffer, not wait for it to commit to L1d cache.But in your LDR version, you're timing the actual cache-miss loads, which can't complete until they actual read data from cache.
If you time the whole loops, including the
dc civac
instructions, you'll maybe see something meaningful. Instead of commenting out thedc civac
instructions, give them the address of different cache lines, not the ones you're loading or storing to.This is especially true on cores that allow out-of-order execution. Timing a couple instructions is not very meaningful when they can be reordered with instructions that read the clock, and when the normal state of affairs is that many instructions are "in flight". Draining out-of-order execution with barriers so you can time something creates pretty artificial conditions, and timing overhead means you can never time the real cost of something that way.
gettimeofday
is pretty high overhead compared to a cache-miss anyway; it's almost always better to construct a test that can run for many cycles, with timing outside it.Also don't use global variables; there's no reason for not just passing an arg to
get_diff
.Also, your load test used
volatile uint64_t value;
for the asm output for no apparent reason, forcing the compiler to emit to 2 stores to the stack for each load. Theasm volatile
statements make sure the load happens. Unless the CPU aggressively optimizes to discard unused load results (and not wait for them if a later instruction overwrites the register), you can just let the loads go unused.