I have the following code in C++ (explained later):
#include <stdio.h>
#include <string>
#include <vector>
using namespace std;
struct th_private{
double mean_tau;
th_private()
{
mean_tau = 0;
}
};
class resistor
{
public:
string name;
/*****************************************************************************
Approach 0: Within each resistor strcuture, declare arrays of 'thread private'
variables. Thread 0 will use mean_tau[0], offset[0].., Thread 1 will use
mean_tau[1], offset[1]... and so on. As I understand, this is not a good
approach, would lead to a lot of false sharing.
/*****************************************************************************/
vector<double> mean_tau;
/*****************************************************************************
Approach 1: 1D array of struct th_private in each instance of the resistor,
where state[0] is used ONLY by thread[0], state[0] is used ONLY by thread[1]
and so on. Could potentially elimiate false sharing, but how to ensure
it will align in the cache?
/*****************************************************************************/
vector<th_private> state;
resistor( )
{
name = "";
}
void prepare_for_threads( int num_threads )
{
/* If Approach 0 */
mean_tau.resize(num_threads);
/* Else If Approach 1 */
state.resize(num_threads);
}
~resistor(){}
};
class mesh
{
public:
vector<resistor*> R;
mesh( int num_resistors, int num_threads )
{
for( int i = 0; i < num_resistors; i++ )
{
resistor *r = new resistor();
r->prepare_for_threads( num_threads );
R.push_back(r);
}
}
~mesh(){}
};
/*****************************************************************************
Approach 2: Declare a global 2D matrix, where each row belongs to a
thread and each column belongs to a resistor. Seems to be the best approach.
R[0] R[1] R[2] R[3] R[4] R[9]
thread0: [0][0] [0][1] [0][2] [0][3] [0][4] .. [0][9]
...
thread3: [3][0] [3][1] [3][2] [3][3] [3][4] .. [3][9]
/*****************************************************************************/
th_private __attribute__((aligned(0x1000))) global_state[4][10];
int main( int argc, char** argv )
{
// Assume that there are 4 threads declared.
mesh grid(10, 4);
printf("sizeof(th_private): %d\n", sizeof(th_private));
printf("Approach 1: %p %p %p %p\n", &grid.R[0]->state[0], &grid.R[0]->state[1], &grid.R[0]->state[2], &grid.R[0]->state[3]);
printf("Approach 2: %p %p %p %p\n", &global_state[0][0], &global_state[0][1], &global_state[0][2], &global_state[0][3]);
}
And the output on a 64-bit linux machine is:
sizeof(th_private): 8
Approach 1: 0x658080 0x658088 0x658090 0x658098
Approach 2: 0x608000 0x608008 0x608010 0x608018
Each resistor has a set of attributes which are to be modified (read and write) by threads. Ideally, they can be treated as thread private variables. But, due to some restrictions imposed by the old code base, I can only go for one of three approaches:
- Approach 0: Within each resistor structure, declare arrays of 'thread private' variables. Thread 0 will use mean_tau[0], offset[0].., Thread 1 will use mean_tau[1], offset[1]... and so on. As I understand, this is not a good approach, would lead to a lot of false sharing.
- Approach 1: 1D array of struct th_private in each instance of the resistor, where state[0] is used ONLY by thread[0], state[0] is used ONLY by thread[1] and so on. Could potentially eliminate false sharing, but how to ensure it will align in the cache?
- Approach 2: Declare a global 2D matrix, where each row belongs to a thread and each column belongs to a resistor (More details in the code).
Now, i) to avoid false sharing and ii) for cache aligning, which approach is the best?