mbind: how to uniformly interleave existing segment on all nodes?

414 views Asked by At

Using mbind, one can set the memory policy for a given mapped memory segment.

Q: How can I tell mbind to interleave a segment on all nodes?

If done after allocation but before usage, MPOL_INTERLEAVE on all nodes will do what we expect -- memory will be allocated uniformly on all nodes.

However, if the segment has already been written to and is allocated in e.g. node zero, there is no way to tell the kernel to uniformly interleave it on all NUMA nodes.

The operation simply becomes a no-op, as the kernel interprets it as "please place this segment on this set of nodes". Since we're passing the set of all NUMA nodes, there is no memory allocated outside that requires being moved.

Minimal, Complete, and Verifiable example

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#include <sys/syscall.h>
#include <numaif.h>
#include <numa.h>

#define N ((1<<29) / sizeof(int))

#define PAGE_SIZE sysconf(_SC_PAGESIZE)
#define PAGE_MASK (~(PAGE_SIZE - 1))

void print_command(char *cmd) {
  FILE *fp;
  char buf[1024];

  if ((fp = popen(cmd, "r")) == NULL) {
    perror("popen");
    exit(-1);
  }

  while(fgets(buf, sizeof(buf), fp) != NULL) {
    printf("%s", buf);
  }

  if(pclose(fp))  {
    perror("pclose");
    exit(-1);
  }
}

void print_node_allocations() {
  char buf[1024];
  snprintf(buf, sizeof(buf), "numastat -c %d", getpid());
  printf("\x1B[32m");
  print_command(buf);
  printf("\x1B[0m");
}

int main(int argc, char **argv) {
  int *a = numa_alloc_local(N * sizeof(int));
  size_t len = (N * sizeof(int)) & PAGE_MASK;
  unsigned long mymask = *numa_get_mems_allowed()->maskp;
  unsigned long maxnode = numa_get_mems_allowed()->size;

  // pin thread to core zero
  cpu_set_t mask;
  CPU_ZERO(&mask);
  CPU_SET(0, &mask);
  if (sched_setaffinity(syscall(SYS_gettid), sizeof(mask), &mask) < 0) {
    perror("sched_setaffinity");
    exit(-1);
  }

  // initialize array
  printf("\n\n(1) array allocated on local node\n");
  a[0] = 997;
  for(size_t i=1; i < N; i++) {
    a[i] = a[i-1] * a[i-1] % 1000000000;
  }
  print_node_allocations();

  // attempt to get it to be uniformly interleaved on all nodes
  printf("\n\n(2) array interleaved on all nodes\n");
  if (mbind(a, len, MPOL_INTERLEAVE, &mymask, maxnode, MPOL_MF_MOVE_ALL | MPOL_MF_STRICT) == -1) {
    perror("mbind failed");
    exit(-1);
  }
  print_node_allocations();

  // what if we interleave on all but the local node?
  printf("\n\n(3) array interleaved on all nodes (except local node)\n");
  mymask -= 0x01;
  if (mbind(a, len, MPOL_INTERLEAVE, &mymask, maxnode, MPOL_MF_MOVE_ALL | MPOL_MF_STRICT) == -1) {
    perror("mbind failed");
    exit(-1);
  }
  print_node_allocations();

  return 0;
}

Compiling and running with gcc -o interleave_all interleave_all.c -lnuma && sudo ./interleave_all yields:

(1) array allocated on local node

Per-node process memory usage (in MBs) for PID 20636 (interleave_all)
         Node 0 Node 1 Node 2 Node 3 Total
         ------ ------ ------ ------ -----
Huge          0      0      0      0     0
Heap          0      0      0      0     0
Stack         0      0      0      0     0
Private     514      0      0      0   514
-------  ------ ------ ------ ------ -----
Total       514      0      0      0   514


(2) array interleaved on all nodes

Per-node process memory usage (in MBs) for PID 20636 (interleave_all)
         Node 0 Node 1 Node 2 Node 3 Total
         ------ ------ ------ ------ -----
Huge          0      0      0      0     0
Heap          0      0      0      0     0
Stack         0      0      0      0     0
Private     514      0      0      0   514
-------  ------ ------ ------ ------ -----
Total       514      0      0      0   514


(3) array interleaved on all nodes (except local node)

Per-node process memory usage (in MBs) for PID 20636 (interleave_all)
         Node 0 Node 1 Node 2 Node 3 Total
         ------ ------ ------ ------ -----
Huge          0      0      0      0     0
Heap          0      0      0      0     0
Stack         0      0      0      0     0
Private       2    171    171    171   514
-------  ------ ------ ------ ------ -----
Total         2    171    171    171   514
0

There are 0 answers