Global pointer appearing null to signal handler

76 views Asked by At

I have some C code that launches processes. For each process it registers some metadata in a linked list of structs. Then, the SIGCHLD handler updates said metadata. The structure is something like this:

// block/unblock utilities
static sigset_t block () {
  sigset_t sigchld_mask, old_mask;
  sigemptyset(&sigchld_mask);
  sigaddset(&sigchld_mask, SIGCHLD);

  if(sigprocmask(SIG_BLOCK, &sigchld_mask, &old_mask))
    exit_with_error();
  
  return old_mask;
}

static void unblock (sigset_t* old_mask)  {
  if(sigprocmask(SIG_SETMASK, old_mask, NULL)) exit_with_error();
}


typedef struct ChildProcess {
  pid_t pid;
  int* status;
} ChildProcess;

// ProcessNode is a linked list node containing process metadata
typedef struct ProcessNode {
  ChildProcess* proc;
  volatile struct ProcessNode* next;
} ProcessNode;

volatile ProcessNode * head = NULL;

int launch_process ( ... ) {
  block();            // block SIGCHLD
  ...                 // launch process with posix_spawn()
  register_proc(...); // register all metadata
  unblock();          // unblock SIGCHLD
}

void register_proc (pid_t pid, int* status) {
  ChildProcess* child = (ChildProcess*)malloc(sizeof(ChildProcess));
  child->pid = pid;
  child->status = status;
  volatile ProcessNode* node = (ProcessNode*)malloc(sizeof(ProcessNode));
  new_node->proc = child;
  new_node->next = head;
  head = new_node;
  return 0;
}

int update_proc (pid_t pid, int status) {
  volatile ProcessNode* cursor = head;
  while(cursor != NULL && curr->proc->pid != pid) {
    cursor = cursor->next;
  }
  if(cursor != NULL) {
    *(cursor->proc->status) = status;
  }
}

void sigchld_handler (int sig) {
  int status;
  pid_t pid;
  while((pid = waitpid(-1, &status, WNOHANG))) {
    update_proc(pid, status);
  }
}

// API function for checking the status pointer
// if we are to wait for the process to terminate, we pselect until sigchld eventually updates the status pointer
int get_state (int* status, bool wait_for_termination) {
  sigset_t old_mask = block(); // block SIGCHLD, store old mask
  bool state_unknown = true;
  bool done = false;
  while(state_unknown) {
    if(WIFEXITED(*status) || WIFSIGNALED(*status)) {
      done = true;
    }
    state_unknown = false;
    if(wait_for_termination && done) {
      if(pselect(0, NULL, NULL, NULL, NULL, &old_mask)) {
        if(errno == EINTR) {
          state_unknown = true; // check status again on interrupt
        }
      }
    }
    else if(!wait_for_termination) {
      state_unknown = false;
    }
  }
  unblock(); // unblock SIGCHLD
  return 0;
}

int main () {
  // ... allocate some memory for other tasks

  //Setup SIGCHLD handler
  sigset_t sigchld_mask;
  sigemptyset(&sigchld_mask);
  sigaddset(&sigchld_mask, SIGCHLD);
  struct sigaction sa;
  sa.sa_handler = sigchld_handler;
  sa.sa_mask = sigchld_mask;
  sa.sa_flags = SA_RESTART;
  sigaction(SIGCHLD, &sa, NULL);
}

Sometimes this works, but it can sometimes fails because update_status fails to find a matching ProcessNode. This happens because head is NULL when sigchld_handler executes -- even if it is non-null before an after. I've elided a lot of detail, but I block SIGCHLD every time head is written or read, and it's volatile. I thought those two things would be sufficient to deal with any race conditions.

Any other ideas for things that could go wrong?

1

There are 1 answers

6
Craig Estey On

With the correct initialization, you should get one SIGCHLD for each terminated process.

However ... Consider the following:

You fire off two children: child A and child B

  1. child A terminates
  2. handler entered (SIGCHLD for child A)
  3. handler does successful waitpid for child A
  4. new child B terminates
  5. handler loops and does successful waitpid for child B
  6. handler exits.
  7. handler entered (SIGCHLD for child B)
  8. waitpid returns -1 (for ECHILD)

In the signal handler, the fix is to change:

while ((pid = waitpid(-1, &status, WNOHANG)))
    update_proc(pid, status);

Into:

while ((pid = waitpid(-1, &status, WNOHANG)) > 0)
    update_proc(pid, status);

Note that volatile for head is not necessary because of the block/unblock calls.

Here is the modified/fixed code. Because I was trying to produce a full diagnostic and get this out quickly, I hacked it up somewhat severely [Sorry]:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
#include <time.h>
#include <sys/wait.h>

typedef struct ChildProcess {
    pid_t pid;
    int status;
    double reaptime;
} ChildProcess;

// ProcessNode is a linked list node containing process metadata
typedef struct ProcessNode {
    ChildProcess *proc;
    struct ProcessNode *next;
} ProcessNode;

ProcessNode *head = NULL;

#define MAXPROC     10
ProcessNode nodelist[MAXPROC];
ChildProcess childlist[MAXPROC];

volatile int trap_idx;
double trap_times[MAXPROC * 2];

int forkcnt;
volatile int reapcnt;
volatile int misscnt;

#define prt(_fmt...) \
    do { \
        char buf[100]; \
        size_t len = sprintf(buf,_fmt); \
        write(1,buf,len); \
    } while (0)

#define sysfault(_fmt...) \
    do { \
        prt(_fmt); \
        exit(99); \
    } while (0)

double zerotime;

double
tscgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_MONOTONIC,&ts);

    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    sec -= zerotime;

    return sec;
}

void
exit_with_error(void)
{
    int sverr = errno;

    sysfault("exit_with_error: sverr=%d (%s)\n",sverr,strerror(sverr));
}

#if 1
static void
block(sigset_t *old_mask)
{
  sigset_t sigchld_mask;
  sigemptyset(&sigchld_mask);
  sigaddset(&sigchld_mask, SIGCHLD);

  if(sigprocmask(SIG_BLOCK, &sigchld_mask, old_mask))
    exit_with_error();
}

static void
unblock(sigset_t* old_mask)
{
  if(sigprocmask(SIG_SETMASK, old_mask, NULL)) exit_with_error();
}

#else
void
block(void)
{
    sigset_t set;

    sigemptyset(&set);
    sigaddset(&set,SIGCHLD);
    sigprocmask(SIG_BLOCK,&set,NULL);
}

void
unblock(void)
{
    sigset_t set;

    sigemptyset(&set);
    sigaddset(&set,SIGCHLD);
    sigprocmask(SIG_UNBLOCK,&set,NULL);
}
#endif

void
register_proc(int pididx,pid_t pid, int *status)
{
    ChildProcess *child = &childlist[pididx];

    child->pid = pid;
    //child->status = status;
    ProcessNode *new_node = &nodelist[pididx];

    new_node->proc = child;

    new_node->next = head;
    head = new_node;
}

void
launch_process(int pididx)
{
    sigset_t old_mask;
    block(&old_mask);                           // block SIGCHLD

    pid_t pid = fork();

    if (pid != 0) {
        forkcnt += 1;
        register_proc(pididx,pid,NULL);         // register all metadata
    }

    unblock(&old_mask);                         // unblock SIGCHLD

    if (pid == 0) {
#if ADDSLEEP
        struct timespec ts;
        ts.tv_sec = 0;
        ts.tv_nsec = 1000000 * pididx;
        nanosleep(&ts,NULL);
#endif
        exit(0x10 + pididx);
    }
}

void
update_proc(pid_t pid, int status, double curtime)
{
    ProcessNode *cursor = head;

    if (pid <= 0) {
        int sverr = errno;
        sysfault("update_proc: fault pid=%d sverr=%d (%s)\n",
            pid,sverr,strerror(sverr));
    }

    while (cursor != NULL && cursor->proc->pid != pid) {
        cursor = cursor->next;
    }
    if (cursor != NULL) {
        cursor->proc->status = status;
        cursor->proc->reaptime = curtime;
        reapcnt += 1;
    }
    else
        misscnt += 1;
}

void
sigchld_handler(int sig)
{
    int status;
    pid_t pid;

    double curtime = tscgetf();
    trap_times[trap_idx++] = curtime;

#if BUG
    while ((pid = waitpid(-1, &status, WNOHANG)))
        update_proc(pid, status);
#else
    while ((pid = waitpid(-1, &status, WNOHANG)) > 0)
        update_proc(pid, status);
#endif
}

// API function for checking the status pointer
// if we are to wait for the process to terminate, we pselect until sigchld eventually updates the status pointer
int
get_state(int *status, bool wait_for_termination)
{
    sigset_t old_mask;
    block(&old_mask);       // block SIGCHLD, store old mask
    bool state_unknown = true;
    bool done = false;

    while (state_unknown) {
        if (WIFEXITED(*status) || WIFSIGNALED(*status)) {
            done = true;
        }
        state_unknown = false;
        if (wait_for_termination && done) {
            if (pselect(0, NULL, NULL, NULL, NULL, &old_mask)) {
                if (errno == EINTR) {
                    state_unknown = true;   // check status again on interrupt
                }
            }
        }
        else if (!wait_for_termination) {
            state_unknown = false;
        }
    }
    unblock(&old_mask);                         // unblock SIGCHLD
    return 0;
}

int
main(void)
{
    // ... allocate some memory for other tasks

    fflush(stdout);
    zerotime = tscgetf();

    // Setup SIGCHLD handler
    sigset_t sigchld_mask;

    sigemptyset(&sigchld_mask);
    sigaddset(&sigchld_mask, SIGCHLD);
#if 0
    struct sigaction sa;
#else
    struct sigaction sa = { 0 };
#endif

    sa.sa_handler = sigchld_handler;
    sa.sa_mask = sigchld_mask;
    sa.sa_flags = SA_RESTART;
//#if NODEFER
    sa.sa_flags |= SA_NODEFER;
//#endif
    sigaction(SIGCHLD, &sa, NULL);

    for (int pididx = 0;  pididx < MAXPROC;  ++pididx)
        launch_process(pididx);

    time_t timebeg = time(NULL);

    int doneflg = 0;
    while (1) {
        time_t timenow = time(NULL);
        if ((timenow - timebeg) > 4)
            break;

        if (reapcnt >= forkcnt) {
            doneflg = 1;
            break;
        }
    }

    block(NULL);
    prt("main: forkcnt=%d reapcnt=%d misscnt=%d\n",forkcnt,reapcnt,misscnt);
    for (int pididx = 0;  pididx < MAXPROC;  ++pididx) {
        ProcessNode *cursor = &nodelist[pididx];
        ChildProcess *proc = cursor->proc;
        prt("main: pididx=%d status=%8.8X reaptime=%.9f\n",
            pididx,proc->status,proc->reaptime);
    }
    if (! doneflg)
        sysfault("main: timeout!!!\n");

    return 0;
}

In the code above, I've used cpp conditionals to denote old vs. new code:

#if 0
// old code
#else
// new code
#endif

#if 1
// new code
#endif

Note: this can be cleaned up by running the file through unifdef -k


Here is the program output with the original bug (compiled with -DBUG=1):

update_proc: fault pid=-1 sverr=10 (No child processes)

Before I arrived at the above fix, I went through several rounds of experimentation.

Some of the attempts I tried.

There appears to be a race condition in the original above code. The child processes terminate so fast that some signals can be lost.

I added timestamping to show when the handler is invoked.

I added SA_NODEFER (which did nothing). And a nanosleep in the child (which was an "okay" workaround but not the ultimate fix).

My best guess is:

  1. SIGCHLD is not a realtime signal.
  2. Ordinary signals are not queued.
  3. If multiple children terminate within a short time of one another, some signals may be lost because the kernel only keeps a "signal pending" flag (and not count of pending signals of the same type).

Here is the program output with the [simplified] fix:

main: forkcnt=10 reapcnt=10 misscnt=0
main: pididx=0 status=00001000 reaptime=0.000245451
main: pididx=1 status=00001100 reaptime=0.000313690
main: pididx=2 status=00001200 reaptime=0.000368583
main: pididx=3 status=00001300 reaptime=0.000368583
main: pididx=4 status=00001400 reaptime=0.000368583
main: pididx=5 status=00001500 reaptime=0.000517055
main: pididx=6 status=00001600 reaptime=0.000517055
main: pididx=7 status=00001700 reaptime=0.000637672
main: pididx=8 status=00001800 reaptime=0.000637672
main: pididx=9 status=00001900 reaptime=0.000608328