We have a bug in a production system, where a process segfaults while holding a shared memory mutex. We'd like it to release the lock when dying. We use sem_wait()/sem_post(), but doing my homework, I've found that this API does not allow for such a behavior:
http://www.usenetmessages.com/view.php?c=computer&g=1074&id=78029&p=0
The answer, the article says, is using the robust pthreads API. I've found the following article about this topic:
http://www.embedded-linux.co.uk/tutorial/mutex_mutandis
But, having implemented the following code, I'm having an unreliable behavior, that is, should I tell process 3, for instance, to segfault, the code works just fine. The other processes wake up, recognize that a process died while holding the mutex, and recover. However, should I tell process 0 to die, or should I remove the sleep call on line 63, the other processes do not wake up once the failling process kills itself. Am I doing something wrong?
#include <stdio.h>
#include <stdlib.h>
#include <features.h>
#define __USE_POSIX
#include <signal.h>
#include <sys/types.h>
#include <unistd.h>
#define __USE_MISC
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#define __USE_GNU /* Necessario para usar a API PTHREAD_MUTEX_ROBUST_NP */
#define __USE_UNIX98 /* Necessario para usar a funcao pthread_mutexattr_settype */
#include <pthread.h>
#include <sys/wait.h>
static void *shrd;
static int child_main(int slot, int segfault) {
pthread_mutex_t *lock = (pthread_mutex_t *) shrd;
int err;
if ( 0 != (err=pthread_mutex_lock(lock)) ) {
switch(err) {
case EINVAL:
printf("Lock invalido no filho [%d]\n", slot);
goto excecao;
case EDEADLK:
printf("O filho [%d] tentou travar um lock que jah possui.\n", slot);
break;
case EOWNERDEAD:
printf("Filho [%d] foi informado que o processo que estava com o lock morreu.\n", slot);
if ( 0 == pthread_mutex_consistent_np(lock) ) {
printf("Filho [%d] retornou o lock para um estado consistente.\n", slot);
} else {
fprintf(stderr, "Nao foi possivel retornar o lock a um estado consistente.\n");
goto desistir;
}
if ( 0 != (err=pthread_mutex_lock(lock)) ) {
fprintf(stderr, "Apos recuperar o estado do lock, nao foi possivel trava-lo: %d\n", err);
goto desistir;
}
case ENOTRECOVERABLE:
printf("O filho [%d] foi informado de que o lock estah permanentemente em estado inconsistente.\n", slot);
goto desistir;
default:
printf("Erro desconhecido ao tentar travar o lock no filho [%d]: [%d]\n", slot, err);
goto excecao;
}
}
printf("Filho [%d] adquiriu o lock.\n", slot);
if ( segfault == slot ) {
printf("Matando o PID [%d] com SIGSEGV.\n", getpid());
kill(getpid(), SIGSEGV);
} else {
sleep(1);
}
if ( 0 != (err = pthread_mutex_unlock(lock)) ) {
switch (err) {
case EPERM:
printf("O filho [%d] tentou liberar o lock, mas nao o possui.\n", slot);
break;
default:
fprintf(stderr, "Erro inesperado ao liberar o lock do filho [%d]: [%d]\n", slot, err);
}
} else {
printf("Filho [%d] retornou o lock.\n", slot);
}
return 0;
excecao:
fprintf(stderr, "Programa terminado devido excecao.\n");
return 1;
desistir:
fprintf(stderr, "A execucao do sistema nao deve prosseguir. Abortando todos os processos.\n");
kill(0, SIGTERM);
/* unreachable */
return 1;
}
int main(int argc, const char * const argv[]) {
pid_t filhos[10];
int status;
pid_t p;
int segfault = -1;
pthread_mutexattr_t attrs;
if ( argc > 1 ) {
segfault = atoi(argv[1]);
if ( segfault < 0 || segfault > 9 )
segfault = -1;
}
if ( (shrd = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0)) == MAP_FAILED ) {
perror("Erro ao criar shrd mem:\n");
exit(1);
}
pthread_mutexattr_init (&attrs);
pthread_mutexattr_settype (&attrs, PTHREAD_MUTEX_RECURSIVE_NP);
pthread_mutexattr_setrobust_np (&attrs, PTHREAD_MUTEX_ROBUST_NP);
pthread_mutexattr_setpshared (&attrs, PTHREAD_PROCESS_SHARED);
/*
Devido a um BUG na glibc 2.5 (que eh a usada pelo CentOS 5,
a unica forma de fazer os mutexes robustos funcionarem eh
setando o protocolo para PTHREAD_PRIO_INHERIT:
http://sourceware.org/ml/libc-help/2010-04/msg00028.html
*/
pthread_mutexattr_setprotocol (&attrs, PTHREAD_PRIO_INHERIT);
pthread_mutex_init ((pthread_mutex_t*) shrd, &attrs);
pthread_mutexattr_destroy (&attrs);
for (size_t i=0; i<sizeof(filhos)/sizeof(pid_t); ++i) {
if ( (filhos[i]=fork()) == 0 ) {
return child_main((int) i, segfault);
} else {
if ( filhos[i] < 0 ) {
fprintf(stderr, "Erro ao criar o filho [%zu]. Abortando.\n", i);
exit(1);
}
}
}
for (size_t i=0; i<sizeof(filhos)/sizeof(pid_t); ++i) {
do {
p = waitpid(filhos[i], &status, 0);
} while (p != -1);
}
printf("Pai encerrou a sua execucao.\n");
return 0;
}
BTW: I'm compiling on CentOS 5, 64 bits:
$ uname -rm
2.6.18-194.el5 x86_64
glibc-2.5-49
gcc-4.1.2-48.el5
(Sorry, the sentences and comments on the code are in portuguese, my native language.)
Your
EOWNERDEAD
block misses abreak
beforeENOTRECOVERABLE
block. Also, according topthread_mutex_lock
manpage, after the first call topthread_mutex_lock()
, the lock is held by caller even whenEOWNERDEAD
is returned. Thus, you should not call it again insideEOWNERDEAD
's block.