signal handler priority issue
Sean McNeil
sean at mcneil.com
Fri Jun 11 06:10:15 GMT 2004
On Thu, 2004-06-10 at 23:02, Daniel Eischen wrote:
> On Thu, 10 Jun 2004, Sean McNeil wrote:
>
> > On Thu, 2004-06-10 at 22:04, David Xu wrote:
> > > Can you provide some code to demostrate the problem ? I have interest.
> > >
> > > David Xu
> >
> > Yes, here is some code snippets from boehm-gc:
> >
> > master thread doing:
> >
> > int GC_suspend_all()
> > {
> > int n_live_threads = 0;
> > int i;
> > GC_thread p;
> > int result;
> > pthread_t my_thread = pthread_self();
> >
> > GC_stopping_thread = my_thread; /* debugging only. */
> > GC_stopping_pid = getpid(); /* debugging only. */
> > for (i = 0; i < THREAD_TABLE_SZ; i++) {
> > for (p = GC_threads[i]; p != 0; p = p -> next) {
> > if (p -> id != my_thread) {
> > if (p -> flags & FINISHED) continue;
> > if (p -> stop_info.last_stop_count == GC_stop_count) continue;
> > if (p -> thread_blocked) /* Will wait */ continue;
> > n_live_threads++;
> > #if DEBUG_THREADS
> > GC_printf1("Sending suspend signal to 0x%lx\n", p -> id);
> > #endif
> >
> > result = pthread_kill(p -> id, SIG_SUSPEND);
> > switch(result) {
> > case ESRCH:
> > /* Not really there anymore. Possible? */
> > n_live_threads--;
> > break;
> > case 0:
> > break;
> > default:
> > ABORT("pthread_kill failed");
> > }
> > }
> > }
> > }
> > return n_live_threads;
> > }
>
> Am I blind, or is there no sem_wait() above after pthread_kill().
No, you aren't blind. It is done by the calling routine of above:
n_live_threads = GC_suspend_all();
if (GC_retry_signals) {
unsigned long wait_usecs = 0; /* Total wait since retry. */
# define WAIT_UNIT 3000
# define RETRY_INTERVAL 100000
for (;;) {
int ack_count;
sem_getvalue(&GC_suspend_ack_sem, &ack_count);
if (ack_count == n_live_threads) break;
if (wait_usecs > RETRY_INTERVAL) {
int newly_sent = GC_suspend_all();
# ifdef CONDPRINT
if (GC_print_stats) {
GC_printf1("Resent %ld signals after timeout\n",
newly_sent);
}
# endif
sem_getvalue(&GC_suspend_ack_sem, &ack_count);
if (newly_sent < n_live_threads - ack_count) {
WARN("Lost some threads during GC_stop_world?!\n",0);
n_live_threads = ack_count + newly_sent;
}
wait_usecs = 0;
}
usleep(WAIT_UNIT);
wait_usecs += WAIT_UNIT;
}
}
for (i = 0; i < n_live_threads; i++) {
if (0 != (code = sem_wait(&GC_suspend_ack_sem))) {
GC_err_printf1("Sem_wait returned %ld\n", (unsigned long)code);
ABORT("sem_wait for handler failed");
}
}
> > and slave threads doing:
> >
> > void GC_suspend_handler(int sig)
> > {
> > int dummy;
> > pthread_t my_thread = pthread_self();
> > GC_thread me;
> > sigset_t mask;
> > # ifdef PARALLEL_MARK
> > word my_mark_no = GC_mark_no;
> > /* Marker can't proceed until we acknowledge. Thus this is */
> > /* guaranteed to be the mark_no correspending to our */
> > /* suspension, i.e. the marker can't have incremented it yet. */
> > # endif
> > word my_stop_count = GC_stop_count;
> >
> > if (sig != SIG_SUSPEND) ABORT("Bad signal in suspend_handler");
> >
> > #if DEBUG_THREADS
> > GC_printf1("Suspending 0x%lx\n", my_thread);
> > #endif
> >
> > me = GC_lookup_thread(my_thread);
> > /* The lookup here is safe, since I'm doing this on behalf */
> > /* of a thread which holds the allocation lock in order */
> > /* to stop the world. Thus concurrent modification of the */
> > /* data structure is impossible. */
> > if (me -> stop_info.last_stop_count == my_stop_count) {
> > /* Duplicate signal. OK if we are retrying. */
> > if (!GC_retry_signals) {
> > WARN("Duplicate suspend signal in thread %lx\n",
> > pthread_self());
> > }
> > return;
> > }
> > # ifdef SPARC
> > me -> stop_info.stack_ptr = (ptr_t)GC_save_regs_in_stack();
> > # else
> > me -> stop_info.stack_ptr = (ptr_t)(&dummy);
> > # endif
> > # ifdef IA64
> > me -> backing_store_ptr = (ptr_t)GC_save_regs_in_stack();
> > # endif
> >
> > /* Tell the thread that wants to stop the world that this */
> > /* thread has been stopped. Note that sem_post() is */
> > /* the only async-signal-safe primitive in LinuxThreads. */
> > sem_post(&GC_suspend_ack_sem);
> > me -> stop_info.last_stop_count = my_stop_count;
> >
> > #if DEBUG_THREADS
> > GC_printf2("Waiting for restart #%d of 0x%lx\n", my_stop_count, my_thread);
> > #endif
> >
> > /* Wait until that thread tells us to restart by sending */
> > /* this thread a SIG_THR_RESTART signal. */
> > /* SIG_THR_RESTART should be masked at this point. Thus there */
> > /* is no race. */
>
> You also need to make sure SIGUSR1 is masked while in this
> function. It could possibly get another one after the sem_post()
> and before the sigsuspend() below.
>
> > if (sigfillset(&mask) != 0) ABORT("sigfillset() failed");
> > if (sigdelset(&mask, SIG_THR_RESTART) != 0) ABORT("sigdelset() failed");
> > # ifdef NO_SIGNALS
> > if (sigdelset(&mask, SIGINT) != 0) ABORT("sigdelset() failed");
> > if (sigdelset(&mask, SIGQUIT) != 0) ABORT("sigdelset() failed");
> > if (sigdelset(&mask, SIGTERM) != 0) ABORT("sigdelset() failed");
> > if (sigdelset(&mask, SIGABRT) != 0) ABORT("sigdelset() failed");
> > # endif
> > do {
> > me->stop_info.signal = 0;
> > sigsuspend(&mask); /* Wait for signal */
> > } while (me->stop_info.signal != SIG_THR_RESTART);
> > /* If the RESTART signal gets lost, we can still lose. That should be */
> > /* less likely than losing the SUSPEND signal, since we don't do much */
> > /* between the sem_post and sigsuspend. */
> > /* We'd need more handshaking to work around that, since we don't want */
> > /* to accidentally leave a RESTART signal pending, thus causing us to */
> > /* continue prematurely in a future round. */
> >
> > #if DEBUG_THREADS
> > GC_printf1("Continuing 0x%lx\n", my_thread);
> > #endif
> > }
> >
> > and here is the output with debug messages:
> >
> > Stopping the world from 0x50d000
> > Sending suspend signal to 0x9d1400
> > Suspending 0x9d1400
> > World stopped from 0x50d000
> > Pushing stacks from thread 0x50d000
> > Stack for thread 0x9d1400 = [7fffffeed95c,7fffffeee000)
> > Stack for thread 0x50d000 = [7fffffffcf00,800000000000)
> > World starting
> > Sending restart signal to 0x9d1400
> > World started
> > Buildfile: build.xml
> > Stopping the world from 0x50d000
> > Sending suspend signal to 0x9d1400
> > Waiting for restart #2 of 0x9d1400
> >
> > There are other things causing output, but you can see that the slave
> > isn't getting to the sigsuspend before it is evoked again from the
> > master thread.
>
> Something is weird. It's not getting the first restart, but
> yet it's past the sem_post() in the second restart. Try displaying
> the return from sem_post(). And where's the missing sem_wait()?
More information about the freebsd-threads
mailing list