hastd: parent got stuck in waitpid()

Mikolaj Golub to.my.trociny at gmail.com
Sun Sep 19 09:57:17 UTC 2010


Hi,

When trying to produce the scenario described in another thread (hastd: possible
race when a worker is starting) I stepped on another issue. I was running the
following script:

#!/bin/sh

for i in `jot 1000`; do
        hastctl status storage > /dev/null
done &
for i in `jot 1000`; do
        hastctl role init storage
        hastctl role primary storage
done

Parent hastd got stuck but that time when changing the role to init and
terminating the worker: in waitpid() after sending kill() to the worker. It
looked like the signal was lost. I don't have a clue how this might happen but
it is rather easy reproducible in my environment with the script above.

After the hung:

[root at lolek /usr/src/sbin/hastctl]# ps auxww |grep hast
root   3334   0.0  0.5  11244   2372  ??  Is   12:13PM   0:00.10 /sbin/hastd -ddd
root   3473   0.0  7.0  44908  35664  ??  I    12:13PM   0:00.43 hastd: storage (primary) (hastd)
root   3474   0.0  0.3  10924   1764   1  I+   12:13PM   0:00.01 hastctl role init storage
root   3475   0.0  0.3  10924   1764   1  I+   12:13PM   0:00.01 hastctl status storage

[root at lolek /usr/src/sbin/hastctl]# gdb /usr/obj/usr/src/sbin/hastd/hastd 3334

[Switching to Thread 28404140 (LWP 100070)]
0x282b9689 in wait4 () from /lib/libc.so.7
(gdb) bt
#0  0x282b9689 in wait4 () from /lib/libc.so.7
#1  0x282902a3 in waitpid () from /lib/libc.so.7
#2  0x280de272 in waitpid () from /lib/libthr.so.3
#3  0x0804c664 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=1 '\001', 
    res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:103
#4  0x0804cf91 in control_handle (cfg=0x28419600) at /usr/src/sbin/hastd/control.c:344
#5  0x08050357 in main_loop () at /usr/src/sbin/hastd/hastd.c:682
#6  0x0805076d in main (argc=0, argv=0xbfbfecd0) at /usr/src/sbin/hastd/hastd.c:792
(gdb) fr 3
#3  0x0804c664 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=1 '\001', 
    res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:103
103                     } else if (waitpid(res->hr_workerpid, NULL, 0) !=
(gdb) list
98              if (res->hr_workerpid != 0) {
99                      if (kill(res->hr_workerpid, SIGTERM) < 0) {
100                             pjdlog_errno(LOG_WARNING,
101                                 "Unable to kill worker process %u",
102                                 (unsigned int)res->hr_workerpid);
103                     } else if (waitpid(res->hr_workerpid, NULL, 0) !=
104                         res->hr_workerpid) {
105                             pjdlog_errno(LOG_WARNING,
106                                 "Error while waiting for worker process %u",
107                                 (unsigned int)res->hr_workerpid);

[root at lolek /usr/src/sbin/hastctl]# gdb /usr/obj/usr/src/sbin/hastd/hastd 3473

Thread 8 (Thread 28404140 (LWP 100079)):
#0  0x282a14bb in sigtimedwait () from /lib/libc.so.7
#1  0x280dff3b in sigtimedwait () from /lib/libthr.so.3
#2  0x0805e5ec in guard_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1986
#3  0x0805aa47 in hastd_primary (res=0x284eb500) at /usr/src/sbin/hastd/primary.c:828
#4  0x0804c6d5 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=2 '\002', 
    res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:117
#5  0x0804cf91 in control_handle (cfg=0x28419600) at /usr/src/sbin/hastd/control.c:344
#6  0x08050357 in main_loop () at /usr/src/sbin/hastd/hastd.c:682
#7  0x0805076d in main (argc=0, argv=0xbfbfecd0) at /usr/src/sbin/hastd/hastd.c:792

Thread 7 (Thread 28404280 (LWP 100089)):
#0  0x2834b233 in ioctl () from /lib/libc.so.7
#1  0x0805b0f4 in ggate_recv_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:943
#2  0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#3  0x00000000 in ?? ()

Thread 6 (Thread 284043c0 (LWP 100090)):
#0  0x280e6ea7 in __error () from /lib/libthr.so.3
#1  0x280e6a88 in __error () from /lib/libthr.so.3
#2  0x284a14e0 in ?? ()
#3  0x00000008 in ?? ()
#4  0x00000001 in ?? ()
#5  0x284a14c0 in ?? ()
#6  0x00000000 in ?? ()
#7  0x00000000 in ?? ()
#8  0x00000000 in ?? ()
#9  0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x284ea110, lock=0x284ea108) at synch.h:149
#12 0x0805b767 in local_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1081
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()

Thread 5 (Thread 28404500 (LWP 100091)):
#0  0x280e6ea7 in __error () from /lib/libthr.so.3
#1  0x280e6a88 in __error () from /lib/libthr.so.3
#2  0x284a15e0 in ?? ()
#3  0x00000008 in ?? ()
#4  0x00000001 in ?? ()
#5  0x284a15c0 in ?? ()
#6  0x00000000 in ?? ()
#7  0x00000000 in ?? ()
#8  0x00000000 in ?? ()
#9  0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x284ea114, lock=0x284ea10c) at synch.h:149
#12 0x0805bd3f in remote_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1166
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()

Thread 4 (Thread 28404640 (LWP 100093)):
#0  0x280e6ea7 in __error () from /lib/libthr.so.3
#1  0x280e6a88 in __error () from /lib/libthr.so.3
#2  0x284a1660 in ?? ()
#3  0x00000008 in ?? ()
#4  0x00000001 in ?? ()
#5  0x284a1640 in ?? ()
#6  0x00000000 in ?? ()
#7  0x00000000 in ?? ()
#8  0x00000000 in ?? ()
#9  0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x284ea124, lock=0x284ea11c) at synch.h:149
#12 0x0805c58e in remote_recv_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1312
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()

Thread 3 (Thread 28404780 (LWP 100094)):
#0  0x280e6ea7 in __error () from /lib/libthr.so.3
#1  0x280e6a88 in __error () from /lib/libthr.so.3
#2  0x284a16e0 in ?? ()
#3  0x00000008 in ?? ()
#4  0x00000001 in ?? ()
#5  0x284a16c0 in ?? ()
#6  0x00000000 in ?? ()
#7  0x00000000 in ?? ()
#8  0x00000000 in ?? ()
#9  0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x806ba54, lock=0x806ba50) at synch.h:149
#12 0x0805cc3b in ggate_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1432
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()

Thread 2 (Thread 284048c0 (LWP 100095)):
#0  0x280e6ea7 in __error () from /lib/libthr.so.3
#1  0x280e6a88 in __error () from /lib/libthr.so.3
#2  0x28bf48e0 in ?? ()
#3  0x00000008 in ?? ()
#4  0x00000001 in ?? ()
#5  0x284a16c0 in ?? ()
#6  0x00000000 in ?? ()
#7  0x00000000 in ?? ()
#8  0x00000000 in ?? ()
#9  0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x806ba54, lock=0x806ba50) at synch.h:149
#12 0x0805cc3b in ggate_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1432
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()

Thread 2 (Thread 284048c0 (LWP 100095)):
#0  0x280e6ea7 in __error () from /lib/libthr.so.3
#1  0x280e6a88 in __error () from /lib/libthr.so.3
#2  0x28bf48e0 in ?? ()
---Type <return> to continue, or q <return> to quit---
#3  0x00000008 in ?? ()
#4  0x00000001 in ?? ()
#5  0x28bf48c0 in ?? ()
#6  0x00000000 in ?? ()
#7  0xbf4f9e84 in ?? ()
#8  0x00000000 in ?? ()
#9  0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x806ba60, lock=0x806ba5c) at synch.h:149
#12 0x0805d053 in sync_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1528
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()

Thread 1 (Thread 28404a00 (LWP 100096)):
#0  0x28300ed5 in recvfrom () from /lib/libc.so.7
#1  0x28286f52 in recv () from /lib/libc.so.7
#2  0x0805f237 in proto_common_recv (fd=33, data=0xbf3f8f47 "", size=5)
    at /usr/src/sbin/hastd/proto_common.c:77
#3  0x0805f68d in sp_recv (ctx=0x2850e3f0, data=0xbf3f8f47 "", size=5)
    at /usr/src/sbin/hastd/proto_socketpair.c:185
#4  0x0805ec61 in proto_recv (conn=0x2850e3e0, data=0xbf3f8f47, size=5)
    at /usr/src/sbin/hastd/proto.c:207
#5  0x0804e42e in hast_proto_recv_hdr (conn=0x2850e3e0, nvp=0xbf3f8f80)
    at /usr/src/sbin/hastd/hast_proto.c:308
#6  0x0804d0b7 in ctrl_thread (arg=0x284eb500) at /usr/src/sbin/hastd/control.c:385
#7  0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#8  0x00000000 in ?? ()

-- 
Mikolaj Golub


More information about the freebsd-fs mailing list