hastd: parent got stuck in waitpid()
Mikolaj Golub
to.my.trociny at gmail.com
Sun Sep 19 09:57:17 UTC 2010
Hi,
When trying to produce the scenario described in another thread (hastd: possible
race when a worker is starting) I stepped on another issue. I was running the
following script:
#!/bin/sh
for i in `jot 1000`; do
hastctl status storage > /dev/null
done &
for i in `jot 1000`; do
hastctl role init storage
hastctl role primary storage
done
Parent hastd got stuck but that time when changing the role to init and
terminating the worker: in waitpid() after sending kill() to the worker. It
looked like the signal was lost. I don't have a clue how this might happen but
it is rather easy reproducible in my environment with the script above.
After the hung:
[root at lolek /usr/src/sbin/hastctl]# ps auxww |grep hast
root 3334 0.0 0.5 11244 2372 ?? Is 12:13PM 0:00.10 /sbin/hastd -ddd
root 3473 0.0 7.0 44908 35664 ?? I 12:13PM 0:00.43 hastd: storage (primary) (hastd)
root 3474 0.0 0.3 10924 1764 1 I+ 12:13PM 0:00.01 hastctl role init storage
root 3475 0.0 0.3 10924 1764 1 I+ 12:13PM 0:00.01 hastctl status storage
[root at lolek /usr/src/sbin/hastctl]# gdb /usr/obj/usr/src/sbin/hastd/hastd 3334
[Switching to Thread 28404140 (LWP 100070)]
0x282b9689 in wait4 () from /lib/libc.so.7
(gdb) bt
#0 0x282b9689 in wait4 () from /lib/libc.so.7
#1 0x282902a3 in waitpid () from /lib/libc.so.7
#2 0x280de272 in waitpid () from /lib/libthr.so.3
#3 0x0804c664 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=1 '\001',
res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:103
#4 0x0804cf91 in control_handle (cfg=0x28419600) at /usr/src/sbin/hastd/control.c:344
#5 0x08050357 in main_loop () at /usr/src/sbin/hastd/hastd.c:682
#6 0x0805076d in main (argc=0, argv=0xbfbfecd0) at /usr/src/sbin/hastd/hastd.c:792
(gdb) fr 3
#3 0x0804c664 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=1 '\001',
res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:103
103 } else if (waitpid(res->hr_workerpid, NULL, 0) !=
(gdb) list
98 if (res->hr_workerpid != 0) {
99 if (kill(res->hr_workerpid, SIGTERM) < 0) {
100 pjdlog_errno(LOG_WARNING,
101 "Unable to kill worker process %u",
102 (unsigned int)res->hr_workerpid);
103 } else if (waitpid(res->hr_workerpid, NULL, 0) !=
104 res->hr_workerpid) {
105 pjdlog_errno(LOG_WARNING,
106 "Error while waiting for worker process %u",
107 (unsigned int)res->hr_workerpid);
[root at lolek /usr/src/sbin/hastctl]# gdb /usr/obj/usr/src/sbin/hastd/hastd 3473
Thread 8 (Thread 28404140 (LWP 100079)):
#0 0x282a14bb in sigtimedwait () from /lib/libc.so.7
#1 0x280dff3b in sigtimedwait () from /lib/libthr.so.3
#2 0x0805e5ec in guard_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1986
#3 0x0805aa47 in hastd_primary (res=0x284eb500) at /usr/src/sbin/hastd/primary.c:828
#4 0x0804c6d5 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=2 '\002',
res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:117
#5 0x0804cf91 in control_handle (cfg=0x28419600) at /usr/src/sbin/hastd/control.c:344
#6 0x08050357 in main_loop () at /usr/src/sbin/hastd/hastd.c:682
#7 0x0805076d in main (argc=0, argv=0xbfbfecd0) at /usr/src/sbin/hastd/hastd.c:792
Thread 7 (Thread 28404280 (LWP 100089)):
#0 0x2834b233 in ioctl () from /lib/libc.so.7
#1 0x0805b0f4 in ggate_recv_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:943
#2 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#3 0x00000000 in ?? ()
Thread 6 (Thread 284043c0 (LWP 100090)):
#0 0x280e6ea7 in __error () from /lib/libthr.so.3
#1 0x280e6a88 in __error () from /lib/libthr.so.3
#2 0x284a14e0 in ?? ()
#3 0x00000008 in ?? ()
#4 0x00000001 in ?? ()
#5 0x284a14c0 in ?? ()
#6 0x00000000 in ?? ()
#7 0x00000000 in ?? ()
#8 0x00000000 in ?? ()
#9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x284ea110, lock=0x284ea108) at synch.h:149
#12 0x0805b767 in local_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1081
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()
Thread 5 (Thread 28404500 (LWP 100091)):
#0 0x280e6ea7 in __error () from /lib/libthr.so.3
#1 0x280e6a88 in __error () from /lib/libthr.so.3
#2 0x284a15e0 in ?? ()
#3 0x00000008 in ?? ()
#4 0x00000001 in ?? ()
#5 0x284a15c0 in ?? ()
#6 0x00000000 in ?? ()
#7 0x00000000 in ?? ()
#8 0x00000000 in ?? ()
#9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x284ea114, lock=0x284ea10c) at synch.h:149
#12 0x0805bd3f in remote_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1166
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()
Thread 4 (Thread 28404640 (LWP 100093)):
#0 0x280e6ea7 in __error () from /lib/libthr.so.3
#1 0x280e6a88 in __error () from /lib/libthr.so.3
#2 0x284a1660 in ?? ()
#3 0x00000008 in ?? ()
#4 0x00000001 in ?? ()
#5 0x284a1640 in ?? ()
#6 0x00000000 in ?? ()
#7 0x00000000 in ?? ()
#8 0x00000000 in ?? ()
#9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x284ea124, lock=0x284ea11c) at synch.h:149
#12 0x0805c58e in remote_recv_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1312
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()
Thread 3 (Thread 28404780 (LWP 100094)):
#0 0x280e6ea7 in __error () from /lib/libthr.so.3
#1 0x280e6a88 in __error () from /lib/libthr.so.3
#2 0x284a16e0 in ?? ()
#3 0x00000008 in ?? ()
#4 0x00000001 in ?? ()
#5 0x284a16c0 in ?? ()
#6 0x00000000 in ?? ()
#7 0x00000000 in ?? ()
#8 0x00000000 in ?? ()
#9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x806ba54, lock=0x806ba50) at synch.h:149
#12 0x0805cc3b in ggate_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1432
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()
Thread 2 (Thread 284048c0 (LWP 100095)):
#0 0x280e6ea7 in __error () from /lib/libthr.so.3
#1 0x280e6a88 in __error () from /lib/libthr.so.3
#2 0x28bf48e0 in ?? ()
#3 0x00000008 in ?? ()
#4 0x00000001 in ?? ()
#5 0x284a16c0 in ?? ()
#6 0x00000000 in ?? ()
#7 0x00000000 in ?? ()
#8 0x00000000 in ?? ()
#9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x806ba54, lock=0x806ba50) at synch.h:149
#12 0x0805cc3b in ggate_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1432
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()
Thread 2 (Thread 284048c0 (LWP 100095)):
#0 0x280e6ea7 in __error () from /lib/libthr.so.3
#1 0x280e6a88 in __error () from /lib/libthr.so.3
#2 0x28bf48e0 in ?? ()
---Type <return> to continue, or q <return> to quit---
#3 0x00000008 in ?? ()
#4 0x00000001 in ?? ()
#5 0x28bf48c0 in ?? ()
#6 0x00000000 in ?? ()
#7 0xbf4f9e84 in ?? ()
#8 0x00000000 in ?? ()
#9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3
#10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3
#11 0x0805b6a8 in cv_wait (cv=0x806ba60, lock=0x806ba5c) at synch.h:149
#12 0x0805d053 in sync_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1528
#13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#14 0x00000000 in ?? ()
Thread 1 (Thread 28404a00 (LWP 100096)):
#0 0x28300ed5 in recvfrom () from /lib/libc.so.7
#1 0x28286f52 in recv () from /lib/libc.so.7
#2 0x0805f237 in proto_common_recv (fd=33, data=0xbf3f8f47 "", size=5)
at /usr/src/sbin/hastd/proto_common.c:77
#3 0x0805f68d in sp_recv (ctx=0x2850e3f0, data=0xbf3f8f47 "", size=5)
at /usr/src/sbin/hastd/proto_socketpair.c:185
#4 0x0805ec61 in proto_recv (conn=0x2850e3e0, data=0xbf3f8f47, size=5)
at /usr/src/sbin/hastd/proto.c:207
#5 0x0804e42e in hast_proto_recv_hdr (conn=0x2850e3e0, nvp=0xbf3f8f80)
at /usr/src/sbin/hastd/hast_proto.c:308
#6 0x0804d0b7 in ctrl_thread (arg=0x284eb500) at /usr/src/sbin/hastd/control.c:385
#7 0x280dc35f in pthread_getprio () from /lib/libthr.so.3
#8 0x00000000 in ?? ()
--
Mikolaj Golub
More information about the freebsd-fs
mailing list