Global destructor order problems (was: Re: Are ports supposed to build and run on 10-CURRENT?)
Dimitry Andric
dim at FreeBSD.org
Wed Jun 26 19:26:17 UTC 2013
On Jun 26, 2013, at 13:31, Michael Gmelin <freebsd at grem.de> wrote:
> On Wed, 26 Jun 2013 11:00:40 +0200
> Dimitry Andric <dim at FreeBSD.org> wrote:
>> On 2013-06-26 01:55, Michael Gmelin wrote:
>> ...
>>> The problem is that static initialization happens in the expected
>>> order (same translation unit), but termination does *not* happen in
>>> the reverse order of initialization,
...
> Yep, strange indeed - my test cases didn't use fPIC at first, so it
> took a while to figure it out. It's seems to be some sort of
> combined link/runtime problem, since the same executable built on 10
> runs fine on 9.1-RELEASE when copied over. I tried replacing various
> system libraries with their versions from 9.1 in a jail to see if I
> could make it run on 10, but to no success.
>
> By the way, the same code built on 9.1 using clang 3.1 or clang 3.3
> runs fine on 10 as well, so the only case that does NOT work is build
> on 10 and run on 10 using clang. Also, when I link copies of main.o and
> libout.so that have been built on 10 on 9.1 using clang33 the problem
> doesn't happen as well. So it appears that the problem happens
> when linking the executable when one of the objects is position
> independent and then only surfaces on 10.
So I did a bit of investigation, and the root cause is that both clang
and newer versions of gcc emit direct calls to the destructors of global
objects, while older gcc's, such as the one in base, generate anonymous
wrapper functions, which in turn call the destructors.
The direct destructor calls will not work correctly, if the destructors
are located in shared objects, while the global objects themselves are
in the main program, and if the main program is compiled with -fPIC.
This problem happens after the following revision, which changed the
behavior of __cxa_finalize();
http://svnweb.freebsd.org/base?view=revision&revision=211706
This revision is not in 9.1-RELEASE, but it is in 9-STABLE, so the
problem can also be reproduced there.
To illustrate: if you compile your test program's main.cpp with gcc
-fPIC, it produces (excerpted the assembly a bit for readability):
.section .ctors,"aw", at progbits
.align 4
.long _GLOBAL__I_main
[...]
__tcf_1:
pushl %ebp
movl %esp, %ebp
pushl %ebx
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
subl $16, %esp
leal innerInstance at GOTOFF(%ebx), %eax
pushl %eax
call _ZN5InnerD1Ev at PLT
addl $16, %esp
movl -4(%ebp), %ebx
leave
ret
[...]
_Z41__static_initialization_and_destruction_0ii:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %ebx
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
decl %eax
jne .L14
cmpl $65535, %edx
jne .L14
subl $12, %esp
leal outerInstance at GOTOFF(%ebx), %eax
pushl %eax
call _ZN5OuterC1Ev at PLT
movl __dso_handle at GOT(%ebx), %esi
addl $12, %esp
leal __tcf_0 at GOTOFF(%ebx), %eax
pushl %esi
pushl $0
pushl %eax
call __cxa_atexit at PLT
leal innerInstance at GOTOFF(%ebx), %eax
movl %eax, (%esp)
call _ZN5InnerC1Ev at PLT
addl $12, %esp
pushl %esi
pushl $0
leal __tcf_1 at GOTOFF(%ebx), %eax
pushl %eax
call __cxa_atexit at PLT
addl $16, %esp
.L14:
leal -8(%ebp), %esp
popl %ebx
popl %esi
popl %ebp
ret
[...]
_GLOBAL__I_main:
pushl %ebp
movl $65535, %edx
movl %esp, %ebp
movl $1, %eax
popl %ebp
jmp _Z41__static_initialization_and_destruction_0ii
[...]
__tcf_0:
pushl %ebp
movl %esp, %ebp
pushl %ebx
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
subl $16, %esp
leal outerInstance at GOTOFF(%ebx), %eax
pushl %eax
call _ZN5OuterD1Ev at PLT
addl $16, %esp
movl -4(%ebp), %ebx
leave
ret
[...]
Summarizing:
- the startup code calls _GLOBAL__I_main, a.k.a. "global constructors keyed to main"
- jumps to _Z41__static_initialization_and_destruction_0ii, a.k.a.
__static_initialization_and_destruction_0(int, int)
- calls _ZN5OuterC1Ev, a.k.a. Outer::Outer(), to construct the
outerInstance object
- calls __cxa_atexit(), registering a generated wrapper function
__tcf_0(), which actually calls _ZN5OuterD1Ev, a.k.a. Outer::~Outer()
- similar for the innerInstance object
In contrast, clang produces the following:
_GLOBAL__I_a: # @_GLOBAL__I_a
pushl %ebp
movl %esp, %ebp
pushl %ebx
pushl %edi
pushl %esi
subl $12, %esp
calll .L2$pb
.L2$pb:
popl %ebx
addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp13-.L2$pb), %ebx
leal _ZL13outerInstance at GOTOFF(%ebx), %edi
movl %edi, (%esp)
calll _ZN5OuterC1Ev at PLT
movl __dso_handle at GOT(%ebx), %esi
movl %esi, 8(%esp)
movl %edi, 4(%esp)
movl _ZN5OuterD1Ev at GOT(%ebx), %eax
movl %eax, (%esp)
calll __cxa_atexit at PLT
leal .Lstr5 at GOTOFF(%ebx), %eax
movl %eax, (%esp)
calll puts at PLT
movl %esi, 8(%esp)
leal _ZL13innerInstance at GOTOFF(%ebx), %eax
movl %eax, 4(%esp)
movl _ZN5InnerD1Ev at GOT(%ebx), %eax
movl %eax, (%esp)
calll __cxa_atexit at PLT
addl $12, %esp
popl %esi
popl %edi
popl %ebx
popl %ebp
ret
[...]
.section .ctors,"aw", at progbits
.align 4
.long _GLOBAL__I_a
Summarizing:
- the startup code calls _GLOBAL__I_a, a.k.a. "global constructors
keyed to a"
- calls _ZN5OuterC1Ev, a.k.a. Outer::Outer(), to construct the
outerInstance object
- calls __cxa_atexit(), directly registering _ZN5OuterD1Ev, a.k.a
Outer::~Outer()
- similar for the innerInstance object (though the constructor is
inlined)
The crucial difference is that clang *directly* registers the
destructor's function pointer, instead of using a locally generated
wrapper. Newer versions of gcc behave the same way, since this upstream
revision:
http://gcc.gnu.org/viewcvs/gcc?view=revision&revision=125253
This is roughly gcc 4.3.0 and later. For example, gcc 4.8 generates:
_GLOBAL__sub_I_main.cpp:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
call __x86.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
subl $24, %esp
leal _ZL13outerInstance at GOTOFF(%ebx), %edi
pushl %edi
call _ZN5OuterC1Ev at PLT
leal __dso_handle at GOTOFF(%ebx), %esi
addl $12, %esp
pushl %esi
pushl %edi
pushl _ZN5OuterD1Ev at GOT(%ebx)
call __cxa_atexit at PLT
leal .LC2 at GOTOFF(%ebx), %eax
movl %eax, (%esp)
call puts at PLT
addl $12, %esp
pushl %esi
leal _ZL13innerInstance at GOTOFF(%ebx), %eax
pushl %eax
pushl _ZN5InnerD1Ev at GOT(%ebx)
call __cxa_atexit at PLT
addl $16, %esp
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
[...]
.section .ctors,"aw", at progbits
.align 4
.long _GLOBAL__sub_I_main.cpp
In each case, __cxa_exit() is called with the following three arguments:
the address of the destructor, the pointer to the object ('this'), and
the dso handle, which in this case belongs to main.
Now, when the program exits, it will repeatedly call __cxa_finalize() to
actually call the registered exit functions, each time passing a pointer
to the dso being unloaded (or NULL for main):
void
__cxa_finalize(void *dso)
{
struct dl_phdr_info phdr_info;
struct atexit *p;
struct atexit_fn fn;
int n, has_phdr;
if (dso != NULL)
has_phdr = _rtld_addr_phdr(dso, &phdr_info);
else
has_phdr = 0;
_MUTEX_LOCK(&atexit_mutex);
for (p = __atexit; p; p = p->next) {
for (n = p->ind; --n >= 0;) {
if (p->fns[n].fn_type == ATEXIT_FN_EMPTY)
continue; /* already been called */
fn = p->fns[n];
if (dso != NULL && dso != fn.fn_dso) {
/* wrong DSO ? */
if (!has_phdr || !__elf_phdr_match_addr(
&phdr_info, fn.fn_ptr.cxa_func))
continue;
}
/*
Mark entry to indicate that this particular handler
has already been called.
*/
p->fns[n].fn_type = ATEXIT_FN_EMPTY;
_MUTEX_UNLOCK(&atexit_mutex);
/* Call the function of correct type. */
if (fn.fn_type == ATEXIT_FN_CXA)
fn.fn_ptr.cxa_func(fn.fn_arg);
else if (fn.fn_type == ATEXIT_FN_STD)
fn.fn_ptr.std_func();
[...]
The problem is in the part with the comment "wrong DSO?". When the main
program is compiled with -fPIC, and __cxa_finalize() is called for
libout.so (which is the first dso to be processed), it will encounter
the entry for Outer::~Outer().
Then, the "wrong DSO?" part will be entered, and because has_phdr is
true, it will call __elf_phdr_match_addr() with the address of the
destructor. Since the destructor is registered with _ZN5OuterD1Ev at GOT,
it will match, and it will be called.
In contrast, if the main program is not compiled with -fPIC, the
destructor will be registered with _ZN5OuterD1Ev (e.g. without @GOT),
and __elf_phdr_match_addr() will not match, and the loop continues
without calling the destructor.
Finally, if the main program is compiled with gcc and -fPIC, the
destructor itself is never considered in the __cxa_finalize() loop, only
the locally generated wrapper function. That function will only be
called in the __cxa_finalize() call for the main program, and so the
destructor will be called at the right time.
I am not entirely sure what can be done to remedy this scenario, and I
also do not know the exact reasons for r211706 changing the behavior.
E.g., before r211706, if the atexit_fn's fn_dso did not match the dso
being unloaded, the loop would unconditionally continue without calling
the handler. On the other hand, r211706 seems to make sure functions
from dso's will be called before they are unloaded, as calling them
afterwards would not always make sense... :-)
> Based on this I would *speculate* that the problem first appeared when
> r232832 was committed [1] and there's something wrong with the order of
> how fini_array is filled b the linker (or traversed later).
At this point, I do not think r232832 is the culprit.
-Dimitry
More information about the freebsd-ports
mailing list