catrig[fl].c and inexact

Sat May 13 13:08:37 UTC 2017

On 13 May 2017, at 08:08, Steve Kargl <sgk at troutmask.apl.washington.edu> wrote:
> 
> On Sat, May 13, 2017 at 11:35:49AM +1000, Bruce Evans wrote:
>> On Fri, 12 May 2017, Steve Kargl wrote:
...
>> required for the standard magic.  I planned to fix all this magic using
>> macros like raise_inexact().
> 
> If you plan to fix the magic with raise_inexact, then please
> test with a suite of compilers.  AFAICT, clang is optimizing
> out the code.  I haven't written a testcase to demonstrate this
> as I have other irons in the fire.

Using the full catrig.c and -O3, I tried gcc 4.2.1, 4.7.4, 4.8.5, 4.9.4,
5.4.0, 6.3.0 and 7.0.1, in addition to clang 3.4.1, 3.8.0, 3.9.1, 4.0.0
and 5.0.0.  All versions of gcc produced something similar to the
following for i386:

# /usr/src/lib/msun/src/catrig.c:314:   if (x == 0 && y == 0)
        .loc 1 314 0
        fldz
        fucom   %st(3)  #
        fnstsw  %ax     # tmp262
        sahf
        setne   %al     #, tmp270
        setnp   %dl     #, tmp259
        subl    $1, %eax        #, tmp270
        testb   %al, %dl        # tmp270, tmp259
        je      .L176   #,
        fucomp  %st(1)  #
        fnstsw  %ax     # tmp281
        sahf
        setne   %al     #, tmp289
        setnp   %dl     #, tmp278
        subl    $1, %eax        #, tmp289
        testb   %al, %dl        # tmp289, tmp278
        je      .L37    #,
        fstp    %st(3)  #
        fstp    %st(0)  #
        jmp     .L153   #
[...]
.L176:
        fstp    %st(0)  #
.L37:
.LBB25:
# /usr/src/lib/msun/src/catrig.c:318:   raise_inexact();
        flds    tiny    # tiny
        fadds   .LC2    #
        fstps   120(%esp)       # junk

and for amd64:

# /usr/src/lib/msun/src/catrig.c:314:   if (x == 0 && y == 0)
        .loc 1 314 0
        pxor    %xmm7, %xmm7    # tmp386
        ucomisd %xmm7, %xmm3    # tmp386, z
        setnp   %dl     #, tmp258
        cmovne  %eax, %edx      # tmp258,, tmp207, tmp254
        testb   %dl, %dl        # tmp254
        je      .L34    #,
        ucomisd %xmm7, %xmm1    # tmp386, z
        setnp   %dl     #, tmp266
        cmove   %edx, %eax      # tmp266,, tmp262
        testb   %al, %al        # tmp262
        je      .L34    #,
[...]
.L34:
.LBB33:
# /usr/src/lib/msun/src/catrig.c:318:   raise_inexact();
        movss   tiny(%rip), %xmm0       # tiny, tiny.0_28
        addss   .LC13(%rip), %xmm0      #, _29
        movss   %xmm0, 188(%rsp)        # _29, junk

All versions of clang produced something similar to the following for
i386:

        .loc    1 314 8 is_stmt 1       # /usr/src/lib/msun/src/catrig.c:314:8
        fldz
        .loc    1 314 13 is_stmt 0      # /usr/src/lib/msun/src/catrig.c:314:13
        fxch    %st(1)
        fucom   %st(1)
        fnstsw  %ax
        sahf
        jne     .LBB0_19
        jp      .LBB0_19
        .loc    1 0 13                  # /usr/src/lib/msun/src/catrig.c:0:13
        fxch    %st(3)
        fucom   %st(1)
        fstp    %st(1)
        fnstsw  %ax
        sahf
        fldz
        fxch    %st(1)
        fxch    %st(3)
        jne     .LBB0_19
        jp      .LBB0_19
[...]
.LBB0_19:                               # %do.body
        .loc    1 0 8 is_stmt 0         # /usr/src/lib/msun/src/catrig.c:0:8
        fstp    %st(1)
        .loc    1 318 2 is_stmt 1       # /usr/src/lib/msun/src/catrig.c:318:2
        fld1
        fadds   tiny
        fstps   168(%esp)

and for amd64:

        .loc    1 314 8 is_stmt 1       # /usr/src/lib/msun/src/catrig.c:314:8
        pxor    %xmm2, %xmm2
        .loc    1 314 13 is_stmt 0      # /usr/src/lib/msun/src/catrig.c:314:13
        ucomisd %xmm2, %xmm4
        jne     .LBB0_15
        jp      .LBB0_15
        .loc    1 0 13                  # /usr/src/lib/msun/src/catrig.c:0:13
        ucomisd %xmm2, %xmm3
        jne     .LBB0_15
        jnp     .LBB0_21
.LBB0_15:                               # %do.body
        .loc    1 318 2 is_stmt 1       # /usr/src/lib/msun/src/catrig.c:318:2
        movss   tiny(%rip), %xmm2       # xmm2 = mem[0],zero,zero,zero
        addss   .LCPI0_2(%rip), %xmm2
.Ltmp11:
        movss   %xmm2, -16(%rbp)

E.g., these all look good, at least with regards to not optimizing out
the desired addition.

The only compiler I could find that does optimize everything away (at
least in the simplified test case), is the Intel compiler:

https://godbolt.org/g/g1UT2m

-Dimitry

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 194 bytes
Desc: Message signed with OpenPGP
URL: <http://lists.freebsd.org/pipermail/freebsd-hackers/attachments/20170513/6ccea7c2/attachment.sig>