git: 35f6b83049da - stable/14 - Update the Arm Optimized Routine library to v24.01
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 08 Apr 2024 13:15:43 UTC
The branch stable/14 has been updated by andrew:
URL: https://cgit.FreeBSD.org/src/commit/?id=35f6b83049dabe18277ac0fcc73ede6ed7f3a1a6
commit 35f6b83049dabe18277ac0fcc73ede6ed7f3a1a6
Author: Andrew Turner <andrew@FreeBSD.org>
AuthorDate: 2024-02-29 11:39:12 +0000
Commit: Andrew Turner <andrew@FreeBSD.org>
CommitDate: 2024-04-08 13:15:18 +0000
Update the Arm Optimized Routine library to v24.01
Sponsored by: Arm Ltd
(cherry picked from commit 5a02ffc32e777041dd2dad4e651ed2a0865a0a5d)
---
contrib/arm-optimized-routines/README | 2 +-
contrib/arm-optimized-routines/config.mk.dist | 13 +-
contrib/arm-optimized-routines/math/Dir.mk | 6 +-
.../arm-optimized-routines/math/aarch64/v_cos.c | 87 +
.../arm-optimized-routines/math/aarch64/v_cosf.c | 82 +
.../arm-optimized-routines/math/aarch64/v_exp.c | 125 +
.../arm-optimized-routines/math/aarch64/v_exp2f.c | 113 +
.../math/aarch64/v_exp2f_1u.c | 72 +
.../math/aarch64/v_exp_data.c | 146 +
.../arm-optimized-routines/math/aarch64/v_expf.c | 122 +
.../math/aarch64/v_expf_1u.c | 77 +
.../arm-optimized-routines/math/aarch64/v_log.c | 100 +
.../math/aarch64/v_log_data.c | 156 +
.../arm-optimized-routines/math/aarch64/v_logf.c | 74 +
.../arm-optimized-routines/math/aarch64/v_math.h | 135 +
.../arm-optimized-routines/math/aarch64/v_pow.c | 22 +
.../arm-optimized-routines/math/aarch64/v_powf.c | 148 +
.../arm-optimized-routines/math/aarch64/v_sin.c | 97 +
.../arm-optimized-routines/math/aarch64/v_sinf.c | 82 +
contrib/arm-optimized-routines/math/exp10.c | 129 +
contrib/arm-optimized-routines/math/exp_data.c | 23 +-
.../arm-optimized-routines/math/include/mathlib.h | 67 +-
contrib/arm-optimized-routines/math/math_config.h | 61 +-
contrib/arm-optimized-routines/math/s_cos.c | 6 -
contrib/arm-optimized-routines/math/s_cosf.c | 6 -
contrib/arm-optimized-routines/math/s_exp.c | 6 -
contrib/arm-optimized-routines/math/s_exp2f.c | 6 -
contrib/arm-optimized-routines/math/s_exp2f_1u.c | 6 -
contrib/arm-optimized-routines/math/s_expf.c | 6 -
contrib/arm-optimized-routines/math/s_expf_1u.c | 6 -
contrib/arm-optimized-routines/math/s_log.c | 6 -
contrib/arm-optimized-routines/math/s_logf.c | 6 -
contrib/arm-optimized-routines/math/s_pow.c | 6 -
contrib/arm-optimized-routines/math/s_powf.c | 6 -
contrib/arm-optimized-routines/math/s_sin.c | 6 -
contrib/arm-optimized-routines/math/s_sinf.c | 6 -
.../arm-optimized-routines/math/test/mathbench.c | 152 +-
.../math/test/mathbench_funcs.h | 50 +-
.../math/test/mathbench_wrappers.h | 42 +-
.../arm-optimized-routines/math/test/mathtest.c | 9 +-
contrib/arm-optimized-routines/math/test/runulp.sh | 112 +-
.../math/test/testcases/directed/exp10.tst | 15 +
contrib/arm-optimized-routines/math/test/ulp.c | 81 +-
contrib/arm-optimized-routines/math/test/ulp.h | 29 +-
.../arm-optimized-routines/math/test/ulp_funcs.h | 50 +-
.../math/test/ulp_wrappers.h | 36 +-
contrib/arm-optimized-routines/math/tgamma128.c | 356 ++
contrib/arm-optimized-routines/math/tgamma128.h | 141 +
.../math/tools/tgamma128_gen.jl | 212 ++
contrib/arm-optimized-routines/math/v_cos.c | 95 -
contrib/arm-optimized-routines/math/v_cosf.c | 84 -
contrib/arm-optimized-routines/math/v_exp.c | 128 -
contrib/arm-optimized-routines/math/v_exp.h | 14 -
contrib/arm-optimized-routines/math/v_exp2f.c | 117 -
contrib/arm-optimized-routines/math/v_exp2f_1u.c | 75 -
contrib/arm-optimized-routines/math/v_expf.c | 122 -
contrib/arm-optimized-routines/math/v_expf_1u.c | 80 -
contrib/arm-optimized-routines/math/v_log.c | 104 -
contrib/arm-optimized-routines/math/v_log.h | 18 -
contrib/arm-optimized-routines/math/v_log_data.c | 158 -
contrib/arm-optimized-routines/math/v_logf.c | 73 -
contrib/arm-optimized-routines/math/v_math.h | 661 ----
contrib/arm-optimized-routines/math/v_pow.c | 27 -
contrib/arm-optimized-routines/math/v_powf.c | 235 --
contrib/arm-optimized-routines/math/v_sin.c | 103 -
contrib/arm-optimized-routines/math/v_sinf.c | 88 -
contrib/arm-optimized-routines/math/vn_cos.c | 12 -
contrib/arm-optimized-routines/math/vn_cosf.c | 12 -
contrib/arm-optimized-routines/math/vn_exp.c | 12 -
contrib/arm-optimized-routines/math/vn_exp2f.c | 12 -
contrib/arm-optimized-routines/math/vn_exp2f_1u.c | 11 -
contrib/arm-optimized-routines/math/vn_expf.c | 12 -
contrib/arm-optimized-routines/math/vn_expf_1u.c | 11 -
contrib/arm-optimized-routines/math/vn_log.c | 12 -
contrib/arm-optimized-routines/math/vn_logf.c | 12 -
contrib/arm-optimized-routines/math/vn_pow.c | 12 -
contrib/arm-optimized-routines/math/vn_powf.c | 12 -
contrib/arm-optimized-routines/math/vn_sin.c | 12 -
contrib/arm-optimized-routines/math/vn_sinf.c | 12 -
contrib/arm-optimized-routines/pl/math/Dir.mk | 89 +-
contrib/arm-optimized-routines/pl/math/acos_2u.c | 100 +
contrib/arm-optimized-routines/pl/math/acosf_1u4.c | 99 +
contrib/arm-optimized-routines/pl/math/asin_3u.c | 106 +
contrib/arm-optimized-routines/pl/math/asin_data.c | 19 +
contrib/arm-optimized-routines/pl/math/asinf_2u5.c | 100 +
.../arm-optimized-routines/pl/math/asinf_data.c | 16 +
contrib/arm-optimized-routines/pl/math/asinh_2u5.c | 5 +-
.../arm-optimized-routines/pl/math/asinhf_3u5.c | 6 +-
.../arm-optimized-routines/pl/math/atan_common.h | 40 +-
contrib/arm-optimized-routines/pl/math/atanf_2u9.c | 12 +-
.../arm-optimized-routines/pl/math/atanf_common.h | 33 +-
contrib/arm-optimized-routines/pl/math/atanh_3u.c | 15 +-
.../arm-optimized-routines/pl/math/atanhf_3u1.c | 12 +-
contrib/arm-optimized-routines/pl/math/cbrt_2u.c | 5 +-
contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c | 9 +-
contrib/arm-optimized-routines/pl/math/cosh_2u.c | 9 +-
contrib/arm-optimized-routines/pl/math/coshf_1u9.c | 9 +-
contrib/arm-optimized-routines/pl/math/cospi_3u1.c | 89 +
.../arm-optimized-routines/pl/math/cospif_2u6.c | 84 +
contrib/arm-optimized-routines/pl/math/erf_2u5.c | 102 +
contrib/arm-optimized-routines/pl/math/erf_data.c | 788 +++++
contrib/arm-optimized-routines/pl/math/erfc_1u8.c | 153 +
contrib/arm-optimized-routines/pl/math/erfc_4u5.c | 155 -
contrib/arm-optimized-routines/pl/math/erfc_data.c | 3628 +++++++++++++++++++-
contrib/arm-optimized-routines/pl/math/erfcf.h | 38 -
contrib/arm-optimized-routines/pl/math/erfcf_1u7.c | 103 +
contrib/arm-optimized-routines/pl/math/erfcf_2u.c | 133 -
.../arm-optimized-routines/pl/math/erfcf_data.c | 703 +++-
contrib/arm-optimized-routines/pl/math/erff_1u5.c | 108 -
contrib/arm-optimized-routines/pl/math/erff_2u.c | 82 +
contrib/arm-optimized-routines/pl/math/erff_data.c | 532 ++-
.../arm-optimized-routines/pl/math/erfinv_24u5.c | 81 +
.../arm-optimized-routines/pl/math/erfinvf_4u7.c | 74 +
contrib/arm-optimized-routines/pl/math/erfinvl.c | 114 +
contrib/arm-optimized-routines/pl/math/estrin.h | 16 -
.../arm-optimized-routines/pl/math/estrin_wrap.h | 48 -
contrib/arm-optimized-routines/pl/math/estrinf.h | 14 -
contrib/arm-optimized-routines/pl/math/expf.c | 4 +-
contrib/arm-optimized-routines/pl/math/expm1_2u5.c | 19 +-
.../arm-optimized-routines/pl/math/expm1f_1u6.c | 11 +-
.../arm-optimized-routines/pl/math/finite_pow.h | 365 ++
contrib/arm-optimized-routines/pl/math/horner.h | 14 -
.../arm-optimized-routines/pl/math/horner_wrap.h | 34 -
contrib/arm-optimized-routines/pl/math/hornerf.h | 14 -
.../pl/math/include/mathlib.h | 238 +-
.../pl/math/include/pl_test.h | 8 +-
contrib/arm-optimized-routines/pl/math/log1p_2u.c | 17 +-
.../arm-optimized-routines/pl/math/log1pf_2u1.c | 16 +-
.../arm-optimized-routines/pl/math/math_config.h | 252 +-
contrib/arm-optimized-routines/pl/math/math_err.c | 4 +-
contrib/arm-optimized-routines/pl/math/math_errf.c | 4 +-
.../pl/math/pairwise_horner.h | 14 -
.../pl/math/pairwise_horner_wrap.h | 48 -
.../pl/math/pairwise_hornerf.h | 14 -
contrib/arm-optimized-routines/pl/math/pl_sig.h | 56 +-
.../pl/math/poly_advsimd_f32.h | 24 +
.../pl/math/poly_advsimd_f64.h | 24 +
.../arm-optimized-routines/pl/math/poly_generic.h | 277 ++
.../pl/math/poly_scalar_f32.h | 24 +
.../pl/math/poly_scalar_f64.h | 24 +
.../arm-optimized-routines/pl/math/poly_sve_f32.h | 26 +
.../arm-optimized-routines/pl/math/poly_sve_f64.h | 26 +
.../pl/math/poly_sve_generic.h | 301 ++
.../arm-optimized-routines/pl/math/s_acosh_3u5.c | 6 -
.../arm-optimized-routines/pl/math/s_acoshf_3u1.c | 6 -
.../arm-optimized-routines/pl/math/s_asinh_3u5.c | 6 -
.../arm-optimized-routines/pl/math/s_asinhf_2u7.c | 6 -
.../arm-optimized-routines/pl/math/s_atan2_3u.c | 6 -
.../arm-optimized-routines/pl/math/s_atan2f_3u.c | 6 -
.../arm-optimized-routines/pl/math/s_atan_2u5.c | 6 -
.../arm-optimized-routines/pl/math/s_atanf_3u.c | 6 -
.../arm-optimized-routines/pl/math/s_atanh_3u5.c | 6 -
.../arm-optimized-routines/pl/math/s_atanhf_3u1.c | 6 -
contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c | 6 -
.../arm-optimized-routines/pl/math/s_cbrtf_1u5.c | 6 -
contrib/arm-optimized-routines/pl/math/s_cosh_2u.c | 6 -
.../arm-optimized-routines/pl/math/s_coshf_2u4.c | 6 -
contrib/arm-optimized-routines/pl/math/s_erf_2u.c | 6 -
contrib/arm-optimized-routines/pl/math/s_erfc_4u.c | 6 -
.../arm-optimized-routines/pl/math/s_erfcf_1u.c | 6 -
.../arm-optimized-routines/pl/math/s_erff_1u5.c | 6 -
.../arm-optimized-routines/pl/math/s_exp_tail.c | 6 -
contrib/arm-optimized-routines/pl/math/s_expf.c | 6 -
.../arm-optimized-routines/pl/math/s_expm1_2u5.c | 6 -
.../arm-optimized-routines/pl/math/s_expm1f_1u6.c | 6 -
.../arm-optimized-routines/pl/math/s_log10_2u5.c | 6 -
.../arm-optimized-routines/pl/math/s_log10f_3u5.c | 6 -
.../arm-optimized-routines/pl/math/s_log1p_2u5.c | 6 -
.../arm-optimized-routines/pl/math/s_log1pf_2u1.c | 6 -
contrib/arm-optimized-routines/pl/math/s_log2_3u.c | 6 -
.../arm-optimized-routines/pl/math/s_log2f_2u5.c | 6 -
contrib/arm-optimized-routines/pl/math/s_sinh_3u.c | 6 -
.../arm-optimized-routines/pl/math/s_sinhf_2u3.c | 6 -
contrib/arm-optimized-routines/pl/math/s_tan_3u5.c | 6 -
.../arm-optimized-routines/pl/math/s_tanf_3u5.c | 6 -
contrib/arm-optimized-routines/pl/math/s_tanh_3u.c | 6 -
.../arm-optimized-routines/pl/math/s_tanhf_2u6.c | 6 -
contrib/arm-optimized-routines/pl/math/sinh_3u.c | 9 +-
contrib/arm-optimized-routines/pl/math/sinhf_2u3.c | 9 +-
contrib/arm-optimized-routines/pl/math/sinpi_3u.c | 90 +
.../arm-optimized-routines/pl/math/sinpif_2u5.c | 83 +
.../arm-optimized-routines/pl/math/sv_acos_2u.c | 91 +
.../arm-optimized-routines/pl/math/sv_acosf_1u4.c | 84 +
.../arm-optimized-routines/pl/math/sv_acosh_3u5.c | 50 +
.../arm-optimized-routines/pl/math/sv_acoshf_2u8.c | 47 +
.../arm-optimized-routines/pl/math/sv_asin_3u.c | 84 +
.../arm-optimized-routines/pl/math/sv_asinf_2u5.c | 76 +
.../arm-optimized-routines/pl/math/sv_asinh_3u0.c | 129 +
.../arm-optimized-routines/pl/math/sv_asinhf_2u5.c | 55 +
.../arm-optimized-routines/pl/math/sv_atan2_2u5.c | 111 +-
.../arm-optimized-routines/pl/math/sv_atan2f_3u.c | 112 +-
.../arm-optimized-routines/pl/math/sv_atan_2u5.c | 77 +-
.../pl/math/sv_atan_common.h | 61 -
.../arm-optimized-routines/pl/math/sv_atanf_2u9.c | 69 +-
.../pl/math/sv_atanf_common.h | 47 -
.../arm-optimized-routines/pl/math/sv_atanh_3u3.c | 60 +
.../arm-optimized-routines/pl/math/sv_atanhf_2u8.c | 56 +
.../arm-optimized-routines/pl/math/sv_cbrt_2u.c | 122 +
.../arm-optimized-routines/pl/math/sv_cbrtf_1u7.c | 116 +
.../arm-optimized-routines/pl/math/sv_cexpi_3u5.c | 45 +
.../arm-optimized-routines/pl/math/sv_cexpif_1u8.c | 47 +
.../arm-optimized-routines/pl/math/sv_cos_2u5.c | 104 +-
.../arm-optimized-routines/pl/math/sv_cosf_2u1.c | 94 +-
.../arm-optimized-routines/pl/math/sv_cosh_2u.c | 100 +
.../arm-optimized-routines/pl/math/sv_coshf_2u.c | 56 +
.../arm-optimized-routines/pl/math/sv_cospi_3u2.c | 63 +
.../arm-optimized-routines/pl/math/sv_cospif_2u6.c | 59 +
.../arm-optimized-routines/pl/math/sv_erf_2u5.c | 111 +
contrib/arm-optimized-routines/pl/math/sv_erf_3u.c | 103 -
.../arm-optimized-routines/pl/math/sv_erf_data.c | 1558 +++++++++
.../arm-optimized-routines/pl/math/sv_erfc_1u8.c | 164 +
.../arm-optimized-routines/pl/math/sv_erfc_4u.c | 146 -
.../arm-optimized-routines/pl/math/sv_erfcf_1u7.c | 111 +
.../arm-optimized-routines/pl/math/sv_erff_1u3.c | 104 -
.../arm-optimized-routines/pl/math/sv_erff_2u.c | 90 +
.../arm-optimized-routines/pl/math/sv_erff_data.c | 1046 ++++++
.../arm-optimized-routines/pl/math/sv_exp10_1u5.c | 122 +
.../arm-optimized-routines/pl/math/sv_exp10f_1u5.c | 87 +
.../arm-optimized-routines/pl/math/sv_exp2_2u.c | 107 +
.../arm-optimized-routines/pl/math/sv_exp2f_1u6.c | 80 +
.../arm-optimized-routines/pl/math/sv_exp_1u5.c | 137 +
.../arm-optimized-routines/pl/math/sv_exp_tail.h | 79 -
.../arm-optimized-routines/pl/math/sv_expf_2u.c | 180 +-
.../arm-optimized-routines/pl/math/sv_expf_data.c | 12 -
.../pl/math/sv_expf_inline.h | 66 +
.../arm-optimized-routines/pl/math/sv_expm1_2u5.c | 95 +
.../arm-optimized-routines/pl/math/sv_expm1f_1u6.c | 93 +
.../pl/math/sv_expm1f_inline.h | 73 +
.../arm-optimized-routines/pl/math/sv_hypot_1u5.c | 51 +
.../arm-optimized-routines/pl/math/sv_hypotf_1u5.c | 45 +
.../arm-optimized-routines/pl/math/sv_log10_2u5.c | 94 +-
.../arm-optimized-routines/pl/math/sv_log10f_3u5.c | 119 +-
.../arm-optimized-routines/pl/math/sv_log1p_2u5.c | 116 +
.../pl/math/sv_log1p_inline.h | 96 +
.../arm-optimized-routines/pl/math/sv_log1pf_1u3.c | 97 +
.../pl/math/sv_log1pf_inline.h | 65 +
.../arm-optimized-routines/pl/math/sv_log2_3u.c | 94 +-
.../arm-optimized-routines/pl/math/sv_log2f_2u5.c | 99 +-
.../arm-optimized-routines/pl/math/sv_log_2u5.c | 101 +-
.../arm-optimized-routines/pl/math/sv_log_data.c | 146 -
.../arm-optimized-routines/pl/math/sv_logf_3u4.c | 99 +-
.../arm-optimized-routines/pl/math/sv_logf_data.c | 12 -
contrib/arm-optimized-routines/pl/math/sv_math.h | 220 +-
.../arm-optimized-routines/pl/math/sv_pow_1u5.c | 444 +++
.../arm-optimized-routines/pl/math/sv_powf_2u6.c | 360 ++
contrib/arm-optimized-routines/pl/math/sv_powi.c | 25 +-
contrib/arm-optimized-routines/pl/math/sv_powif.c | 26 +-
contrib/arm-optimized-routines/pl/math/sv_sin_3u.c | 89 -
.../arm-optimized-routines/pl/math/sv_sin_3u5.c | 96 +
.../arm-optimized-routines/pl/math/sv_sincos_3u5.c | 61 +
.../pl/math/sv_sincos_common.h | 85 +
.../pl/math/sv_sincosf_1u8.c | 62 +
.../pl/math/sv_sincosf_common.h | 81 +
.../arm-optimized-routines/pl/math/sv_sinf_1u9.c | 103 +-
.../pl/math/sv_sinf_poly_data.c | 19 -
.../arm-optimized-routines/pl/math/sv_sinh_3u.c | 103 +
.../arm-optimized-routines/pl/math/sv_sinhf_2u3.c | 64 +
.../arm-optimized-routines/pl/math/sv_sinpi_3u1.c | 57 +
.../arm-optimized-routines/pl/math/sv_sinpif_2u5.c | 53 +
.../arm-optimized-routines/pl/math/sv_tan_3u5.c | 99 +
.../arm-optimized-routines/pl/math/sv_tanf_3u5.c | 141 +-
.../arm-optimized-routines/pl/math/sv_tanh_3u.c | 96 +
.../arm-optimized-routines/pl/math/sv_tanhf_2u6.c | 59 +
contrib/arm-optimized-routines/pl/math/tanf_3u3.c | 27 +-
contrib/arm-optimized-routines/pl/math/tanh_3u.c | 22 +-
contrib/arm-optimized-routines/pl/math/tanhf_2u6.c | 9 +-
.../pl/math/test/mathbench_funcs.h | 55 +-
.../pl/math/test/mathbench_wrappers.h | 159 +-
.../arm-optimized-routines/pl/math/test/pl_test.h | 24 +-
.../arm-optimized-routines/pl/math/test/runulp.sh | 56 +-
.../pl/math/test/testcases/directed/acos.tst | 17 +
.../pl/math/test/testcases/directed/acosf.tst | 21 +
.../pl/math/test/testcases/directed/asin.tst | 24 +
.../pl/math/test/testcases/directed/asinf.tst | 24 +
.../pl/math/test/ulp_funcs.h | 54 +-
.../pl/math/test/ulp_wrappers.h | 78 +-
.../pl/math/tools/asin.sollya | 29 +
.../pl/math/tools/asinf.sollya | 36 +
.../pl/math/tools/erf.sollya | 25 +
.../pl/math/tools/erfc.sollya | 60 +-
.../pl/math/tools/erfcf.sollya | 41 +-
.../pl/math/tools/erff.sollya | 20 +
.../pl/math/tools/exp10.sollya | 55 +
.../pl/math/tools/sincos.sollya | 33 +
.../pl/math/tools/sincosf.sollya | 33 +
.../pl/math/tools/sinpi.sollya | 33 +
.../pl/math/trigpi_references.c | 57 +
contrib/arm-optimized-routines/pl/math/v_acos_2u.c | 122 +
.../arm-optimized-routines/pl/math/v_acosf_1u4.c | 113 +
.../arm-optimized-routines/pl/math/v_acosh_3u5.c | 63 +-
.../arm-optimized-routines/pl/math/v_acoshf_3u1.c | 70 +-
contrib/arm-optimized-routines/pl/math/v_asin_3u.c | 113 +
.../arm-optimized-routines/pl/math/v_asinf_2u5.c | 104 +
.../arm-optimized-routines/pl/math/v_asinh_3u5.c | 176 +-
.../arm-optimized-routines/pl/math/v_asinhf_2u7.c | 78 +-
.../arm-optimized-routines/pl/math/v_atan2_3u.c | 117 +-
.../arm-optimized-routines/pl/math/v_atan2f_3u.c | 112 +-
.../arm-optimized-routines/pl/math/v_atan_2u5.c | 98 +-
.../arm-optimized-routines/pl/math/v_atanf_3u.c | 96 +-
.../arm-optimized-routines/pl/math/v_atanh_3u5.c | 69 +-
.../arm-optimized-routines/pl/math/v_atanhf_3u1.c | 73 +-
contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c | 100 +-
.../arm-optimized-routines/pl/math/v_cbrtf_1u5.c | 96 -
.../arm-optimized-routines/pl/math/v_cbrtf_1u7.c | 116 +
.../arm-optimized-routines/pl/math/v_cexpi_3u5.c | 45 +
.../arm-optimized-routines/pl/math/v_cexpif_1u8.c | 47 +
contrib/arm-optimized-routines/pl/math/v_cosh_2u.c | 130 +-
.../arm-optimized-routines/pl/math/v_coshf_2u4.c | 76 +-
.../arm-optimized-routines/pl/math/v_cospi_3u1.c | 86 +
.../arm-optimized-routines/pl/math/v_cospif_3u2.c | 83 +
contrib/arm-optimized-routines/pl/math/v_erf_2u.c | 116 -
contrib/arm-optimized-routines/pl/math/v_erf_2u5.c | 158 +
.../arm-optimized-routines/pl/math/v_erf_data.c | 119 -
.../arm-optimized-routines/pl/math/v_erfc_1u8.c | 198 ++
contrib/arm-optimized-routines/pl/math/v_erfc_4u.c | 168 -
.../arm-optimized-routines/pl/math/v_erfc_data.c | 96 -
.../arm-optimized-routines/pl/math/v_erfcf_1u.c | 183 -
.../arm-optimized-routines/pl/math/v_erfcf_1u7.c | 166 +
.../arm-optimized-routines/pl/math/v_erff_1u5.c | 116 -
contrib/arm-optimized-routines/pl/math/v_erff_2u.c | 118 +
.../arm-optimized-routines/pl/math/v_erff_data.c | 18 -
.../arm-optimized-routines/pl/math/v_erfinv_25u.c | 161 +
.../arm-optimized-routines/pl/math/v_erfinvf_5u.c | 163 +
.../arm-optimized-routines/pl/math/v_exp10_2u.c | 144 +
.../arm-optimized-routines/pl/math/v_exp10f_2u4.c | 138 +
contrib/arm-optimized-routines/pl/math/v_exp2_2u.c | 128 +
.../arm-optimized-routines/pl/math/v_exp_data.c | 55 +
.../arm-optimized-routines/pl/math/v_exp_tail.c | 75 -
.../pl/math/v_exp_tail_data.c | 179 +-
.../pl/math/v_exp_tail_inline.h | 102 +
contrib/arm-optimized-routines/pl/math/v_expf.c | 83 -
.../arm-optimized-routines/pl/math/v_expf_inline.h | 60 +
.../arm-optimized-routines/pl/math/v_expm1_2u5.c | 139 +-
.../arm-optimized-routines/pl/math/v_expm1f_1u6.c | 123 +-
.../pl/math/v_expm1f_inline.h | 56 +-
.../arm-optimized-routines/pl/math/v_hypot_1u5.c | 95 +
.../arm-optimized-routines/pl/math/v_hypotf_1u5.c | 94 +
.../arm-optimized-routines/pl/math/v_log10_2u5.c | 140 +-
.../arm-optimized-routines/pl/math/v_log10_data.c | 298 +-
.../arm-optimized-routines/pl/math/v_log10f_3u5.c | 114 +-
.../arm-optimized-routines/pl/math/v_log10f_data.c | 13 -
.../arm-optimized-routines/pl/math/v_log1p_2u5.c | 144 +-
.../pl/math/v_log1p_inline.h | 82 +-
.../arm-optimized-routines/pl/math/v_log1pf_2u1.c | 174 +-
.../pl/math/v_log1pf_inline.h | 74 +-
contrib/arm-optimized-routines/pl/math/v_log2_3u.c | 133 +-
.../arm-optimized-routines/pl/math/v_log2_data.c | 278 +-
.../arm-optimized-routines/pl/math/v_log2f_2u5.c | 93 +-
.../arm-optimized-routines/pl/math/v_log2f_data.c | 15 -
.../arm-optimized-routines/pl/math/v_log_data.c | 161 +
.../arm-optimized-routines/pl/math/v_log_inline.h | 104 +
.../arm-optimized-routines/pl/math/v_logf_inline.h | 59 +
contrib/arm-optimized-routines/pl/math/v_math.h | 874 +----
contrib/arm-optimized-routines/pl/math/v_pow_1u5.c | 259 ++
.../v_exp_data.c => pl/math/v_pow_exp_data.c} | 164 +-
.../pl/math/v_pow_log_data.c | 174 +
.../arm-optimized-routines/pl/math/v_powf_data.c | 89 +
.../arm-optimized-routines/pl/math/v_sincos_3u5.c | 57 +
.../pl/math/v_sincos_common.h | 86 +
.../arm-optimized-routines/pl/math/v_sincosf_1u8.c | 58 +
.../pl/math/v_sincosf_common.h | 84 +
contrib/arm-optimized-routines/pl/math/v_sinh_3u.c | 120 +-
.../arm-optimized-routines/pl/math/v_sinhf_2u3.c | 91 +-
.../arm-optimized-routines/pl/math/v_sinpi_3u1.c | 86 +
.../arm-optimized-routines/pl/math/v_sinpif_3u.c | 81 +
contrib/arm-optimized-routines/pl/math/v_tan_3u5.c | 124 +-
.../arm-optimized-routines/pl/math/v_tan_data.c | 15 -
.../arm-optimized-routines/pl/math/v_tanf_3u5.c | 134 +-
contrib/arm-optimized-routines/pl/math/v_tanh_3u.c | 112 +-
.../arm-optimized-routines/pl/math/v_tanhf_2u6.c | 80 +-
.../arm-optimized-routines/pl/math/vn_acosh_3u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_acoshf_3u1.c | 12 -
.../arm-optimized-routines/pl/math/vn_asinh_3u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_asinhf_2u7.c | 12 -
.../arm-optimized-routines/pl/math/vn_atan2_3u.c | 12 -
.../arm-optimized-routines/pl/math/vn_atan2f_3u.c | 12 -
.../arm-optimized-routines/pl/math/vn_atan_2u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_atanf_3u.c | 12 -
.../arm-optimized-routines/pl/math/vn_atanh_3u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_atanhf_3u1.c | 12 -
.../arm-optimized-routines/pl/math/vn_cbrt_2u.c | 12 -
.../arm-optimized-routines/pl/math/vn_cbrtf_1u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_cosh_2u.c | 12 -
.../arm-optimized-routines/pl/math/vn_coshf_2u4.c | 12 -
contrib/arm-optimized-routines/pl/math/vn_erf_2u.c | 12 -
.../arm-optimized-routines/pl/math/vn_erfc_4u.c | 12 -
.../arm-optimized-routines/pl/math/vn_erfcf_1u.c | 12 -
.../arm-optimized-routines/pl/math/vn_erff_1u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_exp_tail.c | 11 -
contrib/arm-optimized-routines/pl/math/vn_expf.c | 12 -
.../arm-optimized-routines/pl/math/vn_expm1_2u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_expm1f_1u6.c | 12 -
.../arm-optimized-routines/pl/math/vn_log10_2u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_log10f_3u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_log1p_2u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_log1pf_2u1.c | 12 -
.../arm-optimized-routines/pl/math/vn_log2_3u.c | 12 -
.../arm-optimized-routines/pl/math/vn_log2f_2u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_sinh_3u.c | 12 -
.../arm-optimized-routines/pl/math/vn_sinhf_2u3.c | 12 -
.../arm-optimized-routines/pl/math/vn_tan_3u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_tanf_3u5.c | 12 -
.../arm-optimized-routines/pl/math/vn_tanh_3u.c | 12 -
.../arm-optimized-routines/pl/math/vn_tanhf_2u6.c | 12 -
.../string/aarch64/asmdefs.h | 14 +
.../string/aarch64/memcpy-advsimd.S | 62 +-
.../string/aarch64/memcpy-mops.S | 21 +
.../string/aarch64/memmove-mops.S | 21 +
.../string/aarch64/memset-mops.S | 20 +
.../arm-optimized-routines/string/bench/memcpy.c | 5 +-
.../string/include/stringlib.h | 7 +-
.../arm-optimized-routines/string/test/memcpy.c | 5 +-
.../arm-optimized-routines/string/test/memmove.c | 5 +-
.../arm-optimized-routines/string/test/memset.c | 5 +-
414 files changed, 26613 insertions(+), 10731 deletions(-)
diff --git a/contrib/arm-optimized-routines/README b/contrib/arm-optimized-routines/README
index a2143a28488a..651ebdc84bc8 100644
--- a/contrib/arm-optimized-routines/README
+++ b/contrib/arm-optimized-routines/README
@@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of
the appropriate subdirectory.
Regular quarterly releases are tagged as vYY.MM, the latest
-release is v23.01.
+release is v24.01.
Source code layout:
diff --git a/contrib/arm-optimized-routines/config.mk.dist b/contrib/arm-optimized-routines/config.mk.dist
index 7a8497507a81..03fb54db52fa 100644
--- a/contrib/arm-optimized-routines/config.mk.dist
+++ b/contrib/arm-optimized-routines/config.mk.dist
@@ -1,6 +1,6 @@
# Example config.mk
#
-# Copyright (c) 2018-2022, Arm Limited.
+# Copyright (c) 2018-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
# Subprojects to build
@@ -59,13 +59,14 @@ math-cflags += -ffp-contract=fast -fno-math-errno
# Use with clang.
#math-cflags += -ffp-contract=fast
-# Disable vector math code
-#math-cflags += -DWANT_VMATH=0
-
-# Disable/enable SVE vector math code and tests
+# Disable/enable SVE vector math code and tests.
+# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
+# routines only so that SVE code does not leak into scalar
+# routines. It is also necessary to add it for tools (e.g. ulp,
+# mathbench)
WANT_SVE_MATH = 0
ifeq ($(WANT_SVE_MATH), 1)
- math-cflags += -march=armv8.2-a+sve
+ math-sve-cflags = -march=armv8-a+sve
endif
math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk
index 2a9cad10d96a..5e9494a7bd3c 100644
--- a/contrib/arm-optimized-routines/math/Dir.mk
+++ b/contrib/arm-optimized-routines/math/Dir.mk
@@ -1,12 +1,14 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019-2022, Arm Limited.
+# Copyright (c) 2019-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
S := $(srcdir)/math
B := build/math
math-lib-srcs := $(wildcard $(S)/*.[cS])
+math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
+
math-test-srcs := \
$(S)/test/mathtest.c \
$(S)/test/mathbench.c \
@@ -65,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs)
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
$(math-tools): LDLIBS += $(math-ldlibs) -lm
+# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
+$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
build/bin/rtest: $(math-host-objs)
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cos.c b/contrib/arm-optimized-routines/math/aarch64/v_cos.c
new file mode 100644
index 000000000000..9a73575bce89
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_cos.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float64x2_t poly[7];
+ float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+} data = {
+ /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .half_pi = V2 (0x1.921fb54442d18p+0),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+ .shift = V2 (0x1.8p52),
+ .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f64 (x);
+ cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+ vreinterpretq_u64_f64 (d->range_val));
+ if (unlikely (v_any_u64 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+ cmp = vcageq_f64 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+ n = vsubq_f64 (n, d->shift);
+ n = vsubq_f64 (n, v_f64 (0.5));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cosf.c b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c
new file mode 100644
index 000000000000..b9890b2998ad
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .shift = V4 (0x1.8p+23f),
+ .half_pi = V4 (0x1.921fb6p0f),
+ .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, r3, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f32 (x);
+ cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+ vreinterpretq_u32_f32 (d->range_val));
+ if (unlikely (v_any_u32 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+ cmp = vcageq_f32 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+ n = vsubq_f32 (n, d->shift);
+ n = vsubq_f32 (n, v_f32 (0.5f));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r). */
+ r2 = vmulq_f32 (r, r);
+ r3 = vmulq_f32 (r2, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, y, r3);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp.c b/contrib/arm-optimized-routines/math/aarch64/v_exp.c
new file mode 100644
index 000000000000..bc5609faf4fc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp.c
@@ -0,0 +1,125 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+ float64x2_t poly[3];
+ float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.88 +0.5 ulp
+ rel error: 1.4337*2^-53
+ abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
+ .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+ V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V2 (163840.0), /* 1280.0 * N. */
+ .special_bound = V2 (704.0),
+#endif
+ .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
+ .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
+ .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+ .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+ float64x2_t n, r, r2, s, y, z;
+ uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ float64x2_t xm = x;
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+ if (unlikely (v_any_u64 (cmp)))
+ x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+ cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+ /* n = round(x/(ln2/N)). */
+ z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+ u = vreinterpretq_u64_f64 (z);
+ n = vsubq_f64 (z, data.shift);
+
+ /* r = x - n*ln2/N. */
+ r = x;
+ r = vfmsq_f64 (r, data.ln2_hi, n);
+ r = vfmsq_f64 (r, data.ln2_lo, n);
+
+ e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (C (0), C (1), r);
+ y = vfmaq_f64 (y, C (2), r2);
+ y = vfmaq_f64 (r, y, r2);
+
+ /* s = 2^(n/N). */
+ u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+ s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+ return special_case (s, y, n);
+#endif
+
+ return vfmaq_f64 (s, y, s);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
new file mode 100644
index 000000000000..e402205e98e6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
@@ -0,0 +1,113 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.962 ulp. */
+ .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
+ V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+ .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine for special lanes. */
+ return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, scale, p, q, poly;
+ uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+ /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ n = vrndaq_f32 (x);
+ r = vsubq_f32 (x, n);
+ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ r2 = vmulq_f32 (r, r);
+ p = vfmaq_f32 (C (1), C (0), r);
+ q = vfmaq_f32 (C (3), C (2), r);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (C (4), r);
+ poly = vfmaq_f32 (p, q, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
new file mode 100644
index 000000000000..ba6b02fbb4bc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const float Poly[] = {
+ /* maxerr: 0.878 ulp. */
+ 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+#define C5 v_f32 (Poly[5])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+ float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+ float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+ uint32x4_t cmp = absn > v_f32 (192.0f);
+ float32x4_t r1 = s1 * s1;
+ float32x4_t r0 = poly * s1 * s2;
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
*** 42574 LINES SKIPPED ***