Browse Source

x86: Improve svml_s_atanhf16_core_avx512.S

Improvements are:
    1. Reduce code size (-64 bytes).
    2. Remove redundant move instructions.
    3. Slightly improve instruction selection/scheduling where
       possible.
    4. Reduce rodata size ([-128, -188] bytes).

The throughput improvement is not significant as the port 0 bottleneck
is unavoidable.

        Function, New Time, Old Time, New / Old
_ZGVeN16v_atanhf,     1.39,    1.408,     0.987
arm/morello/v1
Noah Goldstein 4 years ago
parent
commit
73bae395cf
  1. 474
      sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S

474
sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S

@ -31,53 +31,50 @@
*
*/
/* Offsets for data table __svml_satanh_data_internal_avx512
*/
#define Log_tbl_H 0
#define Log_tbl_L 128
#define One 256
#define AbsMask 320
#define AddB5 384
#define RcpBitMask 448
#define poly_coeff3 512
#define poly_coeff2 576
#define poly_coeff1 640
#define poly_coeff0 704
#define Half 768
#define L2H 832
#define L2L 896
/* Offsets for data table __svml_satanh_data_internal_avx512 and
__svml_satanh_data_internal_avx512_al64. Ordered by use in the
function. On cold-starts this might help the prefetcher. Possibly
a better idea is to interleave start/end so that the prefetcher is
less likely to detect a stream and pull irrelivant lines into
cache. */
/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
the memory is broadcast to {1to16}. */
#define AbsMask 0
/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
is used here. */
#define One 0
#define AddB5 64
#define RcpBitMask 128
#define Log_tbl_L_lo 192
#define Log_tbl_L_hi 256
#define Log_tbl_H_lo 320
#define Log_tbl_H_hi 384
#define L2H 448
#define L2L 512
#define poly_coeff3 576
#define poly_coeff2 640
#define poly_coeff1 704
#include <sysdep.h>
#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal_avx512_al64)
.section .text.exex512, "ax", @progbits
ENTRY(_ZGVeN16v_atanhf_skx)
pushq %rbp
cfi_def_cfa_offset(16)
movq %rsp, %rbp
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
andq $-64, %rsp
subq $192, %rsp
vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
/* round reciprocals to 1+5b mantissas */
vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
vmovaps %zmm0, %zmm11
vandps AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
vandps AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
vmovups ATANHF_DATA(One)(%rip), %zmm4
/* 1+y */
vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
/* 1-y */
vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
vxorps %zmm6, %zmm11, %zmm10
/* Yp_high */
vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
/* -Ym_high */
vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
/* round reciprocals to 1+5b mantissas */
vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
/* RcpP ~ 1/Yp */
vrcp14ps %zmm9, %zmm12
@ -85,15 +82,21 @@ ENTRY(_ZGVeN16v_atanhf_skx)
/* RcpM ~ 1/Ym */
vrcp14ps %zmm8, %zmm13
/* Yp_high */
vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
/* -Ym_high */
vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
/* input outside (-1, 1) ? */
vcmpps $21, {sae}, %zmm4, %zmm6, %k0
vpaddd %zmm14, %zmm12, %zmm15
vpaddd %zmm14, %zmm13, %zmm0
vpaddd %zmm14, %zmm13, %zmm12
/* Yp_low */
vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
vandps %zmm1, %zmm15, %zmm7
vandps %zmm1, %zmm0, %zmm12
vandps %zmm1, %zmm12, %zmm12
/* Ym_low */
vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
@ -102,225 +105,199 @@ ENTRY(_ZGVeN16v_atanhf_skx)
vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
/* exponents */
vgetexpps {sae}, %zmm7, %zmm15
vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
vgetexpps {sae}, %zmm7, %zmm15
/* Table lookups */
vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
vgetexpps {sae}, %zmm12, %zmm14
vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
/* Prepare table index */
vpsrld $18, %zmm7, %zmm3
vpsrld $18, %zmm12, %zmm2
vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
/* Km-Kp */
vmovaps %zmm3, %zmm5
vpermi2ps %zmm13, %zmm10, %zmm3
vpermt2ps %zmm13, %zmm2, %zmm10
vpermi2ps %zmm7, %zmm11, %zmm5
vpermt2ps %zmm7, %zmm2, %zmm11
vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
kmovw %k0, %edx
vmovaps %zmm3, %zmm0
vpermi2ps %zmm13, %zmm8, %zmm3
vpermt2ps %zmm13, %zmm2, %zmm8
vpermi2ps %zmm7, %zmm6, %zmm0
vpermt2ps %zmm7, %zmm2, %zmm6
vsubps {rn-sae}, %zmm3, %zmm8, %zmm5
vsubps {rn-sae}, %zmm3, %zmm10, %zmm7
/* K*L2H + Th */
vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
vmovups ATANHF_DATA(L2H)(%rip), %zmm2
/* K*L2L + Tl */
vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
/* polynomials */
vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
vmovups ATANHF_DATA(L2L)(%rip), %zmm3
/* table values */
vsubps {rn-sae}, %zmm0, %zmm6, %zmm0
vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
vmovaps %zmm3, %zmm2
vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
vsubps {rn-sae}, %zmm5, %zmm11, %zmm5
vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
/* polynomials */
vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
vmovaps %zmm10, %zmm14
vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
/* (K*L2L + Tl) + Rp*PolyP */
vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
vorps Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
/* zmm12 = zmm12 & (zmm4 | zmm0). */
vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
vaddps {rn-sae}, %zmm3, %zmm0, %zmm4
vmulps {rn-sae}, %zmm9, %zmm4, %zmm0
vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
vaddps {rn-sae}, %zmm14, %zmm10, %zmm8
vcmpps $21, {sae}, %zmm4, %zmm6, %k0
kmovw %k0, %edx
testl %edx, %edx
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
# LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
vmulps {rn-sae}, %zmm12, %zmm8, %zmm0
/* Restore registers
* and exit the function
*/
L(EXIT):
movq %rbp, %rsp
popq %rbp
cfi_def_cfa(7, 8)
cfi_restore(6)
/* No register to restore on fast path. */
ret
cfi_def_cfa(6, 16)
cfi_offset(6, -16)
/* Branch to process
* special inputs
*/
/* Cold case. edx has 1s where there was a special value that
needs to be handled by a atanhf call. Optimize for code size
more so than speed here. */
L(SPECIAL_VALUES_BRANCH):
vmovups %zmm11, 64(%rsp)
vmovups %zmm0, 128(%rsp)
# LOE rbx r12 r13 r14 r15 edx zmm0
xorl %eax, %eax
# LOE rbx r12 r13 r14 r15 eax edx
vzeroupper
movq %r12, 16(%rsp)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
movl %eax, %r12d
movq %r13, 8(%rsp)
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
movl %edx, %r13d
movq %r14, (%rsp)
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r15 r12d r13d
/* Range mask
* bits check
# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
/* Use r13 to save/restore the stack. This allows us to use rbp as
callee save register saving code size. */
pushq %r13
cfi_adjust_cfa_offset(8)
cfi_offset(r13, -16)
/* Need to callee save registers to preserve state across tanhf calls.
*/
pushq %rbx
cfi_adjust_cfa_offset(8)
cfi_offset(rbx, -24)
pushq %rbp
cfi_adjust_cfa_offset(8)
cfi_offset(rbp, -32)
movq %rsp, %r13
cfi_def_cfa_register(r13)
L(RANGEMASK_CHECK):
btl %r12d, %r13d
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE rbx r15 r12d r13d
/* Special inputs
* processing loop
*/
/* Align stack and make room for 2x zmm vectors. */
andq $-64, %rsp
addq $-128, %rsp
vmulps {rn-sae}, %zmm12, %zmm8, %zmm1
vmovaps %zmm1, (%rsp)
vmovaps %zmm0, 64(%rsp)
vzeroupper
/* edx has 1s where there was a special value that needs to be handled
by a atanhf call. */
movl %edx, %ebx
L(SPECIAL_VALUES_LOOP):
incl %r12d
cmpl $16, %r12d
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE rbx r15 r12d r13d
movq 16(%rsp), %r12
cfi_restore(12)
movq 8(%rsp), %r13
cfi_restore(13)
movq (%rsp), %r14
cfi_restore(14)
vmovups 128(%rsp), %zmm0
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15 zmm0
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %r12d, %r14d
movss 64(%rsp, %r14, 4), %xmm0
# LOE rbx rbp r12 r13 r14 r15
/* use rbp as index for special value that is saved across calls to
atanhf. We technically don't need a callee save register here as offset
to rsp is always [0, 56] so we can restore rsp by realigning to 64.
Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
in the loop. Realigning also costs more code size. */
xorl %ebp, %ebp
tzcntl %ebx, %ebp
/* Scalar math fucntion call to process special input. */
movss 64(%rsp, %rbp, 4), %xmm0
call atanhf@PLT
# LOE rbx r14 r15 r12d r13d xmm0
movss %xmm0, 128(%rsp, %r14, 4)
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
# LOE rbx r15 r12d r13d
/* No good way to avoid the store-forwarding fault this will cause on
return. `lfence` avoids the SF fault but at greater cost as it
serialized stack/callee save restoration. */
movss %xmm0, (%rsp, %rbp, 4)
blsrl %ebx, %ebx
jnz L(SPECIAL_VALUES_LOOP)
# LOE r12 r13 r14 r15
/* All results have been written to (%rsp). */
vmovaps (%rsp), %zmm0
/* Restore rsp. */
movq %r13, %rsp
cfi_def_cfa_register(rsp)
/* Restore callee save registers. */
popq %rbp
cfi_adjust_cfa_offset(-8)
cfi_restore(rbp)
popq %rbx
cfi_adjust_cfa_offset(-8)
cfi_restore(rbp)
popq %r13
cfi_adjust_cfa_offset(-8)
cfi_restore(r13)
ret
END(_ZGVeN16v_atanhf_skx)
.section .rodata, "a"
.align 64
.align 4
#ifdef __svml_satanh_data_internal_avx512_typedef
typedef unsigned int VUINT32;
typedef struct {
__declspec(align(64)) VUINT32 Log_tbl_H[32][1];
__declspec(align(64)) VUINT32 Log_tbl_L[32][1];
typedef struct{
__declspec(align(4)) VUINT32 AbsMask[1][1];
__declspec(align(64)) VUINT32 One[16][1];
__declspec(align(64)) VUINT32 AbsMask[16][1];
__declspec(align(64)) VUINT32 AddB5[16][1];
__declspec(align(64)) VUINT32 RcpBitMask[16][1];
__declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
__declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
__declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
__declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
__declspec(align(64)) VUINT32 L2H[16][1];
__declspec(align(64)) VUINT32 L2L[16][1];
__declspec(align(64)) VUINT32 poly_coeff3[16][1];
__declspec(align(64)) VUINT32 poly_coeff2[16][1];
__declspec(align(64)) VUINT32 poly_coeff1[16][1];
__declspec(align(64)) VUINT32 poly_coeff0[16][1];
__declspec(align(64)) VUINT32 Half[16][1];
__declspec(align(64)) VUINT32 L2H[16][1];
__declspec(align(64)) VUINT32 L2L[16][1];
} __svml_satanh_data_internal_avx512;
#endif
__svml_satanh_data_internal_avx512:
/* Log_tbl_H */
.long 0x00000000
.long 0x3cfc0000
.long 0x3d780000
.long 0x3db78000
.long 0x3df10000
.long 0x3e14c000
.long 0x3e300000
.long 0x3e4a8000
.long 0x3e648000
.long 0x3e7dc000
.long 0x3e8b4000
.long 0x3e974000
.long 0x3ea30000
.long 0x3eae8000
.long 0x3eb9c000
.long 0x3ec4e000
.long 0x3ecfa000
.long 0x3eda2000
.long 0x3ee48000
.long 0x3eeea000
.long 0x3ef8a000
.long 0x3f013000
.long 0x3f05f000
.long 0x3f0aa000
.long 0x3f0f4000
.long 0x3f13d000
.long 0x3f184000
.long 0x3f1ca000
.long 0x3f20f000
.long 0x3f252000
.long 0x3f295000
.long 0x3f2d7000
/* Log_tbl_L */
/* Leave this at front so we can potentially save space due to
smaller alignment constraint. */
.align 4
/* AbsMask */
.long 0x7fffffff
.align 64
__svml_satanh_data_internal_avx512_al64:
/* One */
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/* AddB5 */
.align 64
.long 0x00020000, 0x00020000, 0x00020000, 0x00020000
.long 0x00020000, 0x00020000, 0x00020000, 0x00020000
.long 0x00020000, 0x00020000, 0x00020000, 0x00020000
.long 0x00020000, 0x00020000, 0x00020000, 0x00020000
/* RcpBitMask */
.align 64
.long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
.long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
.long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
.long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
/* Log_tbl_L_lo */
.align 64
.long 0x00000000
.long 0x3726c39e
@ -338,6 +315,8 @@ __svml_satanh_data_internal_avx512:
.long 0x38dedfac
.long 0x38ebfb5e
.long 0xb8e63c9f
/* Log_tbl_L_hi */
.align 64
.long 0xb85c1340
.long 0x38777bcd
.long 0xb6038656
@ -354,39 +333,74 @@ __svml_satanh_data_internal_avx512:
.long 0x38f85db0
.long 0x37b4996f
.long 0xb8bfb3ca
/* One */
/* Log_tbl_H_lo */
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/* AbsMask */
.long 0x00000000
.long 0x3cfc0000
.long 0x3d780000
.long 0x3db78000
.long 0x3df10000
.long 0x3e14c000
.long 0x3e300000
.long 0x3e4a8000
.long 0x3e648000
.long 0x3e7dc000
.long 0x3e8b4000
.long 0x3e974000
.long 0x3ea30000
.long 0x3eae8000
.long 0x3eb9c000
.long 0x3ec4e000
/* Log_tbl_H_hi */
.align 64
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
/* AddB5 */
.long 0x3ecfa000
.long 0x3eda2000
.long 0x3ee48000
.long 0x3eeea000
.long 0x3ef8a000
.long 0x3f013000
.long 0x3f05f000
.long 0x3f0aa000
.long 0x3f0f4000
.long 0x3f13d000
.long 0x3f184000
.long 0x3f1ca000
.long 0x3f20f000
.long 0x3f252000
.long 0x3f295000
.long 0x3f2d7000
/* L2H = log(2)_high */
.align 64
.long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
/* RcpBitMask */
.long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
.long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
.long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
.long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
/* L2L = log(2)_low */
.align 64
.long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
/* poly_coeff3 */
.align 64
.long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
.long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
.long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
.long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
.long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
/* poly_coeff2 */
.align 64
.long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
.long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
.long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
.long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
.long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
/* poly_coeff1 */
.align 64
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
/* poly_coeff0 */
.align 64
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
/* Half */
.align 64
.long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
/* L2H = log(2)_high */
.align 64
.long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
/* L2L = log(2)_low */
.align 64
.long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
.long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
.align 64
.type __svml_satanh_data_internal_avx512_al64, @object
.size __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
.type __svml_satanh_data_internal_avx512, @object
.size __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512

Loading…
Cancel
Save