aosp12/bionic/libm/x86/s_cos.S

893 lines
19 KiB
ArmAsm

/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/******************************************************************************/
// ALGORITHM DESCRIPTION
// ---------------------
//
// 1. RANGE REDUCTION
//
// We perform an initial range reduction from X to r with
//
// X =~= N * pi/32 + r
//
// so that |r| <= pi/64 + epsilon. We restrict inputs to those
// where |N| <= 932560. Beyond this, the range reduction is
// insufficiently accurate. For extremely small inputs,
// denormalization can occur internally, impacting performance.
// This means that the main path is actually only taken for
// 2^-252 <= |X| < 90112.
//
// To avoid branches, we perform the range reduction to full
// accuracy each time.
//
// X - N * (P_1 + P_2 + P_3)
//
// where P_1 and P_2 are 32-bit numbers (so multiplication by N
// is exact) and P_3 is a 53-bit number. Together, these
// approximate pi well enough for all cases in the restricted
// range.
//
// The main reduction sequence is:
//
// y = 32/pi * x
// N = integer(y)
// (computed by adding and subtracting off SHIFTER)
//
// m_1 = N * P_1
// m_2 = N * P_2
// r_1 = x - m_1
// r = r_1 - m_2
// (this r can be used for most of the calculation)
//
// c_1 = r_1 - r
// m_3 = N * P_3
// c_2 = c_1 - m_2
// c = c_2 - m_3
//
// 2. MAIN ALGORITHM
//
// The algorithm uses a table lookup based on B = M * pi / 32
// where M = N mod 64. The stored values are:
// sigma closest power of 2 to cos(B)
// C_hl 53-bit cos(B) - sigma
// S_hi + S_lo 2 * 53-bit sin(B)
//
// The computation is organized as follows:
//
// sin(B + r + c) = [sin(B) + sigma * r] +
// r * (cos(B) - sigma) +
// sin(B) * [cos(r + c) - 1] +
// cos(B) * [sin(r + c) - r]
//
// which is approximately:
//
// [S_hi + sigma * r] +
// C_hl * r +
// S_lo + S_hi * [(cos(r) - 1) - r * c] +
// (C_hl + sigma) * [(sin(r) - r) + c]
//
// and this is what is actually computed. We separate this sum
// into four parts:
//
// hi + med + pols + corr
//
// where
//
// hi = S_hi + sigma r
// med = C_hl * r
// pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
// corr = S_lo + c * ((C_hl + sigma) - S_hi * r)
//
// 3. POLYNOMIAL
//
// The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
// (sin(r) - r) can be rearranged freely, since it is quite
// small, so we exploit parallelism to the fullest.
//
// psc4 = SC_4 * r_1
// msc4 = psc4 * r
// r2 = r * r
// msc2 = SC_2 * r2
// r4 = r2 * r2
// psc3 = SC_3 + msc4
// psc1 = SC_1 + msc2
// msc3 = r4 * psc3
// sincospols = psc1 + msc3
// pols = sincospols *
// <S_hi * r^2 | (C_hl + sigma) * r^3>
//
// 4. CORRECTION TERM
//
// This is where the "c" component of the range reduction is
// taken into account; recall that just "r" is used for most of
// the calculation.
//
// -c = m_3 - c_2
// -d = S_hi * r - (C_hl + sigma)
// corr = -c * -d + S_lo
//
// 5. COMPENSATED SUMMATIONS
//
// The two successive compensated summations add up the high
// and medium parts, leaving just the low parts to add up at
// the end.
//
// rs = sigma * r
// res_int = S_hi + rs
// k_0 = S_hi - res_int
// k_2 = k_0 + rs
// med = C_hl * r
// res_hi = res_int + med
// k_1 = res_int - res_hi
// k_3 = k_1 + med
//
// 6. FINAL SUMMATION
//
// We now add up all the small parts:
//
// res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
//
// Now the overall result is just:
//
// res_hi + res_lo
//
// 7. SMALL ARGUMENTS
//
// Inputs with |X| < 2^-252 are treated specially as
// 1 - |x|.
//
// Special cases:
// cos(NaN) = quiet NaN, and raise invalid exception
// cos(INF) = NaN and raise invalid exception
// cos(0) = 1
//
/******************************************************************************/
#include <private/bionic_asm.h>
# -- Begin static_func
.text
.align __bionic_asm_align
.type static_func, @function
static_func:
..B1.1:
call ..L2
..L2:
popl %eax
lea _GLOBAL_OFFSET_TABLE_+[. - ..L2](%eax), %eax
lea static_const_table@GOTOFF(%eax), %eax
ret
.size static_func,.-static_func
# -- End static_func
# -- Begin cos
ENTRY(cos)
# parameter 1: 8 + %ebp
..B2.1:
..B2.2:
pushl %ebp
movl %esp, %ebp
subl $120, %esp
movl %ebx, 56(%esp)
call static_func
movl %eax, %ebx
movsd 128(%esp), %xmm0
pextrw $3, %xmm0, %eax
andl $32767, %eax
subl $12336, %eax
cmpl $4293, %eax
ja .L_2TAG_PACKET_0.0.2
movsd 2160(%ebx), %xmm1
mulsd %xmm0, %xmm1
movapd 2240(%ebx), %xmm5
movsd 2224(%ebx), %xmm4
andpd %xmm0, %xmm4
orps %xmm4, %xmm5
movsd 2128(%ebx), %xmm3
movapd 2112(%ebx), %xmm2
addpd %xmm5, %xmm1
cvttsd2si %xmm1, %edx
cvtsi2sdl %edx, %xmm1
mulsd %xmm1, %xmm3
unpcklpd %xmm1, %xmm1
addl $1865232, %edx
movapd %xmm0, %xmm4
andl $63, %edx
movapd 2096(%ebx), %xmm5
lea (%ebx), %eax
shll $5, %edx
addl %edx, %eax
mulpd %xmm1, %xmm2
subsd %xmm3, %xmm0
mulsd 2144(%ebx), %xmm1
subsd %xmm3, %xmm4
movsd 8(%eax), %xmm7
unpcklpd %xmm0, %xmm0
movapd %xmm4, %xmm3
subsd %xmm2, %xmm4
mulpd %xmm0, %xmm5
subpd %xmm2, %xmm0
movapd 2064(%ebx), %xmm6
mulsd %xmm4, %xmm7
subsd %xmm4, %xmm3
mulpd %xmm0, %xmm5
mulpd %xmm0, %xmm0
subsd %xmm2, %xmm3
movapd (%eax), %xmm2
subsd %xmm3, %xmm1
movsd 24(%eax), %xmm3
addsd %xmm3, %xmm2
subsd %xmm2, %xmm7
mulsd %xmm4, %xmm2
mulpd %xmm0, %xmm6
mulsd %xmm4, %xmm3
mulpd %xmm0, %xmm2
mulpd %xmm0, %xmm0
addpd 2080(%ebx), %xmm5
mulsd (%eax), %xmm4
addpd 2048(%ebx), %xmm6
mulpd %xmm0, %xmm5
movapd %xmm3, %xmm0
addsd 8(%eax), %xmm3
mulpd %xmm7, %xmm1
movapd %xmm4, %xmm7
addsd %xmm3, %xmm4
addpd %xmm5, %xmm6
movsd 8(%eax), %xmm5
subsd %xmm3, %xmm5
subsd %xmm4, %xmm3
addsd 16(%eax), %xmm1
mulpd %xmm2, %xmm6
addsd %xmm0, %xmm5
addsd %xmm7, %xmm3
addsd %xmm5, %xmm1
addsd %xmm3, %xmm1
addsd %xmm6, %xmm1
unpckhpd %xmm6, %xmm6
addsd %xmm6, %xmm1
addsd %xmm1, %xmm4
movsd %xmm4, (%esp)
fldl (%esp)
jmp .L_2TAG_PACKET_1.0.2
.L_2TAG_PACKET_0.0.2:
jg .L_2TAG_PACKET_2.0.2
pextrw $3, %xmm0, %eax
andl $32767, %eax
pinsrw $3, %eax, %xmm0
movsd 2192(%ebx), %xmm1
subsd %xmm0, %xmm1
movsd %xmm1, (%esp)
fldl (%esp)
jmp .L_2TAG_PACKET_1.0.2
.L_2TAG_PACKET_2.0.2:
movl 132(%esp), %eax
andl $2146435072, %eax
cmpl $2146435072, %eax
je .L_2TAG_PACKET_3.0.2
subl $32, %esp
movsd %xmm0, (%esp)
lea 40(%esp), %eax
movl %eax, 8(%esp)
movl $1, %eax
movl %eax, 12(%esp)
call __libm_sincos_huge
addl $32, %esp
fldl 8(%esp)
jmp .L_2TAG_PACKET_1.0.2
.L_2TAG_PACKET_3.0.2:
fldl 128(%esp)
fmull 2208(%ebx)
.L_2TAG_PACKET_1.0.2:
movl 56(%esp), %ebx
movl %ebp, %esp
popl %ebp
ret
..B2.3:
END(cos)
# -- End cos
# Start file scope ASM
ALIAS_SYMBOL(cosl, cos);
# End file scope ASM
.section .rodata, "a"
.align 16
.align 16
static_const_table:
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 1072693248
.long 393047345
.long 3212032302
.long 3156849708
.long 1069094822
.long 3758096384
.long 3158189848
.long 0
.long 1072693248
.long 18115067
.long 3214126342
.long 1013556747
.long 1070135480
.long 3221225472
.long 3160567065
.long 0
.long 1072693248
.long 2476548698
.long 3215330282
.long 785751814
.long 1070765062
.long 2684354560
.long 3161838221
.long 0
.long 1072693248
.long 2255197647
.long 3216211105
.long 2796464483
.long 1071152610
.long 3758096384
.long 3160878317
.long 0
.long 1072693248
.long 1945768569
.long 3216915048
.long 939980347
.long 1071524701
.long 536870912
.long 1012796809
.long 0
.long 1072693248
.long 1539668340
.long 3217396327
.long 967731400
.long 1071761211
.long 536870912
.long 1015752157
.long 0
.long 1072693248
.long 1403757309
.long 3217886718
.long 621354454
.long 1071926515
.long 536870912
.long 1013450602
.long 0
.long 1072693248
.long 2583490354
.long 1070236281
.long 1719614413
.long 1072079006
.long 536870912
.long 3163282740
.long 0
.long 1071644672
.long 2485417816
.long 1069626316
.long 1796544321
.long 1072217216
.long 536870912
.long 3162686945
.long 0
.long 1071644672
.long 2598800519
.long 1068266419
.long 688824739
.long 1072339814
.long 3758096384
.long 1010431536
.long 0
.long 1071644672
.long 2140183630
.long 3214756396
.long 4051746225
.long 1072445618
.long 2147483648
.long 3161907377
.long 0
.long 1071644672
.long 1699043957
.long 3216902261
.long 3476196678
.long 1072533611
.long 536870912
.long 1014257638
.long 0
.long 1071644672
.long 1991047213
.long 1067753521
.long 1455828442
.long 1072602945
.long 3758096384
.long 1015505073
.long 0
.long 1070596096
.long 240740309
.long 3215727903
.long 3489094832
.long 1072652951
.long 536870912
.long 1014325783
.long 0
.long 1070596096
.long 257503056
.long 3214647653
.long 2748392742
.long 1072683149
.long 1073741824
.long 3163061750
.long 0
.long 1069547520
.long 0
.long 0
.long 0
.long 1072693248
.long 0
.long 0
.long 0
.long 0
.long 257503056
.long 1067164005
.long 2748392742
.long 1072683149
.long 1073741824
.long 3163061750
.long 0
.long 3217031168
.long 240740309
.long 1068244255
.long 3489094832
.long 1072652951
.long 536870912
.long 1014325783
.long 0
.long 3218079744
.long 1991047213
.long 3215237169
.long 1455828442
.long 1072602945
.long 3758096384
.long 1015505073
.long 0
.long 3218079744
.long 1699043957
.long 1069418613
.long 3476196678
.long 1072533611
.long 536870912
.long 1014257638
.long 0
.long 3219128320
.long 2140183630
.long 1067272748
.long 4051746225
.long 1072445618
.long 2147483648
.long 3161907377
.long 0
.long 3219128320
.long 2598800519
.long 3215750067
.long 688824739
.long 1072339814
.long 3758096384
.long 1010431536
.long 0
.long 3219128320
.long 2485417816
.long 3217109964
.long 1796544321
.long 1072217216
.long 536870912
.long 3162686945
.long 0
.long 3219128320
.long 2583490354
.long 3217719929
.long 1719614413
.long 1072079006
.long 536870912
.long 3163282740
.long 0
.long 3219128320
.long 1403757309
.long 1070403070
.long 621354454
.long 1071926515
.long 536870912
.long 1013450602
.long 0
.long 3220176896
.long 1539668340
.long 1069912679
.long 967731400
.long 1071761211
.long 536870912
.long 1015752157
.long 0
.long 3220176896
.long 1945768569
.long 1069431400
.long 939980347
.long 1071524701
.long 536870912
.long 1012796809
.long 0
.long 3220176896
.long 2255197647
.long 1068727457
.long 2796464483
.long 1071152610
.long 3758096384
.long 3160878317
.long 0
.long 3220176896
.long 2476548698
.long 1067846634
.long 785751814
.long 1070765062
.long 2684354560
.long 3161838221
.long 0
.long 3220176896
.long 18115067
.long 1066642694
.long 1013556747
.long 1070135480
.long 3221225472
.long 3160567065
.long 0
.long 3220176896
.long 393047345
.long 1064548654
.long 3156849708
.long 1069094822
.long 3758096384
.long 3158189848
.long 0
.long 3220176896
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 0
.long 3220176896
.long 393047345
.long 1064548654
.long 3156849708
.long 3216578470
.long 3758096384
.long 1010706200
.long 0
.long 3220176896
.long 18115067
.long 1066642694
.long 1013556747
.long 3217619128
.long 3221225472
.long 1013083417
.long 0
.long 3220176896
.long 2476548698
.long 1067846634
.long 785751814
.long 3218248710
.long 2684354560
.long 1014354573
.long 0
.long 3220176896
.long 2255197647
.long 1068727457
.long 2796464483
.long 3218636258
.long 3758096384
.long 1013394669
.long 0
.long 3220176896
.long 1945768569
.long 1069431400
.long 939980347
.long 3219008349
.long 536870912
.long 3160280457
.long 0
.long 3220176896
.long 1539668340
.long 1069912679
.long 967731400
.long 3219244859
.long 536870912
.long 3163235805
.long 0
.long 3220176896
.long 1403757309
.long 1070403070
.long 621354454
.long 3219410163
.long 536870912
.long 3160934250
.long 0
.long 3220176896
.long 2583490354
.long 3217719929
.long 1719614413
.long 3219562654
.long 536870912
.long 1015799092
.long 0
.long 3219128320
.long 2485417816
.long 3217109964
.long 1796544321
.long 3219700864
.long 536870912
.long 1015203297
.long 0
.long 3219128320
.long 2598800519
.long 3215750067
.long 688824739
.long 3219823462
.long 3758096384
.long 3157915184
.long 0
.long 3219128320
.long 2140183630
.long 1067272748
.long 4051746225
.long 3219929266
.long 2147483648
.long 1014423729
.long 0
.long 3219128320
.long 1699043957
.long 1069418613
.long 3476196678
.long 3220017259
.long 536870912
.long 3161741286
.long 0
.long 3219128320
.long 1991047213
.long 3215237169
.long 1455828442
.long 3220086593
.long 3758096384
.long 3162988721
.long 0
.long 3218079744
.long 240740309
.long 1068244255
.long 3489094832
.long 3220136599
.long 536870912
.long 3161809431
.long 0
.long 3218079744
.long 257503056
.long 1067164005
.long 2748392742
.long 3220166797
.long 1073741824
.long 1015578102
.long 0
.long 3217031168
.long 0
.long 0
.long 0
.long 3220176896
.long 0
.long 0
.long 0
.long 0
.long 257503056
.long 3214647653
.long 2748392742
.long 3220166797
.long 1073741824
.long 1015578102
.long 0
.long 1069547520
.long 240740309
.long 3215727903
.long 3489094832
.long 3220136599
.long 536870912
.long 3161809431
.long 0
.long 1070596096
.long 1991047213
.long 1067753521
.long 1455828442
.long 3220086593
.long 3758096384
.long 3162988721
.long 0
.long 1070596096
.long 1699043957
.long 3216902261
.long 3476196678
.long 3220017259
.long 536870912
.long 3161741286
.long 0
.long 1071644672
.long 2140183630
.long 3214756396
.long 4051746225
.long 3219929266
.long 2147483648
.long 1014423729
.long 0
.long 1071644672
.long 2598800519
.long 1068266419
.long 688824739
.long 3219823462
.long 3758096384
.long 3157915184
.long 0
.long 1071644672
.long 2485417816
.long 1069626316
.long 1796544321
.long 3219700864
.long 536870912
.long 1015203297
.long 0
.long 1071644672
.long 2583490354
.long 1070236281
.long 1719614413
.long 3219562654
.long 536870912
.long 1015799092
.long 0
.long 1071644672
.long 1403757309
.long 3217886718
.long 621354454
.long 3219410163
.long 536870912
.long 3160934250
.long 0
.long 1072693248
.long 1539668340
.long 3217396327
.long 967731400
.long 3219244859
.long 536870912
.long 3163235805
.long 0
.long 1072693248
.long 1945768569
.long 3216915048
.long 939980347
.long 3219008349
.long 536870912
.long 3160280457
.long 0
.long 1072693248
.long 2255197647
.long 3216211105
.long 2796464483
.long 3218636258
.long 3758096384
.long 1013394669
.long 0
.long 1072693248
.long 2476548698
.long 3215330282
.long 785751814
.long 3218248710
.long 2684354560
.long 1014354573
.long 0
.long 1072693248
.long 18115067
.long 3214126342
.long 1013556747
.long 3217619128
.long 3221225472
.long 1013083417
.long 0
.long 1072693248
.long 393047345
.long 3212032302
.long 3156849708
.long 3216578470
.long 3758096384
.long 1010706200
.long 0
.long 1072693248
.long 1431655765
.long 3217380693
.long 0
.long 3219128320
.long 286331153
.long 1065423121
.long 1431655765
.long 1067799893
.long 436314138
.long 3207201184
.long 381774871
.long 3210133868
.long 2773927732
.long 1053236707
.long 436314138
.long 1056571808
.long 442499072
.long 1032893537
.long 442499072
.long 1032893537
.long 1413480448
.long 1069097467
.long 0
.long 0
.long 771977331
.long 996350346
.long 0
.long 0
.long 1841940611
.long 1076125488
.long 0
.long 0
.long 0
.long 1127743488
.long 0
.long 0
.long 0
.long 1072693248
.long 0
.long 0
.long 0
.long 2147483648
.long 0
.long 0
.long 0
.long 2147483648
.long 0
.long 0
.long 0
.long 1071644672
.long 0
.long 1071644672
.type static_const_table,@object
.size static_const_table,2256
.data
.hidden __libm_sincos_huge
.section .note.GNU-stack, "",@progbits
# End