|
1
|
+/* Correctly-rounded hyperbolic cosine function for binary32 value.
|
|
2
|
+
|
|
3
|
+Copyright (c) 2022-2023 Alexei Sibidanov.
|
|
4
|
+
|
|
5
|
+This file is part of the CORE-MATH project
|
|
6
|
+(https://core-math.gitlabpages.inria.fr/).
|
|
7
|
+
|
|
8
|
+Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
9
|
+of this software and associated documentation files (the "Software"), to deal
|
|
10
|
+in the Software without restriction, including without limitation the rights
|
|
11
|
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
12
|
+copies of the Software, and to permit persons to whom the Software is
|
|
13
|
+furnished to do so, subject to the following conditions:
|
|
14
|
+
|
|
15
|
+The above copyright notice and this permission notice shall be included in all
|
|
16
|
+copies or substantial portions of the Software.
|
|
17
|
+
|
|
18
|
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19
|
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20
|
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
21
|
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22
|
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
23
|
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
24
|
+SOFTWARE.
|
|
25
|
+*/
|
|
26
|
+
|
|
27
|
+#include <stdint.h>
|
|
28
|
+#include <errno.h>
|
|
29
|
+
|
|
30
|
+// Warning: clang also defines __GNUC__
|
|
31
|
+#if defined(__GNUC__) && !defined(__clang__)
|
|
32
|
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
|
|
33
|
+#endif
|
|
34
|
+
|
|
35
|
+#pragma STDC FENV_ACCESS ON
|
|
36
|
+
|
|
37
|
+/* __builtin_roundeven was introduced in gcc 10:
|
|
38
|
+ https://gcc.gnu.org/gcc-10/changes.html,
|
|
39
|
+ and in clang 17 */
|
|
40
|
+#if ((defined(__GNUC__) && __GNUC__ >= 10) || (defined(__clang__) && __clang_major__ >= 17)) && (defined(__aarch64__) || defined(__x86_64__) || defined(__i386__))
|
|
41
|
+# define roundeven_finite(x) __builtin_roundeven (x)
|
|
42
|
+#else
|
|
43
|
+/* round x to nearest integer, breaking ties to even */
|
|
44
|
+static double
|
|
45
|
+roundeven_finite (double x)
|
|
46
|
+{
|
|
47
|
+ double ix;
|
|
48
|
+# if (defined(__GNUC__) || defined(__clang__)) && (defined(__AVX__) || defined(__SSE4_1__) || (__ARM_ARCH >= 8))
|
|
49
|
+# if defined __AVX__
|
|
50
|
+ __asm__("vroundsd $0x8,%1,%1,%0":"=x"(ix):"x"(x));
|
|
51
|
+# elif __ARM_ARCH >= 8
|
|
52
|
+ __asm__ ("frintn %d0, %d1":"=w"(ix):"w"(x));
|
|
53
|
+# else /* __SSE4_1__ */
|
|
54
|
+ __asm__("roundsd $0x8,%1,%0":"=x"(ix):"x"(x));
|
|
55
|
+# endif
|
|
56
|
+# else
|
|
57
|
+ ix = __builtin_round (x); /* nearest, away from 0 */
|
|
58
|
+ if (__builtin_fabs (ix - x) == 0.5)
|
|
59
|
+ {
|
|
60
|
+ /* if ix is odd, we should return ix-1 if x>0, and ix+1 if x<0 */
|
|
61
|
+ union { double f; uint64_t n; } u, v;
|
|
62
|
+ u.f = ix;
|
|
63
|
+ v.f = ix - __builtin_copysign (1.0, x);
|
|
64
|
+ if (__builtin_ctz (v.n) > __builtin_ctz (u.n))
|
|
65
|
+ ix = v.f;
|
|
66
|
+ }
|
|
67
|
+# endif
|
|
68
|
+ return ix;
|
|
69
|
+}
|
|
70
|
+#endif
|
|
71
|
+
|
|
72
|
+typedef union {float f; uint32_t u;} b32u32_u;
|
|
73
|
+typedef union {double f; uint64_t u;} b64u64_u;
|
|
74
|
+
|
|
75
|
+float cr_coshf(float x){
|
|
76
|
+ static const double c[] =
|
|
77
|
+ {1, 0x1.62e42fef4c4e7p-6, 0x1.ebfd1b232f475p-13, 0x1.c6b19384ecd93p-20};
|
|
78
|
+ static const double ch[] =
|
|
79
|
+ {1, 0x1.62e42fefa39efp-6, 0x1.ebfbdff82c58fp-13, 0x1.c6b08d702e0edp-20, 0x1.3b2ab6fb92e5ep-27,
|
|
80
|
+ 0x1.5d886e6d54203p-35, 0x1.430976b8ce6efp-43};
|
|
81
|
+ static const uint64_t tb[] =
|
|
82
|
+ {0x3fe0000000000000, 0x3fe059b0d3158574, 0x3fe0b5586cf9890f, 0x3fe11301d0125b51,
|
|
83
|
+ 0x3fe172b83c7d517b, 0x3fe1d4873168b9aa, 0x3fe2387a6e756238, 0x3fe29e9df51fdee1,
|
|
84
|
+ 0x3fe306fe0a31b715, 0x3fe371a7373aa9cb, 0x3fe3dea64c123422, 0x3fe44e086061892d,
|
|
85
|
+ 0x3fe4bfdad5362a27, 0x3fe5342b569d4f82, 0x3fe5ab07dd485429, 0x3fe6247eb03a5585,
|
|
86
|
+ 0x3fe6a09e667f3bcd, 0x3fe71f75e8ec5f74, 0x3fe7a11473eb0187, 0x3fe82589994cce13,
|
|
87
|
+ 0x3fe8ace5422aa0db, 0x3fe93737b0cdc5e5, 0x3fe9c49182a3f090, 0x3fea5503b23e255d,
|
|
88
|
+ 0x3feae89f995ad3ad, 0x3feb7f76f2fb5e47, 0x3fec199bdd85529c, 0x3fecb720dcef9069,
|
|
89
|
+ 0x3fed5818dcfba487, 0x3fedfc97337b9b5f, 0x3feea4afa2a490da, 0x3fef50765b6e4540};
|
|
90
|
+ const double iln2 = 0x1.71547652b82fep+5;
|
|
91
|
+ b32u32_u t = {.f = x};
|
|
92
|
+ double z = x;
|
|
93
|
+ uint32_t ax = t.u<<1;
|
|
94
|
+ if(__builtin_expect(ax>0x8565a9f8u, 0)){ // |x| >~ 89.4
|
|
95
|
+ if(ax>=0xff000000u) {
|
|
96
|
+ if(ax<<8) return x + x; // nan
|
|
97
|
+ return 1.0f/0.0f; // +-inf
|
|
98
|
+ }
|
|
99
|
+ float r = 2.0f*0x1.fffffep127f;
|
|
100
|
+#ifdef CORE_MATH_SUPPORT_ERRNO
|
|
101
|
+ errno = ERANGE;
|
|
102
|
+#endif
|
|
103
|
+ return r;
|
|
104
|
+ }
|
|
105
|
+ if(__builtin_expect(ax<0x7c000000u, 0)){ // |x| < 0.125
|
|
106
|
+ if(__builtin_expect(ax<0x74000000u, 0)){ // |x| < 0x1p-11
|
|
107
|
+ if(__builtin_expect(ax<0x66000000u, 0)) // |x| < 0x1p-24
|
|
108
|
+ return __builtin_fmaf(__builtin_fabsf(x), 0x1p-25, 1.0f);
|
|
109
|
+ return (0.5f*x)*x + 1.0f;
|
|
110
|
+ }
|
|
111
|
+ static const double cp[] =
|
|
112
|
+ {0x1.fffffffffffe3p-2, 0x1.55555555723cfp-5, 0x1.6c16bee4a5986p-10, 0x1.a0483fc0328f7p-16};
|
|
113
|
+ double z2 = z*z, z4 = z2*z2;
|
|
114
|
+ return 1.0 + z2*((cp[0] + z2*cp[1]) + z4*(cp[2] + z2*(cp[3])));
|
|
115
|
+ }
|
|
116
|
+ double a = iln2*z, ia = roundeven_finite(a), h = a - ia, h2 = h*h;
|
|
117
|
+ b64u64_u ja = {.f = ia + 0x1.8p52};
|
|
118
|
+ int64_t jp = ja.u, jm = -jp;
|
|
119
|
+ b64u64_u sp = {.u = tb[jp&31] + ((uint64_t)(jp>>5)<<52)}, sm = {.u = tb[jm&31] + ((uint64_t)(jm>>5)<<52)};
|
|
120
|
+ double te = c[0] + h2*c[2], to = (c[1] + h2*c[3]);
|
|
121
|
+ double rp = sp.f*(te + h*to), rm = sm.f*(te - h*to), r = rp + rm;
|
|
122
|
+ float ub = r, lb = r - 1.45e-10*r;
|
|
123
|
+ if(__builtin_expect(ub != lb, 0)){
|
|
124
|
+ const double iln2h = 0x1.7154765p+5, iln2l = 0x1.5c17f0bbbe88p-26;
|
|
125
|
+ h = (iln2h*z - ia) + iln2l*z;
|
|
126
|
+ h2 = h*h;
|
|
127
|
+ te = ch[0] + h2*ch[2] + (h2*h2)*(ch[4] + h2*ch[6]);
|
|
128
|
+ to = ch[1] + h2*(ch[3] + h2*ch[5]);
|
|
129
|
+ r = sp.f*(te + h*to) + sm.f*(te - h*to);
|
|
130
|
+ ub = r;
|
|
131
|
+ }
|
|
132
|
+ return ub;
|
|
133
|
+} |