|
|
1
|
+/* Correctly-rounded biased argument natural logarithm function for binary32 value.
|
|
|
2
|
+
|
|
|
3
|
+Copyright (c) 2023-2025 Alexei Sibidanov.
|
|
|
4
|
+
|
|
|
5
|
+This file is part of the CORE-MATH project
|
|
|
6
|
+(https://core-math.gitlabpages.inria.fr/).
|
|
|
7
|
+
|
|
|
8
|
+Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
9
|
+of this software and associated documentation files (the "Software"), to deal
|
|
|
10
|
+in the Software without restriction, including without limitation the rights
|
|
|
11
|
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
12
|
+copies of the Software, and to permit persons to whom the Software is
|
|
|
13
|
+furnished to do so, subject to the following conditions:
|
|
|
14
|
+
|
|
|
15
|
+The above copyright notice and this permission notice shall be included in all
|
|
|
16
|
+copies or substantial portions of the Software.
|
|
|
17
|
+
|
|
|
18
|
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
19
|
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
20
|
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
21
|
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
22
|
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
23
|
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
24
|
+SOFTWARE.
|
|
|
25
|
+*/
|
|
|
26
|
+
|
|
|
27
|
+#include <stdint.h>
|
|
|
28
|
+#include <errno.h>
|
|
|
29
|
+
|
|
|
30
|
+// Warning: clang also defines __GNUC__
|
|
|
31
|
+#if defined(__GNUC__) && !defined(__clang__)
|
|
|
32
|
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
|
|
|
33
|
+#endif
|
|
|
34
|
+
|
|
|
35
|
+#pragma STDC FENV_ACCESS ON
|
|
|
36
|
+
|
|
|
37
|
+typedef union {float f; uint32_t u;} b32u32_u;
|
|
|
38
|
+typedef union {double f; uint64_t u;} b64u64_u;
|
|
|
39
|
+
|
|
|
40
|
+static __attribute__((noinline)) float as_special(float x){
|
|
|
41
|
+ b32u32_u t = {.f = x};
|
|
|
42
|
+ if(t.u==0xbf800000u){// +0.0
|
|
|
43
|
+#ifdef CORE_MATH_SUPPORT_ERRNO
|
|
|
44
|
+ errno = ERANGE;
|
|
|
45
|
+#endif
|
|
|
46
|
+ return -1.0f/0.0f; // to raise FE_DIVBYZERO
|
|
|
47
|
+ }
|
|
|
48
|
+ if(t.u == 0x7f800000u) return x; // +inf
|
|
|
49
|
+ uint32_t ax = t.u<<1;
|
|
|
50
|
+ if(ax > 0xff000000u) return x + x; // nan
|
|
|
51
|
+#ifdef CORE_MATH_SUPPORT_ERRNO
|
|
|
52
|
+ errno = EDOM;
|
|
|
53
|
+#endif
|
|
|
54
|
+ return 0.0f/0.0f; // to raise FE_INVALID
|
|
|
55
|
+}
|
|
|
56
|
+
|
|
|
57
|
+float cr_log1pf(float x) {
|
|
|
58
|
+ static const double x0[] = {
|
|
|
59
|
+ 0x1.f81f82p-1, 0x1.e9131acp-1, 0x1.dae6077p-1, 0x1.cd85689p-1, 0x1.c0e0704p-1, 0x1.b4e81b5p-1,
|
|
|
60
|
+ 0x1.a98ef6p-1, 0x1.9ec8e95p-1, 0x1.948b0fdp-1, 0x1.8acb90fp-1, 0x1.8181818p-1, 0x1.78a4c81p-1,
|
|
|
61
|
+ 0x1.702e05cp-1, 0x1.6816817p-1, 0x1.605816p-1, 0x1.58ed231p-1, 0x1.51d07ebp-1, 0x1.4afd6ap-1,
|
|
|
62
|
+ 0x1.446f865p-1, 0x1.3e22cbdp-1, 0x1.3813814p-1, 0x1.323e34ap-1, 0x1.2c9fb4ep-1, 0x1.27350b9p-1,
|
|
|
63
|
+ 0x1.21fb781p-1, 0x1.1cf06aep-1, 0x1.1811812p-1, 0x1.135c811p-1, 0x1.0ecf56cp-1, 0x1.0a6810ap-1,
|
|
|
64
|
+ 0x1.0624dd3p-1, 0x1.0204081p-1};
|
|
|
65
|
+ static const double lixb[] = {
|
|
|
66
|
+ 0x1.fc0a8909b4218p-7, 0x1.77458f51aac89p-5, 0x1.341d793afb997p-4, 0x1.a926d3a5ebd2ap-4,
|
|
|
67
|
+ 0x1.0d77e7a8a823dp-3, 0x1.44d2b6c557102p-3, 0x1.7ab89040accecp-3, 0x1.af3c94ecab3d6p-3,
|
|
|
68
|
+ 0x1.e27076d54e6c9p-3, 0x1.0a324e3888ad5p-2, 0x1.22941fc0c7357p-2, 0x1.3a64c56ae3fdbp-2,
|
|
|
69
|
+ 0x1.51aad874af21fp-2, 0x1.686c81d300eap-2, 0x1.7eaf83c7fa9b5p-2, 0x1.947941aa610ecp-2,
|
|
|
70
|
+ 0x1.a9cec9a3f023bp-2, 0x1.beb4d9ea4156ep-2, 0x1.d32fe7f35e5c7p-2, 0x1.e7442617b817ap-2,
|
|
|
71
|
+ 0x1.faf588dd5ed1p-2, 0x1.0723e5c635c39p-1, 0x1.109f39d53c99p-1, 0x1.19ee6b38a4668p-1,
|
|
|
72
|
+ 0x1.23130d7f93c3bp-1, 0x1.2c0e9ec9b0b85p-1, 0x1.34e289cb35eccp-1, 0x1.3d9026ad3d3f3p-1,
|
|
|
73
|
+ 0x1.4618bc1eadbbbp-1, 0x1.4e7d8127dd8a9p-1, 0x1.56bf9d5967092p-1, 0x1.5ee02a926936ep-1};
|
|
|
74
|
+ static const double lix[] = {
|
|
|
75
|
+ 0x1.fc0a890fc03e4p-7, 0x1.77458f532dcfcp-5, 0x1.341d793bbd1d1p-4, 0x1.a926d3a6ad563p-4,
|
|
|
76
|
+ 0x1.0d77e7a908e59p-3, 0x1.44d2b6c5b7d1ep-3, 0x1.7ab890410d909p-3, 0x1.af3c94ed0bff3p-3,
|
|
|
77
|
+ 0x1.e27076d5af2e6p-3, 0x1.0a324e38b90e3p-2, 0x1.22941fc0f7966p-2, 0x1.3a64c56b145eap-2,
|
|
|
78
|
+ 0x1.51aad874df82dp-2, 0x1.686c81d3314afp-2, 0x1.7eaf83c82afc3p-2, 0x1.947941aa916fbp-2,
|
|
|
79
|
+ 0x1.a9cec9a42084ap-2, 0x1.beb4d9ea71b7cp-2, 0x1.d32fe7f38ebd5p-2, 0x1.e7442617e8788p-2,
|
|
|
80
|
+ 0x1.faf588dd8f31fp-2, 0x1.0723e5c64df4p-1, 0x1.109f39d554c97p-1, 0x1.19ee6b38bc96fp-1,
|
|
|
81
|
+ 0x1.23130d7fabf43p-1, 0x1.2c0e9ec9c8e8cp-1, 0x1.34e289cb4e1d3p-1, 0x1.3d9026ad556fbp-1,
|
|
|
82
|
+ 0x1.4618bc1ec5ec2p-1, 0x1.4e7d8127f5bb1p-1, 0x1.56bf9d597f399p-1, 0x1.5ee02a9281675p-1};
|
|
|
83
|
+ static const double b[] =
|
|
|
84
|
+ {0x1p+0, -0x1p-1, 0x1.5555555556f6bp-2, -0x1.00000000029b9p-2,
|
|
|
85
|
+ 0x1.9999988d176e4p-3, -0x1.55555418889a7p-3, 0x1.24adeca50e2bcp-3, -0x1.001ba33bf57cfp-3};
|
|
|
86
|
+
|
|
|
87
|
+ double z = x;
|
|
|
88
|
+ b32u32_u t = {.f = x};
|
|
|
89
|
+ uint32_t ux = t.u, ax = ux&(~0u>>1);
|
|
|
90
|
+ if(__builtin_expect(ax<0x3c880000u, 1)){ // |x| < 0x1.1p-6
|
|
|
91
|
+ if(__builtin_expect(ax<0x33000000u, 0)){ // |x| < 0x1p-25
|
|
|
92
|
+ if(!ax) return x;
|
|
|
93
|
+ float res = __builtin_fmaf(x,-x,x);
|
|
|
94
|
+#ifdef CORE_MATH_SUPPORT_ERRNO
|
|
|
95
|
+ /* log1p(x) underflows exactly when |res| < 2^-126
|
|
|
96
|
+ and for x=-0x1.fffffcp-127 and rounding downwards */
|
|
|
97
|
+ if (__builtin_fabsf (res) < 0x1p-126f ||
|
|
|
98
|
+ (x == -0x1.fffffcp-127f && res == -0x1p-126f))
|
|
|
99
|
+ errno = ERANGE; // underflow
|
|
|
100
|
+#endif
|
|
|
101
|
+ return res;
|
|
|
102
|
+ }
|
|
|
103
|
+ double z2 = z*z, z4 = z2*z2;
|
|
|
104
|
+ double f = z2*((b[1] + z*b[2]) + z2*(b[3] + z*b[4]) + z4*((b[5] + z*b[6]) + z2*b[7]));
|
|
|
105
|
+ b64u64_u r = {.f = z + f};
|
|
|
106
|
+ if(__builtin_expect((r.u&0xfffffffll) == 0, 0)) r.f += 0x1p14*(f + (z - r.f));
|
|
|
107
|
+ return r.f;
|
|
|
108
|
+ } else {
|
|
|
109
|
+ if(__builtin_expect(ux>=0xbf800000u||ax>=0x7f800000u, 0)) return as_special(x);
|
|
|
110
|
+ b64u64_u tp = {.f = z + 1};
|
|
|
111
|
+ int e = tp.u>>52;
|
|
|
112
|
+ uint64_t m52 = tp.u&(~(uint64_t)0>>12);
|
|
|
113
|
+ unsigned j = (tp.u >> (52-5))&31;
|
|
|
114
|
+ e -= 0x3ff;
|
|
|
115
|
+ b64u64_u xd = {.u = m52 | ((uint64_t)0x3ff<<52)};
|
|
|
116
|
+ z = xd.f*x0[j] - 1;
|
|
|
117
|
+ static const double c[] =
|
|
|
118
|
+ {-0x1.3902c33434e7fp-43, 0x1.ffffffe1cbed5p-1, -0x1.ffffff7d1b014p-2, 0x1.5564e0ed3613ap-2, -0x1.0012232a00d4ap-2};
|
|
|
119
|
+ const double ln2 = 0x1.62e42fefa39efp-1;
|
|
|
120
|
+ double z2 = z*z, r = (ln2*e + lixb[j]) + z*((c[1] + z*c[2]) + z2*(c[3] + z*c[4]));
|
|
|
121
|
+ float ub = r, lb = r + 2.2e-11;
|
|
|
122
|
+ if(__builtin_expect(ub != lb, 0)){
|
|
|
123
|
+ double z4 = z2*z2, f = z*((b[0] + z*b[1]) + z2*(b[2] + z*b[3]) + z4*((b[4] + z*b[5]) + z2*(b[6] + z*b[7])));
|
|
|
124
|
+ const double ln2l = 0x1.7f7d1cf79abcap-20, ln2h = 0x1.62e4p-1;
|
|
|
125
|
+ double Lh = ln2h * e, Ll = ln2l * e, rl = f + Ll + lix[j];
|
|
|
126
|
+ b64u64_u tr = {.f = rl + Lh};
|
|
|
127
|
+ if(__builtin_expect((tr.u&0xfffffffll) == 0 , 0)){
|
|
|
128
|
+ if(x==-0x1.247ab0p-6) return -0x1.271f0ep-6f - 0x1p-31f;
|
|
|
129
|
+ if(x==-0x1.3a415ep-5) return -0x1.407112p-5f + 0x1p-30f;
|
|
|
130
|
+ if(x== 0x1.fb035ap-2) return 0x1.9bddc2p-2f + 0x1p-27f;
|
|
|
131
|
+ tr.f += 64*(rl + (Lh - tr.f));
|
|
|
132
|
+ } else if(rl+(Lh-tr.f)==0.0){
|
|
|
133
|
+ if(x== 0x1.b7fd86p-4) return 0x1.a1ece2p-4f + 0x1p-29f;
|
|
|
134
|
+ if(x==-0x1.3a415ep-5) return -0x1.407112p-5f + 0x1p-30f;
|
|
|
135
|
+ if(x== 0x1.43c7e2p-6) return 0x1.409f80p-6f + 0x1p-31f;
|
|
|
136
|
+ }
|
|
|
137
|
+ ub = tr.f;
|
|
|
138
|
+ }
|
|
|
139
|
+ return ub;
|
|
|
140
|
+ }
|
|
|
141
|
+} |