Raymond Toy pushed to branch issue-425-correctly-rounded-math-functions-single-float at cmucl / cmucl

Commits:

1 changed file:

Changes:

  • src/lisp/core-math/src/binary32/cosh/coshf.c
    1
    +/* Correctly-rounded hyperbolic cosine function for binary32 value.
    
    2
    +
    
    3
    +Copyright (c) 2022-2023 Alexei Sibidanov.
    
    4
    +
    
    5
    +This file is part of the CORE-MATH project
    
    6
    +(https://core-math.gitlabpages.inria.fr/).
    
    7
    +
    
    8
    +Permission is hereby granted, free of charge, to any person obtaining a copy
    
    9
    +of this software and associated documentation files (the "Software"), to deal
    
    10
    +in the Software without restriction, including without limitation the rights
    
    11
    +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    
    12
    +copies of the Software, and to permit persons to whom the Software is
    
    13
    +furnished to do so, subject to the following conditions:
    
    14
    +
    
    15
    +The above copyright notice and this permission notice shall be included in all
    
    16
    +copies or substantial portions of the Software.
    
    17
    +
    
    18
    +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    
    19
    +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    
    20
    +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    
    21
    +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    
    22
    +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    
    23
    +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    
    24
    +SOFTWARE.
    
    25
    +*/
    
    26
    +
    
    27
    +#include <stdint.h>
    
    28
    +#include <errno.h>
    
    29
    +
    
    30
    +// Warning: clang also defines __GNUC__
    
    31
    +#if defined(__GNUC__) && !defined(__clang__)
    
    32
    +#pragma GCC diagnostic ignored "-Wunknown-pragmas"
    
    33
    +#endif
    
    34
    +
    
    35
    +#pragma STDC FENV_ACCESS ON
    
    36
    +
    
    37
    +/* __builtin_roundeven was introduced in gcc 10:
    
    38
    +   https://gcc.gnu.org/gcc-10/changes.html,
    
    39
    +   and in clang 17 */
    
    40
    +#if ((defined(__GNUC__) && __GNUC__ >= 10) || (defined(__clang__) && __clang_major__ >= 17)) && (defined(__aarch64__) || defined(__x86_64__) || defined(__i386__))
    
    41
    +# define roundeven_finite(x) __builtin_roundeven (x)
    
    42
    +#else
    
    43
    +/* round x to nearest integer, breaking ties to even */
    
    44
    +static double
    
    45
    +roundeven_finite (double x)
    
    46
    +{
    
    47
    +  double ix;
    
    48
    +# if (defined(__GNUC__) || defined(__clang__)) && (defined(__AVX__) || defined(__SSE4_1__) || (__ARM_ARCH >= 8))
    
    49
    +#  if defined __AVX__
    
    50
    +   __asm__("vroundsd $0x8,%1,%1,%0":"=x"(ix):"x"(x));
    
    51
    +#  elif __ARM_ARCH >= 8
    
    52
    +   __asm__ ("frintn %d0, %d1":"=w"(ix):"w"(x));
    
    53
    +#  else /* __SSE4_1__ */
    
    54
    +   __asm__("roundsd $0x8,%1,%0":"=x"(ix):"x"(x));
    
    55
    +#  endif
    
    56
    +# else
    
    57
    +  ix = __builtin_round (x); /* nearest, away from 0 */
    
    58
    +  if (__builtin_fabs (ix - x) == 0.5)
    
    59
    +  {
    
    60
    +    /* if ix is odd, we should return ix-1 if x>0, and ix+1 if x<0 */
    
    61
    +    union { double f; uint64_t n; } u, v;
    
    62
    +    u.f = ix;
    
    63
    +    v.f = ix - __builtin_copysign (1.0, x);
    
    64
    +    if (__builtin_ctz (v.n) > __builtin_ctz (u.n))
    
    65
    +      ix = v.f;
    
    66
    +  }
    
    67
    +# endif
    
    68
    +  return ix;
    
    69
    +}
    
    70
    +#endif
    
    71
    +
    
    72
    +typedef union {float f; uint32_t u;} b32u32_u;
    
    73
    +typedef union {double f; uint64_t u;} b64u64_u;
    
    74
    +
    
    75
    +float cr_coshf(float x){
    
    76
    +  static const double c[] =
    
    77
    +    {1, 0x1.62e42fef4c4e7p-6, 0x1.ebfd1b232f475p-13, 0x1.c6b19384ecd93p-20};
    
    78
    +  static const double ch[] =
    
    79
    +    {1, 0x1.62e42fefa39efp-6, 0x1.ebfbdff82c58fp-13, 0x1.c6b08d702e0edp-20, 0x1.3b2ab6fb92e5ep-27,
    
    80
    +     0x1.5d886e6d54203p-35, 0x1.430976b8ce6efp-43};
    
    81
    +  static const uint64_t tb[] =
    
    82
    +    {0x3fe0000000000000, 0x3fe059b0d3158574, 0x3fe0b5586cf9890f, 0x3fe11301d0125b51,
    
    83
    +     0x3fe172b83c7d517b, 0x3fe1d4873168b9aa, 0x3fe2387a6e756238, 0x3fe29e9df51fdee1,
    
    84
    +     0x3fe306fe0a31b715, 0x3fe371a7373aa9cb, 0x3fe3dea64c123422, 0x3fe44e086061892d,
    
    85
    +     0x3fe4bfdad5362a27, 0x3fe5342b569d4f82, 0x3fe5ab07dd485429, 0x3fe6247eb03a5585,
    
    86
    +     0x3fe6a09e667f3bcd, 0x3fe71f75e8ec5f74, 0x3fe7a11473eb0187, 0x3fe82589994cce13,
    
    87
    +     0x3fe8ace5422aa0db, 0x3fe93737b0cdc5e5, 0x3fe9c49182a3f090, 0x3fea5503b23e255d,
    
    88
    +     0x3feae89f995ad3ad, 0x3feb7f76f2fb5e47, 0x3fec199bdd85529c, 0x3fecb720dcef9069,
    
    89
    +     0x3fed5818dcfba487, 0x3fedfc97337b9b5f, 0x3feea4afa2a490da, 0x3fef50765b6e4540};
    
    90
    +  const double iln2 = 0x1.71547652b82fep+5;
    
    91
    +  b32u32_u t = {.f = x};
    
    92
    +  double z = x;
    
    93
    +  uint32_t ax = t.u<<1;
    
    94
    +  if(__builtin_expect(ax>0x8565a9f8u, 0)){ // |x| >~ 89.4
    
    95
    +    if(ax>=0xff000000u) {
    
    96
    +      if(ax<<8) return x + x; // nan
    
    97
    +      return 1.0f/0.0f; // +-inf
    
    98
    +    }
    
    99
    +    float r = 2.0f*0x1.fffffep127f;
    
    100
    +#ifdef CORE_MATH_SUPPORT_ERRNO
    
    101
    +    errno = ERANGE;
    
    102
    +#endif
    
    103
    +    return r;
    
    104
    +  }
    
    105
    +  if(__builtin_expect(ax<0x7c000000u, 0)){ // |x| < 0.125
    
    106
    +    if(__builtin_expect(ax<0x74000000u, 0)){ // |x| < 0x1p-11
    
    107
    +      if(__builtin_expect(ax<0x66000000u, 0)) // |x| < 0x1p-24
    
    108
    +	return __builtin_fmaf(__builtin_fabsf(x), 0x1p-25, 1.0f);
    
    109
    +      return (0.5f*x)*x + 1.0f;
    
    110
    +    }
    
    111
    +    static const double cp[] =
    
    112
    +      {0x1.fffffffffffe3p-2, 0x1.55555555723cfp-5, 0x1.6c16bee4a5986p-10, 0x1.a0483fc0328f7p-16};
    
    113
    +    double z2 = z*z, z4 = z2*z2;
    
    114
    +    return 1.0 + z2*((cp[0] + z2*cp[1]) + z4*(cp[2] + z2*(cp[3])));
    
    115
    +  }
    
    116
    +  double a = iln2*z, ia = roundeven_finite(a), h = a - ia, h2 = h*h;
    
    117
    +  b64u64_u ja = {.f = ia + 0x1.8p52};
    
    118
    +  int64_t jp = ja.u, jm = -jp;
    
    119
    +  b64u64_u sp = {.u = tb[jp&31] + ((uint64_t)(jp>>5)<<52)}, sm = {.u = tb[jm&31] + ((uint64_t)(jm>>5)<<52)};
    
    120
    +  double te = c[0] + h2*c[2], to = (c[1] + h2*c[3]);
    
    121
    +  double rp = sp.f*(te + h*to), rm = sm.f*(te - h*to), r = rp + rm;
    
    122
    +  float ub = r, lb = r  - 1.45e-10*r;
    
    123
    +  if(__builtin_expect(ub != lb, 0)){
    
    124
    +    const double iln2h = 0x1.7154765p+5, iln2l = 0x1.5c17f0bbbe88p-26;
    
    125
    +    h = (iln2h*z - ia) + iln2l*z;
    
    126
    +    h2 = h*h;
    
    127
    +    te = ch[0] + h2*ch[2] + (h2*h2)*(ch[4] + h2*ch[6]);
    
    128
    +    to = ch[1] + h2*(ch[3] + h2*ch[5]);
    
    129
    +    r = sp.f*(te + h*to) + sm.f*(te - h*to);
    
    130
    +    ub = r;
    
    131
    +  }
    
    132
    +  return ub;
    
    133
    +}