forked from OSchip/llvm-project
[libc] Change sinf/cosf range reduction to mod pi/32 to be shared with tanf.
Change sinf/cosf range reduction to mod pi/32 to be shared with tanf, since polynomial approximations for tanf on subintervals of length pi/16 do not provide enough accuracy. Reviewed By: orex Differential Revision: https://reviews.llvm.org/D131652
This commit is contained in:
parent
13a784f368
commit
42f183792c
|
@ -200,7 +200,7 @@ Performance
|
||||||
| +-----------+-------------------+-----------+-------------------+ +------------+-------------------------+--------------+---------------+
|
| +-----------+-------------------+-----------+-------------------+ +------------+-------------------------+--------------+---------------+
|
||||||
| | LLVM libc | Reference (glibc) | LLVM libc | Reference (glibc) | | CPU | OS | Compiler | Special flags |
|
| | LLVM libc | Reference (glibc) | LLVM libc | Reference (glibc) | | CPU | OS | Compiler | Special flags |
|
||||||
+==============+===========+===================+===========+===================+=====================================+============+=========================+==============+===============+
|
+==============+===========+===================+===========+===================+=====================================+============+=========================+==============+===============+
|
||||||
| cosf | 14 | 32 | 56 | 59 | :math:`[0, 2\pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
| cosf | 13 | 32 | 53 | 59 | :math:`[0, 2\pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
||||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||||
| coshf | 23 | 20 | 73 | 49 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
| coshf | 23 | 20 | 73 | 49 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
||||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||||
|
@ -230,9 +230,9 @@ Performance
|
||||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||||
| log2f | 13 | 10 | 57 | 46 | :math:`[e^{-1}, e]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
| log2f | 13 | 10 | 57 | 46 | :math:`[e^{-1}, e]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
||||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||||
| sinf | 13 | 25 | 54 | 57 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
| sinf | 12 | 25 | 51 | 57 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
||||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||||
| sincosf | 20 | 30 | 62 | 68 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
| sincosf | 19 | 30 | 57 | 68 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
||||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||||
| sinhf | 23 | 64 | 73 | 141 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
| sinhf | 23 | 64 | 73 | 141 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
|
||||||
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
|
||||||
|
|
|
@ -53,21 +53,21 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
|
||||||
// Range reduction:
|
// Range reduction:
|
||||||
// For |x| > pi/16, we perform range reduction as follows:
|
// For |x| > pi/16, we perform range reduction as follows:
|
||||||
// Find k and y such that:
|
// Find k and y such that:
|
||||||
// x = (k + y) * pi/16
|
// x = (k + y) * pi/32
|
||||||
// k is an integer
|
// k is an integer
|
||||||
// |y| < 0.5
|
// |y| < 0.5
|
||||||
// For small range (|x| < 2^46 when FMA instructions are available, 2^22
|
// For small range (|x| < 2^45 when FMA instructions are available, 2^22
|
||||||
// otherwise), this is done by performing:
|
// otherwise), this is done by performing:
|
||||||
// k = round(x * 16/pi)
|
// k = round(x * 32/pi)
|
||||||
// y = x * 16/pi - k
|
// y = x * 32/pi - k
|
||||||
// For large range, we will omit all the higher parts of 16/pi such that the
|
// For large range, we will omit all the higher parts of 16/pi such that the
|
||||||
// least significant bits of their full products with x are larger than 31,
|
// least significant bits of their full products with x are larger than 63,
|
||||||
// since cos((k + y + 32*i) * pi/16) = cos(x + i * 2pi) = cos(x).
|
// since cos((k + y + 64*i) * pi/32) = cos(x + i * 2pi) = cos(x).
|
||||||
//
|
//
|
||||||
// When FMA instructions are not available, we store the digits of 16/pi in
|
// When FMA instructions are not available, we store the digits of 32/pi in
|
||||||
// chunks of 28-bit precision. This will make sure that the products:
|
// chunks of 28-bit precision. This will make sure that the products:
|
||||||
// x * SIXTEEN_OVER_PI_28[i] are all exact.
|
// x * THIRTYTWO_OVER_PI_28[i] are all exact.
|
||||||
// When FMA instructions are available, we simply store the digits of 16/pi in
|
// When FMA instructions are available, we simply store the digits of 32/pi in
|
||||||
// chunks of doubles (53-bit of precision).
|
// chunks of doubles (53-bit of precision).
|
||||||
// So when multiplying by the largest values of single precision, the
|
// So when multiplying by the largest values of single precision, the
|
||||||
// resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the
|
// resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the
|
||||||
|
@ -80,11 +80,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
|
||||||
//
|
//
|
||||||
// Once k and y are computed, we then deduce the answer by the cosine of sum
|
// Once k and y are computed, we then deduce the answer by the cosine of sum
|
||||||
// formula:
|
// formula:
|
||||||
// cos(x) = cos((k + y)*pi/16)
|
// cos(x) = cos((k + y)*pi/32)
|
||||||
// = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
|
// = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
|
||||||
// The values of sin(k*pi/16) and cos(k*pi/16) for k = 0..31 are precomputed
|
// The values of sin(k*pi/32) and cos(k*pi/32) for k = 0..63 are precomputed
|
||||||
// and stored using a vector of 32 doubles. Sin(y*pi/16) and cos(y*pi/16) are
|
// and stored using a vector of 32 doubles. Sin(y*pi/32) and cos(y*pi/32) are
|
||||||
// computed using degree-7 and degree-8 minimax polynomials generated by
|
// computed using degree-7 and degree-6 minimax polynomials generated by
|
||||||
// Sollya respectively.
|
// Sollya respectively.
|
||||||
|
|
||||||
// |x| < 0x1.0p-12f
|
// |x| < 0x1.0p-12f
|
||||||
|
@ -128,8 +128,8 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Combine the results with the sine of sum formula:
|
// Combine the results with the sine of sum formula:
|
||||||
// cos(x) = cos((k + y)*pi/16)
|
// cos(x) = cos((k + y)*pi/32)
|
||||||
// = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
|
// = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
|
||||||
// = cosm1_y * cos_k + sin_y * sin_k
|
// = cosm1_y * cos_k + sin_y * sin_k
|
||||||
// = (cosm1_y * cos_k + cos_k) + sin_y * sin_k
|
// = (cosm1_y * cos_k + cos_k) + sin_y * sin_k
|
||||||
double sin_k, cos_k, sin_y, cosm1_y;
|
double sin_k, cos_k, sin_y, cosm1_y;
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
#define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_H
|
#define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_H
|
||||||
|
|
||||||
#include "src/__support/FPUtil/FPBits.h"
|
#include "src/__support/FPUtil/FPBits.h"
|
||||||
#include "src/__support/FPUtil/except_value_utils.h"
|
|
||||||
#include "src/__support/FPUtil/multiply_add.h"
|
#include "src/__support/FPUtil/multiply_add.h"
|
||||||
#include "src/__support/FPUtil/nearest_integer.h"
|
#include "src/__support/FPUtil/nearest_integer.h"
|
||||||
|
|
||||||
|
@ -22,83 +21,66 @@ static constexpr uint32_t FAST_PASS_BOUND = 0x4a80'0000U; // 2^22
|
||||||
|
|
||||||
static constexpr int N_ENTRIES = 8;
|
static constexpr int N_ENTRIES = 8;
|
||||||
|
|
||||||
// We choose to split bits of 16/pi into 28-bit precision pieces, so that the
|
// We choose to split bits of 32/pi into 28-bit precision pieces, so that the
|
||||||
// product of x * SIXTEEN_OVER_PI_28[i] is exact.
|
// product of x * THIRTYTWO_OVER_PI_28[i] is exact.
|
||||||
// These are generated by Sollya with:
|
// These are generated by Sollya with:
|
||||||
// > a1 = D(round(16/pi, 28, RN)); a1;
|
// > a1 = D(round(32/pi, 28, RN)); a1;
|
||||||
// > a2 = D(round(16/pi - a1, 28, RN)); a2;
|
// > a2 = D(round(32/pi - a1, 28, RN)); a2;
|
||||||
// > a3 = D(round(16/pi - a1 - a2, 28, RN)); a3;
|
// > a3 = D(round(32/pi - a1 - a2, 28, RN)); a3;
|
||||||
// > a4 = D(round(16/pi - a1 - a2 - a3, 28, RN)); a4;
|
// > a4 = D(round(32/pi - a1 - a2 - a3, 28, RN)); a4;
|
||||||
// ...
|
// ...
|
||||||
static constexpr double SIXTEEN_OVER_PI_28[N_ENTRIES] = {
|
static constexpr double THIRTYTWO_OVER_PI_28[N_ENTRIES] = {
|
||||||
0x1.45f306ep+2, -0x1.b1bbeaep-29, 0x1.3f84ebp-58, -0x1.7056592p-88,
|
0x1.45f306ep+3, -0x1.b1bbeaep-28, 0x1.3f84ebp-57, -0x1.7056592p-87,
|
||||||
0x1.c0db62ap-117, -0x1.4cd8778p-146, -0x1.bef806cp-175, 0x1.63abdecp-205};
|
0x1.c0db62ap-116, -0x1.4cd8778p-145, -0x1.bef806cp-174, 0x1.63abdecp-204};
|
||||||
|
|
||||||
// Exponents of the least significant bits of the corresponding entries in
|
// Exponents of the least significant bits of the corresponding entries in
|
||||||
// SIXTEEN_OVER_PI_28.
|
// THIRTYTWO_OVER_PI_28.
|
||||||
static constexpr int SIXTEEN_OVER_PI_28_LSB_EXP[N_ENTRIES] = {
|
static constexpr int THIRTYTWO_OVER_PI_28_LSB_EXP[N_ENTRIES] = {
|
||||||
-25, -56, -82, -115, -144, -171, -201, -231};
|
-24, -55, -81, -114, -143, -170, -200, -230};
|
||||||
|
|
||||||
// Return k and y, where
|
// Return k and y, where
|
||||||
// k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
|
// k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
|
||||||
static inline int64_t small_range_reduction(double x, double &y) {
|
static inline int64_t small_range_reduction(double x, double &y) {
|
||||||
double prod = x * SIXTEEN_OVER_PI_28[0];
|
double prod = x * THIRTYTWO_OVER_PI_28[0];
|
||||||
double kd = fputil::nearest_integer(prod);
|
double kd = fputil::nearest_integer(prod);
|
||||||
y = prod - kd;
|
y = prod - kd;
|
||||||
y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[1], y);
|
y = fputil::multiply_add(x, THIRTYTWO_OVER_PI_28[1], y);
|
||||||
y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[2], y);
|
y = fputil::multiply_add(x, THIRTYTWO_OVER_PI_28[2], y);
|
||||||
return static_cast<int64_t>(kd);
|
return static_cast<int64_t>(kd);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return k and y, where
|
// Return k and y, where
|
||||||
// k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
|
// k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
|
||||||
// For large range, there are at most 2 parts of SIXTEEN_OVER_PI_28 contributing
|
// For large range, there are at most 2 parts of THIRTYTWO_OVER_PI_28
|
||||||
// to the lowest 5 binary digits (k & 31). If the least significant bit of
|
// contributing to the lowest 6 binary digits (k & 63). If the least
|
||||||
// x * the least significant bit of SIXTEEN_OVER_PI_28[i] >= 32, we can
|
// significant bit of x * the least significant bit of THIRTYTWO_OVER_PI_28[i]
|
||||||
// completely ignore SIXTEEN_OVER_PI_28[i].
|
// >= 64, we can completely ignore THIRTYTWO_OVER_PI_28[i].
|
||||||
static inline int64_t large_range_reduction(double x, int x_exp, double &y) {
|
static inline int64_t large_range_reduction(double x, int x_exp, double &y) {
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
y = 0;
|
y = 0;
|
||||||
int x_lsb_exp_m4 = x_exp - fputil::FloatProperties<float>::MANTISSA_WIDTH;
|
int x_lsb_exp_m4 = x_exp - fputil::FloatProperties<float>::MANTISSA_WIDTH;
|
||||||
|
|
||||||
// Skipping the first parts of 16/pi such that:
|
// Skipping the first parts of 32/pi such that:
|
||||||
// LSB of x * LSB of SIXTEEN_OVER_PI_28[i] >= 32.
|
// LSB of x * LSB of THIRTYTWO_OVER_PI_28[i] >= 32.
|
||||||
while (x_lsb_exp_m4 + SIXTEEN_OVER_PI_28_LSB_EXP[idx] > 4)
|
while (x_lsb_exp_m4 + THIRTYTWO_OVER_PI_28_LSB_EXP[idx] > 5)
|
||||||
++idx;
|
++idx;
|
||||||
|
|
||||||
double prod_hi = x * SIXTEEN_OVER_PI_28[idx];
|
double prod_hi = x * THIRTYTWO_OVER_PI_28[idx];
|
||||||
// Get the integral part of x * SIXTEEN_OVER_PI_28[idx]
|
// Get the integral part of x * THIRTYTWO_OVER_PI_28[idx]
|
||||||
double k_hi = fputil::nearest_integer(prod_hi);
|
double k_hi = fputil::nearest_integer(prod_hi);
|
||||||
// Get the fractional part of x * SIXTEEN_OVER_PI_28[idx]
|
// Get the fractional part of x * THIRTYTWO_OVER_PI_28[idx]
|
||||||
double frac = prod_hi - k_hi;
|
double frac = prod_hi - k_hi;
|
||||||
double prod_lo = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[idx + 1], frac);
|
double prod_lo = fputil::multiply_add(x, THIRTYTWO_OVER_PI_28[idx + 1], frac);
|
||||||
double k_lo = fputil::nearest_integer(prod_lo);
|
double k_lo = fputil::nearest_integer(prod_lo);
|
||||||
|
|
||||||
// Now y is the fractional parts.
|
// Now y is the fractional parts.
|
||||||
y = prod_lo - k_lo;
|
y = prod_lo - k_lo;
|
||||||
y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[idx + 2], y);
|
y = fputil::multiply_add(x, THIRTYTWO_OVER_PI_28[idx + 2], y);
|
||||||
y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[idx + 3], y);
|
y = fputil::multiply_add(x, THIRTYTWO_OVER_PI_28[idx + 3], y);
|
||||||
|
|
||||||
return static_cast<int64_t>(k_hi) + static_cast<int64_t>(k_lo);
|
return static_cast<int64_t>(k_hi) + static_cast<int64_t>(k_lo);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exceptional cases.
|
|
||||||
static constexpr int N_EXCEPTS = 3;
|
|
||||||
|
|
||||||
static constexpr fputil::ExceptionalValues<float, N_EXCEPTS> SinfExcepts{
|
|
||||||
/* inputs */ {
|
|
||||||
0x3fa7832a, // x = 0x1.4f0654p0
|
|
||||||
0x46199998, // x = 0x1.33333p13
|
|
||||||
0x55cafb2a, // x = 0x1.95f654p44
|
|
||||||
},
|
|
||||||
/* outputs (RZ, RU offset, RD offset, RN offset) */
|
|
||||||
{
|
|
||||||
{0x3f7741b5, 1, 0, 1}, // x = 0x1.4f0654p0, sin(x) = 0x1.ee836ap-1 (RZ)
|
|
||||||
{0xbeb1fa5d, 0, 1, 0}, // x = 0x1.33333p13, sin(x) = -0x1.63f4bap-2 (RZ)
|
|
||||||
{0xbf7e7a16, 0, 1,
|
|
||||||
1}, // x = 0x1.95f654p44, sin(x) = -0x1.fcf42cp-1 (RZ)
|
|
||||||
}};
|
|
||||||
|
|
||||||
} // namespace generic
|
} // namespace generic
|
||||||
|
|
||||||
} // namespace __llvm_libc
|
} // namespace __llvm_libc
|
||||||
|
|
|
@ -11,92 +11,76 @@
|
||||||
|
|
||||||
#include "src/__support/FPUtil/FMA.h"
|
#include "src/__support/FPUtil/FMA.h"
|
||||||
#include "src/__support/FPUtil/FPBits.h"
|
#include "src/__support/FPUtil/FPBits.h"
|
||||||
#include "src/__support/FPUtil/except_value_utils.h"
|
|
||||||
#include "src/__support/FPUtil/nearest_integer.h"
|
#include "src/__support/FPUtil/nearest_integer.h"
|
||||||
|
|
||||||
namespace __llvm_libc {
|
namespace __llvm_libc {
|
||||||
|
|
||||||
namespace fma {
|
namespace fma {
|
||||||
|
|
||||||
static constexpr uint32_t FAST_PASS_BOUND = 0x5680'0000U; // 2^46
|
static constexpr uint32_t FAST_PASS_BOUND = 0x5600'0000U; // 2^45
|
||||||
|
|
||||||
// Digits of 1/pi, generated by Sollya with:
|
// Digits of 32/pi, generated by Sollya with:
|
||||||
// > a0 = D(16/pi);
|
// > a0 = D(32/pi);
|
||||||
// > a1 = D(16/pi - a0);
|
// > a1 = D(32/pi - a0);
|
||||||
// > a2 = D(16/pi - a0 - a1);
|
// > a2 = D(32/pi - a0 - a1);
|
||||||
// > a3 = D(16/pi - a0 - a1 - a2);
|
// > a3 = D(32/pi - a0 - a1 - a2);
|
||||||
static constexpr double SIXTEEN_OVER_PI[5] = {
|
static constexpr double THIRTYTWO_OVER_PI[5] = {
|
||||||
0x1.45f306dc9c883p+2, -0x1.6b01ec5417056p-52, -0x1.6447e493ad4cep-106,
|
0x1.45f306dc9c883p+3, -0x1.6b01ec5417056p-51, -0x1.6447e493ad4cep-105,
|
||||||
0x1.e21c820ff28b2p-160, -0x1.508510ea79237p-215};
|
0x1.e21c820ff28b2p-159, -0x1.508510ea79237p-214};
|
||||||
|
|
||||||
// Return k and y, where
|
// Return k and y, where
|
||||||
// k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
|
// k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
|
||||||
// Assume x is non-negative.
|
|
||||||
static inline int64_t small_range_reduction(double x, double &y) {
|
static inline int64_t small_range_reduction(double x, double &y) {
|
||||||
double kd = fputil::nearest_integer(x * SIXTEEN_OVER_PI[0]);
|
double kd = fputil::nearest_integer(x * THIRTYTWO_OVER_PI[0]);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[0], -kd);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[0], -kd);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[1], y);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[1], y);
|
||||||
return static_cast<int64_t>(kd);
|
return static_cast<int64_t>(kd);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return k and y, where
|
// Return k and y, where
|
||||||
// k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
|
// k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
|
||||||
|
// This is used for sinf, cosf, sincosf.
|
||||||
static inline int64_t large_range_reduction(double x, int x_exp, double &y) {
|
static inline int64_t large_range_reduction(double x, int x_exp, double &y) {
|
||||||
// 2^46 <= |x| < 2^99
|
// 2^45 <= |x| < 2^99
|
||||||
if (x_exp < 99) {
|
if (x_exp < 99) {
|
||||||
// - When x < 2^99, the full exact product of x * SIXTEEN_OVER_PI[0]
|
// - When x < 2^99, the full exact product of x * THIRTYTWO_OVER_PI[0]
|
||||||
// contains at least one integral bit <= 2^4.
|
// contains at least one integral bit <= 2^5.
|
||||||
// - When 2^46 <= |x| < 2^56, the lowest 5 unit bits are contained
|
// - When 2^45 <= |x| < 2^55, the lowest 6 unit bits are contained
|
||||||
// in the last 10 bits of double(x * SIXTEEN_OVER_PI[0]).
|
// in the last 12 bits of double(x * THIRTYTWO_OVER_PI[0]).
|
||||||
// - When |x| >= 2^56, the LSB of double(x * SIXTEEN_OVER_PI[0]) is at least
|
// - When |x| >= 2^55, the LSB of double(x * THIRTYTWO_OVER_PI[0]) is at
|
||||||
// 32.
|
// least 2^6.
|
||||||
fputil::FPBits<double> prod_hi(x * SIXTEEN_OVER_PI[0]);
|
fputil::FPBits<double> prod_hi(x * THIRTYTWO_OVER_PI[0]);
|
||||||
prod_hi.bits &= (x_exp < 56) ? (~0xfffULL) : (~0ULL); // |x| < 2^56
|
prod_hi.bits &= (x_exp < 55) ? (~0xfffULL) : (~0ULL); // |x| < 2^55
|
||||||
double k_hi = fputil::nearest_integer(static_cast<double>(prod_hi));
|
double k_hi = fputil::nearest_integer(static_cast<double>(prod_hi));
|
||||||
double truncated_prod = fputil::fma(x, SIXTEEN_OVER_PI[0], -k_hi);
|
double truncated_prod = fputil::fma(x, THIRTYTWO_OVER_PI[0], -k_hi);
|
||||||
double prod_lo = fputil::fma(x, SIXTEEN_OVER_PI[1], truncated_prod);
|
double prod_lo = fputil::fma(x, THIRTYTWO_OVER_PI[1], truncated_prod);
|
||||||
double k_lo = fputil::nearest_integer(prod_lo);
|
double k_lo = fputil::nearest_integer(prod_lo);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[1], truncated_prod - k_lo);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[1], truncated_prod - k_lo);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[2], y);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[2], y);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[3], y);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[3], y);
|
||||||
|
|
||||||
return static_cast<int64_t>(k_lo);
|
return static_cast<int64_t>(k_lo);
|
||||||
}
|
}
|
||||||
|
|
||||||
// - When x >= 2^110, the full exact product of x * SIXTEEN_OVER_PI[0] does
|
// - When x >= 2^110, the full exact product of x * THIRTYTWO_OVER_PI[0] does
|
||||||
// not contain any of the lowest 5 unit bits, so we can ignore it completely.
|
// not contain any of the lowest 6 unit bits, so we can ignore it completely.
|
||||||
// - When 2^99 <= |x| < 2^110, the lowest 5 unit bits are contained
|
// - When 2^99 <= |x| < 2^110, the lowest 6 unit bits are contained
|
||||||
// in the last 12 bits of double(x * SIXTEEN_OVER_PI[1]).
|
// in the last 12 bits of double(x * THIRTYTWO_OVER_PI[1]).
|
||||||
// - When |x| >= 2^110, the LSB of double(x * SIXTEEN_OVER_PI[1]) is at
|
// - When |x| >= 2^110, the LSB of double(x * THIRTYTWO_OVER_PI[1]) is at
|
||||||
// least 32.
|
// least 64.
|
||||||
fputil::FPBits<double> prod_hi(x * SIXTEEN_OVER_PI[1]);
|
fputil::FPBits<double> prod_hi(x * THIRTYTWO_OVER_PI[1]);
|
||||||
prod_hi.bits &= (x_exp < 110) ? (~0xfffULL) : (~0ULL); // |x| < 2^110
|
prod_hi.bits &= (x_exp < 110) ? (~0xfffULL) : (~0ULL); // |x| < 2^110
|
||||||
double k_hi = fputil::nearest_integer(static_cast<double>(prod_hi));
|
double k_hi = fputil::nearest_integer(static_cast<double>(prod_hi));
|
||||||
double truncated_prod = fputil::fma(x, SIXTEEN_OVER_PI[1], -k_hi);
|
double truncated_prod = fputil::fma(x, THIRTYTWO_OVER_PI[1], -k_hi);
|
||||||
double prod_lo = fputil::fma(x, SIXTEEN_OVER_PI[2], truncated_prod);
|
double prod_lo = fputil::fma(x, THIRTYTWO_OVER_PI[2], truncated_prod);
|
||||||
double k_lo = fputil::nearest_integer(prod_lo);
|
double k_lo = fputil::nearest_integer(prod_lo);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[2], truncated_prod - k_lo);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[2], truncated_prod - k_lo);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[3], y);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[3], y);
|
||||||
y = fputil::fma(x, SIXTEEN_OVER_PI[4], y);
|
y = fputil::fma(x, THIRTYTWO_OVER_PI[4], y);
|
||||||
|
|
||||||
return static_cast<int64_t>(k_lo);
|
return static_cast<int64_t>(k_lo);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exceptional cases.
|
|
||||||
static constexpr int N_EXCEPTS = 2;
|
|
||||||
|
|
||||||
static constexpr fputil::ExceptionalValues<float, N_EXCEPTS> SinfExcepts{
|
|
||||||
/* inputs */ {
|
|
||||||
0x3fa7832a, // x = 0x1.4f0654p0
|
|
||||||
0x55cafb2a, // x = 0x1.95f654p44
|
|
||||||
},
|
|
||||||
/* outputs (RZ, RU offset, RD offset, RN offset) */
|
|
||||||
{
|
|
||||||
{0x3f7741b5, 1, 0, 1}, // x = 0x1.4f0654p0, sin(x) = 0x1.ee836ap-1 (RZ)
|
|
||||||
{0xbf7e7a16, 0, 1,
|
|
||||||
1}, // x = 0x1.95f654p44, sin(x) = -0x1.fcf42cp-1 (RZ)
|
|
||||||
}};
|
|
||||||
|
|
||||||
} // namespace fma
|
} // namespace fma
|
||||||
|
|
||||||
} // namespace __llvm_libc
|
} // namespace __llvm_libc
|
||||||
|
|
|
@ -18,51 +18,35 @@
|
||||||
namespace __llvm_libc {
|
namespace __llvm_libc {
|
||||||
|
|
||||||
// Exceptional values
|
// Exceptional values
|
||||||
static constexpr int N_EXCEPTS = 10;
|
static constexpr int N_EXCEPTS = 6;
|
||||||
|
|
||||||
static constexpr uint32_t EXCEPT_INPUTS[N_EXCEPTS] = {
|
static constexpr uint32_t EXCEPT_INPUTS[N_EXCEPTS] = {
|
||||||
0x3b5637f5, // x = 0x1.ac6feap-9
|
0x46199998, // x = 0x1.33333p13 x
|
||||||
0x3fa7832a, // x = 0x1.4f0654p0
|
0x55325019, // x = 0x1.64a032p43 x
|
||||||
0x46199998, // x = 0x1.33333p13
|
0x5922aa80, // x = 0x1.4555p51 x
|
||||||
0x55325019, // x = 0x1.64a032p43
|
0x5f18b878, // x = 0x1.3170fp63 x
|
||||||
0x55cafb2a, // x = 0x1.95f654p44
|
0x6115cb11, // x = 0x1.2b9622p67 x
|
||||||
0x5922aa80, // x = 0x1.4555p51
|
0x7beef5ef, // x = 0x1.ddebdep120 x
|
||||||
0x5aa4542c, // x = 0x1.48a858p54
|
|
||||||
0x5f18b878, // x = 0x1.3170fp63
|
|
||||||
0x6115cb11, // x = 0x1.2b9622p67
|
|
||||||
0x7beef5ef, // x = 0x1.ddebdep120
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static constexpr uint32_t EXCEPT_OUTPUTS_SIN[N_EXCEPTS][4] = {
|
static constexpr uint32_t EXCEPT_OUTPUTS_SIN[N_EXCEPTS][4] = {
|
||||||
{0x3b5637dc, 1, 0, 0}, // x = 0x1.ac6feap-9, sin(x) = 0x1.ac6fb8p-9 (RZ)
|
|
||||||
{0x3f7741b5, 1, 0, 1}, // x = 0x1.4f0654p0, sin(x) = 0x1.ee836ap-1 (RZ)
|
|
||||||
{0xbeb1fa5d, 0, 1, 0}, // x = 0x1.33333p13, sin(x) = -0x1.63f4bap-2 (RZ)
|
{0xbeb1fa5d, 0, 1, 0}, // x = 0x1.33333p13, sin(x) = -0x1.63f4bap-2 (RZ)
|
||||||
{0xbf171adf, 0, 1, 1}, // x = 0x1.64a032p43, sin(x) = -0x1.2e35bep-1 (RZ)
|
{0xbf171adf, 0, 1, 1}, // x = 0x1.64a032p43, sin(x) = -0x1.2e35bep-1 (RZ)
|
||||||
{0xbf7e7a16, 0, 1, 1}, // x = 0x1.95f654p44, sin(x) = -0x1.fcf42cp-1 (RZ)
|
|
||||||
{0xbf587521, 0, 1, 1}, // x = 0x1.4555p51, sin(x) = -0x1.b0ea42p-1 (RZ)
|
{0xbf587521, 0, 1, 1}, // x = 0x1.4555p51, sin(x) = -0x1.b0ea42p-1 (RZ)
|
||||||
{0x3f5f5646, 1, 0, 0}, // x = 0x1.48a858p54, sin(x) = 0x1.beac8cp-1 (RZ)
|
|
||||||
{0x3dad60f6, 1, 0, 1}, // x = 0x1.3170fp63, sin(x) = 0x1.5ac1ecp-4 (RZ)
|
{0x3dad60f6, 1, 0, 1}, // x = 0x1.3170fp63, sin(x) = 0x1.5ac1ecp-4 (RZ)
|
||||||
{0xbe7cc1e0, 0, 1, 1}, // x = 0x1.2b9622p67, sin(x) = -0x1.f983cp-3 (RZ)
|
{0xbe7cc1e0, 0, 1, 1}, // x = 0x1.2b9622p67, sin(x) = -0x1.f983cp-3 (RZ)
|
||||||
{0xbf587d1b, 0, 1, 1}, // x = 0x1.ddebdep120, sin(x) = -0x1.b0fa36p-1 (RZ)
|
{0xbf587d1b, 0, 1, 1}, // x = 0x1.ddebdep120, sin(x) = -0x1.b0fa36p-1 (RZ)
|
||||||
};
|
};
|
||||||
|
|
||||||
static constexpr uint32_t EXCEPT_OUTPUTS_COS[N_EXCEPTS][4] = {
|
static constexpr uint32_t EXCEPT_OUTPUTS_COS[N_EXCEPTS][4] = {
|
||||||
{0x3f7fffa6, 1, 0, 0}, // x = 0x1.ac6feap-9, cos(x) = 0x1.ffff4cp-1 (RZ)
|
|
||||||
{0x3e84aabf, 1, 0, 1}, // x = 0x1.4f0654p0, cos(x) = 0x1.09557ep-2 (RZ)
|
|
||||||
{0xbf70090b, 0, 1, 0}, // x = 0x1.33333p13, cos(x) = -0x1.e01216p-1 (RZ)
|
{0xbf70090b, 0, 1, 0}, // x = 0x1.33333p13, cos(x) = -0x1.e01216p-1 (RZ)
|
||||||
{0x3f4ea5d2, 1, 0, 0}, // x = 0x1.64a032p43, cos(x) = 0x1.9d4ba4p-1 (RZ)
|
{0x3f4ea5d2, 1, 0, 0}, // x = 0x1.64a032p43, cos(x) = 0x1.9d4ba4p-1 (RZ)
|
||||||
{0x3ddf11f3, 1, 0, 1}, // x = 0x1.95f654p44, cos(x) = 0x1.be23e6p-4 (RZ)
|
|
||||||
{0x3f08aebe, 1, 0, 1}, // x = 0x1.4555p51, cos(x) = 0x1.115d7cp-1 (RZ)
|
{0x3f08aebe, 1, 0, 1}, // x = 0x1.4555p51, cos(x) = 0x1.115d7cp-1 (RZ)
|
||||||
{0x3efa40a4, 1, 0, 0}, // x = 0x1.48a858p54, cos(x) = 0x1.f48148p-2 (RZ)
|
|
||||||
{0x3f7f14bb, 1, 0, 0}, // x = 0x1.3170fp63, cos(x) = 0x1.fe2976p-1 (RZ)
|
{0x3f7f14bb, 1, 0, 0}, // x = 0x1.3170fp63, cos(x) = 0x1.fe2976p-1 (RZ)
|
||||||
{0x3f78142e, 1, 0, 1}, // x = 0x1.2b9622p67, cos(x) = 0x1.f0285cp-1 (RZ)
|
{0x3f78142e, 1, 0, 1}, // x = 0x1.2b9622p67, cos(x) = 0x1.f0285cp-1 (RZ)
|
||||||
{0x3f08a21c, 1, 0, 0}, // x = 0x1.ddebdep120, cos(x) = 0x1.114438p-1 (RZ)
|
{0x3f08a21c, 1, 0, 0}, // x = 0x1.ddebdep120, cos(x) = 0x1.114438p-1 (RZ)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Fast sincosf implementation. Worst-case ULP is 0.5607, maximum relative
|
|
||||||
// error is 0.5303 * 2^-23. A single-step range reduction is used for
|
|
||||||
// small values. Large inputs have their range reduced using fast integer
|
|
||||||
// arithmetic.
|
|
||||||
LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
|
LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
|
||||||
using FPBits = typename fputil::FPBits<float>;
|
using FPBits = typename fputil::FPBits<float>;
|
||||||
FPBits xbits(x);
|
FPBits xbits(x);
|
||||||
|
@ -71,25 +55,25 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
|
||||||
double xd = static_cast<double>(x);
|
double xd = static_cast<double>(x);
|
||||||
|
|
||||||
// Range reduction:
|
// Range reduction:
|
||||||
// For |x| > pi/16, we perform range reduction as follows:
|
// For |x| >= 2^-12, we perform range reduction as follows:
|
||||||
// Find k and y such that:
|
// Find k and y such that:
|
||||||
// x = (k + y) * pi/16
|
// x = (k + y) * pi/32
|
||||||
// k is an integer
|
// k is an integer
|
||||||
// |y| < 0.5
|
// |y| < 0.5
|
||||||
// For small range (|x| < 2^46 when FMA instructions are available, 2^22
|
// For small range (|x| < 2^45 when FMA instructions are available, 2^22
|
||||||
// otherwise), this is done by performing:
|
// otherwise), this is done by performing:
|
||||||
// k = round(x * 16/pi)
|
// k = round(x * 32/pi)
|
||||||
// y = x * 16/pi - k
|
// y = x * 32/pi - k
|
||||||
// For large range, we will omit all the higher parts of 16/pi such that the
|
// For large range, we will omit all the higher parts of 32/pi such that the
|
||||||
// least significant bits of their full products with x are larger than 31,
|
// least significant bits of their full products with x are larger than 63,
|
||||||
// since:
|
// since:
|
||||||
// sin((k + y + 32*i) * pi/16) = sin(x + i * 2pi) = sin(x), and
|
// sin((k + y + 64*i) * pi/32) = sin(x + i * 2pi) = sin(x), and
|
||||||
// cos((k + y + 32*i) * pi/16) = cos(x + i * 2pi) = cos(x).
|
// cos((k + y + 64*i) * pi/32) = cos(x + i * 2pi) = cos(x).
|
||||||
//
|
//
|
||||||
// When FMA instructions are not available, we store the digits of 16/pi in
|
// When FMA instructions are not available, we store the digits of 32/pi in
|
||||||
// chunks of 28-bit precision. This will make sure that the products:
|
// chunks of 28-bit precision. This will make sure that the products:
|
||||||
// x * SIXTEEN_OVER_PI_28[i] are all exact.
|
// x * THIRTYTWO_OVER_PI_28[i] are all exact.
|
||||||
// When FMA instructions are available, we simply store the digits of 16/pi in
|
// When FMA instructions are available, we simply store the digits of326/pi in
|
||||||
// chunks of doubles (53-bit of precision).
|
// chunks of doubles (53-bit of precision).
|
||||||
// So when multiplying by the largest values of single precision, the
|
// So when multiplying by the largest values of single precision, the
|
||||||
// resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the
|
// resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the
|
||||||
|
@ -102,13 +86,13 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
|
||||||
//
|
//
|
||||||
// Once k and y are computed, we then deduce the answer by the sine and cosine
|
// Once k and y are computed, we then deduce the answer by the sine and cosine
|
||||||
// of sum formulas:
|
// of sum formulas:
|
||||||
// sin(x) = sin((k + y)*pi/16)
|
// sin(x) = sin((k + y)*pi/32)
|
||||||
// = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
|
// = sin(y*pi/32) * cos(k*pi/32) + cos(y*pi/32) * sin(k*pi/32)
|
||||||
// cos(x) = cos((k + y)*pi/16)
|
// cos(x) = cos((k + y)*pi/32)
|
||||||
// = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
|
// = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
|
||||||
// The values of sin(k*pi/16) and cos(k*pi/16) for k = 0..31 are precomputed
|
// The values of sin(k*pi/32) and cos(k*pi/32) for k = 0..63 are precomputed
|
||||||
// and stored using a vector of 32 doubles. Sin(y*pi/16) and cos(y*pi/16) are
|
// and stored using a vector of 32 doubles. Sin(y*pi/32) and cos(y*pi/32) are
|
||||||
// computed using degree-7 and degree-8 minimax polynomials generated by
|
// computed using degree-7 and degree-6 minimax polynomials generated by
|
||||||
// Sollya respectively.
|
// Sollya respectively.
|
||||||
|
|
||||||
// |x| < 0x1.0p-12f
|
// |x| < 0x1.0p-12f
|
||||||
|
@ -195,12 +179,12 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Combine the results with the sine and cosine of sum formulas:
|
// Combine the results with the sine and cosine of sum formulas:
|
||||||
// sin(x) = sin((k + y)*pi/16)
|
// sin(x) = sin((k + y)*pi/32)
|
||||||
// = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
|
// = sin(y*pi/32) * cos(k*pi/32) + cos(y*pi/32) * sin(k*pi/32)
|
||||||
// = sin_y * cos_k + (1 + cosm1_y) * sin_k
|
// = sin_y * cos_k + (1 + cosm1_y) * sin_k
|
||||||
// = sin_y * cos_k + (cosm1_y * sin_k + sin_k)
|
// = sin_y * cos_k + (cosm1_y * sin_k + sin_k)
|
||||||
// cos(x) = cos((k + y)*pi/16)
|
// cos(x) = cos((k + y)*pi/32)
|
||||||
// = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
|
// = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
|
||||||
// = cosm1_y * cos_k + sin_y * sin_k
|
// = cosm1_y * cos_k + sin_y * sin_k
|
||||||
// = (cosm1_y * cos_k + cos_k) + sin_y * sin_k
|
// = (cosm1_y * cos_k + cos_k) + sin_y * sin_k
|
||||||
double sin_k, cos_k, sin_y, cosm1_y;
|
double sin_k, cos_k, sin_y, cosm1_y;
|
||||||
|
|
|
@ -29,22 +29,34 @@ using __llvm_libc::generic::small_range_reduction;
|
||||||
|
|
||||||
namespace __llvm_libc {
|
namespace __llvm_libc {
|
||||||
|
|
||||||
// Lookup table for sin(k * pi / 16) with k = 0, ..., 31.
|
// Lookup table for sin(k * pi / 32) with k = 0, ..., 63.
|
||||||
// Table is generated with Sollya as follow:
|
// Table is generated with Sollya as follow:
|
||||||
// > display = hexadecimal;
|
// > display = hexadecimal;
|
||||||
// > for k from 0 to 31 do { D(sin(k * pi/16)); };
|
// > for k from 0 to 63 do { D(sin(k * pi/32)); };
|
||||||
const double SIN_K_PI_OVER_16[32] = {
|
const double SIN_K_PI_OVER_32[64] = {
|
||||||
0x0.0000000000000p+0, 0x1.8f8b83c69a60bp-3, 0x1.87de2a6aea963p-2,
|
0x0.0000000000000p+0, 0x1.917a6bc29b42cp-4, 0x1.8f8b83c69a60bp-3,
|
||||||
0x1.1c73b39ae68c8p-1, 0x1.6a09e667f3bcdp-1, 0x1.a9b66290ea1a3p-1,
|
0x1.294062ed59f06p-2, 0x1.87de2a6aea963p-2, 0x1.e2b5d3806f63bp-2,
|
||||||
0x1.d906bcf328d46p-1, 0x1.f6297cff75cb0p-1, 0x1.0000000000000p+0,
|
0x1.1c73b39ae68c8p-1, 0x1.44cf325091dd6p-1, 0x1.6a09e667f3bcdp-1,
|
||||||
0x1.f6297cff75cb0p-1, 0x1.d906bcf328d46p-1, 0x1.a9b66290ea1a3p-1,
|
0x1.8bc806b151741p-1, 0x1.a9b66290ea1a3p-1, 0x1.c38b2f180bdb1p-1,
|
||||||
0x1.6a09e667f3bcdp-1, 0x1.1c73b39ae68c8p-1, 0x1.87de2a6aea963p-2,
|
0x1.d906bcf328d46p-1, 0x1.e9f4156c62ddap-1, 0x1.f6297cff75cbp-1,
|
||||||
0x1.8f8b83c69a60bp-3, 0x0.0000000000000p+0, -0x1.8f8b83c69a60bp-3,
|
0x1.fd88da3d12526p-1, 0x1.0000000000000p+0, 0x1.fd88da3d12526p-1,
|
||||||
-0x1.87de2a6aea963p-2, -0x1.1c73b39ae68c8p-1, -0x1.6a09e667f3bcdp-1,
|
0x1.f6297cff75cbp-1, 0x1.e9f4156c62ddap-1, 0x1.d906bcf328d46p-1,
|
||||||
-0x1.a9b66290ea1a3p-1, -0x1.d906bcf328d46p-1, -0x1.f6297cff75cb0p-1,
|
0x1.c38b2f180bdb1p-1, 0x1.a9b66290ea1a3p-1, 0x1.8bc806b151741p-1,
|
||||||
-0x1.0000000000000p+0, -0x1.f6297cff75cb0p-1, -0x1.d906bcf328d46p-1,
|
0x1.6a09e667f3bcdp-1, 0x1.44cf325091dd6p-1, 0x1.1c73b39ae68c8p-1,
|
||||||
-0x1.a9b66290ea1a3p-1, -0x1.6a09e667f3bcdp-1, -0x1.1c73b39ae68c8p-1,
|
0x1.e2b5d3806f63bp-2, 0x1.87de2a6aea963p-2, 0x1.294062ed59f06p-2,
|
||||||
-0x1.87de2a6aea963p-2, -0x1.8f8b83c69a60bp-3};
|
0x1.8f8b83c69a60bp-3, 0x1.917a6bc29b42cp-4, 0x0.0000000000000p+0,
|
||||||
|
-0x1.917a6bc29b42cp-4, -0x1.8f8b83c69a60bp-3, -0x1.294062ed59f06p-2,
|
||||||
|
-0x1.87de2a6aea963p-2, -0x1.e2b5d3806f63bp-2, -0x1.1c73b39ae68c8p-1,
|
||||||
|
-0x1.44cf325091dd6p-1, -0x1.6a09e667f3bcdp-1, -0x1.8bc806b151741p-1,
|
||||||
|
-0x1.a9b66290ea1a3p-1, -0x1.c38b2f180bdb1p-1, -0x1.d906bcf328d46p-1,
|
||||||
|
-0x1.e9f4156c62ddap-1, -0x1.f6297cff75cbp-1, -0x1.fd88da3d12526p-1,
|
||||||
|
-0x1.0000000000000p+0, -0x1.fd88da3d12526p-1, -0x1.f6297cff75cbp-1,
|
||||||
|
-0x1.e9f4156c62ddap-1, -0x1.d906bcf328d46p-1, -0x1.c38b2f180bdb1p-1,
|
||||||
|
-0x1.a9b66290ea1a3p-1, -0x1.8bc806b151741p-1, -0x1.6a09e667f3bcdp-1,
|
||||||
|
-0x1.44cf325091dd6p-1, -0x1.1c73b39ae68c8p-1, -0x1.e2b5d3806f63bp-2,
|
||||||
|
-0x1.87de2a6aea963p-2, -0x1.294062ed59f06p-2, -0x1.8f8b83c69a60bp-3,
|
||||||
|
-0x1.917a6bc29b42cp-4,
|
||||||
|
};
|
||||||
|
|
||||||
static inline void sincosf_eval(double xd, uint32_t x_abs, double &sin_k,
|
static inline void sincosf_eval(double xd, uint32_t x_abs, double &sin_k,
|
||||||
double &cos_k, double &sin_y, double &cosm1_y) {
|
double &cos_k, double &sin_y, double &cosm1_y) {
|
||||||
|
@ -58,29 +70,29 @@ static inline void sincosf_eval(double xd, uint32_t x_abs, double &sin_k,
|
||||||
k = large_range_reduction(xd, x_bits.get_exponent(), y);
|
k = large_range_reduction(xd, x_bits.get_exponent(), y);
|
||||||
}
|
}
|
||||||
|
|
||||||
// After range reduction, k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
|
// After range reduction, k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
|
||||||
// So k is an integer and -0.5 <= y <= 0.5.
|
// So k is an integer and -0.5 <= y <= 0.5.
|
||||||
// Then sin(x) = sin((k + y)*pi/16)
|
// Then sin(x) = sin((k + y)*pi/32)
|
||||||
// = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
|
// = sin(y*pi/32) * cos(k*pi/32) + cos(y*pi/32) * sin(k*pi/32)
|
||||||
|
|
||||||
sin_k = SIN_K_PI_OVER_16[k & 31];
|
sin_k = SIN_K_PI_OVER_32[k & 63];
|
||||||
// cos(k * pi/16) = sin(k * pi/16 + pi/2) = sin((k + 8) * pi/16).
|
// cos(k * pi/32) = sin(k * pi/32 + pi/2) = sin((k + 16) * pi/32).
|
||||||
// cos_k = y * cos(k * pi/16)
|
// cos_k = cos(k * pi/32)
|
||||||
cos_k = SIN_K_PI_OVER_16[(k + 8) & 31];
|
cos_k = SIN_K_PI_OVER_32[(k + 16) & 63];
|
||||||
|
|
||||||
double ysq = y * y;
|
double ysq = y * y;
|
||||||
|
|
||||||
// Degree-6 minimax even polynomial for sin(y*pi/16)/y generated by Sollya
|
// Degree-6 minimax even polynomial for sin(y*pi/32)/y generated by Sollya
|
||||||
// with:
|
// with:
|
||||||
// > Q = fpminimax(sin(y*pi/16)/y, [|0, 2, 4, 6|], [|D...|], [0, 0.5]);
|
// > Q = fpminimax(sin(y*pi/32)/y, [|0, 2, 4, 6|], [|D...|], [0, 0.5]);
|
||||||
sin_y = y * fputil::polyeval(ysq, 0x1.921fb54442d17p-3, -0x1.4abbce6256adp-10,
|
sin_y =
|
||||||
0x1.466bc5a5ac6b3p-19, -0x1.32bdcb4207562p-29);
|
y * fputil::polyeval(ysq, 0x1.921fb54442d18p-4, -0x1.4abbce625abb1p-13,
|
||||||
// Degree-8 minimax even polynomial for cos(y*pi/16) generated by Sollya with:
|
0x1.466bc624f2776p-24, -0x1.32c3a619d4a7ep-36);
|
||||||
// > P = fpminimax(cos(x*pi/16), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 0.5]);
|
// Degree-8 minimax even polynomial for cos(y*pi/32) generated by Sollya with:
|
||||||
// Note that cosm1_y = cos(y*pi/16) - 1.
|
// > P = fpminimax(cos(x*pi/32), [|0, 2, 4, 6|], [|1, D...|], [0, 0.5]);
|
||||||
cosm1_y =
|
// Note that cosm1_y = cos(y*pi/32) - 1.
|
||||||
ysq * fputil::polyeval(ysq, -0x1.3bd3cc9be45dcp-6, 0x1.03c1f081b08ap-14,
|
cosm1_y = ysq * fputil::polyeval(ysq, -0x1.3bd3cc9be430bp-8,
|
||||||
-0x1.55d3c6fb0fb6ep-24, 0x1.e1d3d60f58873p-35);
|
0x1.03c1f070c2e27p-18, -0x1.55cc84bd942p-30);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace __llvm_libc
|
} // namespace __llvm_libc
|
||||||
|
|
|
@ -12,7 +12,6 @@
|
||||||
#include "src/__support/FPUtil/FEnvImpl.h"
|
#include "src/__support/FPUtil/FEnvImpl.h"
|
||||||
#include "src/__support/FPUtil/FPBits.h"
|
#include "src/__support/FPUtil/FPBits.h"
|
||||||
#include "src/__support/FPUtil/PolyEval.h"
|
#include "src/__support/FPUtil/PolyEval.h"
|
||||||
#include "src/__support/FPUtil/except_value_utils.h"
|
|
||||||
#include "src/__support/FPUtil/multiply_add.h"
|
#include "src/__support/FPUtil/multiply_add.h"
|
||||||
#include "src/__support/common.h"
|
#include "src/__support/common.h"
|
||||||
|
|
||||||
|
@ -20,14 +19,8 @@
|
||||||
|
|
||||||
#if defined(LIBC_TARGET_HAS_FMA)
|
#if defined(LIBC_TARGET_HAS_FMA)
|
||||||
#include "range_reduction_fma.h"
|
#include "range_reduction_fma.h"
|
||||||
// using namespace __llvm_libc::fma;
|
|
||||||
using __llvm_libc::fma::N_EXCEPTS;
|
|
||||||
using __llvm_libc::fma::SinfExcepts;
|
|
||||||
#else
|
#else
|
||||||
#include "range_reduction.h"
|
#include "range_reduction.h"
|
||||||
// using namespace __llvm_libc::generic;
|
|
||||||
using __llvm_libc::generic::N_EXCEPTS;
|
|
||||||
using __llvm_libc::generic::SinfExcepts;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace __llvm_libc {
|
namespace __llvm_libc {
|
||||||
|
@ -41,23 +34,23 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
|
||||||
double xd = static_cast<double>(x);
|
double xd = static_cast<double>(x);
|
||||||
|
|
||||||
// Range reduction:
|
// Range reduction:
|
||||||
// For |x| > pi/16, we perform range reduction as follows:
|
// For |x| > pi/32, we perform range reduction as follows:
|
||||||
// Find k and y such that:
|
// Find k and y such that:
|
||||||
// x = (k + y) * pi/16
|
// x = (k + y) * pi/32
|
||||||
// k is an integer
|
// k is an integer
|
||||||
// |y| < 0.5
|
// |y| < 0.5
|
||||||
// For small range (|x| < 2^46 when FMA instructions are available, 2^22
|
// For small range (|x| < 2^45 when FMA instructions are available, 2^22
|
||||||
// otherwise), this is done by performing:
|
// otherwise), this is done by performing:
|
||||||
// k = round(x * 16/pi)
|
// k = round(x * 32/pi)
|
||||||
// y = x * 16/pi - k
|
// y = x * 32/pi - k
|
||||||
// For large range, we will omit all the higher parts of 16/pi such that the
|
// For large range, we will omit all the higher parts of 32/pi such that the
|
||||||
// least significant bits of their full products with x are larger than 31,
|
// least significant bits of their full products with x are larger than 63,
|
||||||
// since sin((k + y + 32*i) * pi/16) = sin(x + i * 2pi) = sin(x).
|
// since sin((k + y + 64*i) * pi/32) = sin(x + i * 2pi) = sin(x).
|
||||||
//
|
//
|
||||||
// When FMA instructions are not available, we store the digits of 16/pi in
|
// When FMA instructions are not available, we store the digits of 32/pi in
|
||||||
// chunks of 28-bit precision. This will make sure that the products:
|
// chunks of 28-bit precision. This will make sure that the products:
|
||||||
// x * SIXTEEN_OVER_PI_28[i] are all exact.
|
// x * THIRTYTWO_OVER_PI_28[i] are all exact.
|
||||||
// When FMA instructions are available, we simply store the digits of 16/pi in
|
// When FMA instructions are available, we simply store the digits of 32/pi in
|
||||||
// chunks of doubles (53-bit of precision).
|
// chunks of doubles (53-bit of precision).
|
||||||
// So when multiplying by the largest values of single precision, the
|
// So when multiplying by the largest values of single precision, the
|
||||||
// resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the
|
// resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the
|
||||||
|
@ -70,11 +63,11 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
|
||||||
//
|
//
|
||||||
// Once k and y are computed, we then deduce the answer by the sine of sum
|
// Once k and y are computed, we then deduce the answer by the sine of sum
|
||||||
// formula:
|
// formula:
|
||||||
// sin(x) = sin((k + y)*pi/16)
|
// sin(x) = sin((k + y)*pi/32)
|
||||||
// = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
|
// = sin(y*pi/32) * cos(k*pi/32) + cos(y*pi/32) * sin(k*pi/32)
|
||||||
// The values of sin(k*pi/16) and cos(k*pi/16) for k = 0..31 are precomputed
|
// The values of sin(k*pi/32) and cos(k*pi/32) for k = 0..31 are precomputed
|
||||||
// and stored using a vector of 32 doubles. Sin(y*pi/16) and cos(y*pi/16) are
|
// and stored using a vector of 32 doubles. Sin(y*pi/32) and cos(y*pi/32) are
|
||||||
// computed using degree-7 and degree-8 minimax polynomials generated by
|
// computed using degree-7 and degree-6 minimax polynomials generated by
|
||||||
// Sollya respectively.
|
// Sollya respectively.
|
||||||
|
|
||||||
// |x| <= pi/16
|
// |x| <= pi/16
|
||||||
|
@ -129,12 +122,13 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
|
||||||
return xd * result;
|
return xd * result;
|
||||||
}
|
}
|
||||||
|
|
||||||
using ExceptChecker = typename fputil::ExceptionChecker<float, N_EXCEPTS>;
|
if (unlikely(x_abs == 0x4619'9998U)) { // x = 0x1.33333p13
|
||||||
{
|
float r = -0x1.63f4bap-2f;
|
||||||
float result;
|
int rounding = fputil::get_round();
|
||||||
if (ExceptChecker::check_odd_func(SinfExcepts, x_abs, xbits.get_sign(),
|
bool sign = xbits.get_sign();
|
||||||
result))
|
if ((rounding == FE_DOWNWARD && !sign) || (rounding == FE_UPWARD && sign))
|
||||||
return result;
|
r = -0x1.63f4bcp-2f;
|
||||||
|
return xbits.get_sign() ? -r : r;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (unlikely(x_abs >= 0x7f80'0000U)) {
|
if (unlikely(x_abs >= 0x7f80'0000U)) {
|
||||||
|
@ -147,8 +141,8 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Combine the results with the sine of sum formula:
|
// Combine the results with the sine of sum formula:
|
||||||
// sin(x) = sin((k + y)*pi/16)
|
// sin(x) = sin((k + y)*pi/32)
|
||||||
// = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
|
// = sin(y*pi/32) * cos(k*pi/32) + cos(y*pi/32) * sin(k*pi/32)
|
||||||
// = sin_y * cos_k + (1 + cosm1_y) * sin_k
|
// = sin_y * cos_k + (1 + cosm1_y) * sin_k
|
||||||
// = sin_y * cos_k + (cosm1_y * sin_k + sin_k)
|
// = sin_y * cos_k + (cosm1_y * sin_k + sin_k)
|
||||||
double sin_k, cos_k, sin_y, cosm1_y;
|
double sin_k, cos_k, sin_y, cosm1_y;
|
||||||
|
|
Loading…
Reference in New Issue