softfloat: Move the binary point to the msb

Rather than point the binary point at msb-1, put it at the msb.
Use uadd64_overflow to detect when addition overflows instead
of DECOMPOSED_OVERFLOW_BIT.

This reduces the number of special cases within the code, such
as shifting an int64_t either left or right during conversion.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2020-11-07 11:19:32 -08:00
parent f2b84b9edb
commit e99c43735a

View File

@ -503,9 +503,8 @@ typedef struct {
bool sign; bool sign;
} FloatParts; } FloatParts;
#define DECOMPOSED_BINARY_POINT (64 - 2) #define DECOMPOSED_BINARY_POINT 63
#define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
#define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
/* Structure holding all of the relevant parameters for a format. /* Structure holding all of the relevant parameters for a format.
* exp_size: the size of the exponent field * exp_size: the size of the exponent field
@ -657,7 +656,7 @@ static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
part.cls = float_class_zero; part.cls = float_class_zero;
part.frac = 0; part.frac = 0;
} else { } else {
int shift = clz64(part.frac) - 1; int shift = clz64(part.frac);
part.cls = float_class_normal; part.cls = float_class_normal;
part.exp = parm->frac_shift - parm->exp_bias - shift + 1; part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
part.frac <<= shift; part.frac <<= shift;
@ -727,9 +726,8 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
if (likely(exp > 0)) { if (likely(exp > 0)) {
if (frac & round_mask) { if (frac & round_mask) {
flags |= float_flag_inexact; flags |= float_flag_inexact;
frac += inc; if (uadd64_overflow(frac, inc, &frac)) {
if (frac & DECOMPOSED_OVERFLOW_BIT) { frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
frac >>= 1;
exp++; exp++;
} }
} }
@ -758,9 +756,12 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
p.cls = float_class_zero; p.cls = float_class_zero;
goto do_zero; goto do_zero;
} else { } else {
bool is_tiny = s->tininess_before_rounding bool is_tiny = s->tininess_before_rounding || (exp < 0);
|| (exp < 0)
|| !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); if (!is_tiny) {
uint64_t discard;
is_tiny = !uadd64_overflow(frac, inc, &discard);
}
shift64RightJamming(frac, 1 - exp, &frac); shift64RightJamming(frac, 1 - exp, &frac);
if (frac & round_mask) { if (frac & round_mask) {
@ -985,7 +986,7 @@ static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
a.cls = float_class_zero; a.cls = float_class_zero;
a.sign = s->float_rounding_mode == float_round_down; a.sign = s->float_rounding_mode == float_round_down;
} else { } else {
int shift = clz64(a.frac) - 1; int shift = clz64(a.frac);
a.frac = a.frac << shift; a.frac = a.frac << shift;
a.exp = a.exp - shift; a.exp = a.exp - shift;
a.sign = a_sign; a.sign = a_sign;
@ -1022,9 +1023,10 @@ static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
a.exp = b.exp; a.exp = b.exp;
} }
a.frac += b.frac;
if (a.frac & DECOMPOSED_OVERFLOW_BIT) { if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
shift64RightJamming(a.frac, 1, &a.frac); shift64RightJamming(a.frac, 1, &a.frac);
a.frac |= DECOMPOSED_IMPLICIT_BIT;
a.exp += 1; a.exp += 1;
} }
return a; return a;
@ -1219,16 +1221,17 @@ static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
int exp = a.exp + b.exp; int exp = a.exp + b.exp;
mul64To128(a.frac, b.frac, &hi, &lo); mul64To128(a.frac, b.frac, &hi, &lo);
shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); if (hi & DECOMPOSED_IMPLICIT_BIT) {
if (lo & DECOMPOSED_OVERFLOW_BIT) {
shift64RightJamming(lo, 1, &lo);
exp += 1; exp += 1;
} else {
hi <<= 1;
} }
hi |= (lo != 0);
/* Re-use a */ /* Re-use a */
a.exp = exp; a.exp = exp;
a.sign = sign; a.sign = sign;
a.frac = lo; a.frac = hi;
return a; return a;
} }
/* handle all the NaN cases */ /* handle all the NaN cases */
@ -1411,56 +1414,41 @@ static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
p_exp = a.exp + b.exp; p_exp = a.exp + b.exp;
/* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
* result.
*/
mul64To128(a.frac, b.frac, &hi, &lo); mul64To128(a.frac, b.frac, &hi, &lo);
/* binary point now at bit 124 */
/* check for overflow */ /* Renormalize to the msb. */
if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { if (hi & DECOMPOSED_IMPLICIT_BIT) {
shift128RightJamming(hi, lo, 1, &hi, &lo);
p_exp += 1; p_exp += 1;
} else {
shortShift128Left(hi, lo, 1, &hi, &lo);
} }
/* + add/sub */ /* + add/sub */
if (c.cls == float_class_zero) { if (c.cls != float_class_zero) {
/* move binary point back to 62 */
shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
} else {
int exp_diff = p_exp - c.exp; int exp_diff = p_exp - c.exp;
if (p_sign == c.sign) { if (p_sign == c.sign) {
/* Addition */ /* Addition */
if (exp_diff <= 0) { if (exp_diff <= 0) {
shift128RightJamming(hi, lo, shift64RightJamming(hi, -exp_diff, &hi);
DECOMPOSED_BINARY_POINT - exp_diff,
&hi, &lo);
lo += c.frac;
p_exp = c.exp; p_exp = c.exp;
if (uadd64_overflow(hi, c.frac, &hi)) {
shift64RightJamming(hi, 1, &hi);
hi |= DECOMPOSED_IMPLICIT_BIT;
p_exp += 1;
}
} else { } else {
uint64_t c_hi, c_lo; uint64_t c_hi, c_lo, over;
/* shift c to the same binary point as the product (124) */ shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
c_hi = c.frac >> 2; add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
c_lo = 0; if (over) {
shift128RightJamming(c_hi, c_lo, shift64RightJamming(hi, 1, &hi);
exp_diff, hi |= DECOMPOSED_IMPLICIT_BIT;
&c_hi, &c_lo); p_exp += 1;
add128(hi, lo, c_hi, c_lo, &hi, &lo); }
/* move binary point back to 62 */
shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
} }
if (lo & DECOMPOSED_OVERFLOW_BIT) {
shift64RightJamming(lo, 1, &lo);
p_exp += 1;
}
} else { } else {
/* Subtraction */ /* Subtraction */
uint64_t c_hi, c_lo; uint64_t c_hi = c.frac, c_lo = 0;
/* make C binary point match product at bit 124 */
c_hi = c.frac >> 2;
c_lo = 0;
if (exp_diff <= 0) { if (exp_diff <= 0) {
shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
@ -1495,20 +1483,15 @@ static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
/* Normalizing to a binary point of 124 is the /* Normalizing to a binary point of 124 is the
correct adjust for the exponent. However since we're correct adjust for the exponent. However since we're
shifting, we might as well put the binary point back shifting, we might as well put the binary point back
at 62 where we really want it. Therefore shift as at 63 where we really want it. Therefore shift as
if we're leaving 1 bit at the top of the word, but if we're leaving 1 bit at the top of the word, but
adjust the exponent as if we're leaving 3 bits. */ adjust the exponent as if we're leaving 3 bits. */
shift -= 1; shift128Left(hi, lo, shift, &hi, &lo);
if (shift >= 64) { p_exp -= shift;
lo = lo << (shift - 64);
} else {
hi = (hi << shift) | (lo >> (64 - shift));
lo = hi | ((lo << shift) != 0);
}
p_exp -= shift - 2;
} }
} }
} }
hi |= (lo != 0);
if (flags & float_muladd_halve_result) { if (flags & float_muladd_halve_result) {
p_exp -= 1; p_exp -= 1;
@ -1518,7 +1501,7 @@ static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
a.cls = float_class_normal; a.cls = float_class_normal;
a.sign = p_sign ^ sign_flip; a.sign = p_sign ^ sign_flip;
a.exp = p_exp; a.exp = p_exp;
a.frac = lo; a.frac = hi;
return a; return a;
} }
@ -1742,25 +1725,17 @@ static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
* exponent to match. * exponent to match.
* *
* The udiv_qrnnd algorithm that we're using requires normalization, * The udiv_qrnnd algorithm that we're using requires normalization,
* i.e. the msb of the denominator must be set. Since we know that * i.e. the msb of the denominator must be set, which is already true.
* DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
* by one (more), and the remainder must be shifted right by one.
*/ */
if (a.frac < b.frac) { if (a.frac < b.frac) {
exp -= 1; exp -= 1;
shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
} else {
shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
} else {
shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
} }
q = udiv_qrnnd(&r, n1, n0, b.frac << 1); q = udiv_qrnnd(&r, n1, n0, b.frac);
/* /* Set lsb if there is a remainder, to set inexact. */
* Set lsb if there is a remainder, to set inexact.
* As mentioned above, to find the actual value of the remainder we
* would need to shift right, but (1) we are only concerned about
* non-zero-ness, and (2) the remainder will always be even because
* both inputs to the division primitive are even.
*/
a.frac = q | (r != 0); a.frac = q | (r != 0);
a.sign = sign; a.sign = sign;
a.exp = exp; a.exp = exp;
@ -2135,12 +2110,12 @@ static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode,
if (a.frac & rnd_mask) { if (a.frac & rnd_mask) {
s->float_exception_flags |= float_flag_inexact; s->float_exception_flags |= float_flag_inexact;
a.frac += inc; if (uadd64_overflow(a.frac, inc, &a.frac)) {
a.frac &= ~rnd_mask;
if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
a.frac >>= 1; a.frac >>= 1;
a.frac |= DECOMPOSED_IMPLICIT_BIT;
a.exp++; a.exp++;
} }
a.frac &= ~rnd_mask;
} }
} }
break; break;
@ -2213,10 +2188,8 @@ static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode,
case float_class_zero: case float_class_zero:
return 0; return 0;
case float_class_normal: case float_class_normal:
if (p.exp < DECOMPOSED_BINARY_POINT) { if (p.exp <= DECOMPOSED_BINARY_POINT) {
r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
} else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
} else { } else {
r = UINT64_MAX; r = UINT64_MAX;
} }
@ -2498,10 +2471,8 @@ static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode,
return 0; return 0;
} }
if (p.exp < DECOMPOSED_BINARY_POINT) { if (p.exp <= DECOMPOSED_BINARY_POINT) {
r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
} else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
} else { } else {
s->float_exception_flags = orig_flags | float_flag_invalid; s->float_exception_flags = orig_flags | float_flag_invalid;
return max; return max;
@ -2765,11 +2736,11 @@ static FloatParts int_to_float(int64_t a, int scale, float_status *status)
f = -f; f = -f;
r.sign = true; r.sign = true;
} }
shift = clz64(f) - 1; shift = clz64(f);
scale = MIN(MAX(scale, -0x10000), 0x10000); scale = MIN(MAX(scale, -0x10000), 0x10000);
r.exp = DECOMPOSED_BINARY_POINT - shift + scale; r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); r.frac = f << shift;
} }
return r; return r;
@ -2920,21 +2891,16 @@ bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
{ {
FloatParts r = { .sign = false }; FloatParts r = { .sign = false };
int shift;
if (a == 0) { if (a == 0) {
r.cls = float_class_zero; r.cls = float_class_zero;
} else { } else {
scale = MIN(MAX(scale, -0x10000), 0x10000); scale = MIN(MAX(scale, -0x10000), 0x10000);
shift = clz64(a);
r.cls = float_class_normal; r.cls = float_class_normal;
if ((int64_t)a < 0) { r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; r.frac = a << shift;
shift64RightJamming(a, 1, &a);
r.frac = a;
} else {
int shift = clz64(a) - 1;
r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
r.frac = a << shift;
}
} }
return r; return r;
@ -3475,12 +3441,9 @@ static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
/* We need two overflow bits at the top. Adding room for that is a /* We need two overflow bits at the top. Adding room for that is a
* right shift. If the exponent is odd, we can discard the low bit * right shift. If the exponent is odd, we can discard the low bit
* by multiplying the fraction by 2; that's a left shift. Combine * by multiplying the fraction by 2; that's a left shift. Combine
* those and we shift right if the exponent is even. * those and we shift right by 1 if the exponent is odd, otherwise 2.
*/ */
a_frac = a.frac; a_frac = a.frac >> (2 - (a.exp & 1));
if (!(a.exp & 1)) {
a_frac >>= 1;
}
a.exp >>= 1; a.exp >>= 1;
/* Bit-by-bit computation of sqrt. */ /* Bit-by-bit computation of sqrt. */
@ -3488,10 +3451,10 @@ static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
s_frac = 0; s_frac = 0;
/* Iterate from implicit bit down to the 3 extra bits to compute a /* Iterate from implicit bit down to the 3 extra bits to compute a
* properly rounded result. Remember we've inserted one more bit * properly rounded result. Remember we've inserted two more bits
* at the top, so these positions are one less. * at the top, so these positions are two less.
*/ */
bit = DECOMPOSED_BINARY_POINT - 1; bit = DECOMPOSED_BINARY_POINT - 2;
last_bit = MAX(p->frac_shift - 4, 0); last_bit = MAX(p->frac_shift - 4, 0);
do { do {
uint64_t q = 1ULL << bit; uint64_t q = 1ULL << bit;
@ -3507,7 +3470,7 @@ static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
/* Undo the right shift done above. If there is any remaining /* Undo the right shift done above. If there is any remaining
* fraction, the result is inexact. Set the sticky bit. * fraction, the result is inexact. Set the sticky bit.
*/ */
a.frac = (r_frac << 1) + (a_frac != 0); a.frac = (r_frac << 2) + (a_frac != 0);
return a; return a;
} }