Skip to content

Commit 9be96a4

Browse files
allightcopybara-github
authored andcommitted
Fix incorrect rounding in apfloat::downcast
When downcasting to a subnormal previously we first did a downcast of just the fractional part followed by the downcast of the exponent, where subnormals are generated. This led to the fraction being rounded twice, once for the downcast of just the fractional part and again for the subnormal fractional. This could lead to some values being rounded up twice, ending up 1 ulp larger than intended. PiperOrigin-RevId: 751558828
1 parent 4bd3c88 commit 9be96a4

File tree

4 files changed

+206
-18
lines changed

4 files changed

+206
-18
lines changed

xls/dslx/stdlib/apfloat.x

Lines changed: 86 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,14 +1107,15 @@ fn downcast_fractional_rne_fp32_to_bf16_test() {
11071107
minus_inf_bf16);
11081108
}
11091109

1110-
// Perform downcasting that converts a normal number into a subnormal with the same number of
1111-
// fraction bits. f_cast must have an unbiased exponent less than the minumum normal exponent of the
1112-
// target float type (checked via assert!).
1113-
fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
1114-
(f_cast: APFloat<FROM_EXP_SZ, FRACTION_SZ>, round_style: RoundStyle)
1115-
-> APFloat<TO_EXP_SZ, FRACTION_SZ> {
1110+
// Perform downcasting that converts a normal number into a subnormal. f must
1111+
// have an unbiased exponent less than the minumum normal exponent of the target
1112+
// float type (checked via assert!).
1113+
fn downcast_to_subnormal
1114+
<TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, FROM_EXP_SZ: u32>
1115+
(f: APFloat<FROM_EXP_SZ, FROM_FRACTION_SZ>, round_style: RoundStyle)
1116+
-> APFloat<TO_EXP_SZ, TO_FRACTION_SZ> {
11161117
const TO_BIAS = std::signed_max_value<TO_EXP_SZ>() as sN[FROM_EXP_SZ];
1117-
let uexp = unbiased_exponent(f_cast);
1118+
let uexp = unbiased_exponent(f);
11181119
// Check for over- and underflow of the exponent in the target type.
11191120
assert!(
11201121
uexp < (min_normal_exp<TO_EXP_SZ>() as sN[FROM_EXP_SZ]),
@@ -1124,30 +1125,35 @@ fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
11241125
// 32 bits is more than large enough for any reasonable
11251126
// floating point numbers exponent. Narrowing should crush it down.
11261127
let right_shift_cnt = -(TO_BIAS + uexp) as u32 + u32:1;
1127-
if right_shift_cnt > (FRACTION_SZ + u32:1) {
1128+
if right_shift_cnt > (TO_FRACTION_SZ + u32:1) {
11281129
// actually underflows
1129-
zero<TO_EXP_SZ, FRACTION_SZ>(f_cast.sign)
1130+
zero<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
11301131
} else {
1131-
// Add the implied leading 1.
1132-
let full_frac = u1:0b1 ++ f_cast.fraction;
1133-
let unrounded_subnormal_frac = (full_frac >> right_shift_cnt) as uN[FRACTION_SZ];
1132+
const SMALL_FRAC_OFF = FROM_FRACTION_SZ - TO_FRACTION_SZ;
1133+
// Truncate the trailing bits of the fraction.
1134+
let truncated_frac = f.fraction[SMALL_FRAC_OFF+:uN[TO_FRACTION_SZ]];
1135+
// Add the implied leading 1
1136+
let full_frac = u1:0b1 ++ truncated_frac;
1137+
// Shift the bits over.
1138+
let unrounded_subnormal_frac = full_frac[right_shift_cnt+:uN[TO_FRACTION_SZ]];
11341139

1135-
let round_up = does_lsb_round_up(right_shift_cnt as u32, full_frac, round_style);
1140+
let round_up = does_lsb_round_up(
1141+
right_shift_cnt as u32 + SMALL_FRAC_OFF, u1:1 ++ f.fraction, round_style);
11361142

11371143
let subnormal_frac = if round_up {
1138-
unrounded_subnormal_frac + uN[FRACTION_SZ]:1
1144+
unrounded_subnormal_frac + uN[TO_FRACTION_SZ]:1
11391145
} else {
11401146
unrounded_subnormal_frac
11411147
};
11421148

11431149
// Technically the subnormal frac is good enough but this is
11441150
// easier to see through.
1145-
let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[FRACTION_SZ]:0;
1151+
let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[TO_FRACTION_SZ]:0;
11461152

11471153
if rounds_to_normal {
1148-
APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[FRACTION_SZ]:0 }
1154+
APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[TO_FRACTION_SZ]:0 }
11491155
} else {
1150-
APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
1156+
APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
11511157
}
11521158
}
11531159
}
@@ -1181,7 +1187,7 @@ fn downcast_normal<TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, F
11811187
// NB In the no-subnormals case the fraction/bias is already inf.
11821188
inf<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
11831189
} else if CAN_GENERATE_SUBNORMALS && uexp <= -TO_BIAS {
1184-
downcast_to_subnormal<TO_EXP_SZ>(f_cast, round_style)
1190+
downcast_to_subnormal<TO_FRACTION_SZ, TO_EXP_SZ>(f, round_style)
11851191
} else {
11861192
APFloat {
11871193
sign: f_cast.sign,
@@ -1432,6 +1438,68 @@ fn downcast_generates_subnormal() {
14321438
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_0000_0000 };
14331439
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
14341440
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
1441+
1442+
// Rounds up
1443+
let not_subnormal = F32 {
1444+
sign: false,
1445+
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
1446+
fraction: u10:0b00_0000_0110 ++ u13:0,
1447+
};
1448+
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
1449+
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
1450+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
1451+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
1452+
1453+
// Rounds down
1454+
let not_subnormal = F32 {
1455+
sign: false,
1456+
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
1457+
fraction: u10:0b00_0000_0011 ++ u13:0,
1458+
};
1459+
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
1460+
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
1461+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
1462+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
1463+
1464+
let not_subnormal = F32 {
1465+
sign: false,
1466+
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
1467+
fraction: u10:0b00_0000_0100 ++ u13:0,
1468+
};
1469+
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
1470+
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
1471+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
1472+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
1473+
1474+
let not_subnormal = F32 {
1475+
sign: false,
1476+
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
1477+
fraction: u10:0b00_0000_1100 ++ u13:0,
1478+
};
1479+
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
1480+
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
1481+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
1482+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
1483+
1484+
let not_subnormal = F32 {
1485+
sign: false,
1486+
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
1487+
fraction: u23:0b100_1001_0111_1011_1110_1001,
1488+
};
1489+
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
1490+
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
1491+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
1492+
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
1493+
1494+
type F64 = APFloat<u32:11, u32:52>;
1495+
let not_subnormal = F64 {
1496+
sign: false,
1497+
bexp: bias<F64::EXP_SIZE>(min_normal_exp<F32::EXP_SIZE>() as s11 - s11:3),
1498+
fraction: u52:0b1000_1000_0110_0001_0010_1001_1010_0000_0001_0010_1100_1010_0001,
1499+
};
1500+
let expected = F32 { sign: false, bexp: u8:0, fraction: u23:0b001_1000_1000_0110_0001_0011 };
1501+
assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected);
1502+
assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected);
14351503
}
14361504

14371505
#[test]

xls/dslx/stdlib/tests/BUILD

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,41 @@ cc_test(
149149
],
150150
)
151151

152+
xls_dslx_test(
153+
name = "float32_downcast_test",
154+
srcs = ["float32_downcast_test.x"],
155+
)
156+
157+
xls_dslx_opt_ir(
158+
name = "float32_downcast",
159+
srcs = ["float32_downcast_test.x"],
160+
dslx_top = "f64_to_f32",
161+
ir_file = "float32_downcast.ir",
162+
opt_ir_file = "float32_downcast.opt.ir",
163+
)
164+
165+
cc_xls_ir_jit_wrapper(
166+
name = "float32_downcast_jit_wrapper",
167+
src = ":float32_downcast",
168+
jit_wrapper_args = {
169+
"class_name": "F64ToF32",
170+
"namespace": "xls::fp",
171+
},
172+
)
173+
174+
cc_test(
175+
name = "float32_downcast_test_cc",
176+
srcs = ["float32_downcast_test.cc"],
177+
tags = ["optonly"],
178+
deps = [
179+
":float32_downcast_jit_wrapper",
180+
"//xls/common:xls_gunit_main",
181+
"//xls/common/fuzzing:fuzztest",
182+
"@com_google_absl//absl/base",
183+
"@googletest//:gtest",
184+
],
185+
)
186+
152187
xls_dslx_test(
153188
name = "float32_upcast_test",
154189
srcs = ["float32_upcast_test.x"],
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// Copyright 2025 The XLS Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <fenv.h> // NOLINT - Allow fenv header.
16+
17+
#include <cmath>
18+
#include <cstdint>
19+
#include <ios>
20+
#include <utility>
21+
22+
#include "gmock/gmock.h"
23+
#include "gtest/gtest.h"
24+
#include "xls/common/fuzzing/fuzztest.h"
25+
#include "absl/base/casts.h"
26+
#include "xls/dslx/stdlib/tests/float32_downcast_jit_wrapper.h"
27+
28+
namespace xls {
29+
namespace {
30+
31+
static_assert(sizeof(double) == 8, "8 byte double required");
32+
static_assert(sizeof(float) == 4, "4 byte float required");
33+
34+
class F64ToF32 {
35+
public:
36+
F64ToF32() { jit_ = std::move(fp::F64ToF32::Create()).value(); }
37+
void F64ToF32Test(uint64_t v) {
38+
if (fegetround() != FE_TONEAREST) {
39+
GTEST_SKIP() << "Unexpected rounding mode";
40+
}
41+
double d = absl::bit_cast<double>(v);
42+
float f = (float)d;
43+
float j = jit_->Run(d).value();
44+
if (std::isnan(f)) {
45+
ASSERT_THAT(j, testing::IsNan());
46+
} else {
47+
ASSERT_EQ(f, j) << std::boolalpha
48+
<< "is subnormal: " << (fpclassify(f) == FP_SUBNORMAL)
49+
<< " inp: " << std::hex << "0x" << v << " "
50+
<< std::hexfloat << d << " f=" << f << " j=" << j;
51+
}
52+
}
53+
54+
private:
55+
std::unique_ptr<fp::F64ToF32> jit_;
56+
};
57+
FUZZ_TEST_F(F64ToF32, F64ToF32Test).WithDomains(fuzztest::Arbitrary<int64_t>());
58+
59+
} // namespace
60+
} // namespace xls
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Copyright 2025 The XLS Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import apfloat;
16+
import float32;
17+
import float64;
18+
19+
fn f64_to_f32(f: float64::F64) -> float32::F32 {
20+
apfloat::downcast<float32::F32::FRACTION_SIZE, float32::F32::EXP_SIZE>(
21+
f, apfloat::RoundStyle::TIES_TO_EVEN)
22+
}
23+
24+
#[test]
25+
fn f64_to_f32_test() { assert_eq(f64_to_f32(float64::one(u1:0)), float32::one(u1:0)); }

0 commit comments

Comments
 (0)