Skip to content

Fix incorrect rounding in apfloat::downcast #2051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 86 additions & 18 deletions xls/dslx/stdlib/apfloat.x
Original file line number Diff line number Diff line change
Expand Up @@ -1107,14 +1107,15 @@ fn downcast_fractional_rne_fp32_to_bf16_test() {
minus_inf_bf16);
}

// Perform downcasting that converts a normal number into a subnormal with the same number of
// fraction bits. f_cast must have an unbiased exponent less than the minumum normal exponent of the
// target float type (checked via assert!).
fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
(f_cast: APFloat<FROM_EXP_SZ, FRACTION_SZ>, round_style: RoundStyle)
-> APFloat<TO_EXP_SZ, FRACTION_SZ> {
// Perform downcasting that converts a normal number into a subnormal. f must
// have an unbiased exponent less than the minumum normal exponent of the target
// float type (checked via assert!).
fn downcast_to_subnormal
<TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, FROM_EXP_SZ: u32>
(f: APFloat<FROM_EXP_SZ, FROM_FRACTION_SZ>, round_style: RoundStyle)
-> APFloat<TO_EXP_SZ, TO_FRACTION_SZ> {
const TO_BIAS = std::signed_max_value<TO_EXP_SZ>() as sN[FROM_EXP_SZ];
let uexp = unbiased_exponent(f_cast);
let uexp = unbiased_exponent(f);
// Check for over- and underflow of the exponent in the target type.
assert!(
uexp < (min_normal_exp<TO_EXP_SZ>() as sN[FROM_EXP_SZ]),
Expand All @@ -1124,30 +1125,35 @@ fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
// 32 bits is more than large enough for any reasonable
// floating point numbers exponent. Narrowing should crush it down.
let right_shift_cnt = -(TO_BIAS + uexp) as u32 + u32:1;
if right_shift_cnt > (FRACTION_SZ + u32:1) {
if right_shift_cnt > (TO_FRACTION_SZ + u32:1) {
// actually underflows
zero<TO_EXP_SZ, FRACTION_SZ>(f_cast.sign)
zero<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
} else {
// Add the implied leading 1.
let full_frac = u1:0b1 ++ f_cast.fraction;
let unrounded_subnormal_frac = (full_frac >> right_shift_cnt) as uN[FRACTION_SZ];
const SMALL_FRAC_OFF = FROM_FRACTION_SZ - TO_FRACTION_SZ;
// Truncate the trailing bits of the fraction.
let truncated_frac = f.fraction[SMALL_FRAC_OFF+:uN[TO_FRACTION_SZ]];
// Add the implied leading 1
let full_frac = u1:0b1 ++ truncated_frac;
// Shift the bits over.
let unrounded_subnormal_frac = full_frac[right_shift_cnt+:uN[TO_FRACTION_SZ]];

let round_up = does_lsb_round_up(right_shift_cnt as u32, full_frac, round_style);
let round_up = does_lsb_round_up(
right_shift_cnt as u32 + SMALL_FRAC_OFF, u1:1 ++ f.fraction, round_style);

let subnormal_frac = if round_up {
unrounded_subnormal_frac + uN[FRACTION_SZ]:1
unrounded_subnormal_frac + uN[TO_FRACTION_SZ]:1
} else {
unrounded_subnormal_frac
};

// Technically the subnormal frac is good enough but this is
// easier to see through.
let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[FRACTION_SZ]:0;
let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[TO_FRACTION_SZ]:0;

if rounds_to_normal {
APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[FRACTION_SZ]:0 }
APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[TO_FRACTION_SZ]:0 }
} else {
APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
}
}
}
Expand Down Expand Up @@ -1181,7 +1187,7 @@ fn downcast_normal<TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, F
// NB In the no-subnormals case the fraction/bias is already inf.
inf<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
} else if CAN_GENERATE_SUBNORMALS && uexp <= -TO_BIAS {
downcast_to_subnormal<TO_EXP_SZ>(f_cast, round_style)
downcast_to_subnormal<TO_FRACTION_SZ, TO_EXP_SZ>(f, round_style)
} else {
APFloat {
sign: f_cast.sign,
Expand Down Expand Up @@ -1432,6 +1438,68 @@ fn downcast_generates_subnormal() {
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_0000_0000 };
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);

// Rounds up
let not_subnormal = F32 {
sign: false,
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
fraction: u10:0b00_0000_0110 ++ u13:0,
};
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);

// Rounds down
let not_subnormal = F32 {
sign: false,
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
fraction: u10:0b00_0000_0011 ++ u13:0,
};
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);

let not_subnormal = F32 {
sign: false,
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
fraction: u10:0b00_0000_0100 ++ u13:0,
};
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);

let not_subnormal = F32 {
sign: false,
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
fraction: u10:0b00_0000_1100 ++ u13:0,
};
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);

let not_subnormal = F32 {
sign: false,
bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
fraction: u23:0b100_1001_0111_1011_1110_1001,
};
let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);

type F64 = APFloat<u32:11, u32:52>;
let not_subnormal = F64 {
sign: false,
bexp: bias<F64::EXP_SIZE>(min_normal_exp<F32::EXP_SIZE>() as s11 - s11:3),
fraction: u52:0b1000_1000_0110_0001_0010_1001_1010_0000_0001_0010_1100_1010_0001,
};
let expected = F32 { sign: false, bexp: u8:0, fraction: u23:0b001_1000_1000_0110_0001_0011 };
assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected);
assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected);
}

#[test]
Expand Down
35 changes: 35 additions & 0 deletions xls/dslx/stdlib/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,41 @@ cc_test(
],
)

xls_dslx_test(
name = "float32_downcast_test",
srcs = ["float32_downcast_test.x"],
)

xls_dslx_opt_ir(
name = "float32_downcast",
srcs = ["float32_downcast_test.x"],
dslx_top = "f64_to_f32",
ir_file = "float32_downcast.ir",
opt_ir_file = "float32_downcast.opt.ir",
)

cc_xls_ir_jit_wrapper(
name = "float32_downcast_jit_wrapper",
src = ":float32_downcast",
jit_wrapper_args = {
"class_name": "F64ToF32",
"namespace": "xls::fp",
},
)

cc_test(
name = "float32_downcast_test_cc",
srcs = ["float32_downcast_test.cc"],
tags = ["optonly"],
deps = [
":float32_downcast_jit_wrapper",
"//xls/common:xls_gunit_main",
"//xls/common/fuzzing:fuzztest",
"@com_google_absl//absl/base",
"@googletest//:gtest",
],
)

xls_dslx_test(
name = "float32_upcast_test",
srcs = ["float32_upcast_test.x"],
Expand Down
60 changes: 60 additions & 0 deletions xls/dslx/stdlib/tests/float32_downcast_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright 2025 The XLS Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <fenv.h> // NOLINT - Allow fenv header.

#include <cmath>
#include <cstdint>
#include <ios>
#include <utility>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "xls/common/fuzzing/fuzztest.h"
#include "absl/base/casts.h"
#include "xls/dslx/stdlib/tests/float32_downcast_jit_wrapper.h"

namespace xls {
namespace {

static_assert(sizeof(double) == 8, "8 byte double required");
static_assert(sizeof(float) == 4, "4 byte float required");

class F64ToF32 {
public:
F64ToF32() { jit_ = std::move(fp::F64ToF32::Create()).value(); }
void F64ToF32Test(uint64_t v) {
if (fegetround() != FE_TONEAREST) {
GTEST_SKIP() << "Unexpected rounding mode";
}
double d = absl::bit_cast<double>(v);
float f = (float)d;
float j = jit_->Run(d).value();
if (std::isnan(f)) {
ASSERT_THAT(j, testing::IsNan());
} else {
ASSERT_EQ(f, j) << std::boolalpha
<< "is subnormal: " << (fpclassify(f) == FP_SUBNORMAL)
<< " inp: " << std::hex << "0x" << v << " "
<< std::hexfloat << d << " f=" << f << " j=" << j;
}
}

private:
std::unique_ptr<fp::F64ToF32> jit_;
};
FUZZ_TEST_F(F64ToF32, F64ToF32Test).WithDomains(fuzztest::Arbitrary<int64_t>());

} // namespace
} // namespace xls
25 changes: 25 additions & 0 deletions xls/dslx/stdlib/tests/float32_downcast_test.x
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright 2025 The XLS Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import apfloat;
import float32;
import float64;

fn f64_to_f32(f: float64::F64) -> float32::F32 {
apfloat::downcast<float32::F32::FRACTION_SIZE, float32::F32::EXP_SIZE>(
f, apfloat::RoundStyle::TIES_TO_EVEN)
}

#[test]
fn f64_to_f32_test() { assert_eq(f64_to_f32(float64::one(u1:0)), float32::one(u1:0)); }