Fix incorrect rounding in apfloat::downcast

allight · copybara-github · commit 9be96a44f01a · 2025-04-25T14:28:10.000-07:00
When downcasting to a subnormal previously we first did a downcast of just the fractional part followed by the downcast of the exponent, where subnormals are generated. This led to the fraction being rounded twice, once for the downcast of just the fractional part and again for the subnormal fractional. This could lead to some values being rounded up twice, ending up 1 ulp larger than intended.

PiperOrigin-RevId: 751558828
diff --git a/xls/dslx/stdlib/apfloat.x b/xls/dslx/stdlib/apfloat.x
@@ -1107,14 +1107,15 @@ fn downcast_fractional_rne_fp32_to_bf16_test() {
         minus_inf_bf16);
 }
 
-// Perform downcasting that converts a normal number into a subnormal with the same number of
-// fraction bits. f_cast must have an unbiased exponent less than the minumum normal exponent of the
-// target float type (checked via assert!).
-fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
-    (f_cast: APFloat<FROM_EXP_SZ, FRACTION_SZ>, round_style: RoundStyle)
-    -> APFloat<TO_EXP_SZ, FRACTION_SZ> {
+// Perform downcasting that converts a normal number into a subnormal. f must
+// have an unbiased exponent less than the minumum normal exponent of the target
+// float type (checked via assert!).
+fn downcast_to_subnormal
+    <TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, FROM_EXP_SZ: u32>
+    (f: APFloat<FROM_EXP_SZ, FROM_FRACTION_SZ>, round_style: RoundStyle)
+    -> APFloat<TO_EXP_SZ, TO_FRACTION_SZ> {
     const TO_BIAS = std::signed_max_value<TO_EXP_SZ>() as sN[FROM_EXP_SZ];
-    let uexp = unbiased_exponent(f_cast);
+    let uexp = unbiased_exponent(f);
     // Check for over- and underflow of the exponent in the target type.
     assert!(
         uexp < (min_normal_exp<TO_EXP_SZ>() as sN[FROM_EXP_SZ]),
@@ -1124,30 +1125,35 @@ fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
     // 32 bits is more than large enough for any reasonable
     // floating point numbers exponent. Narrowing should crush it down.
     let right_shift_cnt = -(TO_BIAS + uexp) as u32 + u32:1;
-    if right_shift_cnt > (FRACTION_SZ + u32:1) {
+    if right_shift_cnt > (TO_FRACTION_SZ + u32:1) {
         // actually underflows
-        zero<TO_EXP_SZ, FRACTION_SZ>(f_cast.sign)
+        zero<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
     } else {
-        // Add the implied leading 1.
-        let full_frac = u1:0b1 ++ f_cast.fraction;
-        let unrounded_subnormal_frac = (full_frac >> right_shift_cnt) as uN[FRACTION_SZ];
+        const SMALL_FRAC_OFF = FROM_FRACTION_SZ - TO_FRACTION_SZ;
+        // Truncate the trailing bits of the fraction.
+        let truncated_frac = f.fraction[SMALL_FRAC_OFF+:uN[TO_FRACTION_SZ]];
+        // Add the implied leading 1
+        let full_frac = u1:0b1 ++ truncated_frac;
+        // Shift the bits over.
+        let unrounded_subnormal_frac = full_frac[right_shift_cnt+:uN[TO_FRACTION_SZ]];
 
-        let round_up = does_lsb_round_up(right_shift_cnt as u32, full_frac, round_style);
+        let round_up = does_lsb_round_up(
+            right_shift_cnt as u32 + SMALL_FRAC_OFF, u1:1 ++ f.fraction, round_style);
 
         let subnormal_frac = if round_up {
-            unrounded_subnormal_frac + uN[FRACTION_SZ]:1
+            unrounded_subnormal_frac + uN[TO_FRACTION_SZ]:1
         } else {
             unrounded_subnormal_frac
         };
 
         // Technically the subnormal frac is good enough but this is
         // easier to see through.
-        let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[FRACTION_SZ]:0;
+        let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[TO_FRACTION_SZ]:0;
 
         if rounds_to_normal {
-            APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[FRACTION_SZ]:0 }
+            APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[TO_FRACTION_SZ]:0 }
         } else {
-            APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
+            APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
         }
     }
 }
@@ -1181,7 +1187,7 @@ fn downcast_normal<TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, F
             // NB In the no-subnormals case the fraction/bias is already inf.
             inf<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
         } else if CAN_GENERATE_SUBNORMALS && uexp <= -TO_BIAS {
-            downcast_to_subnormal<TO_EXP_SZ>(f_cast, round_style)
+            downcast_to_subnormal<TO_FRACTION_SZ, TO_EXP_SZ>(f, round_style)
         } else {
             APFloat {
                 sign: f_cast.sign,
@@ -1432,6 +1438,68 @@ fn downcast_generates_subnormal() {
     let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_0000_0000 };
     assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
     assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    // Rounds up
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_0110 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    // Rounds down
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_0011 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_0100 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_1100 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u23:0b100_1001_0111_1011_1110_1001,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    type F64 = APFloat<u32:11, u32:52>;
+    let not_subnormal = F64 {
+        sign: false,
+        bexp: bias<F64::EXP_SIZE>(min_normal_exp<F32::EXP_SIZE>() as s11 - s11:3),
+        fraction: u52:0b1000_1000_0110_0001_0010_1001_1010_0000_0001_0010_1100_1010_0001,
+    };
+    let expected = F32 { sign: false, bexp: u8:0, fraction: u23:0b001_1000_1000_0110_0001_0011 };
+    assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected);
+    assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected);
 }
 
 #[test]
diff --git a/xls/dslx/stdlib/tests/BUILD b/xls/dslx/stdlib/tests/BUILD
@@ -149,6 +149,41 @@ cc_test(
     ],
 )
 
+xls_dslx_test(
+    name = "float32_downcast_test",
+    srcs = ["float32_downcast_test.x"],
+)
+
+xls_dslx_opt_ir(
+    name = "float32_downcast",
+    srcs = ["float32_downcast_test.x"],
+    dslx_top = "f64_to_f32",
+    ir_file = "float32_downcast.ir",
+    opt_ir_file = "float32_downcast.opt.ir",
+)
+
+cc_xls_ir_jit_wrapper(
+    name = "float32_downcast_jit_wrapper",
+    src = ":float32_downcast",
+    jit_wrapper_args = {
+        "class_name": "F64ToF32",
+        "namespace": "xls::fp",
+    },
+)
+
+cc_test(
+    name = "float32_downcast_test_cc",
+    srcs = ["float32_downcast_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":float32_downcast_jit_wrapper",
+        "//xls/common:xls_gunit_main",
+        "//xls/common/fuzzing:fuzztest",
+        "@com_google_absl//absl/base",
+        "@googletest//:gtest",
+    ],
+)
+
 xls_dslx_test(
     name = "float32_upcast_test",
     srcs = ["float32_upcast_test.x"],
diff --git a/xls/dslx/stdlib/tests/float32_downcast_test.cc b/xls/dslx/stdlib/tests/float32_downcast_test.cc
@@ -0,0 +1,60 @@
+// Copyright 2025 The XLS Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fenv.h>  // NOLINT - Allow fenv header.
+
+#include <cmath>
+#include <cstdint>
+#include <ios>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "xls/common/fuzzing/fuzztest.h"
+#include "absl/base/casts.h"
+#include "xls/dslx/stdlib/tests/float32_downcast_jit_wrapper.h"
+
+namespace xls {
+namespace {
+
+static_assert(sizeof(double) == 8, "8 byte double required");
+static_assert(sizeof(float) == 4, "4 byte float required");
+
+class F64ToF32 {
+ public:
+  F64ToF32() { jit_ = std::move(fp::F64ToF32::Create()).value(); }
+  void F64ToF32Test(uint64_t v) {
+    if (fegetround() != FE_TONEAREST) {
+      GTEST_SKIP() << "Unexpected rounding mode";
+    }
+    double d = absl::bit_cast<double>(v);
+    float f = (float)d;
+    float j = jit_->Run(d).value();
+    if (std::isnan(f)) {
+      ASSERT_THAT(j, testing::IsNan());
+    } else {
+      ASSERT_EQ(f, j) << std::boolalpha
+                      << "is subnormal: " << (fpclassify(f) == FP_SUBNORMAL)
+                      << " inp: " << std::hex << "0x" << v << " "
+                      << std::hexfloat << d << " f=" << f << " j=" << j;
+    }
+  }
+
+ private:
+  std::unique_ptr<fp::F64ToF32> jit_;
+};
+FUZZ_TEST_F(F64ToF32, F64ToF32Test).WithDomains(fuzztest::Arbitrary<int64_t>());
+
+}  // namespace
+}  // namespace xls
diff --git a/xls/dslx/stdlib/tests/float32_downcast_test.x b/xls/dslx/stdlib/tests/float32_downcast_test.x
@@ -0,0 +1,25 @@
+// Copyright 2025 The XLS Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import apfloat;
+import float32;
+import float64;
+
+fn f64_to_f32(f: float64::F64) -> float32::F32 {
+    apfloat::downcast<float32::F32::FRACTION_SIZE, float32::F32::EXP_SIZE>(
+        f, apfloat::RoundStyle::TIES_TO_EVEN)
+}
+
+#[test]
+fn f64_to_f32_test() { assert_eq(f64_to_f32(float64::one(u1:0)), float32::one(u1:0)); }