From 8eef6b27565fb42a3bb3db7b6b951b66f35d82db Mon Sep 17 00:00:00 2001
From: Alex Light <allight@google.com>
Date: Tue, 22 Apr 2025 16:09:52 -0700
Subject: [PATCH] Fix incorrect rounding in apfloat::downcast

When downcasting to a subnormal previously we first did a downcast of just the fractional part followed by the downcast of the exponent, where subnormals are generated. This led to the fraction being rounded twice, once for the downcast of just the fractional part and again for the subnormal fractional. This could lead to some values being rounded up twice, ending up 1 ulp larger than intended.
---
 xls/dslx/stdlib/apfloat.x                     | 104 +++++++++++++++---
 xls/dslx/stdlib/tests/BUILD                   |  35 ++++++
 .../stdlib/tests/float32_downcast_test.cc     |  60 ++++++++++
 xls/dslx/stdlib/tests/float32_downcast_test.x |  25 +++++
 4 files changed, 206 insertions(+), 18 deletions(-)
 create mode 100644 xls/dslx/stdlib/tests/float32_downcast_test.cc
 create mode 100644 xls/dslx/stdlib/tests/float32_downcast_test.x

diff --git a/xls/dslx/stdlib/apfloat.x b/xls/dslx/stdlib/apfloat.x
index d433220f7b..db28983037 100644
--- a/xls/dslx/stdlib/apfloat.x
+++ b/xls/dslx/stdlib/apfloat.x
@@ -1107,14 +1107,15 @@ fn downcast_fractional_rne_fp32_to_bf16_test() {
         minus_inf_bf16);
 }
 
-// Perform downcasting that converts a normal number into a subnormal with the same number of
-// fraction bits. f_cast must have an unbiased exponent less than the minumum normal exponent of the
-// target float type (checked via assert!).
-fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
-    (f_cast: APFloat<FROM_EXP_SZ, FRACTION_SZ>, round_style: RoundStyle)
-    -> APFloat<TO_EXP_SZ, FRACTION_SZ> {
+// Perform downcasting that converts a normal number into a subnormal. f must
+// have an unbiased exponent less than the minumum normal exponent of the target
+// float type (checked via assert!).
+fn downcast_to_subnormal
+    <TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, FROM_EXP_SZ: u32>
+    (f: APFloat<FROM_EXP_SZ, FROM_FRACTION_SZ>, round_style: RoundStyle)
+    -> APFloat<TO_EXP_SZ, TO_FRACTION_SZ> {
     const TO_BIAS = std::signed_max_value<TO_EXP_SZ>() as sN[FROM_EXP_SZ];
-    let uexp = unbiased_exponent(f_cast);
+    let uexp = unbiased_exponent(f);
     // Check for over- and underflow of the exponent in the target type.
     assert!(
         uexp < (min_normal_exp<TO_EXP_SZ>() as sN[FROM_EXP_SZ]),
@@ -1124,30 +1125,35 @@ fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
     // 32 bits is more than large enough for any reasonable
     // floating point numbers exponent. Narrowing should crush it down.
     let right_shift_cnt = -(TO_BIAS + uexp) as u32 + u32:1;
-    if right_shift_cnt > (FRACTION_SZ + u32:1) {
+    if right_shift_cnt > (TO_FRACTION_SZ + u32:1) {
         // actually underflows
-        zero<TO_EXP_SZ, FRACTION_SZ>(f_cast.sign)
+        zero<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
     } else {
-        // Add the implied leading 1.
-        let full_frac = u1:0b1 ++ f_cast.fraction;
-        let unrounded_subnormal_frac = (full_frac >> right_shift_cnt) as uN[FRACTION_SZ];
+        const SMALL_FRAC_OFF = FROM_FRACTION_SZ - TO_FRACTION_SZ;
+        // Truncate the trailing bits of the fraction.
+        let truncated_frac = f.fraction[SMALL_FRAC_OFF+:uN[TO_FRACTION_SZ]];
+        // Add the implied leading 1
+        let full_frac = u1:0b1 ++ truncated_frac;
+        // Shift the bits over.
+        let unrounded_subnormal_frac = full_frac[right_shift_cnt+:uN[TO_FRACTION_SZ]];
 
-        let round_up = does_lsb_round_up(right_shift_cnt as u32, full_frac, round_style);
+        let round_up = does_lsb_round_up(
+            right_shift_cnt as u32 + SMALL_FRAC_OFF, u1:1 ++ f.fraction, round_style);
 
         let subnormal_frac = if round_up {
-            unrounded_subnormal_frac + uN[FRACTION_SZ]:1
+            unrounded_subnormal_frac + uN[TO_FRACTION_SZ]:1
         } else {
             unrounded_subnormal_frac
         };
 
         // Technically the subnormal frac is good enough but this is
         // easier to see through.
-        let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[FRACTION_SZ]:0;
+        let rounds_to_normal = right_shift_cnt == u32:1 && subnormal_frac == uN[TO_FRACTION_SZ]:0;
 
         if rounds_to_normal {
-            APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[FRACTION_SZ]:0 }
+            APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:1, fraction: uN[TO_FRACTION_SZ]:0 }
         } else {
-            APFloat { sign: f_cast.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
+            APFloat { sign: f.sign, bexp: uN[TO_EXP_SZ]:0, fraction: subnormal_frac }
         }
     }
 }
@@ -1181,7 +1187,7 @@ fn downcast_normal<TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, F
             // NB In the no-subnormals case the fraction/bias is already inf.
             inf<TO_EXP_SZ, TO_FRACTION_SZ>(f.sign)
         } else if CAN_GENERATE_SUBNORMALS && uexp <= -TO_BIAS {
-            downcast_to_subnormal<TO_EXP_SZ>(f_cast, round_style)
+            downcast_to_subnormal<TO_FRACTION_SZ, TO_EXP_SZ>(f, round_style)
         } else {
             APFloat {
                 sign: f_cast.sign,
@@ -1432,6 +1438,68 @@ fn downcast_generates_subnormal() {
     let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_0000_0000 };
     assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
     assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    // Rounds up
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_0110 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    // Rounds down
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_0011 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_0100 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0001 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0000 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u10:0b00_0000_1100 ++ u13:0,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1000_0010 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    let not_subnormal = F32 {
+        sign: false,
+        bexp: bias<u32:8>(min_normal_exp<HF16::EXP_SIZE>() as s8 - s8:3),
+        fraction: u23:0b100_1001_0111_1011_1110_1001,
+    };
+    let expected_away = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
+    let expected_even = HF16 { sign: false, bexp: u5:0, fraction: u10:0b00_1100_1001 };
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected_even);
+    assert_eq(downcast<u32:10, u32:5>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected_away);
+
+    type F64 = APFloat<u32:11, u32:52>;
+    let not_subnormal = F64 {
+        sign: false,
+        bexp: bias<F64::EXP_SIZE>(min_normal_exp<F32::EXP_SIZE>() as s11 - s11:3),
+        fraction: u52:0b1000_1000_0110_0001_0010_1001_1010_0000_0001_0010_1100_1010_0001,
+    };
+    let expected = F32 { sign: false, bexp: u8:0, fraction: u23:0b001_1000_1000_0110_0001_0011 };
+    assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_EVEN), expected);
+    assert_eq(downcast<u32:23, u32:8>(not_subnormal, RoundStyle::TIES_TO_AWAY), expected);
 }
 
 #[test]
diff --git a/xls/dslx/stdlib/tests/BUILD b/xls/dslx/stdlib/tests/BUILD
index c151842a1c..0d7ed1f5b0 100644
--- a/xls/dslx/stdlib/tests/BUILD
+++ b/xls/dslx/stdlib/tests/BUILD
@@ -149,6 +149,41 @@ cc_test(
     ],
 )
 
+xls_dslx_test(
+    name = "float32_downcast_test",
+    srcs = ["float32_downcast_test.x"],
+)
+
+xls_dslx_opt_ir(
+    name = "float32_downcast",
+    srcs = ["float32_downcast_test.x"],
+    dslx_top = "f64_to_f32",
+    ir_file = "float32_downcast.ir",
+    opt_ir_file = "float32_downcast.opt.ir",
+)
+
+cc_xls_ir_jit_wrapper(
+    name = "float32_downcast_jit_wrapper",
+    src = ":float32_downcast",
+    jit_wrapper_args = {
+        "class_name": "F64ToF32",
+        "namespace": "xls::fp",
+    },
+)
+
+cc_test(
+    name = "float32_downcast_test_cc",
+    srcs = ["float32_downcast_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":float32_downcast_jit_wrapper",
+        "//xls/common:xls_gunit_main",
+        "//xls/common/fuzzing:fuzztest",
+        "@com_google_absl//absl/base",
+        "@googletest//:gtest",
+    ],
+)
+
 xls_dslx_test(
     name = "float32_upcast_test",
     srcs = ["float32_upcast_test.x"],
diff --git a/xls/dslx/stdlib/tests/float32_downcast_test.cc b/xls/dslx/stdlib/tests/float32_downcast_test.cc
new file mode 100644
index 0000000000..bcaafd180d
--- /dev/null
+++ b/xls/dslx/stdlib/tests/float32_downcast_test.cc
@@ -0,0 +1,60 @@
+// Copyright 2025 The XLS Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fenv.h>  // NOLINT - Allow fenv header.
+
+#include <cmath>
+#include <cstdint>
+#include <ios>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "xls/common/fuzzing/fuzztest.h"
+#include "absl/base/casts.h"
+#include "xls/dslx/stdlib/tests/float32_downcast_jit_wrapper.h"
+
+namespace xls {
+namespace {
+
+static_assert(sizeof(double) == 8, "8 byte double required");
+static_assert(sizeof(float) == 4, "4 byte float required");
+
+class F64ToF32 {
+ public:
+  F64ToF32() { jit_ = std::move(fp::F64ToF32::Create()).value(); }
+  void F64ToF32Test(uint64_t v) {
+    if (fegetround() != FE_TONEAREST) {
+      GTEST_SKIP() << "Unexpected rounding mode";
+    }
+    double d = absl::bit_cast<double>(v);
+    float f = (float)d;
+    float j = jit_->Run(d).value();
+    if (std::isnan(f)) {
+      ASSERT_THAT(j, testing::IsNan());
+    } else {
+      ASSERT_EQ(f, j) << std::boolalpha
+                      << "is subnormal: " << (fpclassify(f) == FP_SUBNORMAL)
+                      << " inp: " << std::hex << "0x" << v << " "
+                      << std::hexfloat << d << " f=" << f << " j=" << j;
+    }
+  }
+
+ private:
+  std::unique_ptr<fp::F64ToF32> jit_;
+};
+FUZZ_TEST_F(F64ToF32, F64ToF32Test).WithDomains(fuzztest::Arbitrary<int64_t>());
+
+}  // namespace
+}  // namespace xls
diff --git a/xls/dslx/stdlib/tests/float32_downcast_test.x b/xls/dslx/stdlib/tests/float32_downcast_test.x
new file mode 100644
index 0000000000..2e064771dc
--- /dev/null
+++ b/xls/dslx/stdlib/tests/float32_downcast_test.x
@@ -0,0 +1,25 @@
+// Copyright 2025 The XLS Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import apfloat;
+import float32;
+import float64;
+
+fn f64_to_f32(f: float64::F64) -> float32::F32 {
+    apfloat::downcast<float32::F32::FRACTION_SIZE, float32::F32::EXP_SIZE>(
+        f, apfloat::RoundStyle::TIES_TO_EVEN)
+}
+
+#[test]
+fn f64_to_f32_test() { assert_eq(f64_to_f32(float64::one(u1:0)), float32::one(u1:0)); }