@@ -1107,14 +1107,15 @@ fn downcast_fractional_rne_fp32_to_bf16_test() {
1107
1107
minus_inf_bf16 );
1108
1108
}
1109
1109
1110
- // Perform downcasting that converts a normal number into a subnormal with the same number of
1111
- // fraction bits. f_cast must have an unbiased exponent less than the minumum normal exponent of the
1112
- // target float type (checked via assert!).
1113
- fn downcast_to_subnormal < TO_EXP_SZ : u32 , FRACTION_SZ : u32 , FROM_EXP_SZ : u32 >
1114
- (f_cast : APFloat < FROM_EXP_SZ , FRACTION_SZ > , round_style : RoundStyle )
1115
- -> APFloat < TO_EXP_SZ , FRACTION_SZ > {
1110
+ // Perform downcasting that converts a normal number into a subnormal. f must
1111
+ // have an unbiased exponent less than the minumum normal exponent of the target
1112
+ // float type (checked via assert!).
1113
+ fn downcast_to_subnormal
1114
+ < TO_FRACTION_SZ : u32 , TO_EXP_SZ : u32 , FROM_FRACTION_SZ : u32 , FROM_EXP_SZ : u32 >
1115
+ (f : APFloat < FROM_EXP_SZ , FROM_FRACTION_SZ > , round_style : RoundStyle )
1116
+ -> APFloat < TO_EXP_SZ , TO_FRACTION_SZ > {
1116
1117
const TO_BIAS = std ::signed_max_value < TO_EXP_SZ > () as sN [FROM_EXP_SZ ];
1117
- let uexp = unbiased_exponent (f_cast );
1118
+ let uexp = unbiased_exponent (f );
1118
1119
// Check for over- and underflow of the exponent in the target type.
1119
1120
assert !(
1120
1121
uexp < (min_normal_exp < TO_EXP_SZ > () as sN [FROM_EXP_SZ ]),
@@ -1124,30 +1125,35 @@ fn downcast_to_subnormal<TO_EXP_SZ: u32, FRACTION_SZ: u32, FROM_EXP_SZ: u32>
1124
1125
// 32 bits is more than large enough for any reasonable
1125
1126
// floating point numbers exponent. Narrowing should crush it down.
1126
1127
let right_shift_cnt = - (TO_BIAS + uexp ) as u32 + u32 :1 ;
1127
- if right_shift_cnt > (FRACTION_SZ + u32 :1 ) {
1128
+ if right_shift_cnt > (TO_FRACTION_SZ + u32 :1 ) {
1128
1129
// actually underflows
1129
- zero < TO_EXP_SZ , FRACTION_SZ > ( f_cast .sign )
1130
+ zero < TO_EXP_SZ , TO_FRACTION_SZ > ( f .sign )
1130
1131
} else {
1131
- // Add the implied leading 1.
1132
- let full_frac = u1 :0b1 ++ f_cast .fraction ;
1133
- let unrounded_subnormal_frac = (full_frac >> right_shift_cnt ) as uN [FRACTION_SZ ];
1132
+ const SMALL_FRAC_OFF = FROM_FRACTION_SZ - TO_FRACTION_SZ ;
1133
+ // Truncate the trailing bits of the fraction.
1134
+ let truncated_frac = f .fraction [SMALL_FRAC_OFF + :uN [TO_FRACTION_SZ ]];
1135
+ // Add the implied leading 1
1136
+ let full_frac = u1 :0b1 ++ truncated_frac ;
1137
+ // Shift the bits over.
1138
+ let unrounded_subnormal_frac = full_frac [right_shift_cnt + :uN [TO_FRACTION_SZ ]];
1134
1139
1135
- let round_up = does_lsb_round_up (right_shift_cnt as u32 , full_frac , round_style );
1140
+ let round_up = does_lsb_round_up (
1141
+ right_shift_cnt as u32 + SMALL_FRAC_OFF , u1 :1 ++ f .fraction , round_style );
1136
1142
1137
1143
let subnormal_frac = if round_up {
1138
- unrounded_subnormal_frac + uN [FRACTION_SZ ]:1
1144
+ unrounded_subnormal_frac + uN [TO_FRACTION_SZ ]:1
1139
1145
} else {
1140
1146
unrounded_subnormal_frac
1141
1147
};
1142
1148
1143
1149
// Technically the subnormal frac is good enough but this is
1144
1150
// easier to see through.
1145
- let rounds_to_normal = right_shift_cnt == u32 :1 && subnormal_frac == uN [FRACTION_SZ ]:0 ;
1151
+ let rounds_to_normal = right_shift_cnt == u32 :1 && subnormal_frac == uN [TO_FRACTION_SZ ]:0 ;
1146
1152
1147
1153
if rounds_to_normal {
1148
- APFloat { sign : f_cast .sign , bexp : uN [TO_EXP_SZ ]:1 , fraction : uN [FRACTION_SZ ]:0 }
1154
+ APFloat { sign : f .sign , bexp : uN [TO_EXP_SZ ]:1 , fraction : uN [TO_FRACTION_SZ ]:0 }
1149
1155
} else {
1150
- APFloat { sign : f_cast .sign , bexp : uN [TO_EXP_SZ ]:0 , fraction : subnormal_frac }
1156
+ APFloat { sign : f .sign , bexp : uN [TO_EXP_SZ ]:0 , fraction : subnormal_frac }
1151
1157
}
1152
1158
}
1153
1159
}
@@ -1181,7 +1187,7 @@ fn downcast_normal<TO_FRACTION_SZ: u32, TO_EXP_SZ: u32, FROM_FRACTION_SZ: u32, F
1181
1187
// NB In the no-subnormals case the fraction/bias is already inf.
1182
1188
inf < TO_EXP_SZ , TO_FRACTION_SZ > (f .sign )
1183
1189
} else if CAN_GENERATE_SUBNORMALS && uexp <= - TO_BIAS {
1184
- downcast_to_subnormal < TO_EXP_SZ > (f_cast , round_style )
1190
+ downcast_to_subnormal < TO_FRACTION_SZ , TO_EXP_SZ > (f , round_style )
1185
1191
} else {
1186
1192
APFloat {
1187
1193
sign : f_cast .sign ,
@@ -1432,6 +1438,68 @@ fn downcast_generates_subnormal() {
1432
1438
let expected_even = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _0000_0000 };
1433
1439
assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_EVEN ), expected_even );
1434
1440
assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_AWAY ), expected_away );
1441
+
1442
+ // Rounds up
1443
+ let not_subnormal = F32 {
1444
+ sign : false,
1445
+ bexp : bias < u32 :8 > (min_normal_exp < HF16 ::EXP_SIZE > () as s8 - s8 :3 ),
1446
+ fraction : u10 :0b00 _0000_0110 ++ u13 :0 ,
1447
+ };
1448
+ let expected_away = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0001 };
1449
+ let expected_even = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0001 };
1450
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_EVEN ), expected_even );
1451
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_AWAY ), expected_away );
1452
+
1453
+ // Rounds down
1454
+ let not_subnormal = F32 {
1455
+ sign : false,
1456
+ bexp : bias < u32 :8 > (min_normal_exp < HF16 ::EXP_SIZE > () as s8 - s8 :3 ),
1457
+ fraction : u10 :0b00 _0000_0011 ++ u13 :0 ,
1458
+ };
1459
+ let expected_away = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0000 };
1460
+ let expected_even = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0000 };
1461
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_EVEN ), expected_even );
1462
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_AWAY ), expected_away );
1463
+
1464
+ let not_subnormal = F32 {
1465
+ sign : false,
1466
+ bexp : bias < u32 :8 > (min_normal_exp < HF16 ::EXP_SIZE > () as s8 - s8 :3 ),
1467
+ fraction : u10 :0b00 _0000_0100 ++ u13 :0 ,
1468
+ };
1469
+ let expected_away = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0001 };
1470
+ let expected_even = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0000 };
1471
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_EVEN ), expected_even );
1472
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_AWAY ), expected_away );
1473
+
1474
+ let not_subnormal = F32 {
1475
+ sign : false,
1476
+ bexp : bias < u32 :8 > (min_normal_exp < HF16 ::EXP_SIZE > () as s8 - s8 :3 ),
1477
+ fraction : u10 :0b00 _0000_1100 ++ u13 :0 ,
1478
+ };
1479
+ let expected_away = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0010 };
1480
+ let expected_even = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1000_0010 };
1481
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_EVEN ), expected_even );
1482
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_AWAY ), expected_away );
1483
+
1484
+ let not_subnormal = F32 {
1485
+ sign : false,
1486
+ bexp : bias < u32 :8 > (min_normal_exp < HF16 ::EXP_SIZE > () as s8 - s8 :3 ),
1487
+ fraction : u23 :0b100 _1001_0111_1011_1110_1001 ,
1488
+ };
1489
+ let expected_away = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1100_1001 };
1490
+ let expected_even = HF16 { sign : false, bexp : u5 :0 , fraction : u10 :0b00 _1100_1001 };
1491
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_EVEN ), expected_even );
1492
+ assert_eq (downcast < u32 :10 , u32 :5 > (not_subnormal , RoundStyle ::TIES_TO_AWAY ), expected_away );
1493
+
1494
+ type F64 = APFloat < u32 :11 , u32 :52 > ;
1495
+ let not_subnormal = F64 {
1496
+ sign : false,
1497
+ bexp : bias < F64 ::EXP_SIZE > (min_normal_exp < F32 ::EXP_SIZE > () as s11 - s11 :3 ),
1498
+ fraction : u52 :0b1000 _1000_0110_0001_0010_1001_1010_0000_0001_0010_1100_1010_0001 ,
1499
+ };
1500
+ let expected = F32 { sign : false, bexp : u8 :0 , fraction : u23 :0b001 _1000_1000_0110_0001_0011 };
1501
+ assert_eq (downcast < u32 :23 , u32 :8 > (not_subnormal , RoundStyle ::TIES_TO_EVEN ), expected );
1502
+ assert_eq (downcast < u32 :23 , u32 :8 > (not_subnormal , RoundStyle ::TIES_TO_AWAY ), expected );
1435
1503
}
1436
1504
1437
1505
#[test ]
0 commit comments