11/* SPDX-License-Identifier: MIT OR Apache-2.0 */
2- use crate :: support:: { CastFrom , Float , Int , MinInt } ;
2+ use crate :: support:: { CastFrom , CastInto , Float , HInt , Int , MinInt , NarrowingDiv } ;
33
44#[ inline]
5- pub fn fmod < F : Float > ( x : F , y : F ) -> F {
5+ pub fn fmod < F : Float > ( x : F , y : F ) -> F
6+ where
7+ F :: Int : HInt ,
8+ <F :: Int as HInt >:: D : NarrowingDiv ,
9+ {
610 let _1 = F :: Int :: ONE ;
711 let sx = x. to_bits ( ) & F :: SIGN_MASK ;
812 let ux = x. to_bits ( ) & !F :: SIGN_MASK ;
@@ -29,7 +33,7 @@ pub fn fmod<F: Float>(x: F, y: F) -> F {
2933
3034 // To compute `(num << ex) % (div << ey)`, first
3135 // evaluate `rem = (num << (ex - ey)) % div` ...
32- let rem = reduction ( num, ex - ey, div) ;
36+ let rem = reduction :: < F > ( num, ex - ey, div) ;
3337 // ... so the result will be `rem << ey`
3438
3539 if rem. is_zero ( ) {
@@ -58,11 +62,55 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
5862}
5963
6064/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
61- fn reduction < I : Int > ( mut x : I , e : u32 , y : I ) -> I {
62- x %= y;
63- for _ in 0 ..e {
64- x <<= 1 ;
65- x = x. checked_sub ( y) . unwrap_or ( x) ;
65+ fn reduction < F > ( mut x : F :: Int , e : u32 , y : F :: Int ) -> F :: Int
66+ where
67+ F : Float ,
68+ F :: Int : HInt ,
69+ <<F as Float >:: Int as HInt >:: D : NarrowingDiv ,
70+ {
71+ // `f16` only has 5 exponent bits, so even `f16::MAX = 65504.0` is only
72+ // a 40-bit integer multiple of the smallest subnormal.
73+ if F :: BITS == 16 {
74+ debug_assert ! ( F :: EXP_MAX - F :: EXP_MIN == 29 ) ;
75+ debug_assert ! ( e <= 29 ) ;
76+ let u: u16 = x. cast ( ) ;
77+ let v: u16 = y. cast ( ) ;
78+ let u = ( u as u64 ) << e;
79+ let v = v as u64 ;
80+ return F :: Int :: cast_from ( ( u % v) as u16 ) ;
6681 }
67- x
82+
83+ // Ensure `x < 2y` for later steps
84+ if x >= ( y << 1 ) {
85+ // This case is only reached with subnormal divisors,
86+ // but it might be better to just normalize all significands
87+ // to make this unnecessary. The further calls could potentially
88+ // benefit from assuming a specific fixed leading bit position.
89+ x %= y;
90+ }
91+
92+ // The simple implementation seems to be fastest for a short reduction
93+ // at this size. The limit here was chosen empirically on an Intel Nehalem.
94+ // Less old CPUs that have faster `u64 * u64 -> u128` might not benefit,
95+ // and 32-bit systems or architectures without hardware multipliers might
96+ // want to do this in more cases.
97+ if F :: BITS == 64 && e < 32 {
98+ // Assumes `x < 2y`
99+ for _ in 0 ..e {
100+ x = x. checked_sub ( y) . unwrap_or ( x) ;
101+ x <<= 1 ;
102+ }
103+ return x. checked_sub ( y) . unwrap_or ( x) ;
104+ }
105+
106+ // Fast path for short reductions
107+ if e < F :: BITS {
108+ let w = x. widen ( ) << e;
109+ if let Some ( ( _, r) ) = w. checked_narrowing_div_rem ( y) {
110+ return r;
111+ }
112+ }
113+
114+ // Assumes `x < 2y`
115+ crate :: support:: linear_mul_reduction ( x, e, y)
68116}
0 commit comments