3232#endif
3333
3434#ifndef __BMI2__
35- # error "Target architecture must support BMI2 (for MULX)"
35+ # error "Target architecture must support BMI2 (for MULX)"
3636#endif
3737
3838// GCC's overeager SLP vectorizer sometimes pessimizes code. For functions that
@@ -732,84 +732,89 @@ namespace monad::vm::runtime
732732 }
733733
734734 [[gnu::always_inline]]
735- inline constexpr std::pair<uint64_t , uint64_t >
736- mulx_constexpr (uint64_t const x, uint64_t const y) noexcept
735+ inline constexpr void mulx_constexpr (
736+ uint64_t const x, uint64_t const y, uint64_t &r_hi,
737+ uint64_t &r_lo) noexcept
737738 {
738- uint128_t const prod = static_cast < uint128_t >(x) * static_cast < uint128_t >(y);
739- uint64_t const hi = static_cast <uint64_t >(prod >> uint128_t { 64 } );
740- uint64_t const lo = static_cast <uint64_t >(prod);
741- return {hi, lo} ;
739+ uint128_t const prod =
740+ static_cast <uint128_t >(x) * static_cast < uint128_t >(y );
741+ r_hi = static_cast <uint64_t >(prod >> uint128_t { 64 } );
742+ r_lo = static_cast < uint64_t >(prod) ;
742743 }
743744
744745 [[gnu::always_inline]]
745- inline std::pair<uint64_t , uint64_t >
746- mulx_intrinsic (uint64_t const x, uint64_t const y) noexcept
746+ inline void mulx_intrinsic (
747+ uint64_t const x, uint64_t const y, uint64_t &r_hi,
748+ uint64_t &r_lo) noexcept
747749 {
750+ /*
748751 uint64_t hi;
749752 uint64_t lo;
753+ */
750754 asm (" mulx %[x], %[lo], %[hi]"
751- : [hi] " =r" (hi ), [lo] " =r" (lo )
755+ : [hi] " =r" (r_hi ), [lo] " =r" (r_lo )
752756 : [x] " r" (x), [y] " d" (y));
753- return {hi, lo};
754757 }
755758
756759 [[gnu::always_inline]]
757- inline constexpr std::pair<uint64_t , uint64_t >
758- mulx (uint64_t const x, uint64_t const y) noexcept
760+ inline constexpr void mulx (
761+ uint64_t const x, uint64_t const y, uint64_t &r_hi,
762+ uint64_t &r_lo) noexcept
759763 {
760764 if consteval {
761- return mulx_constexpr (x, y);
765+ return mulx_constexpr (x, y, r_hi, r_lo );
762766 }
763767 else {
764- return mulx_intrinsic (x, y);
768+ return mulx_intrinsic (x, y, r_hi, r_lo );
765769 }
766770 }
767771
768772 template <size_t M>
769773 using words_t = std::array<uint64_t , M>;
770774
771775 [[gnu::always_inline]]
772- inline std::tuple<uint64_t , uint64_t , uint64_t > adc_3 (
773- std::tuple<uint64_t , uint64_t , uint64_t > const x,
774- std::tuple<uint64_t , uint64_t > const y) noexcept
776+ inline void adc_3 (
777+ uint64_t x_2, uint64_t x_1, uint64_t x_0, uint64_t const y_1,
778+ uint64_t const y_0, uint64_t &r_2, uint64_t &r_1,
779+ uint64_t &r_0) noexcept
775780 {
776- auto [x_2, x_1, x_0] = x;
777- auto [y_1, y_0] = y;
778781 asm (" addq %[y_0], %[x_0]\n "
779782 " adcq %[y_1], %[x_1]\n "
780783 " adcq $0, %[x_2]"
781784 : [x_0] " +r" (x_0), [x_1] " +r" (x_1), [x_2] " +r" (x_2)
782785 : [y_0] " r" (y_0), [y_1] " r" (y_1)
783786 : " cc" );
784- return {x_2, x_1, x_0};
787+ r_2 = x_2;
788+ r_1 = x_1;
789+ r_0 = x_0;
785790 }
786791
787792 [[gnu::always_inline]]
788- inline std::pair<uint64_t , uint64_t >
789- adc_2 (std::pair<uint64_t , uint64_t > const x, uint64_t const y_0) noexcept
793+ inline void adc_2 (
794+ uint64_t x_1, uint64_t x_0, uint64_t const y_0, uint64_t &r_1,
795+ uint64_t &r_0) noexcept
790796 {
791- auto [x_1, x_0] = x;
792797 asm (" addq %[y_0], %[x_0]\n "
793798 " adcq $0, %[x_1]"
794799 : [x_0] " +r" (x_0), [x_1] " +r" (x_1)
795800 : [y_0] " r" (y_0)
796801 : " cc" );
797- return {x_1, x_0};
802+ r_1 = x_1;
803+ r_0 = x_0;
798804 }
799805
800806 [[gnu::always_inline]]
801- inline std::pair< uint64_t , uint64_t > adc_2 (
802- std::pair< uint64_t , uint64_t > const x ,
803- std::pair< uint64_t , uint64_t > const y ) noexcept
807+ inline void adc_2 (
808+ uint64_t x_1 , uint64_t x_0, uint64_t const y_1, uint64_t const y_0 ,
809+ uint64_t &r_1 , uint64_t &r_0 ) noexcept
804810 {
805- auto [x_1, x_0] = x;
806- auto [y_1, y_0] = y;
807811 asm (" addq %[y_0], %[x_0]\n "
808812 " adcq %[y_1], %[x_1]"
809813 : [x_0] " +r" (x_0), [x_1] " +r" (x_1)
810814 : [y_0] " r" (y_0), [y_1] " r" (y_1)
811815 : " cc" );
812- return {x_1, x_0};
816+ r_1 = x_1;
817+ r_0 = x_0;
813818 }
814819
815820 template <size_t I, size_t R, size_t M>
@@ -820,8 +825,18 @@ namespace monad::vm::runtime
820825 {
821826 if constexpr (I < std::min (R, M)) {
822827 if constexpr (I + 1 < R) {
823- auto const [hi, lo] = mulx (x[I], y);
824- std::tie (carry, result[I]) = adc_2 ({hi, lo}, carry);
828+ uint64_t hi;
829+ uint64_t lo;
830+ mulx (x[I], y, hi, lo);
831+ adc_2 (
832+ // Input 1
833+ hi,
834+ lo,
835+ // Input 2
836+ carry,
837+ // Output
838+ carry,
839+ result[I]);
825840 mul_line_recur<I + 1 , R, M>(x, y, result, carry);
826841 }
827842 else {
@@ -841,7 +856,7 @@ namespace monad::vm::runtime
841856 words_t <R> &__restrict__ result) noexcept
842857 {
843858 uint64_t carry;
844- std::tie (carry, result [0 ]) = mulx (y, x [0 ]);
859+ mulx (y, x [0 ], carry, result [0 ]);
845860
846861 mul_line_recur<1 , R, M>(x, y, result, carry);
847862 }
@@ -855,15 +870,35 @@ namespace monad::vm::runtime
855870 if constexpr (J + 1 < M && I + J < R) {
856871 if constexpr (I + J + 2 < R) {
857872 // We need c_lo, c_hi
858- auto const [hi, lo] = mulx (x[J + 1 ], y_i);
859- std::tie (c_hi, c_lo, result[I + J]) =
860- adc_3 ({hi, lo, result[I + J]}, {c_hi, c_lo});
873+ uint64_t hi;
874+ uint64_t lo;
875+ mulx (x[J + 1 ], y_i, hi, lo);
876+ adc_3 (
877+ // Input 1
878+ hi,
879+ lo,
880+ result[I + J],
881+ // Input 2
882+ c_hi,
883+ c_lo,
884+ // Result
885+ c_hi,
886+ c_lo,
887+ result[I + J]);
861888 }
862889 else if constexpr (I + J + 1 < R) {
863890 // We only need c_lo
864891 uint64_t const lo = x[J + 1 ] * y_i;
865- std::tie (c_lo, result[I + J]) =
866- adc_2 ({lo, result[I + J]}, {c_hi, c_lo});
892+ adc_2 (
893+ // Input 1
894+ lo,
895+ result[I + J],
896+ // Input 2
897+ c_hi,
898+ c_lo,
899+ // Output
900+ c_lo,
901+ result[I + J]);
867902 }
868903 else {
869904 // We're done, we don't need subsequent results
@@ -873,9 +908,16 @@ namespace monad::vm::runtime
873908 }
874909 else {
875910 if constexpr (I + M < R) {
876- auto [hi, lo] = adc_2 ({c_hi, c_lo}, result[I + M - 1 ]);
877- result[I + M - 1 ] = lo;
878- result[I + M] = hi;
911+
912+ adc_2 (
913+ // Input 1
914+ c_hi,
915+ c_lo,
916+ // Input 2
917+ result[I + M - 1 ],
918+ // Output
919+ result[I + M],
920+ result[I + M - 1 ]);
879921 }
880922 else if constexpr (I + M < R + 1 ) {
881923 result[I + M - 1 ] += c_lo;
@@ -904,7 +946,7 @@ namespace monad::vm::runtime
904946 uint64_t c_lo;
905947
906948 if constexpr (I + 1 < R) {
907- std::tie (c_hi, c_lo) = mulx (x[0 ], y_i);
949+ mulx (x[0 ], y_i, c_hi, c_lo );
908950 }
909951 else {
910952 c_hi = 0 ;
@@ -947,7 +989,9 @@ namespace monad::vm::runtime
947989 for (size_t j = 0 ; j < N; j++) {
948990 uint64_t carry = 0 ;
949991 for (size_t i = 0 ; i < M && i + j < R; i++) {
950- auto const [hi, lo] = mulx (x[i], y[j]);
992+ uint64_t hi;
993+ uint64_t lo;
994+ mulx (x[i], y[j], hi, lo);
951995
952996 auto const [s0, c0] = addc (lo, result[i + j], false );
953997 auto const [s1, c1] = addc (s0, carry, false );
0 commit comments