From d09c98ed7c5ecae20c60e19dcb79e7cf6797a8d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Wed, 27 Nov 2024 15:07:21 +0000 Subject: [PATCH 01/13] Add hardcoded flint_mpn_aors_n for ARM and x86 These are generated from `dev/gen_ARCH_aors.jl`. Also add tests for it. --- dev/gen_arm_aors.jl | 94 +++ dev/gen_x86_aors.jl | 83 +++ src/mpn_extras.h | 41 ++ src/mpn_extras/aors_n.c | 88 +++ src/mpn_extras/arm64/aors_hard.asm | 492 +++++++++++++++ src/mpn_extras/test/main.c | 2 + src/mpn_extras/test/t-aors_n.c | 85 +++ src/mpn_extras/x86_64/broadwell/aors_hard.asm | 565 ++++++++++++++++++ 8 files changed, 1450 insertions(+) create mode 100644 dev/gen_arm_aors.jl create mode 100644 dev/gen_x86_aors.jl create mode 100644 src/mpn_extras/aors_n.c create mode 100644 src/mpn_extras/arm64/aors_hard.asm create mode 100644 src/mpn_extras/test/t-aors_n.c create mode 100644 src/mpn_extras/x86_64/broadwell/aors_hard.asm diff --git a/dev/gen_arm_aors.jl b/dev/gen_arm_aors.jl new file mode 100644 index 0000000000..465b90e64a --- /dev/null +++ b/dev/gen_arm_aors.jl @@ -0,0 +1,94 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. See . +# + +# Generating routines for r <- a OP b, where OP is either + or -. +# +# This generation was constructed with processors with Apple silicon in mind. +# Processors decoding less than 6 operations per cycle, or few store and load +# units may have worse performance. + +r = "rp" +a = "ap" +b = "bp" +rp(ix::Int) = "[$r,#$ix*8]" +ap(ix::Int) = "[$a,#$ix*8]" +bp(ix::Int) = "[$b,#$ix*8]" + +sx = "sx" # Return value for carry or borrow +CC = "CC" + +sp = ["s$ix" for ix in 0:14] # Scrap registers + +# Writes assembly that should be preprocessed by M4. +function aors(n::Int) + _str = "PROLOGUE(flint_mpn_aors($n))\n" + function ldr(s0::String, s1::String) + _str *= "\tldr\t$s0, $s1\n" + end + function ldp(s0::String, s1::String, s2::String) + _str *= "\tldp\t$s0, $s1, $s2\n" + end + function str(s0::String, s1::String) + _str *= "\tstr\t$s0, $s1\n" + end + function stp(s0::String, s1::String, s2::String) + _str *= "\tstp\t$s0, $s1, $s2\n" + end + function OP(s0::String, s1::String, s2::String) + _str *= "\tOP\t$s0, $s1, $s2\n" + end + function OPC(s0::String, s1::String, s2::String) + _str *= "\tOPC\t$s0, $s1, $s2\n" + end + function cset(s0::String, s1::String) + _str *= "\tcset\t$s0, $s1\n" + end + + sv = deepcopy(sp) + s(ix::Int) = sv[ix + 1] + function shift(sv::Vector{String}) + sv[(end - 3):end], sv[1:(end - 4)] = sv[1:4], sv[5:end] + end + + ldp( s(0), s(2), ap(0)) + ldp( s(1), s(3), bp(0)) + OP( s(0), s(0), s(1)) + OPC( s(2), s(2), s(3)) + stp( s(0), s(2), rp(0)) + + for ix in 1:(n ÷ 2 - 1) + shift(sv) + ldp( s(0), s(2), ap(2 * ix)) + ldp( s(1), s(3), bp(2 * ix)) + OPC( s(0), s(0), s(1)) + OPC( s(2), s(2), s(3)) + stp( s(0), s(2), rp(2 * ix)) + end + + if n % 2 == 1 + ldr( s(4), ap(n - 1)) + ldr( s(5), bp(n - 1)) + OPC( s(4), s(4), s(5)) + str( s(4), rp(n - 1)) + end + + cset( sx, CC) + + _str *= "\tret\nEPILOGUE()\n" + + return _str +end + +function print_all_aors(nmax::Int = 16) + for n in 2:nmax + println(aors(n)) + end +end diff --git a/dev/gen_x86_aors.jl b/dev/gen_x86_aors.jl new file mode 100644 index 0000000000..0db9110cbd --- /dev/null +++ b/dev/gen_x86_aors.jl @@ -0,0 +1,83 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. See . +# + +# Generating routines for r <- a OP b, where OP is either + or -. +# +# This generation was constructed with processors with descent schedulers in +# mind. + +r = "rp" +a = "ap" +b = "bp" +rp(ix::Int) = "$ix*8($r)" +ap(ix::Int) = "$ix*8($a)" +bp(ix::Int) = "$ix*8($b)" + +sx = "sx" # Return value for carry or borrow, i.e. %rax + +R32(sx::String) = "R32($sx)" +R8(sx::String) = "R8($sx)" + +sp = ["s$ix" for ix in 0:4] # Scrap registers + +# Writes assembly that should be preprocessed by M4. +function aors(n::Int) + str = "\tALIGN(16)\nPROLOGUE(flint_mpn_aors($n))\n" + function mov(s0::String, s1::String) + str *= "\tmov\t$s0, $s1\n" + end + function xor(s0::String, s1::String) + str *= "\txor\t$s0, $s1\n" + end + function OP(s0::String, s1::String) + str *= "\tOP\t$s0, $s1\n" + end + function OPC(s0::String, s1::String) + str *= "\tOPC\t$s0, $s1\n" + end + function setc(s0::String) + str *= "\tsetc\t$s0\n" + end + + sv = deepcopy(sp) + s(ix::Int) = sv[ix + 1] + function shift(sv::Vector{String}) + sv[end], sv[1:end - 1] = sv[1], sv[2:end] + end + + mov( ap(0), s(0)) + + mov( ap(1), s(1)) + xor( R32(sx), R32(sx)) + OP( bp(0), s(0)) + mov( s(0), rp(0)) + + for ix in 1:(n - 2) + shift(sv) + mov( ap(ix + 1), s(1)) + OPC( bp(ix), s(0)) + mov( s(0), rp(ix)) + end + + OPC( bp(n - 1), s(1)) + mov( s(1), rp(n - 1)) + setc( R8(sx)) + + str *= "\tret\nEPILOGUE()\n" + + return str +end + +function print_all_aors(nmax::Int = 16) + for n in 2:nmax + println(aors(n)) + end +end diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 90fc8e6436..9d4ba63c0c 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -462,25 +462,34 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); /* multiplication (general) **************************************************/ +/* NOTE: This is getting a bit messy. How can we clean this up? */ #if FLINT_HAVE_ASSEMBLY_x86_64_adx +# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17 # define FLINT_MPN_MUL_FUNC_TAB_WIDTH 17 # define FLINT_MPN_SQR_FUNC_TAB_WIDTH 14 +# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH) # define FLINT_HAVE_MUL_FUNC(n, m) ((n) <= 16) # define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= 16) # define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH) +# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp)) +# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp)) # define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_tab[xn][yn](rp, xp, yp)) # define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_n_func_tab[n](rp, xp, yp)) # define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp)) #elif FLINT_HAVE_ASSEMBLY_armv8 +# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17 # define FLINT_MPN_MUL_FUNC_N_TAB_WIDTH 15 # define FLINT_MPN_SQR_FUNC_TAB_WIDTH 9 +# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH) # define FLINT_HAVE_MUL_FUNC(n, m) FLINT_HAVE_MUL_N_FUNC(n) # define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= FLINT_MPN_MUL_FUNC_N_TAB_WIDTH) # define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH) +# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp)) +# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp)) # define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_n_tab[xn](rp, xp, yp, yn)) # define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_func_n_tab[n](rp, xp, yp, n)) # define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp)) @@ -506,6 +515,16 @@ typedef mp_limb_t (* flint_mpn_mul_func_t)(mp_ptr, mp_srcptr, mp_srcptr); typedef mp_limb_t (* flint_mpn_mul_func_n_t)(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); typedef mp_limb_t (* flint_mpn_sqr_func_t)(mp_ptr, mp_srcptr); +#ifdef FLINT_MPN_AORS_FUNC_TAB_WIDTH +# define FLINT_USE_AORS_FUNC_TAB 1 +FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_add_func_tab[]; +FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_sub_func_tab[]; +#else +# define FLINT_HAVE_AORS_FUNC(n) 0 +# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) 0 +# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) 0 +#endif + #ifdef FLINT_MPN_MUL_FUNC_N_TAB_WIDTH FLINT_DLL extern const flint_mpn_mul_func_n_t flint_mpn_mul_func_n_tab[]; #else @@ -522,6 +541,28 @@ mp_limb_t _flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_si void _flint_mpn_mul_n(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n); mp_limb_t _flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n); +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_add_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + FLINT_ASSERT(n >= 1); + + if (FLINT_HAVE_AORS_FUNC(n)) + return FLINT_MPN_ADD_HARD(rp, xp, yp, n); + else + return mpn_add_n(rp, xp, yp, n); +} + +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_sub_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + FLINT_ASSERT(n >= 1); + + if (FLINT_HAVE_AORS_FUNC(n)) + return FLINT_MPN_SUB_HARD(rp, xp, yp, n); + else + return mpn_sub_n(rp, xp, yp, n); +} + MPN_EXTRAS_INLINE mp_limb_t flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_size_t yn) { diff --git a/src/mpn_extras/aors_n.c b/src/mpn_extras/aors_n.c new file mode 100644 index 0000000000..ee9231aecd --- /dev/null +++ b/src/mpn_extras/aors_n.c @@ -0,0 +1,88 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" + +#define DECL_AORS(n) _DECL_AORS(n) +#define _DECL_AORS(n) \ +mp_limb_t flint_mpn_add_##n(mp_ptr, mp_srcptr, mp_srcptr); \ +mp_limb_t flint_mpn_sub_##n(mp_ptr, mp_srcptr, mp_srcptr) + +#define ADD(n) _ADD(n) +#define _ADD(n) flint_mpn_add_##n +#define SUB(n) _SUB(n) +#define _SUB(n) flint_mpn_sub_##n + +/* Herein we assume that x86 and ARM are equivalent. */ +#if FLINT_HAVE_ASSEMBLY_x86_64_adx || FLINT_HAVE_ASSEMBLY_armv8 +DECL_AORS(1); +DECL_AORS(2); +DECL_AORS(3); +DECL_AORS(4); +DECL_AORS(5); +DECL_AORS(6); +DECL_AORS(7); +DECL_AORS(8); +DECL_AORS(9); +DECL_AORS(10); +DECL_AORS(11); +DECL_AORS(12); +DECL_AORS(13); +DECL_AORS(14); +DECL_AORS(15); +DECL_AORS(16); + +/* TODO: Should probably rename these types so to not have two different types. + * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */ +const flint_mpn_mul_func_t flint_mpn_add_func_tab[] = +{ + NULL, + ADD(1), + ADD(2), + ADD(3), + ADD(4), + ADD(5), + ADD(6), + ADD(7), + ADD(8), + ADD(9), + ADD(10), + ADD(11), + ADD(12), + ADD(13), + ADD(14), + ADD(15), + ADD(16) +}; + +const flint_mpn_mul_func_t flint_mpn_sub_func_tab[] = +{ + NULL, + SUB(1), + SUB(2), + SUB(3), + SUB(4), + SUB(5), + SUB(6), + SUB(7), + SUB(8), + SUB(9), + SUB(10), + SUB(11), + SUB(12), + SUB(13), + SUB(14), + SUB(15), + SUB(16) +}; +#else +typedef int this_file_is_empty; +#endif diff --git a/src/mpn_extras/arm64/aors_hard.asm b/src/mpn_extras/arm64/aors_hard.asm new file mode 100644 index 0000000000..ed9cc2a0e0 --- /dev/null +++ b/src/mpn_extras/arm64/aors_hard.asm @@ -0,0 +1,492 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +dnl Everything from n = 2 and onwards is generated by +dnl $topdir/dev/gen_arm_aors.jl. +dnl +dnl This generation was constructed with processors with Apple silicon in mind. +dnl Processors decoding less than 6 operations per cycle, or few store and load +dnl units may have worse performance. + +define(`rp', `x0') +define(`ap', `x1') +define(`bp', `x2') + +define(`sx', `x0') C Beware that this is synonymous with rp +define(`s0', `x3') +define(`s1', `x4') +define(`s2', `x5') +define(`s3', `x6') +define(`s4', `x7') +define(`s5', `x8') +define(`s6', `x9') +define(`s7', `x10') +define(`s8', `x11') +define(`s9', `x12') +define(`s10', `x13') +define(`s11', `x14') +define(`s12', `x15') +define(`s13', `x16') +define(`s14', `x17') + +define(ALL_AORS,` +PROLOGUE(flint_mpn_aors(1)) + ldr s0, [ap,#0*8] + ldr s1, [bp,#0*8] + OP s0, s0, s1 + str s0, [rp,#0*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(2)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(3)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldr s4, [ap,#2*8] + ldr s5, [bp,#2*8] + OPC s4, s4, s5 + str s4, [rp,#2*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(4)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(5)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldr s8, [ap,#4*8] + ldr s9, [bp,#4*8] + OPC s8, s8, s9 + str s8, [rp,#4*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(6)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(7)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldr s12, [ap,#6*8] + ldr s13, [bp,#6*8] + OPC s12, s12, s13 + str s12, [rp,#6*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(8)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(9)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldr s1, [ap,#8*8] + ldr s2, [bp,#8*8] + OPC s1, s1, s2 + str s1, [rp,#8*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(10)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(11)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldr s5, [ap,#10*8] + ldr s6, [bp,#10*8] + OPC s5, s5, s6 + str s5, [rp,#10*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(12)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(13)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldr s9, [ap,#12*8] + ldr s10, [bp,#12*8] + OPC s9, s9, s10 + str s9, [rp,#12*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(14)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldp s9, s11, [ap,#12*8] + ldp s10, s12, [bp,#12*8] + OPC s9, s9, s10 + OPC s11, s11, s12 + stp s9, s11, [rp,#12*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(15)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldp s9, s11, [ap,#12*8] + ldp s10, s12, [bp,#12*8] + OPC s9, s9, s10 + OPC s11, s11, s12 + stp s9, s11, [rp,#12*8] + ldr s13, [ap,#14*8] + ldr s14, [bp,#14*8] + OPC s13, s13, s14 + str s13, [rp,#14*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors(16)) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + ldp s4, s6, [ap,#2*8] + ldp s5, s7, [bp,#2*8] + OPC s4, s4, s5 + OPC s6, s6, s7 + stp s4, s6, [rp,#2*8] + ldp s8, s10, [ap,#4*8] + ldp s9, s11, [bp,#4*8] + OPC s8, s8, s9 + OPC s10, s10, s11 + stp s8, s10, [rp,#4*8] + ldp s12, s14, [ap,#6*8] + ldp s13, s0, [bp,#6*8] + OPC s12, s12, s13 + OPC s14, s14, s0 + stp s12, s14, [rp,#6*8] + ldp s1, s3, [ap,#8*8] + ldp s2, s4, [bp,#8*8] + OPC s1, s1, s2 + OPC s3, s3, s4 + stp s1, s3, [rp,#8*8] + ldp s5, s7, [ap,#10*8] + ldp s6, s8, [bp,#10*8] + OPC s5, s5, s6 + OPC s7, s7, s8 + stp s5, s7, [rp,#10*8] + ldp s9, s11, [ap,#12*8] + ldp s10, s12, [bp,#12*8] + OPC s9, s9, s10 + OPC s11, s11, s12 + stp s9, s11, [rp,#12*8] + ldp s13, s0, [ap,#14*8] + ldp s14, s1, [bp,#14*8] + OPC s13, s13, s14 + OPC s0, s0, s1 + stp s13, s0, [rp,#14*8] + cset sx, CC + ret +EPILOGUE() +') + +define(`flint_mpn_aors',`flint_mpn_add_$1') +define(`OP',`adds') +define(`OPC',`adcs') +define(`CC',`cs') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC') + +define(`flint_mpn_aors',`flint_mpn_sub_$1') +define(`OP',`subs') +define(`OPC',`sbcs') +define(`CC',`cc') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC') diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c index f39d688fdd..171a9b7342 100644 --- a/src/mpn_extras/test/main.c +++ b/src/mpn_extras/test/main.c @@ -12,6 +12,7 @@ /* Include functions *********************************************************/ #include "t-2add_n_inplace.c" +#include "t-aors_n.c" #include "t-divides.c" #include "t-divrem_preinv1.c" #include "t-divrem_preinvn.c" @@ -38,6 +39,7 @@ test_struct tests[] = { TEST_FUNCTION(flint_mpn_2add_n_inplace), + TEST_FUNCTION(flint_mpn_aors_n), TEST_FUNCTION(flint_mpn_divides), TEST_FUNCTION(flint_mpn_divrem_preinv1), TEST_FUNCTION(flint_mpn_divrem_preinvn), diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c new file mode 100644 index 0000000000..0af210d94c --- /dev/null +++ b/src/mpn_extras/test/t-aors_n.c @@ -0,0 +1,85 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + Copyright (C) 2024 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "mpn_extras.h" + +#define N_MIN 1 +#define N_MAX (FLINT_MPN_AORS_FUNC_TAB_WIDTH - 1) +#define N_STOR (FLINT_MPN_AORS_FUNC_TAB_WIDTH + 10) + +TEST_FUNCTION_START(flint_mpn_aors_n, state) +{ +#if FLINT_USE_AORS_FUNC_TAB + slong ix; + + for (ix = 0; ix < 10000 * flint_test_multiplier(); ix++) + { + int result; + int type; + mp_limb_t cf, cg; + mp_size_t n; + mp_ptr fp, gp, xp, yp; + + n = N_MIN + n_randint(state, N_MAX - N_MIN + 1); + if (n_randint(state, 1 << 10) == UWORD(0)) + n += N_STOR; + + fp = flint_malloc(sizeof(mp_limb_t) * n); + gp = flint_malloc(sizeof(mp_limb_t) * n); + xp = flint_malloc(sizeof(mp_limb_t) * n); + yp = flint_malloc(sizeof(mp_limb_t) * n); + + flint_mpn_rrandom(xp, state, n); + flint_mpn_rrandom(yp, state, n); + + type = n_randint(state, 2); + + if (type == 0) + { + cf = flint_mpn_add_n(fp, xp, yp, n); + cg = mpn_add_n(gp, xp, yp, n); + } + else + { + cf = flint_mpn_sub_n(fp, xp, yp, n); + cg = mpn_sub_n(gp, xp, yp, n); + } + + result = (cf == cg && mpn_cmp(fp, gp, n) == 0); + if (!result) + TEST_FUNCTION_FAIL( + "%s:\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "FLINT (cy = %wu): %{ulong*}\n" + "GMP (cy = %wu): %{ulong*}\n", + type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", + ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + + flint_free(fp); + flint_free(gp); + flint_free(xp); + flint_free(yp); + } + + TEST_FUNCTION_END(state); +#else + TEST_FUNCTION_END_SKIPPED(state); +#endif +} + +#undef N_MIN +#undef N_MAX +#undef N_STOR diff --git a/src/mpn_extras/x86_64/broadwell/aors_hard.asm b/src/mpn_extras/x86_64/broadwell/aors_hard.asm new file mode 100644 index 0000000000..390ee036ec --- /dev/null +++ b/src/mpn_extras/x86_64/broadwell/aors_hard.asm @@ -0,0 +1,565 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +dnl Everything from n = 2 and onwards is generated by +dnl $topdir/dev/gen_x86_aors.jl. + +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp', `%rdx') + +define(`sx', `%rax') +define(`s0', `%rcx') +define(`s1', `%r8') +define(`s2', `%r9') +define(`s3', `%r10') +define(`s4', `%r11') + +define(ALL_AORS,` + ALIGN(16) +PROLOGUE(flint_mpn_aors(1)) + mov 0*8(ap), s0 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(2)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(3)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(4)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(5)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(6)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(7)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(8)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(9)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(10)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(11)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(12)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(13)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(14)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + mov 13*8(ap), s3 + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + OPC 13*8(bp), s3 + mov s3, 13*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(15)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + mov 13*8(ap), s3 + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + mov 14*8(ap), s4 + OPC 13*8(bp), s3 + mov s3, 13*8(rp) + OPC 14*8(bp), s4 + mov s4, 14*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_aors(16)) + mov 0*8(ap), s0 + mov 1*8(ap), s1 + xor R32(sx), R32(sx) + OP 0*8(bp), s0 + mov s0, 0*8(rp) + mov 2*8(ap), s2 + OPC 1*8(bp), s1 + mov s1, 1*8(rp) + mov 3*8(ap), s3 + OPC 2*8(bp), s2 + mov s2, 2*8(rp) + mov 4*8(ap), s4 + OPC 3*8(bp), s3 + mov s3, 3*8(rp) + mov 5*8(ap), s0 + OPC 4*8(bp), s4 + mov s4, 4*8(rp) + mov 6*8(ap), s1 + OPC 5*8(bp), s0 + mov s0, 5*8(rp) + mov 7*8(ap), s2 + OPC 6*8(bp), s1 + mov s1, 6*8(rp) + mov 8*8(ap), s3 + OPC 7*8(bp), s2 + mov s2, 7*8(rp) + mov 9*8(ap), s4 + OPC 8*8(bp), s3 + mov s3, 8*8(rp) + mov 10*8(ap), s0 + OPC 9*8(bp), s4 + mov s4, 9*8(rp) + mov 11*8(ap), s1 + OPC 10*8(bp), s0 + mov s0, 10*8(rp) + mov 12*8(ap), s2 + OPC 11*8(bp), s1 + mov s1, 11*8(rp) + mov 13*8(ap), s3 + OPC 12*8(bp), s2 + mov s2, 12*8(rp) + mov 14*8(ap), s4 + OPC 13*8(bp), s3 + mov s3, 13*8(rp) + mov 15*8(ap), s0 + OPC 14*8(bp), s4 + mov s4, 14*8(rp) + OPC 15*8(bp), s0 + mov s0, 15*8(rp) + setc R8(sx) + ret +EPILOGUE() +') + + TEXT +define(`flint_mpn_aors',`flint_mpn_add_$1') +define(`OP',`add') +define(`OPC',`adc') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC') + +define(`flint_mpn_aors',`flint_mpn_sub_$1') +define(`OP',`sub') +define(`OPC',`sbb') +ALL_AORS +undefine(`flint_mpn_aors') +undefine(`OP') +undefine(`OPC') From 00ecdcbc6d6b1e6322494f27a9ae3b02d4ec8f32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Sat, 30 Nov 2024 14:47:06 +0000 Subject: [PATCH 02/13] Start on aorsrsh for x86 --- .../x86_64/broadwell/aorsrsh_hard.asm | 220 ++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm new file mode 100644 index 0000000000..4fefb0143b --- /dev/null +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -0,0 +1,220 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp', `%rdx') +define(`cnt', `%rcx') + +define(`tnc', `%r8') +define(`sx', `%rax') + +define(`s0', `%r9') +define(`s1', `%r10') +define(`s2', `%r11') + +dnl r <- a +/- 2^n b +dnl +dnl For 0 <= i < n - 1, we have +dnl +dnl r_{i} = a_{i} +/- (b_{i} >> n + b_{i + 1} << (64 - n)), +dnl +dnl and +dnl +dnl r_{n - 1} = a_{n - 1} +/- (b_{n - 1} >> n). + +dnl The idea is the following: +dnl +dnl Assume that bp[i] is loaded in a register b0. +dnl +dnl t = b0 >> n C shrx +dnl b1 = bp[i + 1] C mov, and fullfills assumption for next iteration +dnl s = b1 << (64 - n) C shlx +dnl s = s + t C lea, carry-less +dnl if OP = add, then +dnl s += ap[i] C adc +dnl rp[i] = s C mov +dnl else +dnl u = ap[i] C mov +dnl u -= s C sbb +dnl rp[i] = u C mov +dnl fi + +define(ALL_AORS,` + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_1) + shrx cnt, 0*8(bp), s0 + xor R32(sx), R32(sx) + add 0*8(ap), s0 + mov s0, 0*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_2) + xor R32(tnc), R32(tnc) + sub cnt, tnc + + xor R32(sx), R32(sx) + + mov 1*8(bp), s1 +C + shrx cnt, 0*8(bp), s0 + shlx tnc, s1, s2 + shrx cnt, s1, s1 + C (0, 2), 1 + + adox s2, s0 +C + adcx 0*8(ap), s0 + mov s0, 0*8(rp) + adox sx, s1 C cannot overflow + adcx 1*8(ap), s1 +C + mov s1, 1*8(rp) + + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_3) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 +ifelse(OP,`add',` + add 0*8(ap), s2 + mov s2, 0*8(rp) +',` + mov 0*8(ap), s0 + sub s2, s0 + mov s0, 0*8(rp) +') + C Used: s1 + + shrx cnt, s1, s1 + mov 2*8(bp), s2 + shlx tnc, s2, s0 + lea (s1, s0), s0 +ifelse(OP,`add',` + adc 1*8(ap), s0 + mov s0, 1*8(rp) +',` + mov 1*8(ap), s1 + sbb s0, s1 + mov s1, 1*8(rp) +') + C Used: s2 + + shrx cnt, s2, s2 +ifelse(OP,`add',` + adc 2*8(ap), s2 + mov s2, 2*8(rp) +',` + mov 2*8(ap), s0 + sbb s2, s0 + mov s0, 1*8(rp) +') + + setc R8(sx) + ret +EPILOGUE() +') + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_4) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 +ifelse(OP,`add',` + add 0*8(ap), s2 + mov s2, 0*8(rp) +',` + mov 0*8(ap), s0 + sub s2, s0 + mov s0, 0*8(rp) +') + C Used: s1 + + shrx cnt, s1, s1 + mov 2*8(bp), s2 + shlx tnc, s2, s0 + lea (s1, s0), s0 +ifelse(OP,`add',` + adc 1*8(ap), s0 + mov s0, 1*8(rp) +',` + mov 1*8(ap), s1 + sbb s0, s1 + mov s1, 1*8(rp) +') + C Used: s2 + +C + shrx cnt, s1, s1 + mov 3*8(bp), s2 + shlx tnc, s2, s0 + lea (s1, s0), s0 +ifelse(OP,`add',` + adc 2*8(ap), s0 + mov s0, 2*8(rp) +',` + mov 2*8(ap), s1 + sbb s0, s1 + mov s1, 2*8(rp) +') + C Used: s2 +C + + shrx cnt, s2, s2 +ifelse(OP,`add',` + adc 2*8(ap), s2 + mov s2, 2*8(rp) +',` + mov 2*8(ap), s0 + sbb s2, s0 + mov s0, 1*8(rp) +') + + setc R8(sx) + ret +EPILOGUE() +') + + TEXT +define(`flint_mpn_aorsrsh',`flint_mpn_addrsh_$1') +define(`OP',`add') +define(`OPC',`adc') +ALL_AORSRSH +undefine(`flint_mpn_aorsrsh') +undefine(`OP') +undefine(`OPC') + +define(`flint_mpn_aorsrsh',`flint_mpn_subrsh_$1') +define(`OP',`sub') +define(`OPC',`sbb') +ALL_AORSRSH +undefine(`flint_mpn_aorsrsh') +undefine(`OP') +undefine(`OPC') From 9ce9168e4e6a99ad0e40d279b7fba949a3933401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 14:07:56 +0000 Subject: [PATCH 03/13] Add generation for hardcoded flint_mpn_addrsh --- dev/gen_x86_aorsrsh.jl | 125 ++ .../x86_64/broadwell/aorsrsh_hard.asm | 1026 +++++++++++++++-- 2 files changed, 1048 insertions(+), 103 deletions(-) create mode 100644 dev/gen_x86_aorsrsh.jl diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl new file mode 100644 index 0000000000..401471d706 --- /dev/null +++ b/dev/gen_x86_aorsrsh.jl @@ -0,0 +1,125 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. See . +# + +# Generating routines for r <- a OP 2^(cnt) * b, where OP is either + or -. + +r = "rp" +a = "ap" +b = "bp" +cnt = "cnt" +rp(ix::Int) = "$ix*8($r)" +ap(ix::Int) = "$ix*8($a)" +bp(ix::Int) = "$ix*8($b)" + +tnc = "tnc" +sx = "sx" # Return value for carry or borrow, i.e. %rax + +R32(sx::String) = "R32($sx)" +R8(sx::String) = "R8($sx)" + +s0 = "s0" +s1 = "s1" +s2 = "s2" +sp = ["s$ix" for ix in 0:2] # Scrap registers +s(ix::Int) = s[ix + 1] + +# Writes assembly that should be preprocessed by M4. +function addrsh(n::Int) + str = "\tALIGN(16)\nPROLOGUE(flint_mpn_addrsh_$n)\n" + function mov(s0::String, s1::String) + str *= "\tmov\t$s0, $s1\n" + end + function xor(s0::String, s1::String) + str *= "\txor\t$s0, $s1\n" + end + function add(s0::String, s1::String) + str *= "\tadd\t$s0, $s1\n" + end + function adc(s0::String, s1::String) + str *= "\tadc\t$s0, $s1\n" + end + function sub(s0::String, s1::String) + str *= "\tsub\t$s0, $s1\n" + end + function sbb(s0::String, s1::String) + str *= "\tsbb\t$s0, $s1\n" + end + function shrx(s0::String, s1::String, s2::String) + str *= "\tshrx\t$s0, $s1, $s2\n" + end + function shlx(s0::String, s1::String, s2::String) + str *= "\tshlx\t$s0, $s1, $s2\n" + end + function lea(t::Tuple{String, String}, s1::String) + str *= "\tlea\t($(t[1]), $(t[2])), $s1\n" + end + function setc(s0::String) + str *= "\tsetc\t$s0\n" + end + + # Initialize variables + xor( R32(tnc), R32(tnc)) + sub( cnt, tnc) # This is modulo 64, so -n = 64 - n. + xor( R32(sx), R32(sx)) + + # f_a assumes s1 contains ix*8(bp) + function f_a(ix::Int) + if ix == 0 + shrx( cnt, bp(0), s0) + mov( bp(ix + 1), s1) + elseif ix == n - 1 + shrx( cnt, s1, s0) + else + shrx( cnt, s1, s0) + mov( bp(ix + 1), s1) + end + end # s0, s1 used + function f_b(ix::Int) + if ix != n - 1 + shlx( tnc, s1, s2) + lea( (s0, s2), s2) + end + end # s1, s2 used + function f_c(ix::Int) + if ix == 0 + add( ap(ix), s2) + mov( s2, rp(ix)) + elseif ix == n - 1 + adc( ap(ix), s2) + mov( s2, rp(ix)) + else + adc( ap(ix), s0) + mov( s0, rp(ix)) + end + end # nothing used + + # We interleave as follows: + f_a(0) + f_b(0) + for ix in 1:(n - 1) + f_a(ix + 0) + f_c(ix - 1) + f_b(ix + 0) + end + f_c(n - 1) + + setc( R8(sx)) + + str *= "\tret\nEPILOGUE()\n" + + return str +end + +function print_all_addrsh(nmax::Int = 16) + for n in 2:nmax + println(addrsh(n)) + end +end diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index 4fefb0143b..ee3583ccab 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -23,6 +23,8 @@ define(`s0', `%r9') define(`s1', `%r10') define(`s2', `%r11') +dnl From n = 2 onwards, these are generated by `dev/gen_x86_aorsrsh.jl'. + dnl r <- a +/- 2^n b dnl dnl For 0 <= i < n - 1, we have @@ -50,7 +52,83 @@ dnl u -= s C sbb dnl rp[i] = u C mov dnl fi -define(ALL_AORS,` +dnl Non-optimized. We probably should optimize add and sub differently. We +dnl probably need to use more registers to interleave more. +ifdef(blablablabla,` + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_5) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 +ifelse(OP,`add',` + add 0*8(ap), s2 + mov s2, 0*8(rp) +',` + mov 0*8(ap), s0 + sub s2, s0 + mov s0, 0*8(rp) +') + + shrx cnt, s1, s0 + mov 2*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 +ifelse(OP,`add',` + adc 1*8(ap), s2 + mov s2, 1*8(rp) +',` + mov 1*8(ap), s0 + sbb s2, s0 + mov s0, 1*8(rp) +') + + shrx cnt, s1, s0 + mov 3*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 +ifelse(OP,`add',` + adc 2*8(ap), s2 + mov s2, 2*8(rp) +',` + mov 2*8(ap), s0 + sbb s2, s0 + mov s0, 2*8(rp) +') + + shrx cnt, s1, s0 + mov 4*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 +ifelse(OP,`add',` + adc 3*8(ap), s2 + mov s2, 3*8(rp) +',` + mov 3*8(ap), s0 + sbb s2, s0 + mov s0, 3*8(rp) +') + + shrx cnt, s1, s0 +ifelse(OP,`add',` + adc 4*8(ap), s0 + mov s0, 4*8(rp) +',` + mov 4*8(ap), s2 + sbb s0, s2 + mov s2, 4*8(rp) +') + + setc R8(sx) + ret +EPILOGUE()',`') + + TEXT + ALIGN(16) PROLOGUE(flint_mpn_addrsh_1) shrx cnt, 0*8(bp), s0 @@ -65,25 +143,16 @@ EPILOGUE() PROLOGUE(flint_mpn_addrsh_2) xor R32(tnc), R32(tnc) sub cnt, tnc - xor R32(sx), R32(sx) - - mov 1*8(bp), s1 -C shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 shlx tnc, s1, s2 - shrx cnt, s1, s1 - C (0, 2), 1 - - adox s2, s0 -C - adcx 0*8(ap), s0 - mov s0, 0*8(rp) - adox sx, s1 C cannot overflow - adcx 1*8(ap), s1 -C - mov s1, 1*8(rp) - + lea (s0, s2), s2 + shrx cnt, s1, s0 + add 0*8(ap), s2 + mov s2, 0*8(rp) + adc 1*8(ap), s2 + mov s2, 1*8(rp) setc R8(sx) ret EPILOGUE() @@ -93,128 +162,879 @@ PROLOGUE(flint_mpn_addrsh_3) xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) - shrx cnt, 0*8(bp), s0 mov 1*8(bp), s1 shlx tnc, s1, s2 lea (s0, s2), s2 -ifelse(OP,`add',` + shrx cnt, s1, s0 + mov 2*8(bp), s1 add 0*8(ap), s2 mov s2, 0*8(rp) -',` - mov 0*8(ap), s0 - sub s2, s0 - mov s0, 0*8(rp) -') - C Used: s1 - - shrx cnt, s1, s1 - mov 2*8(bp), s2 - shlx tnc, s2, s0 - lea (s1, s0), s0 -ifelse(OP,`add',` + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 adc 1*8(ap), s0 mov s0, 1*8(rp) -',` - mov 1*8(ap), s1 - sbb s0, s1 - mov s1, 1*8(rp) -') - C Used: s2 - - shrx cnt, s2, s2 -ifelse(OP,`add',` adc 2*8(ap), s2 mov s2, 2*8(rp) -',` - mov 2*8(ap), s0 - sbb s2, s0 - mov s0, 1*8(rp) -') - setc R8(sx) ret EPILOGUE() -') ALIGN(16) PROLOGUE(flint_mpn_addrsh_4) xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + adc 3*8(ap), s2 + mov s2, 3*8(rp) + setc R8(sx) + ret +EPILOGUE() + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_5) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 mov 1*8(bp), s1 shlx tnc, s1, s2 lea (s0, s2), s2 -ifelse(OP,`add',` + shrx cnt, s1, s0 + mov 2*8(bp), s1 add 0*8(ap), s2 mov s2, 0*8(rp) -',` - mov 0*8(ap), s0 - sub s2, s0 - mov s0, 0*8(rp) -') - C Used: s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + adc 4*8(ap), s2 + mov s2, 4*8(rp) + setc R8(sx) + ret +EPILOGUE() - shrx cnt, s1, s1 - mov 2*8(bp), s2 - shlx tnc, s2, s0 - lea (s1, s0), s0 -ifelse(OP,`add',` + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_6) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 adc 1*8(ap), s0 mov s0, 1*8(rp) -',` - mov 1*8(ap), s1 - sbb s0, s1 - mov s1, 1*8(rp) -') - C Used: s2 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + adc 5*8(ap), s2 + mov s2, 5*8(rp) + setc R8(sx) + ret +EPILOGUE() -C - shrx cnt, s1, s1 - mov 3*8(bp), s2 - shlx tnc, s2, s0 - lea (s1, s0), s0 -ifelse(OP,`add',` + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_7) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 adc 2*8(ap), s0 mov s0, 2*8(rp) -',` - mov 2*8(ap), s1 - sbb s0, s1 - mov s1, 2*8(rp) -') - C Used: s2 -C + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + adc 6*8(ap), s2 + mov s2, 6*8(rp) + setc R8(sx) + ret +EPILOGUE() - shrx cnt, s2, s2 -ifelse(OP,`add',` - adc 2*8(ap), s2 - mov s2, 2*8(rp) -',` - mov 2*8(ap), s0 - sbb s2, s0 + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_8) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 mov s0, 1*8(rp) -') + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + adc 7*8(ap), s2 + mov s2, 7*8(rp) + setc R8(sx) + ret +EPILOGUE() + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_9) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + adc 8*8(ap), s2 + mov s2, 8*8(rp) setc R8(sx) ret EPILOGUE() -') - TEXT -define(`flint_mpn_aorsrsh',`flint_mpn_addrsh_$1') -define(`OP',`add') -define(`OPC',`adc') -ALL_AORSRSH -undefine(`flint_mpn_aorsrsh') -undefine(`OP') -undefine(`OPC') - -define(`flint_mpn_aorsrsh',`flint_mpn_subrsh_$1') -define(`OP',`sub') -define(`OPC',`sbb') -ALL_AORSRSH -undefine(`flint_mpn_aorsrsh') -undefine(`OP') -undefine(`OPC') + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_10) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 8*8(ap), s0 + mov s0, 8*8(rp) + adc 9*8(ap), s2 + mov s2, 9*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_11) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s0 + mov s0, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 9*8(ap), s0 + mov s0, 9*8(rp) + adc 10*8(ap), s2 + mov s2, 10*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_12) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s0 + mov s0, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s0 + mov s0, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 10*8(ap), s0 + mov s0, 10*8(rp) + adc 11*8(ap), s2 + mov s2, 11*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_13) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s0 + mov s0, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s0 + mov s0, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s0 + mov s0, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 11*8(ap), s0 + mov s0, 11*8(rp) + adc 12*8(ap), s2 + mov s2, 12*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_14) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s0 + mov s0, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s0 + mov s0, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s0 + mov s0, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 13*8(bp), s1 + adc 11*8(ap), s0 + mov s0, 11*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 12*8(ap), s0 + mov s0, 12*8(rp) + adc 13*8(ap), s2 + mov s2, 13*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_15) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s0 + mov s0, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s0 + mov s0, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s0 + mov s0, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 13*8(bp), s1 + adc 11*8(ap), s0 + mov s0, 11*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 14*8(bp), s1 + adc 12*8(ap), s0 + mov s0, 12*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 13*8(ap), s0 + mov s0, 13*8(rp) + adc 14*8(ap), s2 + mov s2, 14*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_16) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s0 + mov s0, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s0 + mov s0, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s0 + mov s0, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s0 + mov s0, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s0 + mov s0, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s0 + mov s0, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s0 + mov s0, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s0 + mov s0, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s0 + mov s0, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s0 + mov s0, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 13*8(bp), s1 + adc 11*8(ap), s0 + mov s0, 11*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 14*8(bp), s1 + adc 12*8(ap), s0 + mov s0, 12*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 15*8(bp), s1 + adc 13*8(ap), s0 + mov s0, 13*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + adc 14*8(ap), s0 + mov s0, 14*8(rp) + adc 15*8(ap), s2 + mov s2, 15*8(rp) + setc R8(sx) + ret +EPILOGUE() From c8069306b6f523c8013923eee639463d0612245e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 15:38:01 +0000 Subject: [PATCH 04/13] Also subrsh --- dev/gen_x86_aorsrsh.jl | 62 +- .../x86_64/broadwell/aorsrsh_hard.asm | 1692 ++++++++++++++--- 2 files changed, 1432 insertions(+), 322 deletions(-) diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl index 401471d706..daa696ae5b 100644 --- a/dev/gen_x86_aorsrsh.jl +++ b/dev/gen_x86_aorsrsh.jl @@ -28,12 +28,19 @@ R8(sx::String) = "R8($sx)" s0 = "s0" s1 = "s1" s2 = "s2" -sp = ["s$ix" for ix in 0:2] # Scrap registers +s3 = "s3" +sp = ["s$ix" for ix in 0:3] # Scrap registers s(ix::Int) = s[ix + 1] # Writes assembly that should be preprocessed by M4. -function addrsh(n::Int) - str = "\tALIGN(16)\nPROLOGUE(flint_mpn_addrsh_$n)\n" +function aorsrsh(n::Int; is_add::Bool = true) + str = "\tALIGN(16)\nPROLOGUE(flint_mpn_$(is_add ? "add" : "sub")rsh_$n)\n" + function push(s0::String) + str *= "\tpush\t$s0\n" + end + function pop(s0::String) + str *= "\tpop\t$s0\n" + end function mov(s0::String, s1::String) str *= "\tmov\t$s0, $s1\n" end @@ -66,6 +73,9 @@ function addrsh(n::Int) end # Initialize variables + if !is_add + push( s3) + end xor( R32(tnc), R32(tnc)) sub( cnt, tnc) # This is modulo 64, so -n = 64 - n. xor( R32(sx), R32(sx)) @@ -76,7 +86,7 @@ function addrsh(n::Int) shrx( cnt, bp(0), s0) mov( bp(ix + 1), s1) elseif ix == n - 1 - shrx( cnt, s1, s0) + shrx( cnt, s1, s1) else shrx( cnt, s1, s0) mov( bp(ix + 1), s1) @@ -89,15 +99,33 @@ function addrsh(n::Int) end end # s1, s2 used function f_c(ix::Int) - if ix == 0 - add( ap(ix), s2) - mov( s2, rp(ix)) - elseif ix == n - 1 - adc( ap(ix), s2) - mov( s2, rp(ix)) + if is_add + if ix == 0 + add( ap(ix), s2) + mov( s2, rp(ix)) + elseif ix == n - 1 + adc( ap(ix), s1) + mov( s1, rp(ix)) + else + adc( ap(ix), s2) + mov( s2, rp(ix)) + end else - adc( ap(ix), s0) - mov( s0, rp(ix)) + # Due to the lack of an `rsub' instruction, we need an extra + # register. + if ix == 0 + mov( ap(ix), s3) + sub( s2, s3) + mov( s3, rp(ix)) + elseif ix == n - 1 + mov( ap(ix), s0) + sub( s1, s0) + mov( s0, rp(ix)) + else + mov( ap(ix), s3) + sbb( s2, s3) + mov( s3, rp(ix)) + end end end # nothing used @@ -111,6 +139,9 @@ function addrsh(n::Int) end f_c(n - 1) + if !is_add + pop( s3) + end setc( R8(sx)) str *= "\tret\nEPILOGUE()\n" @@ -118,8 +149,11 @@ function addrsh(n::Int) return str end -function print_all_addrsh(nmax::Int = 16) +function print_all_aorsrsh(nmax::Int = 16) + for n in 2:nmax + println(aorsrsh(n, is_add = true)) + end for n in 2:nmax - println(addrsh(n)) + println(aorsrsh(n, is_add = false)) end end diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index ee3583ccab..2b075466af 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -125,22 +125,935 @@ ifelse(OP,`add',` setc R8(sx) ret -EPILOGUE()',`') +EPILOGUE() +',`') + + TEXT + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_1) + shrx cnt, 0*8(bp), s0 + xor R32(sx), R32(sx) + add 0*8(ap), s0 + mov s0, 0*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_2) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + adc 1*8(ap), s1 + mov s1, 1*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_3) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + adc 2*8(ap), s1 + mov s1, 2*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_4) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + adc 3*8(ap), s1 + mov s1, 3*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_5) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + adc 4*8(ap), s1 + mov s1, 4*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_6) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + adc 5*8(ap), s1 + mov s1, 5*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_7) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + adc 6*8(ap), s1 + mov s1, 6*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_8) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + adc 7*8(ap), s1 + mov s1, 7*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_9) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + adc 8*8(ap), s1 + mov s1, 8*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_10) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 8*8(ap), s2 + mov s2, 8*8(rp) + adc 9*8(ap), s1 + mov s1, 9*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_11) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s2 + mov s2, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 9*8(ap), s2 + mov s2, 9*8(rp) + adc 10*8(ap), s1 + mov s1, 10*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_12) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s2 + mov s2, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s2 + mov s2, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 10*8(ap), s2 + mov s2, 10*8(rp) + adc 11*8(ap), s1 + mov s1, 11*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_13) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s2 + mov s2, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s2 + mov s2, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s2 + mov s2, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 11*8(ap), s2 + mov s2, 11*8(rp) + adc 12*8(ap), s1 + mov s1, 12*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_14) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s2 + mov s2, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s2 + mov s2, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s2 + mov s2, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 13*8(bp), s1 + adc 11*8(ap), s2 + mov s2, 11*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 12*8(ap), s2 + mov s2, 12*8(rp) + adc 13*8(ap), s1 + mov s1, 13*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_15) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s2 + mov s2, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s2 + mov s2, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s2 + mov s2, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 13*8(bp), s1 + adc 11*8(ap), s2 + mov s2, 11*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 14*8(bp), s1 + adc 12*8(ap), s2 + mov s2, 12*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 13*8(ap), s2 + mov s2, 13*8(rp) + adc 14*8(ap), s1 + mov s1, 14*8(rp) + setc R8(sx) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_addrsh_16) + xor R32(tnc), R32(tnc) + sub cnt, tnc + xor R32(sx), R32(sx) + shrx cnt, 0*8(bp), s0 + mov 1*8(bp), s1 + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 2*8(bp), s1 + add 0*8(ap), s2 + mov s2, 0*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 3*8(bp), s1 + adc 1*8(ap), s2 + mov s2, 1*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 4*8(bp), s1 + adc 2*8(ap), s2 + mov s2, 2*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 5*8(bp), s1 + adc 3*8(ap), s2 + mov s2, 3*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 6*8(bp), s1 + adc 4*8(ap), s2 + mov s2, 4*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 7*8(bp), s1 + adc 5*8(ap), s2 + mov s2, 5*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 8*8(bp), s1 + adc 6*8(ap), s2 + mov s2, 6*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 9*8(bp), s1 + adc 7*8(ap), s2 + mov s2, 7*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 10*8(bp), s1 + adc 8*8(ap), s2 + mov s2, 8*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 11*8(bp), s1 + adc 9*8(ap), s2 + mov s2, 9*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 12*8(bp), s1 + adc 10*8(ap), s2 + mov s2, 10*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 13*8(bp), s1 + adc 11*8(ap), s2 + mov s2, 11*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 14*8(bp), s1 + adc 12*8(ap), s2 + mov s2, 12*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s0 + mov 15*8(bp), s1 + adc 13*8(ap), s2 + mov s2, 13*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + adc 14*8(ap), s2 + mov s2, 14*8(rp) + adc 15*8(ap), s1 + mov s1, 15*8(rp) + setc R8(sx) + ret +EPILOGUE() - TEXT - ALIGN(16) -PROLOGUE(flint_mpn_addrsh_1) +PROLOGUE(flint_mpn_subrsh_1) shrx cnt, 0*8(bp), s0 xor R32(sx), R32(sx) - add 0*8(ap), s0 - mov s0, 0*8(rp) + mov 0*8(ap), s1 + sub s0, s1 + mov s1, 0*8(rp) setc R8(sx) ret EPILOGUE() +dnl Modified to avoid pushing and popping s3 ALIGN(16) -PROLOGUE(flint_mpn_addrsh_2) +PROLOGUE(flint_mpn_subrsh_2) xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -148,17 +1061,20 @@ PROLOGUE(flint_mpn_addrsh_2) mov 1*8(bp), s1 shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - add 0*8(ap), s2 - mov s2, 0*8(rp) - adc 1*8(ap), s2 - mov s2, 1*8(rp) + shrx cnt, s1, s1 + mov 0*8(ap), tnc + sub s2, tnc + mov tnc, 0*8(rp) + mov 1*8(ap), s0 + sub s1, s0 + mov s0, 1*8(rp) setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_3) +PROLOGUE(flint_mpn_subrsh_3) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -168,21 +1084,26 @@ PROLOGUE(flint_mpn_addrsh_3) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 1*8(ap), s0 - mov s0, 1*8(rp) - adc 2*8(ap), s2 - mov s2, 2*8(rp) + shrx cnt, s1, s1 + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) + mov 2*8(ap), s0 + sub s1, s0 + mov s0, 2*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_4) +PROLOGUE(flint_mpn_subrsh_4) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -192,27 +1113,33 @@ PROLOGUE(flint_mpn_addrsh_4) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 2*8(ap), s0 - mov s0, 2*8(rp) - adc 3*8(ap), s2 - mov s2, 3*8(rp) + shrx cnt, s1, s1 + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) + mov 3*8(ap), s0 + sub s1, s0 + mov s0, 3*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_5) +PROLOGUE(flint_mpn_subrsh_5) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -222,33 +1149,40 @@ PROLOGUE(flint_mpn_addrsh_5) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 3*8(ap), s0 - mov s0, 3*8(rp) - adc 4*8(ap), s2 - mov s2, 4*8(rp) + shrx cnt, s1, s1 + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) + mov 4*8(ap), s0 + sub s1, s0 + mov s0, 4*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_6) +PROLOGUE(flint_mpn_subrsh_6) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -258,39 +1192,47 @@ PROLOGUE(flint_mpn_addrsh_6) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 4*8(ap), s0 - mov s0, 4*8(rp) - adc 5*8(ap), s2 - mov s2, 5*8(rp) + shrx cnt, s1, s1 + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) + mov 5*8(ap), s0 + sub s1, s0 + mov s0, 5*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_7) +PROLOGUE(flint_mpn_subrsh_7) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -300,45 +1242,54 @@ PROLOGUE(flint_mpn_addrsh_7) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 5*8(ap), s0 - mov s0, 5*8(rp) - adc 6*8(ap), s2 - mov s2, 6*8(rp) + shrx cnt, s1, s1 + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) + mov 6*8(ap), s0 + sub s1, s0 + mov s0, 6*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_8) +PROLOGUE(flint_mpn_subrsh_8) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -348,51 +1299,61 @@ PROLOGUE(flint_mpn_addrsh_8) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 6*8(ap), s0 - mov s0, 6*8(rp) - adc 7*8(ap), s2 - mov s2, 7*8(rp) + shrx cnt, s1, s1 + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) + mov 7*8(ap), s0 + sub s1, s0 + mov s0, 7*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_9) +PROLOGUE(flint_mpn_subrsh_9) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -402,57 +1363,68 @@ PROLOGUE(flint_mpn_addrsh_9) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 7*8(ap), s0 - mov s0, 7*8(rp) - adc 8*8(ap), s2 - mov s2, 8*8(rp) + shrx cnt, s1, s1 + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) + mov 8*8(ap), s0 + sub s1, s0 + mov s0, 8*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_10) +PROLOGUE(flint_mpn_subrsh_10) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -462,63 +1434,75 @@ PROLOGUE(flint_mpn_addrsh_10) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 9*8(bp), s1 - adc 7*8(ap), s0 - mov s0, 7*8(rp) + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 8*8(ap), s0 - mov s0, 8*8(rp) - adc 9*8(ap), s2 - mov s2, 9*8(rp) + shrx cnt, s1, s1 + mov 8*8(ap), s3 + sbb s2, s3 + mov s3, 8*8(rp) + mov 9*8(ap), s0 + sub s1, s0 + mov s0, 9*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_11) +PROLOGUE(flint_mpn_subrsh_11) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -528,69 +1512,82 @@ PROLOGUE(flint_mpn_addrsh_11) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 9*8(bp), s1 - adc 7*8(ap), s0 - mov s0, 7*8(rp) + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 10*8(bp), s1 - adc 8*8(ap), s0 - mov s0, 8*8(rp) + mov 8*8(ap), s3 + sbb s2, s3 + mov s3, 8*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 9*8(ap), s0 - mov s0, 9*8(rp) - adc 10*8(ap), s2 - mov s2, 10*8(rp) + shrx cnt, s1, s1 + mov 9*8(ap), s3 + sbb s2, s3 + mov s3, 9*8(rp) + mov 10*8(ap), s0 + sub s1, s0 + mov s0, 10*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_12) +PROLOGUE(flint_mpn_subrsh_12) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -600,75 +1597,89 @@ PROLOGUE(flint_mpn_addrsh_12) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 9*8(bp), s1 - adc 7*8(ap), s0 - mov s0, 7*8(rp) + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 10*8(bp), s1 - adc 8*8(ap), s0 - mov s0, 8*8(rp) + mov 8*8(ap), s3 + sbb s2, s3 + mov s3, 8*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 11*8(bp), s1 - adc 9*8(ap), s0 - mov s0, 9*8(rp) + mov 9*8(ap), s3 + sbb s2, s3 + mov s3, 9*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 10*8(ap), s0 - mov s0, 10*8(rp) - adc 11*8(ap), s2 - mov s2, 11*8(rp) + shrx cnt, s1, s1 + mov 10*8(ap), s3 + sbb s2, s3 + mov s3, 10*8(rp) + mov 11*8(ap), s0 + sub s1, s0 + mov s0, 11*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_13) +PROLOGUE(flint_mpn_subrsh_13) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -678,81 +1689,96 @@ PROLOGUE(flint_mpn_addrsh_13) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 9*8(bp), s1 - adc 7*8(ap), s0 - mov s0, 7*8(rp) + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 10*8(bp), s1 - adc 8*8(ap), s0 - mov s0, 8*8(rp) + mov 8*8(ap), s3 + sbb s2, s3 + mov s3, 8*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 11*8(bp), s1 - adc 9*8(ap), s0 - mov s0, 9*8(rp) + mov 9*8(ap), s3 + sbb s2, s3 + mov s3, 9*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 12*8(bp), s1 - adc 10*8(ap), s0 - mov s0, 10*8(rp) + mov 10*8(ap), s3 + sbb s2, s3 + mov s3, 10*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 11*8(ap), s0 - mov s0, 11*8(rp) - adc 12*8(ap), s2 - mov s2, 12*8(rp) + shrx cnt, s1, s1 + mov 11*8(ap), s3 + sbb s2, s3 + mov s3, 11*8(rp) + mov 12*8(ap), s0 + sub s1, s0 + mov s0, 12*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_14) +PROLOGUE(flint_mpn_subrsh_14) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -762,87 +1788,103 @@ PROLOGUE(flint_mpn_addrsh_14) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 9*8(bp), s1 - adc 7*8(ap), s0 - mov s0, 7*8(rp) + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 10*8(bp), s1 - adc 8*8(ap), s0 - mov s0, 8*8(rp) + mov 8*8(ap), s3 + sbb s2, s3 + mov s3, 8*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 11*8(bp), s1 - adc 9*8(ap), s0 - mov s0, 9*8(rp) + mov 9*8(ap), s3 + sbb s2, s3 + mov s3, 9*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 12*8(bp), s1 - adc 10*8(ap), s0 - mov s0, 10*8(rp) + mov 10*8(ap), s3 + sbb s2, s3 + mov s3, 10*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 13*8(bp), s1 - adc 11*8(ap), s0 - mov s0, 11*8(rp) + mov 11*8(ap), s3 + sbb s2, s3 + mov s3, 11*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 12*8(ap), s0 - mov s0, 12*8(rp) - adc 13*8(ap), s2 - mov s2, 13*8(rp) + shrx cnt, s1, s1 + mov 12*8(ap), s3 + sbb s2, s3 + mov s3, 12*8(rp) + mov 13*8(ap), s0 + sub s1, s0 + mov s0, 13*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_15) +PROLOGUE(flint_mpn_subrsh_15) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -852,93 +1894,110 @@ PROLOGUE(flint_mpn_addrsh_15) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 9*8(bp), s1 - adc 7*8(ap), s0 - mov s0, 7*8(rp) + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 10*8(bp), s1 - adc 8*8(ap), s0 - mov s0, 8*8(rp) + mov 8*8(ap), s3 + sbb s2, s3 + mov s3, 8*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 11*8(bp), s1 - adc 9*8(ap), s0 - mov s0, 9*8(rp) + mov 9*8(ap), s3 + sbb s2, s3 + mov s3, 9*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 12*8(bp), s1 - adc 10*8(ap), s0 - mov s0, 10*8(rp) + mov 10*8(ap), s3 + sbb s2, s3 + mov s3, 10*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 13*8(bp), s1 - adc 11*8(ap), s0 - mov s0, 11*8(rp) + mov 11*8(ap), s3 + sbb s2, s3 + mov s3, 11*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 14*8(bp), s1 - adc 12*8(ap), s0 - mov s0, 12*8(rp) + mov 12*8(ap), s3 + sbb s2, s3 + mov s3, 12*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 13*8(ap), s0 - mov s0, 13*8(rp) - adc 14*8(ap), s2 - mov s2, 14*8(rp) + shrx cnt, s1, s1 + mov 13*8(ap), s3 + sbb s2, s3 + mov s3, 13*8(rp) + mov 14*8(ap), s0 + sub s1, s0 + mov s0, 14*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() ALIGN(16) -PROLOGUE(flint_mpn_addrsh_16) +PROLOGUE(flint_mpn_subrsh_16) + push s3 xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) @@ -948,93 +2007,110 @@ PROLOGUE(flint_mpn_addrsh_16) lea (s0, s2), s2 shrx cnt, s1, s0 mov 2*8(bp), s1 - add 0*8(ap), s2 - mov s2, 0*8(rp) + mov 0*8(ap), s3 + sub s2, s3 + mov s3, 0*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 3*8(bp), s1 - adc 1*8(ap), s0 - mov s0, 1*8(rp) + mov 1*8(ap), s3 + sbb s2, s3 + mov s3, 1*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 4*8(bp), s1 - adc 2*8(ap), s0 - mov s0, 2*8(rp) + mov 2*8(ap), s3 + sbb s2, s3 + mov s3, 2*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 5*8(bp), s1 - adc 3*8(ap), s0 - mov s0, 3*8(rp) + mov 3*8(ap), s3 + sbb s2, s3 + mov s3, 3*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 6*8(bp), s1 - adc 4*8(ap), s0 - mov s0, 4*8(rp) + mov 4*8(ap), s3 + sbb s2, s3 + mov s3, 4*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 7*8(bp), s1 - adc 5*8(ap), s0 - mov s0, 5*8(rp) + mov 5*8(ap), s3 + sbb s2, s3 + mov s3, 5*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 8*8(bp), s1 - adc 6*8(ap), s0 - mov s0, 6*8(rp) + mov 6*8(ap), s3 + sbb s2, s3 + mov s3, 6*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 9*8(bp), s1 - adc 7*8(ap), s0 - mov s0, 7*8(rp) + mov 7*8(ap), s3 + sbb s2, s3 + mov s3, 7*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 10*8(bp), s1 - adc 8*8(ap), s0 - mov s0, 8*8(rp) + mov 8*8(ap), s3 + sbb s2, s3 + mov s3, 8*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 11*8(bp), s1 - adc 9*8(ap), s0 - mov s0, 9*8(rp) + mov 9*8(ap), s3 + sbb s2, s3 + mov s3, 9*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 12*8(bp), s1 - adc 10*8(ap), s0 - mov s0, 10*8(rp) + mov 10*8(ap), s3 + sbb s2, s3 + mov s3, 10*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 13*8(bp), s1 - adc 11*8(ap), s0 - mov s0, 11*8(rp) + mov 11*8(ap), s3 + sbb s2, s3 + mov s3, 11*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 14*8(bp), s1 - adc 12*8(ap), s0 - mov s0, 12*8(rp) + mov 12*8(ap), s3 + sbb s2, s3 + mov s3, 12*8(rp) shlx tnc, s1, s2 lea (s0, s2), s2 shrx cnt, s1, s0 mov 15*8(bp), s1 - adc 13*8(ap), s0 - mov s0, 13*8(rp) - shlx tnc, s1, s2 - lea (s0, s2), s2 - shrx cnt, s1, s0 - adc 14*8(ap), s0 - mov s0, 14*8(rp) - adc 15*8(ap), s2 - mov s2, 15*8(rp) + mov 13*8(ap), s3 + sbb s2, s3 + mov s3, 13*8(rp) + shlx tnc, s1, s2 + lea (s0, s2), s2 + shrx cnt, s1, s1 + mov 14*8(ap), s3 + sbb s2, s3 + mov s3, 14*8(rp) + mov 15*8(ap), s0 + sub s1, s0 + mov s0, 15*8(rp) + pop s3 setc R8(sx) ret EPILOGUE() From 62a6c2012dcbaa5ff94f53868c66bf8fde76ee41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 15:38:38 +0000 Subject: [PATCH 05/13] Fixup --- src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index 2b075466af..442d440e53 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -22,6 +22,7 @@ define(`sx', `%rax') define(`s0', `%r9') define(`s1', `%r10') define(`s2', `%r11') +define(`s3', `%rbx') dnl From n = 2 onwards, these are generated by `dev/gen_x86_aorsrsh.jl'. From e63d4f7e7445335a00b03fe086d94d0853ce81a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 15:40:45 +0000 Subject: [PATCH 06/13] fixup --- src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index 442d440e53..b84a4df265 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -53,11 +53,10 @@ dnl u -= s C sbb dnl rp[i] = u C mov dnl fi -dnl Non-optimized. We probably should optimize add and sub differently. We -dnl probably need to use more registers to interleave more. +dnl Non-optimized version. ifdef(blablablabla,` ALIGN(16) -PROLOGUE(flint_mpn_addrsh_5) +PROLOGUE(flint_mpn_aorsrsh_5) xor R32(tnc), R32(tnc) sub cnt, tnc xor R32(sx), R32(sx) From 2daf43a14f1e5dc7db93eb0823627ed6e122424e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 16:04:02 +0000 Subject: [PATCH 07/13] Add corresponding C sources for aorsrsh --- src/mpn_extras.h | 51 ++++++++++++++++++++++ src/mpn_extras/aorsrsh_n.c | 88 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 src/mpn_extras/aorsrsh_n.c diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 9d4ba63c0c..9b2877e20c 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -460,6 +460,56 @@ mp_limb_t mpn_rsh1add_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); #endif +#if FLINT_HAVE_ASSEMBLY_x86_64_adx +# define FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH 17 + +# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH) + +# define FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_addrsh_func_tab[n](rp, xp, yp, cnt)) +# define FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_subrsh_func_tab[n](rp, xp, yp, cnt)) +#endif + +typedef mp_limb_t (* flint_mpn_aorssh_func_t)(mp_ptr, mp_srcptr, mp_srcptr, unsigned int); + +#ifdef FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH +# define FLINT_USE_AORSRSH_FUNC_TAB 1 +FLINT_DLL extern const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[]; +FLINT_DLL extern const flint_mpn_aorssh_func_t flint_mpn_subrsh_func_tab[]; +#else +# define FLINT_HAVE_AORSRSH_FUNC(n) 0 +# define FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt) 0 +# define FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt) 0 +#endif + +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +{ + FLINT_ASSERT(n >= 1); + + if (FLINT_HAVE_AORSRSH_FUNC(n)) + return FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt); + else + { + mpn_rshift(rp, yp, n, cnt); + return mpn_add_n(rp, rp, xp, n); + } +} + +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +{ + FLINT_ASSERT(n >= 1); + + if (FLINT_HAVE_AORSRSH_FUNC(n)) + return FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt); + else + { + /* r = x - 2^c y */ + mpn_rshift(rp, yp, n, cnt); + return mpn_sub_n(rp, xp, rp, n); + } +} + /* multiplication (general) **************************************************/ /* NOTE: This is getting a bit messy. How can we clean this up? */ @@ -541,6 +591,7 @@ mp_limb_t _flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_si void _flint_mpn_mul_n(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n); mp_limb_t _flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n); +/* FIXME: This should be under addition */ MPN_EXTRAS_INLINE mp_limb_t flint_mpn_add_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) { diff --git a/src/mpn_extras/aorsrsh_n.c b/src/mpn_extras/aorsrsh_n.c new file mode 100644 index 0000000000..77bf7690ba --- /dev/null +++ b/src/mpn_extras/aorsrsh_n.c @@ -0,0 +1,88 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" + +#define DECL_AORSRSH(n) _DECL_AORSRSH(n) +#define _DECL_AORSRSH(n) \ +mp_limb_t flint_mpn_addrsh_##n(mp_ptr, mp_srcptr, mp_srcptr, unsigned int); \ +mp_limb_t flint_mpn_subrsh_##n(mp_ptr, mp_srcptr, mp_srcptr, unsigned int) + +#define ADDRSH(n) _ADDRSH(n) +#define _ADDRSH(n) flint_mpn_addrsh_##n +#define SUBRSH(n) _SUBRSH(n) +#define _SUBRSH(n) flint_mpn_subrsh_##n + +/* Herein we assume that x86 and ARM are equivalent. */ +#if FLINT_HAVE_ASSEMBLY_x86_64_adx || FLINT_HAVE_ASSEMBLY_armv8 +DECL_AORSRSH(1); +DECL_AORSRSH(2); +DECL_AORSRSH(3); +DECL_AORSRSH(4); +DECL_AORSRSH(5); +DECL_AORSRSH(6); +DECL_AORSRSH(7); +DECL_AORSRSH(8); +DECL_AORSRSH(9); +DECL_AORSRSH(10); +DECL_AORSRSH(11); +DECL_AORSRSH(12); +DECL_AORSRSH(13); +DECL_AORSRSH(14); +DECL_AORSRSH(15); +DECL_AORSRSH(16); + +/* TODO: Should probably rename these types so to not have two different types. + * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */ +const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] = +{ + NULL, + ADDRSH(1), + ADDRSH(2), + ADDRSH(3), + ADDRSH(4), + ADDRSH(5), + ADDRSH(6), + ADDRSH(7), + ADDRSH(8), + ADDRSH(9), + ADDRSH(10), + ADDRSH(11), + ADDRSH(12), + ADDRSH(13), + ADDRSH(14), + ADDRSH(15), + ADDRSH(16) +}; + +const flint_mpn_aorssh_func_t flint_mpn_subsh_func_tab[] = +{ + NULL, + SUBRSH(1), + SUBRSH(2), + SUBRSH(3), + SUBRSH(4), + SUBRSH(5), + SUBRSH(6), + SUBRSH(7), + SUBRSH(8), + SUBRSH(9), + SUBRSH(10), + SUBRSH(11), + SUBRSH(12), + SUBRSH(13), + SUBRSH(14), + SUBRSH(15), + SUBRSH(16) +}; +#else +typedef int this_file_is_empty; +#endif From 96786cd490c1661440376f074b7b5b7f2e5d7c7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 16:55:17 +0000 Subject: [PATCH 08/13] Add tests for aorsrsh --- src/mpn_extras.h | 5 +- src/mpn_extras/test/main.c | 2 + src/mpn_extras/test/t-aors_n.c | 35 ++++++++- src/mpn_extras/test/t-aorsrsh_n.c | 125 ++++++++++++++++++++++++++++++ 4 files changed, 162 insertions(+), 5 deletions(-) create mode 100644 src/mpn_extras/test/t-aorsrsh_n.c diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 9b2877e20c..8f4d5ce307 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -463,7 +463,7 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); #if FLINT_HAVE_ASSEMBLY_x86_64_adx # define FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH 17 -# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH) +# define FLINT_HAVE_AORSRSH_FUNC(n) ((n) < FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH) # define FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_addrsh_func_tab[n](rp, xp, yp, cnt)) # define FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_subrsh_func_tab[n](rp, xp, yp, cnt)) @@ -485,6 +485,7 @@ MPN_EXTRAS_INLINE mp_limb_t flint_mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) { FLINT_ASSERT(n >= 1); + FLINT_ASSERT(1 <= cnt && cnt < FLINT_BITS); if (FLINT_HAVE_AORSRSH_FUNC(n)) return FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt); @@ -499,12 +500,12 @@ MPN_EXTRAS_INLINE mp_limb_t flint_mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) { FLINT_ASSERT(n >= 1); + FLINT_ASSERT(1 <= cnt && cnt < FLINT_BITS); if (FLINT_HAVE_AORSRSH_FUNC(n)) return FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt); else { - /* r = x - 2^c y */ mpn_rshift(rp, yp, n, cnt); return mpn_sub_n(rp, xp, rp, n); } diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c index 171a9b7342..7ed9a71dcc 100644 --- a/src/mpn_extras/test/main.c +++ b/src/mpn_extras/test/main.c @@ -13,6 +13,7 @@ #include "t-2add_n_inplace.c" #include "t-aors_n.c" +#include "t-aorsrsh_n.c" #include "t-divides.c" #include "t-divrem_preinv1.c" #include "t-divrem_preinvn.c" @@ -40,6 +41,7 @@ test_struct tests[] = { TEST_FUNCTION(flint_mpn_2add_n_inplace), TEST_FUNCTION(flint_mpn_aors_n), + TEST_FUNCTION(flint_mpn_aorsrsh_n), TEST_FUNCTION(flint_mpn_divides), TEST_FUNCTION(flint_mpn_divrem_preinv1), TEST_FUNCTION(flint_mpn_divrem_preinvn), diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c index 0af210d94c..b40659ee97 100644 --- a/src/mpn_extras/test/t-aors_n.c +++ b/src/mpn_extras/test/t-aors_n.c @@ -26,6 +26,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) { int result; int type; + int aliasing; mp_limb_t cf, cg; mp_size_t n; mp_ptr fp, gp, xp, yp; @@ -34,6 +35,11 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) if (n_randint(state, 1 << 10) == UWORD(0)) n += N_STOR; + /* 0: No aliasing + * 1: fp = xp + * 2: fp = yp */ + aliasing = n_randint(state, 3); + fp = flint_malloc(sizeof(mp_limb_t) * n); gp = flint_malloc(sizeof(mp_limb_t) * n); xp = flint_malloc(sizeof(mp_limb_t) * n); @@ -46,12 +52,34 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) if (type == 0) { - cf = flint_mpn_add_n(fp, xp, yp, n); + if (aliasing == 0) + cf = flint_mpn_add_n(fp, xp, yp, n); + else if (aliasing == 1) + { + flint_mpn_copyi(fp, xp, n); + cf = flint_mpn_add_n(fp, fp, yp, n); + } + else + { + flint_mpn_copyi(fp, yp, n); + cf = flint_mpn_add_n(fp, xp, fp, n); + } cg = mpn_add_n(gp, xp, yp, n); } else { - cf = flint_mpn_sub_n(fp, xp, yp, n); + if (aliasing == 0) + cf = flint_mpn_sub_n(fp, xp, yp, n); + else if (aliasing == 1) + { + flint_mpn_copyi(fp, xp, n); + cf = flint_mpn_sub_n(fp, fp, yp, n); + } + else + { + flint_mpn_copyi(fp, yp, n); + cf = flint_mpn_sub_n(fp, xp, fp, n); + } cg = mpn_sub_n(gp, xp, yp, n); } @@ -59,6 +87,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) if (!result) TEST_FUNCTION_FAIL( "%s:\n" + "aliasing: %d\n" "ix = %wd\n" "n = %wd\n" "xp = %{ulong*}\n" @@ -66,7 +95,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) "FLINT (cy = %wu): %{ulong*}\n" "GMP (cy = %wu): %{ulong*}\n", type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", - ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + aliasing, ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); flint_free(fp); flint_free(gp); diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c new file mode 100644 index 0000000000..040fa59912 --- /dev/null +++ b/src/mpn_extras/test/t-aorsrsh_n.c @@ -0,0 +1,125 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + Copyright (C) 2024 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "mpn_extras.h" + +#define N_MIN 1 +#define N_MAX (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH - 1) +#define N_STOR (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH + 10) + +static mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +{ + mpn_rshift(rp, yp, n, cnt); + return mpn_add_n(rp, rp, xp, n); +} + +static mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +{ + mpn_rshift(rp, yp, n, cnt); + return mpn_sub_n(rp, xp, rp, n); +} + +TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) +{ +#if FLINT_USE_AORSRSH_FUNC_TAB + slong ix; + + for (ix = 0; ix < 10000 * flint_test_multiplier(); ix++) + { + int result; + int type; + int aliasing; + unsigned int cnt; + mp_limb_t cf, cg; + mp_size_t n; + mp_ptr fp, gp, xp, yp; + + n = N_MIN + n_randint(state, N_MAX - N_MIN + 1); + if (n_randint(state, 1 << 10) == UWORD(0)) + n += N_STOR; + + /* 0: No aliasing + * 1: fp = xp + * 2: fp = yp */ + aliasing = n_randint(state, 3); + + fp = flint_malloc(sizeof(mp_limb_t) * n); + gp = flint_malloc(sizeof(mp_limb_t) * n); + xp = flint_malloc(sizeof(mp_limb_t) * n); + yp = flint_malloc(sizeof(mp_limb_t) * n); + + flint_mpn_rrandom(xp, state, n); + flint_mpn_rrandom(yp, state, n); + cnt = 1 + n_randint(state, FLINT_BITS - 1); + + type = n_randint(state, 2); + + if (type == 0) + { + if (aliasing == 0) + cf = flint_mpn_addrsh_n(fp, xp, yp, n, cnt); + else if (aliasing == 1) + { + flint_mpn_copyi(fp, xp, n); + cf = flint_mpn_addrsh_n(fp, fp, yp, n, cnt); + } + else + { + flint_mpn_copyi(fp, yp, n); + cf = flint_mpn_addrsh_n(fp, xp, fp, n, cnt); + } + cg = mpn_addrsh_n(gp, xp, yp, n, cnt); + } + else + { + if (aliasing == 0) + cf = flint_mpn_subrsh_n(fp, xp, yp, n, cnt); + else if (aliasing == 1) + { + flint_mpn_copyi(fp, xp, n); + cf = flint_mpn_subrsh_n(fp, fp, yp, n, cnt); + } + else + { + flint_mpn_copyi(fp, yp, n); + cf = flint_mpn_subrsh_n(fp, xp, fp, n, cnt); + } + cg = mpn_subrsh_n(gp, xp, yp, n, cnt); + } + + result = (cf == cg && mpn_cmp(fp, gp, n) == 0); + if (!result) + TEST_FUNCTION_FAIL( + "%s:\n" + "aliasing: %d\n" + "ix = %wd\n" + "n = %u\n" + "cnt = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "FLINT (cy = %wu): %{ulong*}\n" + "GMP (cy = %wu): %{ulong*}\n", + type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", + aliasing, ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + + flint_free(fp); + flint_free(gp); + flint_free(xp); + flint_free(yp); + } + + TEST_FUNCTION_END(state); +#else + TEST_FUNCTION_END_SKIPPED(state); +#endif +} From 65fadb40145f538b94460c2362cb583e70404119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 19:14:58 +0000 Subject: [PATCH 09/13] blabla --- dev/gen_x86_aorsrsh.jl | 2 +- src/mpn_extras/aorsrsh_n.c | 4 +- src/mpn_extras/test/t-aors_n.c | 5 +- src/mpn_extras/test/t-aorsrsh_n.c | 22 +- .../x86_64/broadwell/aorsrsh_hard.asm | 206 +++++++++--------- 5 files changed, 122 insertions(+), 117 deletions(-) diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl index daa696ae5b..c11655d8fc 100644 --- a/dev/gen_x86_aorsrsh.jl +++ b/dev/gen_x86_aorsrsh.jl @@ -76,7 +76,7 @@ function aorsrsh(n::Int; is_add::Bool = true) if !is_add push( s3) end - xor( R32(tnc), R32(tnc)) + xor( tnc, tnc) # We do not use 32 bit mode here since tnc = %r8. sub( cnt, tnc) # This is modulo 64, so -n = 64 - n. xor( R32(sx), R32(sx)) diff --git a/src/mpn_extras/aorsrsh_n.c b/src/mpn_extras/aorsrsh_n.c index 77bf7690ba..c561c695e3 100644 --- a/src/mpn_extras/aorsrsh_n.c +++ b/src/mpn_extras/aorsrsh_n.c @@ -40,8 +40,6 @@ DECL_AORSRSH(14); DECL_AORSRSH(15); DECL_AORSRSH(16); -/* TODO: Should probably rename these types so to not have two different types. - * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */ const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] = { NULL, @@ -63,7 +61,7 @@ const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] = ADDRSH(16) }; -const flint_mpn_aorssh_func_t flint_mpn_subsh_func_tab[] = +const flint_mpn_aorssh_func_t flint_mpn_subrsh_func_tab[] = { NULL, SUBRSH(1), diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c index b40659ee97..55c4c4ae68 100644 --- a/src/mpn_extras/test/t-aors_n.c +++ b/src/mpn_extras/test/t-aors_n.c @@ -87,7 +87,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) if (!result) TEST_FUNCTION_FAIL( "%s:\n" - "aliasing: %d\n" + "aliasing: %s\n" "ix = %wd\n" "n = %wd\n" "xp = %{ulong*}\n" @@ -95,7 +95,8 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) "FLINT (cy = %wu): %{ulong*}\n" "GMP (cy = %wu): %{ulong*}\n", type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", - aliasing, ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp", + ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n); flint_free(fp); flint_free(gp); diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c index 040fa59912..25d9075af9 100644 --- a/src/mpn_extras/test/t-aorsrsh_n.c +++ b/src/mpn_extras/test/t-aorsrsh_n.c @@ -17,13 +17,15 @@ #define N_MAX (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH - 1) #define N_STOR (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH + 10) -static mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +static +mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) { mpn_rshift(rp, yp, n, cnt); return mpn_add_n(rp, rp, xp, n); } -static mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +static +mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) { mpn_rshift(rp, yp, n, cnt); return mpn_sub_n(rp, xp, rp, n); @@ -51,7 +53,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) /* 0: No aliasing * 1: fp = xp * 2: fp = yp */ - aliasing = n_randint(state, 3); + aliasing = 0; /* n_randint(state, 3); */ fp = flint_malloc(sizeof(mp_limb_t) * n); gp = flint_malloc(sizeof(mp_limb_t) * n); @@ -101,16 +103,17 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) if (!result) TEST_FUNCTION_FAIL( "%s:\n" - "aliasing: %d\n" + "aliasing: %s\n" "ix = %wd\n" - "n = %u\n" - "cnt = %wd\n" + "n = %wd\n" + "cnt = %u\n" "xp = %{ulong*}\n" "yp = %{ulong*}\n" "FLINT (cy = %wu): %{ulong*}\n" "GMP (cy = %wu): %{ulong*}\n", - type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", - aliasing, ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + type == 0 ? "flint_mpn_addrsh_n" : "flint_mpn_subrsh_n", + aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp", + ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n); flint_free(fp); flint_free(gp); @@ -123,3 +126,6 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) TEST_FUNCTION_END_SKIPPED(state); #endif } +#undef N_MIN +#undef N_MAX +#undef N_STOR diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index b84a4df265..ec24c45e30 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -54,79 +54,79 @@ dnl rp[i] = u C mov dnl fi dnl Non-optimized version. -ifdef(blablablabla,` - ALIGN(16) -PROLOGUE(flint_mpn_aorsrsh_5) - xor R32(tnc), R32(tnc) - sub cnt, tnc - xor R32(sx), R32(sx) - - shrx cnt, 0*8(bp), s0 - mov 1*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - add 0*8(ap), s2 - mov s2, 0*8(rp) -',` - mov 0*8(ap), s0 - sub s2, s0 - mov s0, 0*8(rp) -') - - shrx cnt, s1, s0 - mov 2*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - adc 1*8(ap), s2 - mov s2, 1*8(rp) -',` - mov 1*8(ap), s0 - sbb s2, s0 - mov s0, 1*8(rp) -') - - shrx cnt, s1, s0 - mov 3*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - adc 2*8(ap), s2 - mov s2, 2*8(rp) -',` - mov 2*8(ap), s0 - sbb s2, s0 - mov s0, 2*8(rp) -') - - shrx cnt, s1, s0 - mov 4*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - adc 3*8(ap), s2 - mov s2, 3*8(rp) -',` - mov 3*8(ap), s0 - sbb s2, s0 - mov s0, 3*8(rp) -') - - shrx cnt, s1, s0 -ifelse(OP,`add',` - adc 4*8(ap), s0 - mov s0, 4*8(rp) -',` - mov 4*8(ap), s2 - sbb s0, s2 - mov s2, 4*8(rp) -') - - setc R8(sx) - ret -EPILOGUE() -',`') +dnl ifdef(blablablabla,` +dnl ALIGN(16) +dnl PROLOGUE(flint_mpn_aorsrsh_5) +dnl xor tnc, tnc +dnl sub cnt, tnc +dnl xor R32(sx), R32(sx) +dnl +dnl shrx cnt, 0*8(bp), s0 +dnl mov 1*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl add 0*8(ap), s2 +dnl mov s2, 0*8(rp) +dnl ',` +dnl mov 0*8(ap), s0 +dnl sub s2, s0 +dnl mov s0, 0*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl mov 2*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl adc 1*8(ap), s2 +dnl mov s2, 1*8(rp) +dnl ',` +dnl mov 1*8(ap), s0 +dnl sbb s2, s0 +dnl mov s0, 1*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl mov 3*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl adc 2*8(ap), s2 +dnl mov s2, 2*8(rp) +dnl ',` +dnl mov 2*8(ap), s0 +dnl sbb s2, s0 +dnl mov s0, 2*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl mov 4*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl adc 3*8(ap), s2 +dnl mov s2, 3*8(rp) +dnl ',` +dnl mov 3*8(ap), s0 +dnl sbb s2, s0 +dnl mov s0, 3*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl ifelse(OP,`add',` +dnl adc 4*8(ap), s0 +dnl mov s0, 4*8(rp) +dnl ',` +dnl mov 4*8(ap), s2 +dnl sbb s0, s2 +dnl mov s2, 4*8(rp) +dnl ') +dnl +dnl setc R8(sx) +dnl ret +dnl EPILOGUE() +dnl ') TEXT @@ -142,7 +142,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_2) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -160,7 +160,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_3) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -184,7 +184,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_4) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -214,7 +214,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_5) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -250,7 +250,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_6) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -292,7 +292,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_7) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -340,7 +340,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_8) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -394,7 +394,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_9) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -454,7 +454,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_10) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -520,7 +520,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_11) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -592,7 +592,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_12) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -670,7 +670,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_13) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -754,7 +754,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_14) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -844,7 +844,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_15) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -940,7 +940,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_16) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1054,7 +1054,7 @@ EPILOGUE() dnl Modified to avoid pushing and popping s3 ALIGN(16) PROLOGUE(flint_mpn_subrsh_2) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1075,7 +1075,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_3) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1104,7 +1104,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_4) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1140,7 +1140,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_5) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1183,7 +1183,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_6) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1233,7 +1233,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_7) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1290,7 +1290,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_8) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1354,7 +1354,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_9) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1425,7 +1425,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_10) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1503,7 +1503,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_11) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1588,7 +1588,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_12) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1680,7 +1680,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_13) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1779,7 +1779,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_14) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1885,7 +1885,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_15) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1998,7 +1998,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_16) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 From 877ec63b6988c3685ffbe7edd66c0d46862f5a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 19:25:26 +0000 Subject: [PATCH 10/13] bla --- dev/gen_x86_aorsrsh.jl | 2 +- src/mpn_extras/test/t-aorsrsh_n.c | 2 +- .../x86_64/broadwell/aorsrsh_hard.asm | 30 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl index c11655d8fc..8881a996e5 100644 --- a/dev/gen_x86_aorsrsh.jl +++ b/dev/gen_x86_aorsrsh.jl @@ -119,7 +119,7 @@ function aorsrsh(n::Int; is_add::Bool = true) mov( s3, rp(ix)) elseif ix == n - 1 mov( ap(ix), s0) - sub( s1, s0) + sbb( s1, s0) mov( s0, rp(ix)) else mov( ap(ix), s3) diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c index 25d9075af9..24709f0e49 100644 --- a/src/mpn_extras/test/t-aorsrsh_n.c +++ b/src/mpn_extras/test/t-aorsrsh_n.c @@ -102,7 +102,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) result = (cf == cg && mpn_cmp(fp, gp, n) == 0); if (!result) TEST_FUNCTION_FAIL( - "%s:\n" + "function: %s\n" "aliasing: %s\n" "ix = %wd\n" "n = %wd\n" diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index ec24c45e30..ed6d264c06 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -1066,7 +1066,7 @@ PROLOGUE(flint_mpn_subrsh_2) sub s2, tnc mov tnc, 0*8(rp) mov 1*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 1*8(rp) setc R8(sx) ret @@ -1094,7 +1094,7 @@ PROLOGUE(flint_mpn_subrsh_3) sbb s2, s3 mov s3, 1*8(rp) mov 2*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 2*8(rp) pop s3 setc R8(sx) @@ -1130,7 +1130,7 @@ PROLOGUE(flint_mpn_subrsh_4) sbb s2, s3 mov s3, 2*8(rp) mov 3*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 3*8(rp) pop s3 setc R8(sx) @@ -1173,7 +1173,7 @@ PROLOGUE(flint_mpn_subrsh_5) sbb s2, s3 mov s3, 3*8(rp) mov 4*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 4*8(rp) pop s3 setc R8(sx) @@ -1223,7 +1223,7 @@ PROLOGUE(flint_mpn_subrsh_6) sbb s2, s3 mov s3, 4*8(rp) mov 5*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 5*8(rp) pop s3 setc R8(sx) @@ -1280,7 +1280,7 @@ PROLOGUE(flint_mpn_subrsh_7) sbb s2, s3 mov s3, 5*8(rp) mov 6*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 6*8(rp) pop s3 setc R8(sx) @@ -1344,7 +1344,7 @@ PROLOGUE(flint_mpn_subrsh_8) sbb s2, s3 mov s3, 6*8(rp) mov 7*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 7*8(rp) pop s3 setc R8(sx) @@ -1415,7 +1415,7 @@ PROLOGUE(flint_mpn_subrsh_9) sbb s2, s3 mov s3, 7*8(rp) mov 8*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 8*8(rp) pop s3 setc R8(sx) @@ -1493,7 +1493,7 @@ PROLOGUE(flint_mpn_subrsh_10) sbb s2, s3 mov s3, 8*8(rp) mov 9*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 9*8(rp) pop s3 setc R8(sx) @@ -1578,7 +1578,7 @@ PROLOGUE(flint_mpn_subrsh_11) sbb s2, s3 mov s3, 9*8(rp) mov 10*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 10*8(rp) pop s3 setc R8(sx) @@ -1670,7 +1670,7 @@ PROLOGUE(flint_mpn_subrsh_12) sbb s2, s3 mov s3, 10*8(rp) mov 11*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 11*8(rp) pop s3 setc R8(sx) @@ -1769,7 +1769,7 @@ PROLOGUE(flint_mpn_subrsh_13) sbb s2, s3 mov s3, 11*8(rp) mov 12*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 12*8(rp) pop s3 setc R8(sx) @@ -1875,7 +1875,7 @@ PROLOGUE(flint_mpn_subrsh_14) sbb s2, s3 mov s3, 12*8(rp) mov 13*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 13*8(rp) pop s3 setc R8(sx) @@ -1988,7 +1988,7 @@ PROLOGUE(flint_mpn_subrsh_15) sbb s2, s3 mov s3, 13*8(rp) mov 14*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 14*8(rp) pop s3 setc R8(sx) @@ -2108,7 +2108,7 @@ PROLOGUE(flint_mpn_subrsh_16) sbb s2, s3 mov s3, 14*8(rp) mov 15*8(ap), s0 - sub s1, s0 + sbb s1, s0 mov s0, 15*8(rp) pop s3 setc R8(sx) From a537b899e9c4631969f2b431720dea2c3de11016 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 19:28:21 +0000 Subject: [PATCH 11/13] fixup --- src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index ed6d264c06..5f79d39516 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -25,6 +25,7 @@ define(`s2', `%r11') define(`s3', `%rbx') dnl From n = 2 onwards, these are generated by `dev/gen_x86_aorsrsh.jl'. +dnl However, flint_mpn_subrsh_2 is touched up afterwards. dnl r <- a +/- 2^n b dnl From 55dd1381a4abed605cd253795d98cb2d094fd2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 2 Dec 2024 20:20:52 +0000 Subject: [PATCH 12/13] fixup --- src/mpn_extras.h | 2 ++ src/mpn_extras/test/t-aorsrsh_n.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 8f4d5ce307..f4c6d216c3 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -491,6 +491,7 @@ mp_limb_t flint_mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, return FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt); else { + FLINT_ASSERT(rp != xp); mpn_rshift(rp, yp, n, cnt); return mpn_add_n(rp, rp, xp, n); } @@ -506,6 +507,7 @@ mp_limb_t flint_mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, return FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt); else { + FLINT_ASSERT(rp != xp); mpn_rshift(rp, yp, n, cnt); return mpn_sub_n(rp, xp, rp, n); } diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c index 24709f0e49..acf346d00b 100644 --- a/src/mpn_extras/test/t-aorsrsh_n.c +++ b/src/mpn_extras/test/t-aorsrsh_n.c @@ -53,7 +53,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) /* 0: No aliasing * 1: fp = xp * 2: fp = yp */ - aliasing = 0; /* n_randint(state, 3); */ + aliasing = n_randint(state, 3); fp = flint_malloc(sizeof(mp_limb_t) * n); gp = flint_malloc(sizeof(mp_limb_t) * n); @@ -66,6 +66,10 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) type = n_randint(state, 2); + /* FIXME */ + if (n > N_MAX && aliasing == 1) + aliasing = 0; + if (type == 0) { if (aliasing == 0) @@ -112,7 +116,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) "FLINT (cy = %wu): %{ulong*}\n" "GMP (cy = %wu): %{ulong*}\n", type == 0 ? "flint_mpn_addrsh_n" : "flint_mpn_subrsh_n", - aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp", + aliasing == 0 ? "none" : (aliasing == 1 ? "rp = xp" : "rp = yp"), ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n); flint_free(fp); From 9e7f4625487a3a94f4a83b2ece92c22d0e398822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= Date: Mon, 9 Dec 2024 13:10:33 +0000 Subject: [PATCH 13/13] Stash for aorsrsh for ARM64 --- src/mpn_extras/arm64/aorsrsh_hard.asm | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/mpn_extras/arm64/aorsrsh_hard.asm diff --git a/src/mpn_extras/arm64/aorsrsh_hard.asm b/src/mpn_extras/arm64/aorsrsh_hard.asm new file mode 100644 index 0000000000..6b21e27fec --- /dev/null +++ b/src/mpn_extras/arm64/aorsrsh_hard.asm @@ -0,0 +1,71 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +dnl Everything from n = 2 and onwards is generated by +dnl $topdir/dev/gen_arm_aors.jl. +dnl +dnl This generation was constructed with processors with Apple silicon in mind. +dnl Processors decoding less than 6 operations per cycle, or few store and load +dnl units may have worse performance. + +define(`rp', `x0') +define(`ap', `x1') +define(`bp', `x2') +define(`cnt', `x3') + +define(`sx', `x0') C Beware that this is synonymous with rp +define(`s0', `x3') +define(`s1', `x4') +define(`s2', `x5') +define(`s3', `x6') +define(`s4', `x7') +define(`s5', `x8') +define(`s6', `x9') +define(`s7', `x10') +define(`s8', `x11') +define(`s9', `x12') +define(`s10', `x13') +define(`s11', `x14') +define(`s12', `x15') +define(`s13', `x16') +define(`s14', `x17') + +dnl r <- a +/- 2^n b +dnl +dnl For 0 <= i < n - 1, we have +dnl +dnl r_{i} = a_{i} +/- (b_{i} >> n + b_{i + 1} << (64 - n)), +dnl +dnl and +dnl +dnl r_{n - 1} = a_{n - 1} +/- (b_{n - 1} >> n). + +PROLOGUE(flint_mpn_aorsrsh(1)) + ldr s0, [ap,#0*8] + ldr s1, [bp,#0*8] + lsr s1, s1, cnt + OP s0, s0, s1 + str s0, [rp,#0*8] + cset sx, CC + ret +EPILOGUE() + +PROLOGUE(flint_mpn_aors_2) + ldp s0, s2, [ap,#0*8] + ldp s1, s3, [bp,#0*8] + OP s0, s0, s1 + OPC s2, s2, s3 + stp s0, s2, [rp,#0*8] + cset sx, CC + ret +EPILOGUE()