From d09c98ed7c5ecae20c60e19dcb79e7cf6797a8d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Wed, 27 Nov 2024 15:07:21 +0000
Subject: [PATCH 01/13] Add hardcoded flint_mpn_aors_n for ARM and x86

These are generated from `dev/gen_ARCH_aors.jl`.  Also add tests for it.
---
 dev/gen_arm_aors.jl                           |  94 +++
 dev/gen_x86_aors.jl                           |  83 +++
 src/mpn_extras.h                              |  41 ++
 src/mpn_extras/aors_n.c                       |  88 +++
 src/mpn_extras/arm64/aors_hard.asm            | 492 +++++++++++++++
 src/mpn_extras/test/main.c                    |   2 +
 src/mpn_extras/test/t-aors_n.c                |  85 +++
 src/mpn_extras/x86_64/broadwell/aors_hard.asm | 565 ++++++++++++++++++
 8 files changed, 1450 insertions(+)
 create mode 100644 dev/gen_arm_aors.jl
 create mode 100644 dev/gen_x86_aors.jl
 create mode 100644 src/mpn_extras/aors_n.c
 create mode 100644 src/mpn_extras/arm64/aors_hard.asm
 create mode 100644 src/mpn_extras/test/t-aors_n.c
 create mode 100644 src/mpn_extras/x86_64/broadwell/aors_hard.asm

diff --git a/dev/gen_arm_aors.jl b/dev/gen_arm_aors.jl
new file mode 100644
index 0000000000..465b90e64a
--- /dev/null
+++ b/dev/gen_arm_aors.jl
@@ -0,0 +1,94 @@
+#
+#   Copyright (C) 2024 Albin Ahlbäck
+#
+#   This file is part of FLINT.
+#
+#   FLINT is free software: you can redistribute it and/or modify it under
+#   the terms of the GNU Lesser General Public License (LGPL) as published
+#   by the Free Software Foundation; either version 3 of the License, or
+#   (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+#
+
+# Generating routines for r <- a OP b, where OP is either + or -.
+#
+# This generation was constructed with processors with Apple silicon in mind.
+# Processors decoding less than 6 operations per cycle, or few store and load
+# units may have worse performance.
+
+r = "rp"
+a = "ap"
+b = "bp"
+rp(ix::Int) = "[$r,#$ix*8]"
+ap(ix::Int) = "[$a,#$ix*8]"
+bp(ix::Int) = "[$b,#$ix*8]"
+
+sx = "sx" # Return value for carry or borrow
+CC = "CC"
+
+sp = ["s$ix" for ix in 0:14] # Scrap registers
+
+# Writes assembly that should be preprocessed by M4.
+function aors(n::Int)
+    _str = "PROLOGUE(flint_mpn_aors($n))\n"
+    function ldr(s0::String, s1::String)
+        _str *= "\tldr\t$s0, $s1\n"
+    end
+    function ldp(s0::String, s1::String, s2::String)
+        _str *= "\tldp\t$s0, $s1, $s2\n"
+    end
+    function str(s0::String, s1::String)
+        _str *= "\tstr\t$s0, $s1\n"
+    end
+    function stp(s0::String, s1::String, s2::String)
+        _str *= "\tstp\t$s0, $s1, $s2\n"
+    end
+    function OP(s0::String, s1::String, s2::String)
+        _str *= "\tOP\t$s0, $s1, $s2\n"
+    end
+    function OPC(s0::String, s1::String, s2::String)
+        _str *= "\tOPC\t$s0, $s1, $s2\n"
+    end
+    function cset(s0::String, s1::String)
+        _str *= "\tcset\t$s0, $s1\n"
+    end
+
+    sv = deepcopy(sp)
+    s(ix::Int) = sv[ix + 1]
+    function shift(sv::Vector{String})
+        sv[(end - 3):end], sv[1:(end - 4)] = sv[1:4], sv[5:end]
+    end
+
+    ldp(    s(0), s(2), ap(0))
+    ldp(    s(1), s(3), bp(0))
+    OP(     s(0), s(0), s(1))
+    OPC(    s(2), s(2), s(3))
+    stp(    s(0), s(2), rp(0))
+
+    for ix in 1:(n ÷ 2 - 1)
+        shift(sv)
+        ldp(    s(0), s(2), ap(2 * ix))
+        ldp(    s(1), s(3), bp(2 * ix))
+        OPC(    s(0), s(0), s(1))
+        OPC(    s(2), s(2), s(3))
+        stp(    s(0), s(2), rp(2 * ix))
+    end
+
+    if n % 2 == 1
+        ldr(    s(4), ap(n - 1))
+        ldr(    s(5), bp(n - 1))
+        OPC(    s(4), s(4), s(5))
+        str(    s(4), rp(n - 1))
+    end
+
+    cset(   sx, CC)
+
+    _str *= "\tret\nEPILOGUE()\n"
+
+    return _str
+end
+
+function print_all_aors(nmax::Int = 16)
+    for n in 2:nmax
+        println(aors(n))
+    end
+end
diff --git a/dev/gen_x86_aors.jl b/dev/gen_x86_aors.jl
new file mode 100644
index 0000000000..0db9110cbd
--- /dev/null
+++ b/dev/gen_x86_aors.jl
@@ -0,0 +1,83 @@
+#
+#   Copyright (C) 2024 Albin Ahlbäck
+#
+#   This file is part of FLINT.
+#
+#   FLINT is free software: you can redistribute it and/or modify it under
+#   the terms of the GNU Lesser General Public License (LGPL) as published
+#   by the Free Software Foundation; either version 3 of the License, or
+#   (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+#
+
+# Generating routines for r <- a OP b, where OP is either + or -.
+#
+# This generation was constructed with processors with descent schedulers in
+# mind.
+
+r = "rp"
+a = "ap"
+b = "bp"
+rp(ix::Int) = "$ix*8($r)"
+ap(ix::Int) = "$ix*8($a)"
+bp(ix::Int) = "$ix*8($b)"
+
+sx = "sx" # Return value for carry or borrow, i.e. %rax
+
+R32(sx::String) = "R32($sx)"
+R8(sx::String) = "R8($sx)"
+
+sp = ["s$ix" for ix in 0:4] # Scrap registers
+
+# Writes assembly that should be preprocessed by M4.
+function aors(n::Int)
+    str = "\tALIGN(16)\nPROLOGUE(flint_mpn_aors($n))\n"
+    function mov(s0::String, s1::String)
+        str *= "\tmov\t$s0, $s1\n"
+    end
+    function xor(s0::String, s1::String)
+        str *= "\txor\t$s0, $s1\n"
+    end
+    function OP(s0::String, s1::String)
+        str *= "\tOP\t$s0, $s1\n"
+    end
+    function OPC(s0::String, s1::String)
+        str *= "\tOPC\t$s0, $s1\n"
+    end
+    function setc(s0::String)
+        str *= "\tsetc\t$s0\n"
+    end
+
+    sv = deepcopy(sp)
+    s(ix::Int) = sv[ix + 1]
+    function shift(sv::Vector{String})
+        sv[end], sv[1:end - 1] = sv[1], sv[2:end]
+    end
+
+    mov(    ap(0), s(0))
+
+    mov(    ap(1), s(1))
+    xor(    R32(sx), R32(sx))
+    OP(     bp(0), s(0))
+    mov(    s(0), rp(0))
+
+    for ix in 1:(n - 2)
+        shift(sv)
+        mov(    ap(ix + 1), s(1))
+        OPC(    bp(ix), s(0))
+        mov(    s(0), rp(ix))
+    end
+
+    OPC(    bp(n - 1), s(1))
+    mov(    s(1), rp(n - 1))
+    setc(   R8(sx))
+
+    str *= "\tret\nEPILOGUE()\n"
+
+    return str
+end
+
+function print_all_aors(nmax::Int = 16)
+    for n in 2:nmax
+        println(aors(n))
+    end
+end
diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 90fc8e6436..9d4ba63c0c 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -462,25 +462,34 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
 
 /* multiplication (general) **************************************************/
 
+/* NOTE: This is getting a bit messy.  How can we clean this up? */
 #if FLINT_HAVE_ASSEMBLY_x86_64_adx
+# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17
 # define FLINT_MPN_MUL_FUNC_TAB_WIDTH 17
 # define FLINT_MPN_SQR_FUNC_TAB_WIDTH 14
 
+# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH)
 # define FLINT_HAVE_MUL_FUNC(n, m) ((n) <= 16)
 # define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= 16)
 # define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH)
 
+# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp))
+# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp))
 # define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_tab[xn][yn](rp, xp, yp))
 # define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_n_func_tab[n](rp, xp, yp))
 # define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp))
 #elif FLINT_HAVE_ASSEMBLY_armv8
+# define FLINT_MPN_AORS_FUNC_TAB_WIDTH 17
 # define FLINT_MPN_MUL_FUNC_N_TAB_WIDTH 15
 # define FLINT_MPN_SQR_FUNC_TAB_WIDTH 9
 
+# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH)
 # define FLINT_HAVE_MUL_FUNC(n, m) FLINT_HAVE_MUL_N_FUNC(n)
 # define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= FLINT_MPN_MUL_FUNC_N_TAB_WIDTH)
 # define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH)
 
+# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) (flint_mpn_add_func_tab[n](rp, xp, yp))
+# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) (flint_mpn_sub_func_tab[n](rp, xp, yp))
 # define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_n_tab[xn](rp, xp, yp, yn))
 # define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_func_n_tab[n](rp, xp, yp, n))
 # define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp))
@@ -506,6 +515,16 @@ typedef mp_limb_t (* flint_mpn_mul_func_t)(mp_ptr, mp_srcptr, mp_srcptr);
 typedef mp_limb_t (* flint_mpn_mul_func_n_t)(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
 typedef mp_limb_t (* flint_mpn_sqr_func_t)(mp_ptr, mp_srcptr);
 
+#ifdef FLINT_MPN_AORS_FUNC_TAB_WIDTH
+# define FLINT_USE_AORS_FUNC_TAB 1
+FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_add_func_tab[];
+FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_sub_func_tab[];
+#else
+# define FLINT_HAVE_AORS_FUNC(n) 0
+# define FLINT_MPN_ADD_HARD(rp, xp, yp, n) 0
+# define FLINT_MPN_SUB_HARD(rp, xp, yp, n) 0
+#endif
+
 #ifdef FLINT_MPN_MUL_FUNC_N_TAB_WIDTH
 FLINT_DLL extern const flint_mpn_mul_func_n_t flint_mpn_mul_func_n_tab[];
 #else
@@ -522,6 +541,28 @@ mp_limb_t _flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_si
 void _flint_mpn_mul_n(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n);
 mp_limb_t _flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n);
 
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_add_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+    FLINT_ASSERT(n >= 1);
+
+    if (FLINT_HAVE_AORS_FUNC(n))
+        return FLINT_MPN_ADD_HARD(rp, xp, yp, n);
+    else
+        return mpn_add_n(rp, xp, yp, n);
+}
+
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_sub_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+    FLINT_ASSERT(n >= 1);
+
+    if (FLINT_HAVE_AORS_FUNC(n))
+        return FLINT_MPN_SUB_HARD(rp, xp, yp, n);
+    else
+        return mpn_sub_n(rp, xp, yp, n);
+}
+
 MPN_EXTRAS_INLINE mp_limb_t
 flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_size_t yn)
 {
diff --git a/src/mpn_extras/aors_n.c b/src/mpn_extras/aors_n.c
new file mode 100644
index 0000000000..ee9231aecd
--- /dev/null
+++ b/src/mpn_extras/aors_n.c
@@ -0,0 +1,88 @@
+/*
+    Copyright (C) 2024 Albin Ahlbäck
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "mpn_extras.h"
+
+#define DECL_AORS(n) _DECL_AORS(n)
+#define _DECL_AORS(n) \
+mp_limb_t flint_mpn_add_##n(mp_ptr, mp_srcptr, mp_srcptr); \
+mp_limb_t flint_mpn_sub_##n(mp_ptr, mp_srcptr, mp_srcptr)
+
+#define ADD(n) _ADD(n)
+#define _ADD(n) flint_mpn_add_##n
+#define SUB(n) _SUB(n)
+#define _SUB(n) flint_mpn_sub_##n
+
+/* Herein we assume that x86 and ARM are equivalent. */
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx || FLINT_HAVE_ASSEMBLY_armv8
+DECL_AORS(1);
+DECL_AORS(2);
+DECL_AORS(3);
+DECL_AORS(4);
+DECL_AORS(5);
+DECL_AORS(6);
+DECL_AORS(7);
+DECL_AORS(8);
+DECL_AORS(9);
+DECL_AORS(10);
+DECL_AORS(11);
+DECL_AORS(12);
+DECL_AORS(13);
+DECL_AORS(14);
+DECL_AORS(15);
+DECL_AORS(16);
+
+/* TODO: Should probably rename these types so to not have two different types.
+ * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */
+const flint_mpn_mul_func_t flint_mpn_add_func_tab[] =
+{
+    NULL,
+    ADD(1),
+    ADD(2),
+    ADD(3),
+    ADD(4),
+    ADD(5),
+    ADD(6),
+    ADD(7),
+    ADD(8),
+    ADD(9),
+    ADD(10),
+    ADD(11),
+    ADD(12),
+    ADD(13),
+    ADD(14),
+    ADD(15),
+    ADD(16)
+};
+
+const flint_mpn_mul_func_t flint_mpn_sub_func_tab[] =
+{
+    NULL,
+    SUB(1),
+    SUB(2),
+    SUB(3),
+    SUB(4),
+    SUB(5),
+    SUB(6),
+    SUB(7),
+    SUB(8),
+    SUB(9),
+    SUB(10),
+    SUB(11),
+    SUB(12),
+    SUB(13),
+    SUB(14),
+    SUB(15),
+    SUB(16)
+};
+#else
+typedef int this_file_is_empty;
+#endif
diff --git a/src/mpn_extras/arm64/aors_hard.asm b/src/mpn_extras/arm64/aors_hard.asm
new file mode 100644
index 0000000000..ed9cc2a0e0
--- /dev/null
+++ b/src/mpn_extras/arm64/aors_hard.asm
@@ -0,0 +1,492 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+dnl
+
+include(`config.m4')
+
+dnl Everything from n = 2 and onwards is generated by
+dnl $topdir/dev/gen_arm_aors.jl.
+dnl
+dnl This generation was constructed with processors with Apple silicon in mind.
+dnl Processors decoding less than 6 operations per cycle, or few store and load
+dnl units may have worse performance.
+
+define(`rp',  `x0')
+define(`ap',  `x1')
+define(`bp',  `x2')
+
+define(`sx',  `x0') C Beware that this is synonymous with rp
+define(`s0',  `x3')
+define(`s1',  `x4')
+define(`s2',  `x5')
+define(`s3',  `x6')
+define(`s4',  `x7')
+define(`s5',  `x8')
+define(`s6',  `x9')
+define(`s7',  `x10')
+define(`s8',  `x11')
+define(`s9',  `x12')
+define(`s10', `x13')
+define(`s11', `x14')
+define(`s12', `x15')
+define(`s13', `x16')
+define(`s14', `x17')
+
+define(ALL_AORS,`
+PROLOGUE(flint_mpn_aors(1))
+	ldr	s0, [ap,#0*8]
+	ldr	s1, [bp,#0*8]
+	OP	s0, s0, s1
+	str	s0, [rp,#0*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(2))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(3))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldr	s4, [ap,#2*8]
+	ldr	s5, [bp,#2*8]
+	OPC	s4, s4, s5
+	str	s4, [rp,#2*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(4))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(5))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldr	s8, [ap,#4*8]
+	ldr	s9, [bp,#4*8]
+	OPC	s8, s8, s9
+	str	s8, [rp,#4*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(6))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(7))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldr	s12, [ap,#6*8]
+	ldr	s13, [bp,#6*8]
+	OPC	s12, s12, s13
+	str	s12, [rp,#6*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(8))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(9))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldr	s1, [ap,#8*8]
+	ldr	s2, [bp,#8*8]
+	OPC	s1, s1, s2
+	str	s1, [rp,#8*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(10))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldp	s1, s3, [ap,#8*8]
+	ldp	s2, s4, [bp,#8*8]
+	OPC	s1, s1, s2
+	OPC	s3, s3, s4
+	stp	s1, s3, [rp,#8*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(11))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldp	s1, s3, [ap,#8*8]
+	ldp	s2, s4, [bp,#8*8]
+	OPC	s1, s1, s2
+	OPC	s3, s3, s4
+	stp	s1, s3, [rp,#8*8]
+	ldr	s5, [ap,#10*8]
+	ldr	s6, [bp,#10*8]
+	OPC	s5, s5, s6
+	str	s5, [rp,#10*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(12))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldp	s1, s3, [ap,#8*8]
+	ldp	s2, s4, [bp,#8*8]
+	OPC	s1, s1, s2
+	OPC	s3, s3, s4
+	stp	s1, s3, [rp,#8*8]
+	ldp	s5, s7, [ap,#10*8]
+	ldp	s6, s8, [bp,#10*8]
+	OPC	s5, s5, s6
+	OPC	s7, s7, s8
+	stp	s5, s7, [rp,#10*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(13))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldp	s1, s3, [ap,#8*8]
+	ldp	s2, s4, [bp,#8*8]
+	OPC	s1, s1, s2
+	OPC	s3, s3, s4
+	stp	s1, s3, [rp,#8*8]
+	ldp	s5, s7, [ap,#10*8]
+	ldp	s6, s8, [bp,#10*8]
+	OPC	s5, s5, s6
+	OPC	s7, s7, s8
+	stp	s5, s7, [rp,#10*8]
+	ldr	s9, [ap,#12*8]
+	ldr	s10, [bp,#12*8]
+	OPC	s9, s9, s10
+	str	s9, [rp,#12*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(14))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldp	s1, s3, [ap,#8*8]
+	ldp	s2, s4, [bp,#8*8]
+	OPC	s1, s1, s2
+	OPC	s3, s3, s4
+	stp	s1, s3, [rp,#8*8]
+	ldp	s5, s7, [ap,#10*8]
+	ldp	s6, s8, [bp,#10*8]
+	OPC	s5, s5, s6
+	OPC	s7, s7, s8
+	stp	s5, s7, [rp,#10*8]
+	ldp	s9, s11, [ap,#12*8]
+	ldp	s10, s12, [bp,#12*8]
+	OPC	s9, s9, s10
+	OPC	s11, s11, s12
+	stp	s9, s11, [rp,#12*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(15))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldp	s1, s3, [ap,#8*8]
+	ldp	s2, s4, [bp,#8*8]
+	OPC	s1, s1, s2
+	OPC	s3, s3, s4
+	stp	s1, s3, [rp,#8*8]
+	ldp	s5, s7, [ap,#10*8]
+	ldp	s6, s8, [bp,#10*8]
+	OPC	s5, s5, s6
+	OPC	s7, s7, s8
+	stp	s5, s7, [rp,#10*8]
+	ldp	s9, s11, [ap,#12*8]
+	ldp	s10, s12, [bp,#12*8]
+	OPC	s9, s9, s10
+	OPC	s11, s11, s12
+	stp	s9, s11, [rp,#12*8]
+	ldr	s13, [ap,#14*8]
+	ldr	s14, [bp,#14*8]
+	OPC	s13, s13, s14
+	str	s13, [rp,#14*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors(16))
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	ldp	s4, s6, [ap,#2*8]
+	ldp	s5, s7, [bp,#2*8]
+	OPC	s4, s4, s5
+	OPC	s6, s6, s7
+	stp	s4, s6, [rp,#2*8]
+	ldp	s8, s10, [ap,#4*8]
+	ldp	s9, s11, [bp,#4*8]
+	OPC	s8, s8, s9
+	OPC	s10, s10, s11
+	stp	s8, s10, [rp,#4*8]
+	ldp	s12, s14, [ap,#6*8]
+	ldp	s13, s0, [bp,#6*8]
+	OPC	s12, s12, s13
+	OPC	s14, s14, s0
+	stp	s12, s14, [rp,#6*8]
+	ldp	s1, s3, [ap,#8*8]
+	ldp	s2, s4, [bp,#8*8]
+	OPC	s1, s1, s2
+	OPC	s3, s3, s4
+	stp	s1, s3, [rp,#8*8]
+	ldp	s5, s7, [ap,#10*8]
+	ldp	s6, s8, [bp,#10*8]
+	OPC	s5, s5, s6
+	OPC	s7, s7, s8
+	stp	s5, s7, [rp,#10*8]
+	ldp	s9, s11, [ap,#12*8]
+	ldp	s10, s12, [bp,#12*8]
+	OPC	s9, s9, s10
+	OPC	s11, s11, s12
+	stp	s9, s11, [rp,#12*8]
+	ldp	s13, s0, [ap,#14*8]
+	ldp	s14, s1, [bp,#14*8]
+	OPC	s13, s13, s14
+	OPC	s0, s0, s1
+	stp	s13, s0, [rp,#14*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+')
+
+define(`flint_mpn_aors',`flint_mpn_add_$1')
+define(`OP',`adds')
+define(`OPC',`adcs')
+define(`CC',`cs')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')
+
+define(`flint_mpn_aors',`flint_mpn_sub_$1')
+define(`OP',`subs')
+define(`OPC',`sbcs')
+define(`CC',`cc')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')
diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c
index f39d688fdd..171a9b7342 100644
--- a/src/mpn_extras/test/main.c
+++ b/src/mpn_extras/test/main.c
@@ -12,6 +12,7 @@
 /* Include functions *********************************************************/
 
 #include "t-2add_n_inplace.c"
+#include "t-aors_n.c"
 #include "t-divides.c"
 #include "t-divrem_preinv1.c"
 #include "t-divrem_preinvn.c"
@@ -38,6 +39,7 @@
 test_struct tests[] =
 {
     TEST_FUNCTION(flint_mpn_2add_n_inplace),
+    TEST_FUNCTION(flint_mpn_aors_n),
     TEST_FUNCTION(flint_mpn_divides),
     TEST_FUNCTION(flint_mpn_divrem_preinv1),
     TEST_FUNCTION(flint_mpn_divrem_preinvn),
diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c
new file mode 100644
index 0000000000..0af210d94c
--- /dev/null
+++ b/src/mpn_extras/test/t-aors_n.c
@@ -0,0 +1,85 @@
+/*
+    Copyright (C) 2024 Albin Ahlbäck
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "mpn_extras.h"
+
+#define N_MIN                                     1
+#define N_MAX   (FLINT_MPN_AORS_FUNC_TAB_WIDTH -  1)
+#define N_STOR  (FLINT_MPN_AORS_FUNC_TAB_WIDTH + 10)
+
+TEST_FUNCTION_START(flint_mpn_aors_n, state)
+{
+#if FLINT_USE_AORS_FUNC_TAB
+    slong ix;
+
+    for (ix = 0; ix < 10000 * flint_test_multiplier(); ix++)
+    {
+        int result;
+        int type;
+        mp_limb_t cf, cg;
+        mp_size_t n;
+        mp_ptr fp, gp, xp, yp;
+
+        n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
+        if (n_randint(state, 1 << 10) == UWORD(0))
+            n += N_STOR;
+
+        fp = flint_malloc(sizeof(mp_limb_t) * n);
+        gp = flint_malloc(sizeof(mp_limb_t) * n);
+        xp = flint_malloc(sizeof(mp_limb_t) * n);
+        yp = flint_malloc(sizeof(mp_limb_t) * n);
+
+        flint_mpn_rrandom(xp, state, n);
+        flint_mpn_rrandom(yp, state, n);
+
+        type = n_randint(state, 2);
+
+        if (type == 0)
+        {
+            cf = flint_mpn_add_n(fp, xp, yp, n);
+            cg = mpn_add_n(gp, xp, yp, n);
+        }
+        else
+        {
+            cf = flint_mpn_sub_n(fp, xp, yp, n);
+            cg = mpn_sub_n(gp, xp, yp, n);
+        }
+
+        result = (cf == cg && mpn_cmp(fp, gp, n) == 0);
+        if (!result)
+            TEST_FUNCTION_FAIL(
+                    "%s:\n"
+                    "ix = %wd\n"
+                    "n = %wd\n"
+                    "xp = %{ulong*}\n"
+                    "yp = %{ulong*}\n"
+                    "FLINT (cy = %wu): %{ulong*}\n"
+                    "GMP   (cy = %wu): %{ulong*}\n",
+                    type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n",
+                    ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1);
+
+        flint_free(fp);
+        flint_free(gp);
+        flint_free(xp);
+        flint_free(yp);
+    }
+
+    TEST_FUNCTION_END(state);
+#else
+    TEST_FUNCTION_END_SKIPPED(state);
+#endif
+}
+
+#undef N_MIN
+#undef N_MAX
+#undef N_STOR
diff --git a/src/mpn_extras/x86_64/broadwell/aors_hard.asm b/src/mpn_extras/x86_64/broadwell/aors_hard.asm
new file mode 100644
index 0000000000..390ee036ec
--- /dev/null
+++ b/src/mpn_extras/x86_64/broadwell/aors_hard.asm
@@ -0,0 +1,565 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+dnl
+
+include(`config.m4')
+
+dnl Everything from n = 2 and onwards is generated by
+dnl $topdir/dev/gen_x86_aors.jl.
+
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp', `%rdx')
+
+define(`sx', `%rax')
+define(`s0', `%rcx')
+define(`s1', `%r8')
+define(`s2', `%r9')
+define(`s3', `%r10')
+define(`s4', `%r11')
+
+define(ALL_AORS,`
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(1))
+	mov	0*8(ap), s0
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(2))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(3))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(4))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(5))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(6))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(7))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(8))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(9))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(10))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	mov	9*8(ap), s4
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	OPC	9*8(bp), s4
+	mov	s4, 9*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(11))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	mov	9*8(ap), s4
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	mov	10*8(ap), s0
+	OPC	9*8(bp), s4
+	mov	s4, 9*8(rp)
+	OPC	10*8(bp), s0
+	mov	s0, 10*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(12))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	mov	9*8(ap), s4
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	mov	10*8(ap), s0
+	OPC	9*8(bp), s4
+	mov	s4, 9*8(rp)
+	mov	11*8(ap), s1
+	OPC	10*8(bp), s0
+	mov	s0, 10*8(rp)
+	OPC	11*8(bp), s1
+	mov	s1, 11*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(13))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	mov	9*8(ap), s4
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	mov	10*8(ap), s0
+	OPC	9*8(bp), s4
+	mov	s4, 9*8(rp)
+	mov	11*8(ap), s1
+	OPC	10*8(bp), s0
+	mov	s0, 10*8(rp)
+	mov	12*8(ap), s2
+	OPC	11*8(bp), s1
+	mov	s1, 11*8(rp)
+	OPC	12*8(bp), s2
+	mov	s2, 12*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(14))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	mov	9*8(ap), s4
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	mov	10*8(ap), s0
+	OPC	9*8(bp), s4
+	mov	s4, 9*8(rp)
+	mov	11*8(ap), s1
+	OPC	10*8(bp), s0
+	mov	s0, 10*8(rp)
+	mov	12*8(ap), s2
+	OPC	11*8(bp), s1
+	mov	s1, 11*8(rp)
+	mov	13*8(ap), s3
+	OPC	12*8(bp), s2
+	mov	s2, 12*8(rp)
+	OPC	13*8(bp), s3
+	mov	s3, 13*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(15))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	mov	9*8(ap), s4
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	mov	10*8(ap), s0
+	OPC	9*8(bp), s4
+	mov	s4, 9*8(rp)
+	mov	11*8(ap), s1
+	OPC	10*8(bp), s0
+	mov	s0, 10*8(rp)
+	mov	12*8(ap), s2
+	OPC	11*8(bp), s1
+	mov	s1, 11*8(rp)
+	mov	13*8(ap), s3
+	OPC	12*8(bp), s2
+	mov	s2, 12*8(rp)
+	mov	14*8(ap), s4
+	OPC	13*8(bp), s3
+	mov	s3, 13*8(rp)
+	OPC	14*8(bp), s4
+	mov	s4, 14*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_aors(16))
+	mov	0*8(ap), s0
+	mov	1*8(ap), s1
+	xor	R32(sx), R32(sx)
+	OP	0*8(bp), s0
+	mov	s0, 0*8(rp)
+	mov	2*8(ap), s2
+	OPC	1*8(bp), s1
+	mov	s1, 1*8(rp)
+	mov	3*8(ap), s3
+	OPC	2*8(bp), s2
+	mov	s2, 2*8(rp)
+	mov	4*8(ap), s4
+	OPC	3*8(bp), s3
+	mov	s3, 3*8(rp)
+	mov	5*8(ap), s0
+	OPC	4*8(bp), s4
+	mov	s4, 4*8(rp)
+	mov	6*8(ap), s1
+	OPC	5*8(bp), s0
+	mov	s0, 5*8(rp)
+	mov	7*8(ap), s2
+	OPC	6*8(bp), s1
+	mov	s1, 6*8(rp)
+	mov	8*8(ap), s3
+	OPC	7*8(bp), s2
+	mov	s2, 7*8(rp)
+	mov	9*8(ap), s4
+	OPC	8*8(bp), s3
+	mov	s3, 8*8(rp)
+	mov	10*8(ap), s0
+	OPC	9*8(bp), s4
+	mov	s4, 9*8(rp)
+	mov	11*8(ap), s1
+	OPC	10*8(bp), s0
+	mov	s0, 10*8(rp)
+	mov	12*8(ap), s2
+	OPC	11*8(bp), s1
+	mov	s1, 11*8(rp)
+	mov	13*8(ap), s3
+	OPC	12*8(bp), s2
+	mov	s2, 12*8(rp)
+	mov	14*8(ap), s4
+	OPC	13*8(bp), s3
+	mov	s3, 13*8(rp)
+	mov	15*8(ap), s0
+	OPC	14*8(bp), s4
+	mov	s4, 14*8(rp)
+	OPC	15*8(bp), s0
+	mov	s0, 15*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+')
+
+	TEXT
+define(`flint_mpn_aors',`flint_mpn_add_$1')
+define(`OP',`add')
+define(`OPC',`adc')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')
+
+define(`flint_mpn_aors',`flint_mpn_sub_$1')
+define(`OP',`sub')
+define(`OPC',`sbb')
+ALL_AORS
+undefine(`flint_mpn_aors')
+undefine(`OP')
+undefine(`OPC')

From 00ecdcbc6d6b1e6322494f27a9ae3b02d4ec8f32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Sat, 30 Nov 2024 14:47:06 +0000
Subject: [PATCH 02/13] Start on aorsrsh for x86

---
 .../x86_64/broadwell/aorsrsh_hard.asm         | 220 ++++++++++++++++++
 1 file changed, 220 insertions(+)
 create mode 100644 src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm

diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
new file mode 100644
index 0000000000..4fefb0143b
--- /dev/null
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -0,0 +1,220 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+dnl
+
+include(`config.m4')
+
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`bp',  `%rdx')
+define(`cnt', `%rcx')
+
+define(`tnc', `%r8')
+define(`sx', `%rax')
+
+define(`s0', `%r9')
+define(`s1', `%r10')
+define(`s2', `%r11')
+
+dnl r <- a +/- 2^n b
+dnl
+dnl For 0 <= i < n - 1, we have
+dnl
+dnl     r_{i} = a_{i} +/- (b_{i} >> n + b_{i + 1} << (64 - n)),
+dnl
+dnl and
+dnl
+dnl     r_{n - 1} = a_{n - 1} +/- (b_{n - 1} >> n).
+
+dnl The idea is the following:
+dnl
+dnl Assume that bp[i] is loaded in a register b0.
+dnl
+dnl t = b0 >> n		C shrx
+dnl b1 = bp[i + 1]	C mov, and fullfills assumption for next iteration
+dnl s = b1 << (64 - n)	C shlx
+dnl s = s + t		C lea, carry-less
+dnl if OP = add, then
+dnl   s += ap[i]	C adc
+dnl   rp[i] = s		C mov
+dnl else
+dnl   u = ap[i]		C mov
+dnl   u -= s		C sbb
+dnl   rp[i] = u		C mov
+dnl fi
+
+define(ALL_AORS,`
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_1)
+	shrx	cnt, 0*8(bp), s0
+	xor	R32(sx), R32(sx)
+	add	0*8(ap), s0
+	mov	s0, 0*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_2)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+
+	xor	R32(sx), R32(sx)
+
+	mov	1*8(bp), s1
+C
+	shrx	cnt, 0*8(bp), s0
+	shlx	tnc, s1, s2
+	shrx	cnt, s1, s1
+	C (0, 2), 1
+
+	adox	s2, s0
+C
+	adcx	0*8(ap), s0
+	mov	s0, 0*8(rp)
+	adox	sx, s1		C cannot overflow
+	adcx	1*8(ap), s1
+C
+	mov	s1, 1*8(rp)
+
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_3)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+ifelse(OP,`add',`
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+',`
+	mov	0*8(ap), s0
+	sub	s2, s0
+	mov	s0, 0*8(rp)
+')
+	C Used: s1
+
+	shrx	cnt, s1, s1
+	mov	2*8(bp), s2
+	shlx	tnc, s2, s0
+	lea	(s1, s0), s0
+ifelse(OP,`add',`
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+',`
+	mov	1*8(ap), s1
+	sbb	s0, s1
+	mov	s1, 1*8(rp)
+')
+	C Used: s2
+
+	shrx	cnt, s2, s2
+ifelse(OP,`add',`
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+',`
+	mov	2*8(ap), s0
+	sbb	s2, s0
+	mov	s0, 1*8(rp)
+')
+
+	setc	R8(sx)
+	ret
+EPILOGUE()
+')
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_4)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+ifelse(OP,`add',`
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+',`
+	mov	0*8(ap), s0
+	sub	s2, s0
+	mov	s0, 0*8(rp)
+')
+	C Used: s1
+
+	shrx	cnt, s1, s1
+	mov	2*8(bp), s2
+	shlx	tnc, s2, s0
+	lea	(s1, s0), s0
+ifelse(OP,`add',`
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+',`
+	mov	1*8(ap), s1
+	sbb	s0, s1
+	mov	s1, 1*8(rp)
+')
+	C Used: s2
+
+C
+	shrx	cnt, s1, s1
+	mov	3*8(bp), s2
+	shlx	tnc, s2, s0
+	lea	(s1, s0), s0
+ifelse(OP,`add',`
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+',`
+	mov	2*8(ap), s1
+	sbb	s0, s1
+	mov	s1, 2*8(rp)
+')
+	C Used: s2
+C
+
+	shrx	cnt, s2, s2
+ifelse(OP,`add',`
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+',`
+	mov	2*8(ap), s0
+	sbb	s2, s0
+	mov	s0, 1*8(rp)
+')
+
+	setc	R8(sx)
+	ret
+EPILOGUE()
+')
+
+	TEXT
+define(`flint_mpn_aorsrsh',`flint_mpn_addrsh_$1')
+define(`OP',`add')
+define(`OPC',`adc')
+ALL_AORSRSH
+undefine(`flint_mpn_aorsrsh')
+undefine(`OP')
+undefine(`OPC')
+
+define(`flint_mpn_aorsrsh',`flint_mpn_subrsh_$1')
+define(`OP',`sub')
+define(`OPC',`sbb')
+ALL_AORSRSH
+undefine(`flint_mpn_aorsrsh')
+undefine(`OP')
+undefine(`OPC')

From 9ce9168e4e6a99ad0e40d279b7fba949a3933401 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 14:07:56 +0000
Subject: [PATCH 03/13] Add generation for hardcoded flint_mpn_addrsh

---
 dev/gen_x86_aorsrsh.jl                        |  125 ++
 .../x86_64/broadwell/aorsrsh_hard.asm         | 1026 +++++++++++++++--
 2 files changed, 1048 insertions(+), 103 deletions(-)
 create mode 100644 dev/gen_x86_aorsrsh.jl

diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl
new file mode 100644
index 0000000000..401471d706
--- /dev/null
+++ b/dev/gen_x86_aorsrsh.jl
@@ -0,0 +1,125 @@
+#
+#   Copyright (C) 2024 Albin Ahlbäck
+#
+#   This file is part of FLINT.
+#
+#   FLINT is free software: you can redistribute it and/or modify it under
+#   the terms of the GNU Lesser General Public License (LGPL) as published
+#   by the Free Software Foundation; either version 3 of the License, or
+#   (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+#
+
+# Generating routines for r <- a OP 2^(cnt) * b, where OP is either + or -.
+
+r = "rp"
+a = "ap"
+b = "bp"
+cnt = "cnt"
+rp(ix::Int) = "$ix*8($r)"
+ap(ix::Int) = "$ix*8($a)"
+bp(ix::Int) = "$ix*8($b)"
+
+tnc = "tnc"
+sx = "sx" # Return value for carry or borrow, i.e. %rax
+
+R32(sx::String) = "R32($sx)"
+R8(sx::String) = "R8($sx)"
+
+s0 = "s0"
+s1 = "s1"
+s2 = "s2"
+sp = ["s$ix" for ix in 0:2] # Scrap registers
+s(ix::Int) = s[ix + 1]
+
+# Writes assembly that should be preprocessed by M4.
+function addrsh(n::Int)
+    str = "\tALIGN(16)\nPROLOGUE(flint_mpn_addrsh_$n)\n"
+    function mov(s0::String, s1::String)
+        str *= "\tmov\t$s0, $s1\n"
+    end
+    function xor(s0::String, s1::String)
+        str *= "\txor\t$s0, $s1\n"
+    end
+    function add(s0::String, s1::String)
+        str *= "\tadd\t$s0, $s1\n"
+    end
+    function adc(s0::String, s1::String)
+        str *= "\tadc\t$s0, $s1\n"
+    end
+    function sub(s0::String, s1::String)
+        str *= "\tsub\t$s0, $s1\n"
+    end
+    function sbb(s0::String, s1::String)
+        str *= "\tsbb\t$s0, $s1\n"
+    end
+    function shrx(s0::String, s1::String, s2::String)
+        str *= "\tshrx\t$s0, $s1, $s2\n"
+    end
+    function shlx(s0::String, s1::String, s2::String)
+        str *= "\tshlx\t$s0, $s1, $s2\n"
+    end
+    function lea(t::Tuple{String, String}, s1::String)
+        str *= "\tlea\t($(t[1]), $(t[2])), $s1\n"
+    end
+    function setc(s0::String)
+        str *= "\tsetc\t$s0\n"
+    end
+
+    # Initialize variables
+    xor(    R32(tnc), R32(tnc))
+    sub(    cnt, tnc)   # This is modulo 64, so -n = 64 - n.
+    xor(    R32(sx), R32(sx))
+
+    # f_a assumes s1 contains ix*8(bp)
+    function f_a(ix::Int)
+        if ix == 0
+            shrx(   cnt, bp(0), s0)
+            mov(    bp(ix + 1), s1)
+        elseif ix == n - 1
+            shrx(   cnt, s1, s0)
+        else
+            shrx(   cnt, s1, s0)
+            mov(    bp(ix + 1), s1)
+        end
+    end # s0, s1 used
+    function f_b(ix::Int)
+        if ix != n - 1
+            shlx(   tnc, s1, s2)
+            lea(    (s0, s2), s2)
+        end
+    end # s1, s2 used
+    function f_c(ix::Int)
+        if ix == 0
+            add(    ap(ix), s2)
+            mov(    s2, rp(ix))
+        elseif ix == n - 1
+            adc(    ap(ix), s2)
+            mov(    s2, rp(ix))
+        else
+            adc(    ap(ix), s0)
+            mov(    s0, rp(ix))
+        end
+    end # nothing used
+
+    # We interleave as follows:
+    f_a(0)
+    f_b(0)
+    for ix in 1:(n - 1)
+        f_a(ix + 0)
+        f_c(ix - 1)
+        f_b(ix + 0)
+    end
+    f_c(n - 1)
+
+    setc(   R8(sx))
+
+    str *= "\tret\nEPILOGUE()\n"
+
+    return str
+end
+
+function print_all_addrsh(nmax::Int = 16)
+    for n in 2:nmax
+        println(addrsh(n))
+    end
+end
diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
index 4fefb0143b..ee3583ccab 100644
--- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -23,6 +23,8 @@ define(`s0', `%r9')
 define(`s1', `%r10')
 define(`s2', `%r11')
 
+dnl From n = 2 onwards, these are generated by `dev/gen_x86_aorsrsh.jl'.
+
 dnl r <- a +/- 2^n b
 dnl
 dnl For 0 <= i < n - 1, we have
@@ -50,7 +52,83 @@ dnl   u -= s		C sbb
 dnl   rp[i] = u		C mov
 dnl fi
 
-define(ALL_AORS,`
+dnl Non-optimized.  We probably should optimize add and sub differently.  We
+dnl probably need to use more registers to interleave more.
+ifdef(blablablabla,`
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_5)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+ifelse(OP,`add',`
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+',`
+	mov	0*8(ap), s0
+	sub	s2, s0
+	mov	s0, 0*8(rp)
+')
+
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+ifelse(OP,`add',`
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+',`
+	mov	1*8(ap), s0
+	sbb	s2, s0
+	mov	s0, 1*8(rp)
+')
+
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+ifelse(OP,`add',`
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+',`
+	mov	2*8(ap), s0
+	sbb	s2, s0
+	mov	s0, 2*8(rp)
+')
+
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+ifelse(OP,`add',`
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+',`
+	mov	3*8(ap), s0
+	sbb	s2, s0
+	mov	s0, 3*8(rp)
+')
+
+	shrx	cnt, s1, s0
+ifelse(OP,`add',`
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+',`
+	mov	4*8(ap), s2
+	sbb	s0, s2
+	mov	s2, 4*8(rp)
+')
+
+	setc	R8(sx)
+	ret
+EPILOGUE()',`')
+
+	TEXT
+	
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_1)
 	shrx	cnt, 0*8(bp), s0
@@ -65,25 +143,16 @@ EPILOGUE()
 PROLOGUE(flint_mpn_addrsh_2)
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
-
 	xor	R32(sx), R32(sx)
-
-	mov	1*8(bp), s1
-C
 	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
 	shlx	tnc, s1, s2
-	shrx	cnt, s1, s1
-	C (0, 2), 1
-
-	adox	s2, s0
-C
-	adcx	0*8(ap), s0
-	mov	s0, 0*8(rp)
-	adox	sx, s1		C cannot overflow
-	adcx	1*8(ap), s1
-C
-	mov	s1, 1*8(rp)
-
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
 	setc	R8(sx)
 	ret
 EPILOGUE()
@@ -93,128 +162,879 @@ PROLOGUE(flint_mpn_addrsh_3)
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
-
 	shrx	cnt, 0*8(bp), s0
 	mov	1*8(bp), s1
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-ifelse(OP,`add',`
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
 	add	0*8(ap), s2
 	mov	s2, 0*8(rp)
-',`
-	mov	0*8(ap), s0
-	sub	s2, s0
-	mov	s0, 0*8(rp)
-')
-	C Used: s1
-
-	shrx	cnt, s1, s1
-	mov	2*8(bp), s2
-	shlx	tnc, s2, s0
-	lea	(s1, s0), s0
-ifelse(OP,`add',`
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
 	adc	1*8(ap), s0
 	mov	s0, 1*8(rp)
-',`
-	mov	1*8(ap), s1
-	sbb	s0, s1
-	mov	s1, 1*8(rp)
-')
-	C Used: s2
-
-	shrx	cnt, s2, s2
-ifelse(OP,`add',`
 	adc	2*8(ap), s2
 	mov	s2, 2*8(rp)
-',`
-	mov	2*8(ap), s0
-	sbb	s2, s0
-	mov	s0, 1*8(rp)
-')
-
 	setc	R8(sx)
 	ret
 EPILOGUE()
-')
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_4)
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
 
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_5)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
 	mov	1*8(bp), s1
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-ifelse(OP,`add',`
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
 	add	0*8(ap), s2
 	mov	s2, 0*8(rp)
-',`
-	mov	0*8(ap), s0
-	sub	s2, s0
-	mov	s0, 0*8(rp)
-')
-	C Used: s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
 
-	shrx	cnt, s1, s1
-	mov	2*8(bp), s2
-	shlx	tnc, s2, s0
-	lea	(s1, s0), s0
-ifelse(OP,`add',`
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_6)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
 	adc	1*8(ap), s0
 	mov	s0, 1*8(rp)
-',`
-	mov	1*8(ap), s1
-	sbb	s0, s1
-	mov	s1, 1*8(rp)
-')
-	C Used: s2
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
 
-C
-	shrx	cnt, s1, s1
-	mov	3*8(bp), s2
-	shlx	tnc, s2, s0
-	lea	(s1, s0), s0
-ifelse(OP,`add',`
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_7)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
 	adc	2*8(ap), s0
 	mov	s0, 2*8(rp)
-',`
-	mov	2*8(ap), s1
-	sbb	s0, s1
-	mov	s1, 2*8(rp)
-')
-	C Used: s2
-C
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
 
-	shrx	cnt, s2, s2
-ifelse(OP,`add',`
-	adc	2*8(ap), s2
-	mov	s2, 2*8(rp)
-',`
-	mov	2*8(ap), s0
-	sbb	s2, s0
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_8)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
 	mov	s0, 1*8(rp)
-')
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
 
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_9)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
 	setc	R8(sx)
 	ret
 EPILOGUE()
-')
 
-	TEXT
-define(`flint_mpn_aorsrsh',`flint_mpn_addrsh_$1')
-define(`OP',`add')
-define(`OPC',`adc')
-ALL_AORSRSH
-undefine(`flint_mpn_aorsrsh')
-undefine(`OP')
-undefine(`OPC')
-
-define(`flint_mpn_aorsrsh',`flint_mpn_subrsh_$1')
-define(`OP',`sub')
-define(`OPC',`sbb')
-ALL_AORSRSH
-undefine(`flint_mpn_aorsrsh')
-undefine(`OP')
-undefine(`OPC')
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_10)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	8*8(ap), s0
+	mov	s0, 8*8(rp)
+	adc	9*8(ap), s2
+	mov	s2, 9*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_11)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s0
+	mov	s0, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	9*8(ap), s0
+	mov	s0, 9*8(rp)
+	adc	10*8(ap), s2
+	mov	s2, 10*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_12)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s0
+	mov	s0, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s0
+	mov	s0, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	10*8(ap), s0
+	mov	s0, 10*8(rp)
+	adc	11*8(ap), s2
+	mov	s2, 11*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_13)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s0
+	mov	s0, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s0
+	mov	s0, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s0
+	mov	s0, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	11*8(ap), s0
+	mov	s0, 11*8(rp)
+	adc	12*8(ap), s2
+	mov	s2, 12*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_14)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s0
+	mov	s0, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s0
+	mov	s0, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s0
+	mov	s0, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	13*8(bp), s1
+	adc	11*8(ap), s0
+	mov	s0, 11*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	12*8(ap), s0
+	mov	s0, 12*8(rp)
+	adc	13*8(ap), s2
+	mov	s2, 13*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_15)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s0
+	mov	s0, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s0
+	mov	s0, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s0
+	mov	s0, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	13*8(bp), s1
+	adc	11*8(ap), s0
+	mov	s0, 11*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	14*8(bp), s1
+	adc	12*8(ap), s0
+	mov	s0, 12*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	13*8(ap), s0
+	mov	s0, 13*8(rp)
+	adc	14*8(ap), s2
+	mov	s2, 14*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_16)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s0
+	mov	s0, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s0
+	mov	s0, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s0
+	mov	s0, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s0
+	mov	s0, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s0
+	mov	s0, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s0
+	mov	s0, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s0
+	mov	s0, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s0
+	mov	s0, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s0
+	mov	s0, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s0
+	mov	s0, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	13*8(bp), s1
+	adc	11*8(ap), s0
+	mov	s0, 11*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	14*8(bp), s1
+	adc	12*8(ap), s0
+	mov	s0, 12*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	15*8(bp), s1
+	adc	13*8(ap), s0
+	mov	s0, 13*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	adc	14*8(ap), s0
+	mov	s0, 14*8(rp)
+	adc	15*8(ap), s2
+	mov	s2, 15*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()

From c8069306b6f523c8013923eee639463d0612245e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 15:38:01 +0000
Subject: [PATCH 04/13] Also subrsh

---
 dev/gen_x86_aorsrsh.jl                        |   62 +-
 .../x86_64/broadwell/aorsrsh_hard.asm         | 1692 ++++++++++++++---
 2 files changed, 1432 insertions(+), 322 deletions(-)

diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl
index 401471d706..daa696ae5b 100644
--- a/dev/gen_x86_aorsrsh.jl
+++ b/dev/gen_x86_aorsrsh.jl
@@ -28,12 +28,19 @@ R8(sx::String) = "R8($sx)"
 s0 = "s0"
 s1 = "s1"
 s2 = "s2"
-sp = ["s$ix" for ix in 0:2] # Scrap registers
+s3 = "s3"
+sp = ["s$ix" for ix in 0:3] # Scrap registers
 s(ix::Int) = s[ix + 1]
 
 # Writes assembly that should be preprocessed by M4.
-function addrsh(n::Int)
-    str = "\tALIGN(16)\nPROLOGUE(flint_mpn_addrsh_$n)\n"
+function aorsrsh(n::Int; is_add::Bool = true)
+    str = "\tALIGN(16)\nPROLOGUE(flint_mpn_$(is_add ? "add" : "sub")rsh_$n)\n"
+    function push(s0::String)
+        str *= "\tpush\t$s0\n"
+    end
+    function pop(s0::String)
+        str *= "\tpop\t$s0\n"
+    end
     function mov(s0::String, s1::String)
         str *= "\tmov\t$s0, $s1\n"
     end
@@ -66,6 +73,9 @@ function addrsh(n::Int)
     end
 
     # Initialize variables
+    if !is_add
+        push(   s3)
+    end
     xor(    R32(tnc), R32(tnc))
     sub(    cnt, tnc)   # This is modulo 64, so -n = 64 - n.
     xor(    R32(sx), R32(sx))
@@ -76,7 +86,7 @@ function addrsh(n::Int)
             shrx(   cnt, bp(0), s0)
             mov(    bp(ix + 1), s1)
         elseif ix == n - 1
-            shrx(   cnt, s1, s0)
+            shrx(   cnt, s1, s1)
         else
             shrx(   cnt, s1, s0)
             mov(    bp(ix + 1), s1)
@@ -89,15 +99,33 @@ function addrsh(n::Int)
         end
     end # s1, s2 used
     function f_c(ix::Int)
-        if ix == 0
-            add(    ap(ix), s2)
-            mov(    s2, rp(ix))
-        elseif ix == n - 1
-            adc(    ap(ix), s2)
-            mov(    s2, rp(ix))
+        if is_add
+            if ix == 0
+                add(    ap(ix), s2)
+                mov(    s2, rp(ix))
+            elseif ix == n - 1
+                adc(    ap(ix), s1)
+                mov(    s1, rp(ix))
+            else
+                adc(    ap(ix), s2)
+                mov(    s2, rp(ix))
+            end
         else
-            adc(    ap(ix), s0)
-            mov(    s0, rp(ix))
+            # Due to the lack of an `rsub' instruction, we need an extra
+            # register.
+            if ix == 0
+                mov(    ap(ix), s3)
+                sub(    s2, s3)
+                mov(    s3, rp(ix))
+            elseif ix == n - 1
+                mov(    ap(ix), s0)
+                sub(    s1, s0)
+                mov(    s0, rp(ix))
+            else
+                mov(    ap(ix), s3)
+                sbb(    s2, s3)
+                mov(    s3, rp(ix))
+            end
         end
     end # nothing used
 
@@ -111,6 +139,9 @@ function addrsh(n::Int)
     end
     f_c(n - 1)
 
+    if !is_add
+        pop(    s3)
+    end
     setc(   R8(sx))
 
     str *= "\tret\nEPILOGUE()\n"
@@ -118,8 +149,11 @@ function addrsh(n::Int)
     return str
 end
 
-function print_all_addrsh(nmax::Int = 16)
+function print_all_aorsrsh(nmax::Int = 16)
+    for n in 2:nmax
+        println(aorsrsh(n, is_add = true))
+    end
     for n in 2:nmax
-        println(addrsh(n))
+        println(aorsrsh(n, is_add = false))
     end
 end
diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
index ee3583ccab..2b075466af 100644
--- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -125,22 +125,935 @@ ifelse(OP,`add',`
 
 	setc	R8(sx)
 	ret
-EPILOGUE()',`')
+EPILOGUE()
+',`')
+
+	TEXT
+	
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_1)
+	shrx	cnt, 0*8(bp), s0
+	xor	R32(sx), R32(sx)
+	add	0*8(ap), s0
+	mov	s0, 0*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_2)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	adc	1*8(ap), s1
+	mov	s1, 1*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_3)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	adc	2*8(ap), s1
+	mov	s1, 2*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_4)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	adc	3*8(ap), s1
+	mov	s1, 3*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_5)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	adc	4*8(ap), s1
+	mov	s1, 4*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_6)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	adc	5*8(ap), s1
+	mov	s1, 5*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_7)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	adc	6*8(ap), s1
+	mov	s1, 6*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_8)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	adc	7*8(ap), s1
+	mov	s1, 7*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_9)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	adc	8*8(ap), s1
+	mov	s1, 8*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_10)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
+	adc	9*8(ap), s1
+	mov	s1, 9*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_11)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	9*8(ap), s2
+	mov	s2, 9*8(rp)
+	adc	10*8(ap), s1
+	mov	s1, 10*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_12)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s2
+	mov	s2, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	10*8(ap), s2
+	mov	s2, 10*8(rp)
+	adc	11*8(ap), s1
+	mov	s1, 11*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_13)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s2
+	mov	s2, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s2
+	mov	s2, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	11*8(ap), s2
+	mov	s2, 11*8(rp)
+	adc	12*8(ap), s1
+	mov	s1, 12*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_14)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s2
+	mov	s2, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s2
+	mov	s2, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	13*8(bp), s1
+	adc	11*8(ap), s2
+	mov	s2, 11*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	12*8(ap), s2
+	mov	s2, 12*8(rp)
+	adc	13*8(ap), s1
+	mov	s1, 13*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_15)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s2
+	mov	s2, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s2
+	mov	s2, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	13*8(bp), s1
+	adc	11*8(ap), s2
+	mov	s2, 11*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	14*8(bp), s1
+	adc	12*8(ap), s2
+	mov	s2, 12*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	13*8(ap), s2
+	mov	s2, 13*8(rp)
+	adc	14*8(ap), s1
+	mov	s1, 14*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_addrsh_16)
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	xor	R32(sx), R32(sx)
+	shrx	cnt, 0*8(bp), s0
+	mov	1*8(bp), s1
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	2*8(bp), s1
+	add	0*8(ap), s2
+	mov	s2, 0*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	3*8(bp), s1
+	adc	1*8(ap), s2
+	mov	s2, 1*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	4*8(bp), s1
+	adc	2*8(ap), s2
+	mov	s2, 2*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	5*8(bp), s1
+	adc	3*8(ap), s2
+	mov	s2, 3*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	6*8(bp), s1
+	adc	4*8(ap), s2
+	mov	s2, 4*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	7*8(bp), s1
+	adc	5*8(ap), s2
+	mov	s2, 5*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	8*8(bp), s1
+	adc	6*8(ap), s2
+	mov	s2, 6*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	9*8(bp), s1
+	adc	7*8(ap), s2
+	mov	s2, 7*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	10*8(bp), s1
+	adc	8*8(ap), s2
+	mov	s2, 8*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	11*8(bp), s1
+	adc	9*8(ap), s2
+	mov	s2, 9*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	12*8(bp), s1
+	adc	10*8(ap), s2
+	mov	s2, 10*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	13*8(bp), s1
+	adc	11*8(ap), s2
+	mov	s2, 11*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	14*8(bp), s1
+	adc	12*8(ap), s2
+	mov	s2, 12*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s0
+	mov	15*8(bp), s1
+	adc	13*8(ap), s2
+	mov	s2, 13*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	adc	14*8(ap), s2
+	mov	s2, 14*8(rp)
+	adc	15*8(ap), s1
+	mov	s1, 15*8(rp)
+	setc	R8(sx)
+	ret
+EPILOGUE()
 
-	TEXT
-	
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_1)
+PROLOGUE(flint_mpn_subrsh_1)
 	shrx	cnt, 0*8(bp), s0
 	xor	R32(sx), R32(sx)
-	add	0*8(ap), s0
-	mov	s0, 0*8(rp)
+	mov	0*8(ap), s1
+	sub	s0, s1
+	mov	s1, 0*8(rp)
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
+dnl Modified to avoid pushing and popping s3
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_2)
+PROLOGUE(flint_mpn_subrsh_2)
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -148,17 +1061,20 @@ PROLOGUE(flint_mpn_addrsh_2)
 	mov	1*8(bp), s1
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
-	adc	1*8(ap), s2
-	mov	s2, 1*8(rp)
+	shrx	cnt, s1, s1
+	mov	0*8(ap), tnc
+	sub	s2, tnc
+	mov	tnc, 0*8(rp)
+	mov	1*8(ap), s0
+	sub	s1, s0
+	mov	s0, 1*8(rp)
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_3)
+PROLOGUE(flint_mpn_subrsh_3)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -168,21 +1084,26 @@ PROLOGUE(flint_mpn_addrsh_3)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
-	adc	2*8(ap), s2
-	mov	s2, 2*8(rp)
+	shrx	cnt, s1, s1
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
+	mov	2*8(ap), s0
+	sub	s1, s0
+	mov	s0, 2*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_4)
+PROLOGUE(flint_mpn_subrsh_4)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -192,27 +1113,33 @@ PROLOGUE(flint_mpn_addrsh_4)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
-	adc	3*8(ap), s2
-	mov	s2, 3*8(rp)
+	shrx	cnt, s1, s1
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
+	mov	3*8(ap), s0
+	sub	s1, s0
+	mov	s0, 3*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_5)
+PROLOGUE(flint_mpn_subrsh_5)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -222,33 +1149,40 @@ PROLOGUE(flint_mpn_addrsh_5)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
-	adc	4*8(ap), s2
-	mov	s2, 4*8(rp)
+	shrx	cnt, s1, s1
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
+	mov	4*8(ap), s0
+	sub	s1, s0
+	mov	s0, 4*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_6)
+PROLOGUE(flint_mpn_subrsh_6)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -258,39 +1192,47 @@ PROLOGUE(flint_mpn_addrsh_6)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
-	adc	5*8(ap), s2
-	mov	s2, 5*8(rp)
+	shrx	cnt, s1, s1
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
+	mov	5*8(ap), s0
+	sub	s1, s0
+	mov	s0, 5*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_7)
+PROLOGUE(flint_mpn_subrsh_7)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -300,45 +1242,54 @@ PROLOGUE(flint_mpn_addrsh_7)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
-	adc	6*8(ap), s2
-	mov	s2, 6*8(rp)
+	shrx	cnt, s1, s1
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
+	mov	6*8(ap), s0
+	sub	s1, s0
+	mov	s0, 6*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_8)
+PROLOGUE(flint_mpn_subrsh_8)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -348,51 +1299,61 @@ PROLOGUE(flint_mpn_addrsh_8)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
-	adc	7*8(ap), s2
-	mov	s2, 7*8(rp)
+	shrx	cnt, s1, s1
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
+	mov	7*8(ap), s0
+	sub	s1, s0
+	mov	s0, 7*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_9)
+PROLOGUE(flint_mpn_subrsh_9)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -402,57 +1363,68 @@ PROLOGUE(flint_mpn_addrsh_9)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
-	adc	8*8(ap), s2
-	mov	s2, 8*8(rp)
+	shrx	cnt, s1, s1
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
+	mov	8*8(ap), s0
+	sub	s1, s0
+	mov	s0, 8*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_10)
+PROLOGUE(flint_mpn_subrsh_10)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -462,63 +1434,75 @@ PROLOGUE(flint_mpn_addrsh_10)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	9*8(bp), s1
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	8*8(ap), s0
-	mov	s0, 8*8(rp)
-	adc	9*8(ap), s2
-	mov	s2, 9*8(rp)
+	shrx	cnt, s1, s1
+	mov	8*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 8*8(rp)
+	mov	9*8(ap), s0
+	sub	s1, s0
+	mov	s0, 9*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_11)
+PROLOGUE(flint_mpn_subrsh_11)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -528,69 +1512,82 @@ PROLOGUE(flint_mpn_addrsh_11)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	9*8(bp), s1
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	10*8(bp), s1
-	adc	8*8(ap), s0
-	mov	s0, 8*8(rp)
+	mov	8*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 8*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	9*8(ap), s0
-	mov	s0, 9*8(rp)
-	adc	10*8(ap), s2
-	mov	s2, 10*8(rp)
+	shrx	cnt, s1, s1
+	mov	9*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 9*8(rp)
+	mov	10*8(ap), s0
+	sub	s1, s0
+	mov	s0, 10*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_12)
+PROLOGUE(flint_mpn_subrsh_12)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -600,75 +1597,89 @@ PROLOGUE(flint_mpn_addrsh_12)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	9*8(bp), s1
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	10*8(bp), s1
-	adc	8*8(ap), s0
-	mov	s0, 8*8(rp)
+	mov	8*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 8*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	11*8(bp), s1
-	adc	9*8(ap), s0
-	mov	s0, 9*8(rp)
+	mov	9*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 9*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	10*8(ap), s0
-	mov	s0, 10*8(rp)
-	adc	11*8(ap), s2
-	mov	s2, 11*8(rp)
+	shrx	cnt, s1, s1
+	mov	10*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 10*8(rp)
+	mov	11*8(ap), s0
+	sub	s1, s0
+	mov	s0, 11*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_13)
+PROLOGUE(flint_mpn_subrsh_13)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -678,81 +1689,96 @@ PROLOGUE(flint_mpn_addrsh_13)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	9*8(bp), s1
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	10*8(bp), s1
-	adc	8*8(ap), s0
-	mov	s0, 8*8(rp)
+	mov	8*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 8*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	11*8(bp), s1
-	adc	9*8(ap), s0
-	mov	s0, 9*8(rp)
+	mov	9*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 9*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	12*8(bp), s1
-	adc	10*8(ap), s0
-	mov	s0, 10*8(rp)
+	mov	10*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 10*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	11*8(ap), s0
-	mov	s0, 11*8(rp)
-	adc	12*8(ap), s2
-	mov	s2, 12*8(rp)
+	shrx	cnt, s1, s1
+	mov	11*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 11*8(rp)
+	mov	12*8(ap), s0
+	sub	s1, s0
+	mov	s0, 12*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_14)
+PROLOGUE(flint_mpn_subrsh_14)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -762,87 +1788,103 @@ PROLOGUE(flint_mpn_addrsh_14)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	9*8(bp), s1
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	10*8(bp), s1
-	adc	8*8(ap), s0
-	mov	s0, 8*8(rp)
+	mov	8*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 8*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	11*8(bp), s1
-	adc	9*8(ap), s0
-	mov	s0, 9*8(rp)
+	mov	9*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 9*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	12*8(bp), s1
-	adc	10*8(ap), s0
-	mov	s0, 10*8(rp)
+	mov	10*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 10*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	13*8(bp), s1
-	adc	11*8(ap), s0
-	mov	s0, 11*8(rp)
+	mov	11*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 11*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	12*8(ap), s0
-	mov	s0, 12*8(rp)
-	adc	13*8(ap), s2
-	mov	s2, 13*8(rp)
+	shrx	cnt, s1, s1
+	mov	12*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 12*8(rp)
+	mov	13*8(ap), s0
+	sub	s1, s0
+	mov	s0, 13*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_15)
+PROLOGUE(flint_mpn_subrsh_15)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -852,93 +1894,110 @@ PROLOGUE(flint_mpn_addrsh_15)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	9*8(bp), s1
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	10*8(bp), s1
-	adc	8*8(ap), s0
-	mov	s0, 8*8(rp)
+	mov	8*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 8*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	11*8(bp), s1
-	adc	9*8(ap), s0
-	mov	s0, 9*8(rp)
+	mov	9*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 9*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	12*8(bp), s1
-	adc	10*8(ap), s0
-	mov	s0, 10*8(rp)
+	mov	10*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 10*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	13*8(bp), s1
-	adc	11*8(ap), s0
-	mov	s0, 11*8(rp)
+	mov	11*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 11*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	14*8(bp), s1
-	adc	12*8(ap), s0
-	mov	s0, 12*8(rp)
+	mov	12*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 12*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	13*8(ap), s0
-	mov	s0, 13*8(rp)
-	adc	14*8(ap), s2
-	mov	s2, 14*8(rp)
+	shrx	cnt, s1, s1
+	mov	13*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 13*8(rp)
+	mov	14*8(ap), s0
+	sub	s1, s0
+	mov	s0, 14*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()
 
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_16)
+PROLOGUE(flint_mpn_subrsh_16)
+	push	s3
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
@@ -948,93 +2007,110 @@ PROLOGUE(flint_mpn_addrsh_16)
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	2*8(bp), s1
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
+	mov	0*8(ap), s3
+	sub	s2, s3
+	mov	s3, 0*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	3*8(bp), s1
-	adc	1*8(ap), s0
-	mov	s0, 1*8(rp)
+	mov	1*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 1*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	4*8(bp), s1
-	adc	2*8(ap), s0
-	mov	s0, 2*8(rp)
+	mov	2*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 2*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	5*8(bp), s1
-	adc	3*8(ap), s0
-	mov	s0, 3*8(rp)
+	mov	3*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 3*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	6*8(bp), s1
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
+	mov	4*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 4*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	7*8(bp), s1
-	adc	5*8(ap), s0
-	mov	s0, 5*8(rp)
+	mov	5*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 5*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	8*8(bp), s1
-	adc	6*8(ap), s0
-	mov	s0, 6*8(rp)
+	mov	6*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 6*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	9*8(bp), s1
-	adc	7*8(ap), s0
-	mov	s0, 7*8(rp)
+	mov	7*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 7*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	10*8(bp), s1
-	adc	8*8(ap), s0
-	mov	s0, 8*8(rp)
+	mov	8*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 8*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	11*8(bp), s1
-	adc	9*8(ap), s0
-	mov	s0, 9*8(rp)
+	mov	9*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 9*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	12*8(bp), s1
-	adc	10*8(ap), s0
-	mov	s0, 10*8(rp)
+	mov	10*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 10*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	13*8(bp), s1
-	adc	11*8(ap), s0
-	mov	s0, 11*8(rp)
+	mov	11*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 11*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	14*8(bp), s1
-	adc	12*8(ap), s0
-	mov	s0, 12*8(rp)
+	mov	12*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 12*8(rp)
 	shlx	tnc, s1, s2
 	lea	(s0, s2), s2
 	shrx	cnt, s1, s0
 	mov	15*8(bp), s1
-	adc	13*8(ap), s0
-	mov	s0, 13*8(rp)
-	shlx	tnc, s1, s2
-	lea	(s0, s2), s2
-	shrx	cnt, s1, s0
-	adc	14*8(ap), s0
-	mov	s0, 14*8(rp)
-	adc	15*8(ap), s2
-	mov	s2, 15*8(rp)
+	mov	13*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 13*8(rp)
+	shlx	tnc, s1, s2
+	lea	(s0, s2), s2
+	shrx	cnt, s1, s1
+	mov	14*8(ap), s3
+	sbb	s2, s3
+	mov	s3, 14*8(rp)
+	mov	15*8(ap), s0
+	sub	s1, s0
+	mov	s0, 15*8(rp)
+	pop	s3
 	setc	R8(sx)
 	ret
 EPILOGUE()

From 62a6c2012dcbaa5ff94f53868c66bf8fde76ee41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 15:38:38 +0000
Subject: [PATCH 05/13] Fixup

---
 src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
index 2b075466af..442d440e53 100644
--- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -22,6 +22,7 @@ define(`sx', `%rax')
 define(`s0', `%r9')
 define(`s1', `%r10')
 define(`s2', `%r11')
+define(`s3', `%rbx')
 
 dnl From n = 2 onwards, these are generated by `dev/gen_x86_aorsrsh.jl'.
 

From e63d4f7e7445335a00b03fe086d94d0853ce81a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 15:40:45 +0000
Subject: [PATCH 06/13] fixup

---
 src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
index 442d440e53..b84a4df265 100644
--- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -53,11 +53,10 @@ dnl   u -= s		C sbb
 dnl   rp[i] = u		C mov
 dnl fi
 
-dnl Non-optimized.  We probably should optimize add and sub differently.  We
-dnl probably need to use more registers to interleave more.
+dnl Non-optimized version.
 ifdef(blablablabla,`
 	ALIGN(16)
-PROLOGUE(flint_mpn_addrsh_5)
+PROLOGUE(flint_mpn_aorsrsh_5)
 	xor	R32(tnc), R32(tnc)
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)

From 2daf43a14f1e5dc7db93eb0823627ed6e122424e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 16:04:02 +0000
Subject: [PATCH 07/13] Add corresponding C sources for aorsrsh

---
 src/mpn_extras.h           | 51 ++++++++++++++++++++++
 src/mpn_extras/aorsrsh_n.c | 88 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 src/mpn_extras/aorsrsh_n.c

diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 9d4ba63c0c..9b2877e20c 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -460,6 +460,56 @@ mp_limb_t mpn_rsh1add_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
 mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
 #endif
 
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx
+# define FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH 17
+
+# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH)
+
+# define FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_addrsh_func_tab[n](rp, xp, yp, cnt))
+# define FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_subrsh_func_tab[n](rp, xp, yp, cnt))
+#endif
+
+typedef mp_limb_t (* flint_mpn_aorssh_func_t)(mp_ptr, mp_srcptr, mp_srcptr, unsigned int);
+
+#ifdef FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH
+# define FLINT_USE_AORSRSH_FUNC_TAB 1
+FLINT_DLL extern const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[];
+FLINT_DLL extern const flint_mpn_aorssh_func_t flint_mpn_subrsh_func_tab[];
+#else
+# define FLINT_HAVE_AORSRSH_FUNC(n) 0
+# define FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt) 0
+# define FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt) 0
+#endif
+
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
+{
+    FLINT_ASSERT(n >= 1);
+
+    if (FLINT_HAVE_AORSRSH_FUNC(n))
+        return FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt);
+    else
+    {
+        mpn_rshift(rp, yp, n, cnt);
+        return mpn_add_n(rp, rp, xp, n);
+    }
+}
+
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
+{
+    FLINT_ASSERT(n >= 1);
+
+    if (FLINT_HAVE_AORSRSH_FUNC(n))
+        return FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt);
+    else
+    {
+        /* r = x - 2^c y */
+        mpn_rshift(rp, yp, n, cnt);
+        return mpn_sub_n(rp, xp, rp, n);
+    }
+}
+
 /* multiplication (general) **************************************************/
 
 /* NOTE: This is getting a bit messy.  How can we clean this up? */
@@ -541,6 +591,7 @@ mp_limb_t _flint_mpn_mul(mp_ptr r, mp_srcptr x, mp_size_t xn, mp_srcptr y, mp_si
 void _flint_mpn_mul_n(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n);
 mp_limb_t _flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n);
 
+/* FIXME: This should be under addition */
 MPN_EXTRAS_INLINE
 mp_limb_t flint_mpn_add_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
 {
diff --git a/src/mpn_extras/aorsrsh_n.c b/src/mpn_extras/aorsrsh_n.c
new file mode 100644
index 0000000000..77bf7690ba
--- /dev/null
+++ b/src/mpn_extras/aorsrsh_n.c
@@ -0,0 +1,88 @@
+/*
+    Copyright (C) 2024 Albin Ahlbäck
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "mpn_extras.h"
+
+#define DECL_AORSRSH(n) _DECL_AORSRSH(n)
+#define _DECL_AORSRSH(n) \
+mp_limb_t flint_mpn_addrsh_##n(mp_ptr, mp_srcptr, mp_srcptr, unsigned int); \
+mp_limb_t flint_mpn_subrsh_##n(mp_ptr, mp_srcptr, mp_srcptr, unsigned int)
+
+#define ADDRSH(n) _ADDRSH(n)
+#define _ADDRSH(n) flint_mpn_addrsh_##n
+#define SUBRSH(n) _SUBRSH(n)
+#define _SUBRSH(n) flint_mpn_subrsh_##n
+
+/* Herein we assume that x86 and ARM are equivalent. */
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx || FLINT_HAVE_ASSEMBLY_armv8
+DECL_AORSRSH(1);
+DECL_AORSRSH(2);
+DECL_AORSRSH(3);
+DECL_AORSRSH(4);
+DECL_AORSRSH(5);
+DECL_AORSRSH(6);
+DECL_AORSRSH(7);
+DECL_AORSRSH(8);
+DECL_AORSRSH(9);
+DECL_AORSRSH(10);
+DECL_AORSRSH(11);
+DECL_AORSRSH(12);
+DECL_AORSRSH(13);
+DECL_AORSRSH(14);
+DECL_AORSRSH(15);
+DECL_AORSRSH(16);
+
+/* TODO: Should probably rename these types so to not have two different types.
+ * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */
+const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] =
+{
+    NULL,
+    ADDRSH(1),
+    ADDRSH(2),
+    ADDRSH(3),
+    ADDRSH(4),
+    ADDRSH(5),
+    ADDRSH(6),
+    ADDRSH(7),
+    ADDRSH(8),
+    ADDRSH(9),
+    ADDRSH(10),
+    ADDRSH(11),
+    ADDRSH(12),
+    ADDRSH(13),
+    ADDRSH(14),
+    ADDRSH(15),
+    ADDRSH(16)
+};
+
+const flint_mpn_aorssh_func_t flint_mpn_subsh_func_tab[] =
+{
+    NULL,
+    SUBRSH(1),
+    SUBRSH(2),
+    SUBRSH(3),
+    SUBRSH(4),
+    SUBRSH(5),
+    SUBRSH(6),
+    SUBRSH(7),
+    SUBRSH(8),
+    SUBRSH(9),
+    SUBRSH(10),
+    SUBRSH(11),
+    SUBRSH(12),
+    SUBRSH(13),
+    SUBRSH(14),
+    SUBRSH(15),
+    SUBRSH(16)
+};
+#else
+typedef int this_file_is_empty;
+#endif

From 96786cd490c1661440376f074b7b5b7f2e5d7c7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 16:55:17 +0000
Subject: [PATCH 08/13] Add tests for aorsrsh

---
 src/mpn_extras.h                  |   5 +-
 src/mpn_extras/test/main.c        |   2 +
 src/mpn_extras/test/t-aors_n.c    |  35 ++++++++-
 src/mpn_extras/test/t-aorsrsh_n.c | 125 ++++++++++++++++++++++++++++++
 4 files changed, 162 insertions(+), 5 deletions(-)
 create mode 100644 src/mpn_extras/test/t-aorsrsh_n.c

diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 9b2877e20c..8f4d5ce307 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -463,7 +463,7 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
 #if FLINT_HAVE_ASSEMBLY_x86_64_adx
 # define FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH 17
 
-# define FLINT_HAVE_AORS_FUNC(n) ((n) < FLINT_MPN_AORS_FUNC_TAB_WIDTH)
+# define FLINT_HAVE_AORSRSH_FUNC(n) ((n) < FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH)
 
 # define FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_addrsh_func_tab[n](rp, xp, yp, cnt))
 # define FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt) (flint_mpn_subrsh_func_tab[n](rp, xp, yp, cnt))
@@ -485,6 +485,7 @@ MPN_EXTRAS_INLINE
 mp_limb_t flint_mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
 {
     FLINT_ASSERT(n >= 1);
+    FLINT_ASSERT(1 <= cnt && cnt < FLINT_BITS);
 
     if (FLINT_HAVE_AORSRSH_FUNC(n))
         return FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt);
@@ -499,12 +500,12 @@ MPN_EXTRAS_INLINE
 mp_limb_t flint_mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
 {
     FLINT_ASSERT(n >= 1);
+    FLINT_ASSERT(1 <= cnt && cnt < FLINT_BITS);
 
     if (FLINT_HAVE_AORSRSH_FUNC(n))
         return FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt);
     else
     {
-        /* r = x - 2^c y */
         mpn_rshift(rp, yp, n, cnt);
         return mpn_sub_n(rp, xp, rp, n);
     }
diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c
index 171a9b7342..7ed9a71dcc 100644
--- a/src/mpn_extras/test/main.c
+++ b/src/mpn_extras/test/main.c
@@ -13,6 +13,7 @@
 
 #include "t-2add_n_inplace.c"
 #include "t-aors_n.c"
+#include "t-aorsrsh_n.c"
 #include "t-divides.c"
 #include "t-divrem_preinv1.c"
 #include "t-divrem_preinvn.c"
@@ -40,6 +41,7 @@ test_struct tests[] =
 {
     TEST_FUNCTION(flint_mpn_2add_n_inplace),
     TEST_FUNCTION(flint_mpn_aors_n),
+    TEST_FUNCTION(flint_mpn_aorsrsh_n),
     TEST_FUNCTION(flint_mpn_divides),
     TEST_FUNCTION(flint_mpn_divrem_preinv1),
     TEST_FUNCTION(flint_mpn_divrem_preinvn),
diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c
index 0af210d94c..b40659ee97 100644
--- a/src/mpn_extras/test/t-aors_n.c
+++ b/src/mpn_extras/test/t-aors_n.c
@@ -26,6 +26,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state)
     {
         int result;
         int type;
+        int aliasing;
         mp_limb_t cf, cg;
         mp_size_t n;
         mp_ptr fp, gp, xp, yp;
@@ -34,6 +35,11 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state)
         if (n_randint(state, 1 << 10) == UWORD(0))
             n += N_STOR;
 
+        /* 0: No aliasing
+         * 1: fp = xp
+         * 2: fp = yp */
+        aliasing = n_randint(state, 3);
+
         fp = flint_malloc(sizeof(mp_limb_t) * n);
         gp = flint_malloc(sizeof(mp_limb_t) * n);
         xp = flint_malloc(sizeof(mp_limb_t) * n);
@@ -46,12 +52,34 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state)
 
         if (type == 0)
         {
-            cf = flint_mpn_add_n(fp, xp, yp, n);
+            if (aliasing == 0)
+                cf = flint_mpn_add_n(fp, xp, yp, n);
+            else if (aliasing == 1)
+            {
+                flint_mpn_copyi(fp, xp, n);
+                cf = flint_mpn_add_n(fp, fp, yp, n);
+            }
+            else
+            {
+                flint_mpn_copyi(fp, yp, n);
+                cf = flint_mpn_add_n(fp, xp, fp, n);
+            }
             cg = mpn_add_n(gp, xp, yp, n);
         }
         else
         {
-            cf = flint_mpn_sub_n(fp, xp, yp, n);
+            if (aliasing == 0)
+                cf = flint_mpn_sub_n(fp, xp, yp, n);
+            else if (aliasing == 1)
+            {
+                flint_mpn_copyi(fp, xp, n);
+                cf = flint_mpn_sub_n(fp, fp, yp, n);
+            }
+            else
+            {
+                flint_mpn_copyi(fp, yp, n);
+                cf = flint_mpn_sub_n(fp, xp, fp, n);
+            }
             cg = mpn_sub_n(gp, xp, yp, n);
         }
 
@@ -59,6 +87,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state)
         if (!result)
             TEST_FUNCTION_FAIL(
                     "%s:\n"
+                    "aliasing: %d\n"
                     "ix = %wd\n"
                     "n = %wd\n"
                     "xp = %{ulong*}\n"
@@ -66,7 +95,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state)
                     "FLINT (cy = %wu): %{ulong*}\n"
                     "GMP   (cy = %wu): %{ulong*}\n",
                     type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n",
-                    ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1);
+                    aliasing, ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1);
 
         flint_free(fp);
         flint_free(gp);
diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c
new file mode 100644
index 0000000000..040fa59912
--- /dev/null
+++ b/src/mpn_extras/test/t-aorsrsh_n.c
@@ -0,0 +1,125 @@
+/*
+    Copyright (C) 2024 Albin Ahlbäck
+    Copyright (C) 2024 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "mpn_extras.h"
+
+#define N_MIN                                        1
+#define N_MAX   (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH -  1)
+#define N_STOR  (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH + 10)
+
+static mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
+{
+    mpn_rshift(rp, yp, n, cnt);
+    return mpn_add_n(rp, rp, xp, n);
+}
+
+static mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
+{
+    mpn_rshift(rp, yp, n, cnt);
+    return mpn_sub_n(rp, xp, rp, n);
+}
+
+TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
+{
+#if FLINT_USE_AORSRSH_FUNC_TAB
+    slong ix;
+
+    for (ix = 0; ix < 10000 * flint_test_multiplier(); ix++)
+    {
+        int result;
+        int type;
+        int aliasing;
+        unsigned int cnt;
+        mp_limb_t cf, cg;
+        mp_size_t n;
+        mp_ptr fp, gp, xp, yp;
+
+        n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
+        if (n_randint(state, 1 << 10) == UWORD(0))
+            n += N_STOR;
+
+        /* 0: No aliasing
+         * 1: fp = xp
+         * 2: fp = yp */
+        aliasing = n_randint(state, 3);
+
+        fp = flint_malloc(sizeof(mp_limb_t) * n);
+        gp = flint_malloc(sizeof(mp_limb_t) * n);
+        xp = flint_malloc(sizeof(mp_limb_t) * n);
+        yp = flint_malloc(sizeof(mp_limb_t) * n);
+
+        flint_mpn_rrandom(xp, state, n);
+        flint_mpn_rrandom(yp, state, n);
+        cnt = 1 + n_randint(state, FLINT_BITS - 1);
+
+        type = n_randint(state, 2);
+
+        if (type == 0)
+        {
+            if (aliasing == 0)
+                cf = flint_mpn_addrsh_n(fp, xp, yp, n, cnt);
+            else if (aliasing == 1)
+            {
+                flint_mpn_copyi(fp, xp, n);
+                cf = flint_mpn_addrsh_n(fp, fp, yp, n, cnt);
+            }
+            else
+            {
+                flint_mpn_copyi(fp, yp, n);
+                cf = flint_mpn_addrsh_n(fp, xp, fp, n, cnt);
+            }
+            cg = mpn_addrsh_n(gp, xp, yp, n, cnt);
+        }
+        else
+        {
+            if (aliasing == 0)
+                cf = flint_mpn_subrsh_n(fp, xp, yp, n, cnt);
+            else if (aliasing == 1)
+            {
+                flint_mpn_copyi(fp, xp, n);
+                cf = flint_mpn_subrsh_n(fp, fp, yp, n, cnt);
+            }
+            else
+            {
+                flint_mpn_copyi(fp, yp, n);
+                cf = flint_mpn_subrsh_n(fp, xp, fp, n, cnt);
+            }
+            cg = mpn_subrsh_n(gp, xp, yp, n, cnt);
+        }
+
+        result = (cf == cg && mpn_cmp(fp, gp, n) == 0);
+        if (!result)
+            TEST_FUNCTION_FAIL(
+                    "%s:\n"
+                    "aliasing: %d\n"
+                    "ix = %wd\n"
+                    "n = %u\n"
+                    "cnt = %wd\n"
+                    "xp = %{ulong*}\n"
+                    "yp = %{ulong*}\n"
+                    "FLINT (cy = %wu): %{ulong*}\n"
+                    "GMP   (cy = %wu): %{ulong*}\n",
+                    type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n",
+                    aliasing, ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n + 1);
+
+        flint_free(fp);
+        flint_free(gp);
+        flint_free(xp);
+        flint_free(yp);
+    }
+
+    TEST_FUNCTION_END(state);
+#else
+    TEST_FUNCTION_END_SKIPPED(state);
+#endif
+}

From 65fadb40145f538b94460c2362cb583e70404119 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 19:14:58 +0000
Subject: [PATCH 09/13] blabla

---
 dev/gen_x86_aorsrsh.jl                        |   2 +-
 src/mpn_extras/aorsrsh_n.c                    |   4 +-
 src/mpn_extras/test/t-aors_n.c                |   5 +-
 src/mpn_extras/test/t-aorsrsh_n.c             |  22 +-
 .../x86_64/broadwell/aorsrsh_hard.asm         | 206 +++++++++---------
 5 files changed, 122 insertions(+), 117 deletions(-)

diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl
index daa696ae5b..c11655d8fc 100644
--- a/dev/gen_x86_aorsrsh.jl
+++ b/dev/gen_x86_aorsrsh.jl
@@ -76,7 +76,7 @@ function aorsrsh(n::Int; is_add::Bool = true)
     if !is_add
         push(   s3)
     end
-    xor(    R32(tnc), R32(tnc))
+    xor(    tnc, tnc)   # We do not use 32 bit mode here since tnc = %r8.
     sub(    cnt, tnc)   # This is modulo 64, so -n = 64 - n.
     xor(    R32(sx), R32(sx))
 
diff --git a/src/mpn_extras/aorsrsh_n.c b/src/mpn_extras/aorsrsh_n.c
index 77bf7690ba..c561c695e3 100644
--- a/src/mpn_extras/aorsrsh_n.c
+++ b/src/mpn_extras/aorsrsh_n.c
@@ -40,8 +40,6 @@ DECL_AORSRSH(14);
 DECL_AORSRSH(15);
 DECL_AORSRSH(16);
 
-/* TODO: Should probably rename these types so to not have two different types.
- * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */
 const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] =
 {
     NULL,
@@ -63,7 +61,7 @@ const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] =
     ADDRSH(16)
 };
 
-const flint_mpn_aorssh_func_t flint_mpn_subsh_func_tab[] =
+const flint_mpn_aorssh_func_t flint_mpn_subrsh_func_tab[] =
 {
     NULL,
     SUBRSH(1),
diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c
index b40659ee97..55c4c4ae68 100644
--- a/src/mpn_extras/test/t-aors_n.c
+++ b/src/mpn_extras/test/t-aors_n.c
@@ -87,7 +87,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state)
         if (!result)
             TEST_FUNCTION_FAIL(
                     "%s:\n"
-                    "aliasing: %d\n"
+                    "aliasing: %s\n"
                     "ix = %wd\n"
                     "n = %wd\n"
                     "xp = %{ulong*}\n"
@@ -95,7 +95,8 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state)
                     "FLINT (cy = %wu): %{ulong*}\n"
                     "GMP   (cy = %wu): %{ulong*}\n",
                     type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n",
-                    aliasing, ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1);
+                    aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp",
+                    ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n);
 
         flint_free(fp);
         flint_free(gp);
diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c
index 040fa59912..25d9075af9 100644
--- a/src/mpn_extras/test/t-aorsrsh_n.c
+++ b/src/mpn_extras/test/t-aorsrsh_n.c
@@ -17,13 +17,15 @@
 #define N_MAX   (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH -  1)
 #define N_STOR  (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH + 10)
 
-static mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
+static
+mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
 {
     mpn_rshift(rp, yp, n, cnt);
     return mpn_add_n(rp, rp, xp, n);
 }
 
-static mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
+static
+mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt)
 {
     mpn_rshift(rp, yp, n, cnt);
     return mpn_sub_n(rp, xp, rp, n);
@@ -51,7 +53,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
         /* 0: No aliasing
          * 1: fp = xp
          * 2: fp = yp */
-        aliasing = n_randint(state, 3);
+        aliasing = 0; /* n_randint(state, 3); */
 
         fp = flint_malloc(sizeof(mp_limb_t) * n);
         gp = flint_malloc(sizeof(mp_limb_t) * n);
@@ -101,16 +103,17 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
         if (!result)
             TEST_FUNCTION_FAIL(
                     "%s:\n"
-                    "aliasing: %d\n"
+                    "aliasing: %s\n"
                     "ix = %wd\n"
-                    "n = %u\n"
-                    "cnt = %wd\n"
+                    "n = %wd\n"
+                    "cnt = %u\n"
                     "xp = %{ulong*}\n"
                     "yp = %{ulong*}\n"
                     "FLINT (cy = %wu): %{ulong*}\n"
                     "GMP   (cy = %wu): %{ulong*}\n",
-                    type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n",
-                    aliasing, ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n + 1);
+                    type == 0 ? "flint_mpn_addrsh_n" : "flint_mpn_subrsh_n",
+                    aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp",
+                    ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n);
 
         flint_free(fp);
         flint_free(gp);
@@ -123,3 +126,6 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
     TEST_FUNCTION_END_SKIPPED(state);
 #endif
 }
+#undef N_MIN
+#undef N_MAX
+#undef N_STOR
diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
index b84a4df265..ec24c45e30 100644
--- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -54,79 +54,79 @@ dnl   rp[i] = u		C mov
 dnl fi
 
 dnl Non-optimized version.
-ifdef(blablablabla,`
-	ALIGN(16)
-PROLOGUE(flint_mpn_aorsrsh_5)
-	xor	R32(tnc), R32(tnc)
-	sub	cnt, tnc
-	xor	R32(sx), R32(sx)
-
-	shrx	cnt, 0*8(bp), s0
-	mov	1*8(bp), s1
-	shlx	tnc, s1, s2
-	lea	(s0, s2), s2
-ifelse(OP,`add',`
-	add	0*8(ap), s2
-	mov	s2, 0*8(rp)
-',`
-	mov	0*8(ap), s0
-	sub	s2, s0
-	mov	s0, 0*8(rp)
-')
-
-	shrx	cnt, s1, s0
-	mov	2*8(bp), s1
-	shlx	tnc, s1, s2
-	lea	(s0, s2), s2
-ifelse(OP,`add',`
-	adc	1*8(ap), s2
-	mov	s2, 1*8(rp)
-',`
-	mov	1*8(ap), s0
-	sbb	s2, s0
-	mov	s0, 1*8(rp)
-')
-
-	shrx	cnt, s1, s0
-	mov	3*8(bp), s1
-	shlx	tnc, s1, s2
-	lea	(s0, s2), s2
-ifelse(OP,`add',`
-	adc	2*8(ap), s2
-	mov	s2, 2*8(rp)
-',`
-	mov	2*8(ap), s0
-	sbb	s2, s0
-	mov	s0, 2*8(rp)
-')
-
-	shrx	cnt, s1, s0
-	mov	4*8(bp), s1
-	shlx	tnc, s1, s2
-	lea	(s0, s2), s2
-ifelse(OP,`add',`
-	adc	3*8(ap), s2
-	mov	s2, 3*8(rp)
-',`
-	mov	3*8(ap), s0
-	sbb	s2, s0
-	mov	s0, 3*8(rp)
-')
-
-	shrx	cnt, s1, s0
-ifelse(OP,`add',`
-	adc	4*8(ap), s0
-	mov	s0, 4*8(rp)
-',`
-	mov	4*8(ap), s2
-	sbb	s0, s2
-	mov	s2, 4*8(rp)
-')
-
-	setc	R8(sx)
-	ret
-EPILOGUE()
-',`')
+dnl ifdef(blablablabla,`
+dnl 	ALIGN(16)
+dnl PROLOGUE(flint_mpn_aorsrsh_5)
+dnl 	xor	tnc, tnc
+dnl 	sub	cnt, tnc
+dnl 	xor	R32(sx), R32(sx)
+dnl 
+dnl 	shrx	cnt, 0*8(bp), s0
+dnl 	mov	1*8(bp), s1
+dnl 	shlx	tnc, s1, s2
+dnl 	lea	(s0, s2), s2
+dnl ifelse(OP,`add',`
+dnl 	add	0*8(ap), s2
+dnl 	mov	s2, 0*8(rp)
+dnl ',`
+dnl 	mov	0*8(ap), s0
+dnl 	sub	s2, s0
+dnl 	mov	s0, 0*8(rp)
+dnl ')
+dnl 
+dnl 	shrx	cnt, s1, s0
+dnl 	mov	2*8(bp), s1
+dnl 	shlx	tnc, s1, s2
+dnl 	lea	(s0, s2), s2
+dnl ifelse(OP,`add',`
+dnl 	adc	1*8(ap), s2
+dnl 	mov	s2, 1*8(rp)
+dnl ',`
+dnl 	mov	1*8(ap), s0
+dnl 	sbb	s2, s0
+dnl 	mov	s0, 1*8(rp)
+dnl ')
+dnl 
+dnl 	shrx	cnt, s1, s0
+dnl 	mov	3*8(bp), s1
+dnl 	shlx	tnc, s1, s2
+dnl 	lea	(s0, s2), s2
+dnl ifelse(OP,`add',`
+dnl 	adc	2*8(ap), s2
+dnl 	mov	s2, 2*8(rp)
+dnl ',`
+dnl 	mov	2*8(ap), s0
+dnl 	sbb	s2, s0
+dnl 	mov	s0, 2*8(rp)
+dnl ')
+dnl 
+dnl 	shrx	cnt, s1, s0
+dnl 	mov	4*8(bp), s1
+dnl 	shlx	tnc, s1, s2
+dnl 	lea	(s0, s2), s2
+dnl ifelse(OP,`add',`
+dnl 	adc	3*8(ap), s2
+dnl 	mov	s2, 3*8(rp)
+dnl ',`
+dnl 	mov	3*8(ap), s0
+dnl 	sbb	s2, s0
+dnl 	mov	s0, 3*8(rp)
+dnl ')
+dnl 
+dnl 	shrx	cnt, s1, s0
+dnl ifelse(OP,`add',`
+dnl 	adc	4*8(ap), s0
+dnl 	mov	s0, 4*8(rp)
+dnl ',`
+dnl 	mov	4*8(ap), s2
+dnl 	sbb	s0, s2
+dnl 	mov	s2, 4*8(rp)
+dnl ')
+dnl 
+dnl 	setc	R8(sx)
+dnl 	ret
+dnl EPILOGUE()
+dnl ')
 
 	TEXT
 	
@@ -142,7 +142,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_2)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -160,7 +160,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_3)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -184,7 +184,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_4)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -214,7 +214,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_5)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -250,7 +250,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_6)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -292,7 +292,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_7)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -340,7 +340,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_8)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -394,7 +394,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_9)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -454,7 +454,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_10)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -520,7 +520,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_11)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -592,7 +592,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_12)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -670,7 +670,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_13)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -754,7 +754,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_14)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -844,7 +844,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_15)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -940,7 +940,7 @@ EPILOGUE()
 
 	ALIGN(16)
 PROLOGUE(flint_mpn_addrsh_16)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1054,7 +1054,7 @@ EPILOGUE()
 dnl Modified to avoid pushing and popping s3
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_2)
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1075,7 +1075,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_3)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1104,7 +1104,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_4)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1140,7 +1140,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_5)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1183,7 +1183,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_6)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1233,7 +1233,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_7)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1290,7 +1290,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_8)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1354,7 +1354,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_9)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1425,7 +1425,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_10)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1503,7 +1503,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_11)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1588,7 +1588,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_12)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1680,7 +1680,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_13)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1779,7 +1779,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_14)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1885,7 +1885,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_15)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0
@@ -1998,7 +1998,7 @@ EPILOGUE()
 	ALIGN(16)
 PROLOGUE(flint_mpn_subrsh_16)
 	push	s3
-	xor	R32(tnc), R32(tnc)
+	xor	tnc, tnc
 	sub	cnt, tnc
 	xor	R32(sx), R32(sx)
 	shrx	cnt, 0*8(bp), s0

From 877ec63b6988c3685ffbe7edd66c0d46862f5a2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 19:25:26 +0000
Subject: [PATCH 10/13] bla

---
 dev/gen_x86_aorsrsh.jl                        |  2 +-
 src/mpn_extras/test/t-aorsrsh_n.c             |  2 +-
 .../x86_64/broadwell/aorsrsh_hard.asm         | 30 +++++++++----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl
index c11655d8fc..8881a996e5 100644
--- a/dev/gen_x86_aorsrsh.jl
+++ b/dev/gen_x86_aorsrsh.jl
@@ -119,7 +119,7 @@ function aorsrsh(n::Int; is_add::Bool = true)
                 mov(    s3, rp(ix))
             elseif ix == n - 1
                 mov(    ap(ix), s0)
-                sub(    s1, s0)
+                sbb(    s1, s0)
                 mov(    s0, rp(ix))
             else
                 mov(    ap(ix), s3)
diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c
index 25d9075af9..24709f0e49 100644
--- a/src/mpn_extras/test/t-aorsrsh_n.c
+++ b/src/mpn_extras/test/t-aorsrsh_n.c
@@ -102,7 +102,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
         result = (cf == cg && mpn_cmp(fp, gp, n) == 0);
         if (!result)
             TEST_FUNCTION_FAIL(
-                    "%s:\n"
+                    "function: %s\n"
                     "aliasing: %s\n"
                     "ix = %wd\n"
                     "n = %wd\n"
diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
index ec24c45e30..ed6d264c06 100644
--- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -1066,7 +1066,7 @@ PROLOGUE(flint_mpn_subrsh_2)
 	sub	s2, tnc
 	mov	tnc, 0*8(rp)
 	mov	1*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 1*8(rp)
 	setc	R8(sx)
 	ret
@@ -1094,7 +1094,7 @@ PROLOGUE(flint_mpn_subrsh_3)
 	sbb	s2, s3
 	mov	s3, 1*8(rp)
 	mov	2*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 2*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1130,7 +1130,7 @@ PROLOGUE(flint_mpn_subrsh_4)
 	sbb	s2, s3
 	mov	s3, 2*8(rp)
 	mov	3*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 3*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1173,7 +1173,7 @@ PROLOGUE(flint_mpn_subrsh_5)
 	sbb	s2, s3
 	mov	s3, 3*8(rp)
 	mov	4*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 4*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1223,7 +1223,7 @@ PROLOGUE(flint_mpn_subrsh_6)
 	sbb	s2, s3
 	mov	s3, 4*8(rp)
 	mov	5*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 5*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1280,7 +1280,7 @@ PROLOGUE(flint_mpn_subrsh_7)
 	sbb	s2, s3
 	mov	s3, 5*8(rp)
 	mov	6*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 6*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1344,7 +1344,7 @@ PROLOGUE(flint_mpn_subrsh_8)
 	sbb	s2, s3
 	mov	s3, 6*8(rp)
 	mov	7*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 7*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1415,7 +1415,7 @@ PROLOGUE(flint_mpn_subrsh_9)
 	sbb	s2, s3
 	mov	s3, 7*8(rp)
 	mov	8*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 8*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1493,7 +1493,7 @@ PROLOGUE(flint_mpn_subrsh_10)
 	sbb	s2, s3
 	mov	s3, 8*8(rp)
 	mov	9*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 9*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1578,7 +1578,7 @@ PROLOGUE(flint_mpn_subrsh_11)
 	sbb	s2, s3
 	mov	s3, 9*8(rp)
 	mov	10*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 10*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1670,7 +1670,7 @@ PROLOGUE(flint_mpn_subrsh_12)
 	sbb	s2, s3
 	mov	s3, 10*8(rp)
 	mov	11*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 11*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1769,7 +1769,7 @@ PROLOGUE(flint_mpn_subrsh_13)
 	sbb	s2, s3
 	mov	s3, 11*8(rp)
 	mov	12*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 12*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1875,7 +1875,7 @@ PROLOGUE(flint_mpn_subrsh_14)
 	sbb	s2, s3
 	mov	s3, 12*8(rp)
 	mov	13*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 13*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -1988,7 +1988,7 @@ PROLOGUE(flint_mpn_subrsh_15)
 	sbb	s2, s3
 	mov	s3, 13*8(rp)
 	mov	14*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 14*8(rp)
 	pop	s3
 	setc	R8(sx)
@@ -2108,7 +2108,7 @@ PROLOGUE(flint_mpn_subrsh_16)
 	sbb	s2, s3
 	mov	s3, 14*8(rp)
 	mov	15*8(ap), s0
-	sub	s1, s0
+	sbb	s1, s0
 	mov	s0, 15*8(rp)
 	pop	s3
 	setc	R8(sx)

From a537b899e9c4631969f2b431720dea2c3de11016 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 19:28:21 +0000
Subject: [PATCH 11/13] fixup

---
 src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
index ed6d264c06..5f79d39516 100644
--- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
+++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm
@@ -25,6 +25,7 @@ define(`s2', `%r11')
 define(`s3', `%rbx')
 
 dnl From n = 2 onwards, these are generated by `dev/gen_x86_aorsrsh.jl'.
+dnl However, flint_mpn_subrsh_2 is touched up afterwards.
 
 dnl r <- a +/- 2^n b
 dnl

From 55dd1381a4abed605cd253795d98cb2d094fd2d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 2 Dec 2024 20:20:52 +0000
Subject: [PATCH 12/13] fixup

---
 src/mpn_extras.h                  | 2 ++
 src/mpn_extras/test/t-aorsrsh_n.c | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 8f4d5ce307..f4c6d216c3 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -491,6 +491,7 @@ mp_limb_t flint_mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n,
         return FLINT_MPN_ADDRSH_HARD(rp, xp, yp, n, cnt);
     else
     {
+        FLINT_ASSERT(rp != xp);
         mpn_rshift(rp, yp, n, cnt);
         return mpn_add_n(rp, rp, xp, n);
     }
@@ -506,6 +507,7 @@ mp_limb_t flint_mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n,
         return FLINT_MPN_SUBRSH_HARD(rp, xp, yp, n, cnt);
     else
     {
+        FLINT_ASSERT(rp != xp);
         mpn_rshift(rp, yp, n, cnt);
         return mpn_sub_n(rp, xp, rp, n);
     }
diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c
index 24709f0e49..acf346d00b 100644
--- a/src/mpn_extras/test/t-aorsrsh_n.c
+++ b/src/mpn_extras/test/t-aorsrsh_n.c
@@ -53,7 +53,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
         /* 0: No aliasing
          * 1: fp = xp
          * 2: fp = yp */
-        aliasing = 0; /* n_randint(state, 3); */
+        aliasing = n_randint(state, 3);
 
         fp = flint_malloc(sizeof(mp_limb_t) * n);
         gp = flint_malloc(sizeof(mp_limb_t) * n);
@@ -66,6 +66,10 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
 
         type = n_randint(state, 2);
 
+        /* FIXME */
+        if (n > N_MAX && aliasing == 1)
+            aliasing = 0;
+
         if (type == 0)
         {
             if (aliasing == 0)
@@ -112,7 +116,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state)
                     "FLINT (cy = %wu): %{ulong*}\n"
                     "GMP   (cy = %wu): %{ulong*}\n",
                     type == 0 ? "flint_mpn_addrsh_n" : "flint_mpn_subrsh_n",
-                    aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp",
+                    aliasing == 0 ? "none" : (aliasing == 1 ? "rp = xp" : "rp = yp"),
                     ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n);
 
         flint_free(fp);

From 9e7f4625487a3a94f4a83b2ece92c22d0e398822 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albin=20Ahlb=C3=A4ck?= <ahlback@lix.polytechnique.fr>
Date: Mon, 9 Dec 2024 13:10:33 +0000
Subject: [PATCH 13/13] Stash for aorsrsh for ARM64

---
 src/mpn_extras/arm64/aorsrsh_hard.asm | 71 +++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 src/mpn_extras/arm64/aorsrsh_hard.asm

diff --git a/src/mpn_extras/arm64/aorsrsh_hard.asm b/src/mpn_extras/arm64/aorsrsh_hard.asm
new file mode 100644
index 0000000000..6b21e27fec
--- /dev/null
+++ b/src/mpn_extras/arm64/aorsrsh_hard.asm
@@ -0,0 +1,71 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+dnl
+
+include(`config.m4')
+
+dnl Everything from n = 2 and onwards is generated by
+dnl $topdir/dev/gen_arm_aors.jl.
+dnl
+dnl This generation was constructed with processors with Apple silicon in mind.
+dnl Processors decoding less than 6 operations per cycle, or few store and load
+dnl units may have worse performance.
+
+define(`rp',  `x0')
+define(`ap',  `x1')
+define(`bp',  `x2')
+define(`cnt', `x3')
+
+define(`sx',  `x0') C Beware that this is synonymous with rp
+define(`s0',  `x3')
+define(`s1',  `x4')
+define(`s2',  `x5')
+define(`s3',  `x6')
+define(`s4',  `x7')
+define(`s5',  `x8')
+define(`s6',  `x9')
+define(`s7',  `x10')
+define(`s8',  `x11')
+define(`s9',  `x12')
+define(`s10', `x13')
+define(`s11', `x14')
+define(`s12', `x15')
+define(`s13', `x16')
+define(`s14', `x17')
+
+dnl r <- a +/- 2^n b
+dnl
+dnl For 0 <= i < n - 1, we have
+dnl
+dnl     r_{i} = a_{i} +/- (b_{i} >> n + b_{i + 1} << (64 - n)),
+dnl
+dnl and
+dnl
+dnl     r_{n - 1} = a_{n - 1} +/- (b_{n - 1} >> n).
+
+PROLOGUE(flint_mpn_aorsrsh(1))
+	ldr	s0, [ap,#0*8]
+	ldr	s1, [bp,#0*8]
+	lsr	s1, s1, cnt
+	OP	s0, s0, s1
+	str	s0, [rp,#0*8]
+	cset	sx, CC
+	ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_aors_2)
+	ldp	s0, s2, [ap,#0*8]
+	ldp	s1, s3, [bp,#0*8]
+	OP	s0, s0, s1
+	OPC	s2, s2, s3
+	stp	s0, s2, [rp,#0*8]
+	cset	sx, CC
+	ret
+EPILOGUE()