Skip to content

Commit 6879047

Browse files
Some explanations for mpn_ctx_mpn_mul
1 parent ad44e4c commit 6879047

File tree

1 file changed

+88
-3
lines changed

1 file changed

+88
-3
lines changed

src/fft_small/mpn_mul.c

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
Copyright (C) 2022 Daniel Schultz
3+
Copyright (C) 2024 Fredrik Johansson
34
45
This file is part of FLINT.
56
@@ -17,6 +18,92 @@
1718
#include "crt_helpers.h"
1819
#include "fft_small.h"
1920

21+
/*
22+
The following profiles are hardcoded.
23+
24+
np bits n m
25+
4 84 4 3
26+
4 88 4 3
27+
4 92 4 3
28+
5 112 4 4
29+
5 116 4 4
30+
5 120 4 4
31+
6 136 5 4
32+
6 140 5 4
33+
6 144 5 4
34+
7 160 6 5
35+
7 164 6 5
36+
7 168 6 5
37+
8 184 7 6
38+
8 188 7 6
39+
8 192 7 6
40+
41+
Here np is the number of primes {p[0], p[1], ...p[np-1]}. By default the
42+
following 50-bit primes are used:
43+
44+
p[0] = 1108307720798209
45+
p[1] = 659706976665601
46+
p[2] = 1086317488242689
47+
p[3] = 910395627798529
48+
p[4] = 699289395265537
49+
p[5] = 1022545813831681
50+
p[6] = 1013749720809473
51+
p[7] = 868614185943041
52+
53+
We suppose that the inputs to multiply are
54+
55+
{a[0], ..., a[an-1]} and {b[0], ..., b[bn-1]}
56+
57+
in base 2^64. They will be converted to
58+
59+
{a'[0], ..., a'[an'-1]} and {b'[0], ..., a'[bn'-1]}.
60+
61+
in base 2^bits, where an' = ceil(64*an/bits) and
62+
bn' = ceil(64*bn/bits). The code for performing the conversion
63+
actually processes the inputs as base 2^32 arrays of twice the lengths.
64+
65+
Each dot product in the convolution of a' and b' is bounded by
66+
67+
bn' * (2^bits-1)^2
68+
69+
and we need this to be smaller than prod_primes = p[0], ..., p[np-1]
70+
for the CRT reconstruction. A slight relaxation of this inequality is
71+
72+
ceil(64*bn/bits) * 2^(2*bits) <= prod_primes.
73+
74+
The function crt_data_find_bn_bound determines an upper bound for
75+
permissible bn such that this holds. Numerical values of the bn bounds
76+
are as follows:
77+
78+
np = 4, bits = 84, bn <= 2536637511
79+
np = 4, bits = 88, bn <= 10380583
80+
np = 4, bits = 92, bn <= 42390
81+
np = 5, bits = 112, bn <= 32822700
82+
np = 5, bits = 116, bn <= 132790
83+
np = 5, bits = 120, bn <= 534
84+
np = 6, bits = 136, bn <= 144789875
85+
np = 6, bits = 140, bn <= 582218
86+
np = 6, bits = 144, bn <= 2337
87+
np = 7, bits = 160, bn <= 613493872
88+
np = 7, bits = 164, bn <= 2456369
89+
np = 7, bits = 168, bn <= 9826
90+
np = 8, bits = 184, bn <= 2177184315
91+
np = 8, bits = 188, bn <= 8689506
92+
np = 8, bits = 192, bn <= 34662
93+
94+
Note that we do not require an >= bn for correctness, but for highly
95+
unbalanced multiplications calling mpn_ctx_mpn_mul with operands
96+
in this order gives a better bound.
97+
98+
The profile parameters n and m describe the size of CRT operands.
99+
Let P = p[0] * ... * p[n-1]. One CRT sum has the form
100+
101+
s = f[0]*x[0] + ... + f[np-1]*x[np-1]
102+
103+
where x[i] is a single limb, f[i] has m limbs, and the limb count n
104+
is large enough to hold s <= P * np.
105+
*/
106+
20107
void crt_data_init(crt_data_t C, ulong prime, ulong coeff_len, ulong nprimes)
21108
{
22109
C->prime = prime;
@@ -30,8 +117,6 @@ void crt_data_clear(crt_data_t C)
30117
flint_free(C->data);
31118
}
32119

33-
34-
35120
/*
36121
need ceil(64*bn/bits) <= prod_primes/2^(2*bits)
37122
i.e. (64*bn+bits-1)/bits <= prod_primes/2^(2*bits)
@@ -987,7 +1072,7 @@ void mpn_ctx_init(mpn_ctx_t R, ulong p)
9871072
R->profiles[i].bits = bits_; \
9881073
R->profiles[i].bn_bound = crt_data_find_bn_bound(R->crts + np_ - 1, bits_); \
9891074
R->profiles[i].to_ffts = CAT3(mpn_to_ffts, np_, bits_); \
990-
/*flint_printf("profile np = %wu, bits = %3wu, bn <= 0x%16wx\n", R->profiles[i].np, R->profiles[i].bits, R->profiles[i].bn_bound);*/ \
1075+
/*flint_printf("profile np = %wu, bits = %3wu, bn <= %wu\n", R->profiles[i].np, R->profiles[i].bits, R->profiles[i].bn_bound); */ \
9911076
R->profiles_size = i + 1;
9921077

9931078
PUSH_PROFILE(4, 84, 4,3);

0 commit comments

Comments
 (0)