Skip to content

Mlkem aarch64 intrinsics restructured #24419

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions src/hotspot/cpu/aarch64/register_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,3 @@ const char* PRegister::PRegisterImpl::name() const {
};
return is_valid() ? names[encoding()] : "pnoreg";
}

// convenience methods for splitting 8-way vector register sequences
// in half -- needed because vector operations can normally only be
// benefit from 4-way instruction parallelism

VSeq<4> vs_front(const VSeq<8>& v) {
return VSeq<4>(v.base(), v.delta());
}

VSeq<4> vs_back(const VSeq<8>& v) {
return VSeq<4>(v.base() + 4 * v.delta(), v.delta());
}

VSeq<4> vs_even(const VSeq<8>& v) {
return VSeq<4>(v.base(), v.delta() * 2);
}

VSeq<4> vs_odd(const VSeq<8>& v) {
return VSeq<4>(v.base() + 1, v.delta() * 2);
}
93 changes: 78 additions & 15 deletions src/hotspot/cpu/aarch64/register_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,19 +436,20 @@ enum RC { rc_bad, rc_int, rc_float, rc_predicate, rc_stack };
// inputs into front and back halves or odd and even halves (see
// convenience methods below).

// helper macro for computing register masks
#define VS_MASK_BIT(base, delta, i) (1 << (base + delta * i))

template<int N> class VSeq {
static_assert(N >= 2, "vector sequence length must be greater than 1");
static_assert(N <= 8, "vector sequence length must not exceed 8");
static_assert((N & (N - 1)) == 0, "vector sequence length must be power of two");
private:
int _base; // index of first register in sequence
int _delta; // increment to derive successive indices
public:
VSeq(FloatRegister base_reg, int delta = 1) : VSeq(base_reg->encoding(), delta) { }
VSeq(int base, int delta = 1) : _base(base), _delta(delta) {
assert (_base >= 0, "invalid base register");
assert (_delta >= 0, "invalid register delta");
assert ((_base + (N - 1) * _delta) < 32, "range exceeded");
assert (_base >= 0 && _base <= 31, "invalid base register");
assert ((_base + (N - 1) * _delta) >= 0, "register range underflow");
assert ((_base + (N - 1) * _delta) < 32, "register range overflow");
}
// indexed access to sequence
FloatRegister operator [](int i) const {
Expand All @@ -457,27 +458,89 @@ template<int N> class VSeq {
}
int mask() const {
int m = 0;
int bit = 1 << _base;
for (int i = 0; i < N; i++) {
m |= bit << (i * _delta);
m |= VS_MASK_BIT(_base, _delta, i);
}
return m;
}
int base() const { return _base; }
int delta() const { return _delta; }
bool is_constant() const { return _delta == 0; }
};

// declare convenience methods for splitting vector register sequences

VSeq<4> vs_front(const VSeq<8>& v);
VSeq<4> vs_back(const VSeq<8>& v);
VSeq<4> vs_even(const VSeq<8>& v);
VSeq<4> vs_odd(const VSeq<8>& v);

// methods for use in asserts to check VSeq inputs and oupts are
// methods for use in asserts to check VSeq inputs and outputs are
// either disjoint or equal

template<int N, int M> bool vs_disjoint(const VSeq<N>& n, const VSeq<M>& m) { return (n.mask() & m.mask()) == 0; }
template<int N> bool vs_same(const VSeq<N>& n, const VSeq<N>& m) { return n.mask() == m.mask(); }

// method for use in asserts to check whether registers appearing in
// an output sequence will be written before they are read from an
// input sequence.

template<int N> bool vs_write_before_read(const VSeq<N>& vout, const VSeq<N>& vin) {
int b_in = vin.base();
int d_in = vin.delta();
int b_out = vout.base();
int d_out = vout.delta();
int bit_in = 1 << b_in;
int bit_out = 1 << b_out;
int mask_read = vin.mask(); // all pending reads
int mask_write = 0; // no writes as yet


for (int i = 0; i < N - 1; i++) {
// check whether a pending read clashes with a write
if ((mask_write & mask_read) != 0) {
return true;
}
// remove the pending input (so long as this is a constant
// sequence)
if (d_in != 0) {
mask_read ^= VS_MASK_BIT(b_in, d_in, i);
}
// record the next write
mask_write |= VS_MASK_BIT(b_out, d_out, i);
}
// no write before read
return false;
}

// convenience methods for splitting 8-way of 4-way vector register
// sequences in half -- needed because vector operations can normally
// benefit from 4-way instruction parallelism or, occasionally, 2-way
// parallelism

template<int N>
VSeq<N/2> vs_front(const VSeq<N>& v) {
static_assert(N > 0 && ((N & 1) == 0), "sequence length must be even");
return VSeq<N/2>(v.base(), v.delta());
}

template<int N>
VSeq<N/2> vs_back(const VSeq<N>& v) {
static_assert(N > 0 && ((N & 1) == 0), "sequence length must be even");
return VSeq<N/2>(v.base() + N / 2 * v.delta(), v.delta());
}

template<int N>
VSeq<N/2> vs_even(const VSeq<N>& v) {
static_assert(N > 0 && ((N & 1) == 0), "sequence length must be even");
return VSeq<N/2>(v.base(), v.delta() * 2);
}

template<int N>
VSeq<N/2> vs_odd(const VSeq<N>& v) {
static_assert(N > 0 && ((N & 1) == 0), "sequence length must be even");
return VSeq<N/2>(v.base() + v.delta(), v.delta() * 2);
}

// convenience method to construct a vector register sequence that
// indexes its elements in reverse order to the original

template<int N>
VSeq<N> vs_reverse(const VSeq<N>& v) {
return VSeq<N>(v.base() + (N - 1) * v.delta(), -v.delta());
}

#endif // CPU_AARCH64_REGISTER_AARCH64_HPP
2 changes: 1 addition & 1 deletion src/hotspot/cpu/aarch64/stubDeclarations_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
do_arch_blob, \
do_arch_entry, \
do_arch_entry_init) \
do_arch_blob(compiler, 55000 ZGC_ONLY(+5000)) \
do_arch_blob(compiler, 75000 ZGC_ONLY(+5000)) \
do_stub(compiler, vector_iota_indices) \
do_arch_entry(aarch64, compiler, vector_iota_indices, \
vector_iota_indices, vector_iota_indices) \
Expand Down
Loading