Skip to content

Optimize L2P for GPUs #158

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 128 additions & 51 deletions sumpy/e2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@
"""

from abc import ABC, abstractmethod
from pytools import memoize_method

import numpy as np
import loopy as lp
from loopy.kernel.data import LocalInameTag
import pymbolic.primitives as prim

from sumpy.tools import KernelCacheMixin, gather_loopy_arguments
from sumpy.codegen import register_optimization_preambles
Expand Down Expand Up @@ -71,7 +74,7 @@ def __init__(self, ctx, expansion, kernels,

self.context = ctx
self.expansion = expansion
self.kernels = kernels
self.kernels = tuple(kernels)
self.name = name or self.default_name
self.device = device

Expand All @@ -82,15 +85,18 @@ def __init__(self, ctx, expansion, kernels,
def default_name(self):
pass

@memoize_method
def get_loopy_evaluator_and_optimizations(self):
return self.expansion.loopy_evaluator_and_optimizations(self.kernels)

def get_cache_key(self):
return (type(self).__name__, self.expansion, tuple(self.kernels))

def add_loopy_eval_callable(
self, loopy_knl: lp.TranslationUnit) -> lp.TranslationUnit:
inner_knl = self.expansion.loopy_evaluator(self.kernels)
inner_knl, _ = self.get_loopy_evaluator_and_optimizations()
loopy_knl = lp.merge([loopy_knl, inner_knl])
loopy_knl = lp.inline_callable_kernel(loopy_knl, "e2p")
loopy_knl = lp.remove_unused_inames(loopy_knl)
for kernel in self.kernels:
loopy_knl = kernel.prepare_loopy_kernel(loopy_knl)
loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
Expand Down Expand Up @@ -118,33 +124,41 @@ class E2PFromSingleBox(E2PBase):
def default_name(self):
return "e2p_from_single_box"

def get_kernel(self):
def get_kernel(self, max_ntargets_in_one_box):
ncoeffs = len(self.expansion)
loopy_args = self.get_loopy_args()
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))

loopy_knl = lp.make_kernel(
[
"{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
"{[itgt,idim]: itgt_start<=itgt<itgt_end and 0<=idim<dim}",
"{[idim]: 0<=idim<dim}",
"{[itgt_offset]: 0<=itgt_offset<max_ntargets_in_one_box}",
"{[icoeff]: 0<=icoeff<ncoeffs}",
"{[iknl]: 0<=iknl<nresults}",
"{[dummy]: 0<=dummy<max_work_items}",
],
self.get_kernel_scaling_assignment()
+ ["""
for itgt_box
<> tgt_ibox = target_boxes[itgt_box]
<> itgt_start = box_target_starts[tgt_ibox]
<> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]
<> tgt_ibox = target_boxes[itgt_box] {id=fetch_init0}
<> itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1}
<> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] \
{id=fetch_init2}

<> center[idim] = centers[idim, tgt_ibox] {id=fetch_center}

<> coeffs[icoeff] = \
src_expansions[tgt_ibox - src_base_ibox, icoeff] \
{id=fetch_coeffs}

for itgt
<> tgt[idim] = targets[idim, itgt] {id=fetch_tgt,dup=idim}
<> result_temp[iknl] = 0 {id=init_result,dup=iknl}
for itgt_offset
<> itgt = itgt_start + itgt_offset
<> run_itgt = itgt<itgt_end
<> tgt[idim] = targets[idim, itgt] {id=fetch_tgt, \
dup=idim,if=run_itgt}
<> result_temp[iknl] = 0 {id=init_result,dup=iknl, \
if=run_itgt}
[iknl]: result_temp[iknl] = e2p(
[iknl]: result_temp[iknl],
[icoeff]: coeffs[icoeff],
Expand All @@ -156,9 +170,9 @@ def get_kernel(self):
targets,
""" + ",".join(arg.name for arg in loopy_args) + """
) {dep=fetch_coeffs:fetch_center:init_result:fetch_tgt,\
id=update_result}
id=update_result,if=run_itgt}
result[iknl, itgt] = result_temp[iknl] * kernel_scaling \
{id=write_result,dep=update_result}
{id=write_result,dep=update_result,if=run_itgt}
end
end
"""],
Expand All @@ -183,7 +197,9 @@ def get_kernel(self):
silenced_warnings="write_race(*_result)",
default_offset=lp.auto,
fixed_parameters={"dim": self.dim, "nresults": len(self.kernels),
"ncoeffs": ncoeffs},
"ncoeffs": ncoeffs,
"max_work_items": max_work_items,
"max_ntargets_in_one_box": max_ntargets_in_one_box},
lang_version=MOST_RECENT_LANGUAGE_VERSION)

loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
Expand All @@ -192,15 +208,41 @@ def get_kernel(self):

return loopy_knl

def get_optimized_kernel(self):
# FIXME
knl = self.get_kernel()
def get_optimized_kernel(self, max_ntargets_in_one_box):
inner_knl, optimizations = self.get_loopy_evaluator_and_optimizations()
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box)
knl = lp.tag_inames(knl, {"itgt_box": "g.0"})
knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0")
knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0")
knl = lp.add_inames_to_insn(knl, "dummy",
"id:fetch_init* or id:fetch_center or id:kernel_scaling")
knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling")
knl = lp.tag_inames(knl, {"dummy": "l.0"})
knl = lp.set_temporary_address_space(knl, "coeffs", lp.AddressSpace.LOCAL)
knl = lp.set_options(knl,
enforce_variable_access_ordered="no_check")
enforce_variable_access_ordered="no_check", write_code=False)

for transform in optimizations:
knl = transform(knl)
knl = register_optimization_preambles(knl, self.device)

# If there are inames tagged as local in the inner kernel
# we need to remove the iname itgt_offset_inner from instructions
# within those inames and also remove the predicate run_itgt
# which depends on itgt_offset_inner
tagged_inames = [iname.name for iname in
knl.default_entrypoint.inames.values() if
iname.name.startswith("e2p_") and any(
isinstance(tag, LocalInameTag) for tag in iname.tags)]
if tagged_inames:
insn_ids = [insn.id for insn in knl.default_entrypoint.instructions
if any(iname in tagged_inames for iname in insn.within_inames)]
match = " or ".join(f"id:{insn_id}" for insn_id in insn_ids)
knl = lp.remove_inames_from_insn(knl,
frozenset(["itgt_offset_inner"]), match)
knl = lp.remove_predicates_from_insn(knl,
frozenset([prim.Variable("run_itgt")]), match)

return knl

def __call__(self, queue, **kwargs):
Expand All @@ -212,7 +254,9 @@ def __call__(self, queue, **kwargs):
:arg centers:
:arg targets:
"""
knl = self.get_cached_kernel_executor()
max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box")
knl = self.get_cached_kernel_executor(
max_ntargets_in_one_box=max_ntargets_in_one_box)

centers = kwargs.pop("centers")
# "1" may be passed for rscale, which won't have its type
Expand All @@ -231,42 +275,49 @@ class E2PFromCSR(E2PBase):
def default_name(self):
return "e2p_from_csr"

def get_kernel(self):
def get_kernel(self, max_ntargets_in_one_box):
ncoeffs = len(self.expansion)
loopy_args = self.get_loopy_args()
max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box))

loopy_knl = lp.make_kernel(
[
"{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
"{[itgt]: itgt_start<=itgt<itgt_end}",
"{[itgt_offset]: 0<=itgt_offset<max_ntargets_in_one_box}",
"{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end }",
"{[idim]: 0<=idim<dim}",
"{[icoeff]: 0<=icoeff<ncoeffs}",
"{[iknl]: 0<=iknl<nresults}",
"{[dummy]: 0<=dummy<max_work_items}",
],
self.get_kernel_scaling_assignment()
+ ["""
for itgt_box
<> tgt_ibox = target_boxes[itgt_box]
<> itgt_start = box_target_starts[tgt_ibox]
<> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]

for itgt
<> tgt[idim] = targets[idim,itgt] {id=fetch_tgt,dup=idim}

<> isrc_box_start = source_box_starts[itgt_box]
<> isrc_box_end = source_box_starts[itgt_box+1]

<> result_temp[iknl] = 0 {id=init_result,dup=iknl}
for isrc_box
<> src_ibox = source_box_lists[isrc_box]
<> coeffs[icoeff] = \
<> tgt_ibox = target_boxes[itgt_box] {id=init_box0}
<> itgt_start = box_target_starts[tgt_ibox] {id=init_box1}
<> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] \
{id=init_box2}
<> isrc_box_start = source_box_starts[itgt_box] {id=init_box3}
<> isrc_box_end = source_box_starts[itgt_box+1] {id=init_box4}

<> result_temp[itgt_offset, iknl] = 0 \
{id=init_result,dup=iknl}
for isrc_box
<> src_ibox = source_box_lists[isrc_box] {id=fetch_src_box}
<> coeffs[icoeff] = \
src_expansions[src_ibox - src_base_ibox, icoeff] \
{id=fetch_coeffs,dup=icoeff}
<> center[idim] = centers[idim, src_ibox] \
{id=fetch_coeffs}
<> center[idim] = centers[idim, src_ibox] \
{dup=idim,id=fetch_center}
[iknl]: result_temp[iknl] = e2p(
[iknl]: result_temp[iknl],

for itgt_offset
<> itgt = itgt_start + itgt_offset
<> run_itgt = itgt<itgt_end
<> tgt[idim] = targets[idim,itgt] \
{id=fetch_tgt,dup=idim,if=run_itgt}

[iknl]: result_temp[itgt_offset, iknl] = e2p(
[iknl]: result_temp[itgt_offset, iknl],
[icoeff]: coeffs[icoeff],
[idim]: center[idim],
[idim]: tgt[idim],
Expand All @@ -276,11 +327,18 @@ def get_kernel(self):
targets,
""" + ",".join(arg.name for arg in loopy_args) + """
) {id=update_result, \
dep=fetch_coeffs:fetch_center:fetch_tgt:init_result}
dep=fetch_coeffs:fetch_center:fetch_tgt:init_result, \
if=run_itgt}
end
result[iknl, itgt] = result[iknl, itgt] + result_temp[iknl] \
* kernel_scaling \
{dep=update_result:init_result,id=write_result,dup=iknl}
end
for itgt_offset
<> itgt2 = itgt_start + itgt_offset {id=init_itgt_for_write}
<> run_itgt2 = itgt_start + itgt_offset < itgt_end \
{id=init_cond_for_write}
result[iknl, itgt2] = result[iknl, itgt2] + result_temp[ \
itgt_offset, iknl] * kernel_scaling \
{dep=update_result:init_result,id=write_result, \
dup=iknl,if=run_itgt2}
end
end
"""],
Expand Down Expand Up @@ -308,30 +366,49 @@ def get_kernel(self):
fixed_parameters={
"ncoeffs": ncoeffs,
"dim": self.dim,
"max_work_items": max_work_items,
"max_ntargets_in_one_box": max_ntargets_in_one_box,
"nresults": len(self.kernels)},
lang_version=MOST_RECENT_LANGUAGE_VERSION)

loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
loopy_knl = lp.tag_inames(loopy_knl, "iknl*:unr")
loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box")
loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,isrc_box,itgt_offset")
loopy_knl = self.add_loopy_eval_callable(loopy_knl)
loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")

return loopy_knl

def get_optimized_kernel(self):
# FIXME
knl = self.get_kernel()
knl = lp.tag_inames(knl, {"itgt_box": "g.0"})
def get_optimized_kernel(self, max_ntargets_in_one_box):
_, optimizations = self.get_loopy_evaluator_and_optimizations()
knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment to explain the idea?

knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"})
knl = lp.unprivatize_temporaries_with_inames(knl,
"itgt_offset", "result_temp")
knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0")
knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0")
knl = lp.privatize_temporaries_with_inames(knl,
"itgt_offset_outer", "result_temp")
knl = lp.duplicate_inames(knl, "itgt_offset_outer", "id:init_result")
knl = lp.duplicate_inames(knl, "itgt_offset_outer",
"id:write_result or id:init_itgt_for_write or id:init_cond_for_write")
knl = lp.add_inames_to_insn(knl, "dummy",
"id:init_box* or id:fetch_src_box or id:fetch_center "
"or id:kernel_scaling")
knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling")
knl = lp.add_inames_to_insn(knl, "itgt_offset_inner", "id:fetch_init*")
knl = lp.set_temporary_address_space(knl, "coeffs", lp.AddressSpace.LOCAL)
knl = lp.set_options(knl,
enforce_variable_access_ordered="no_check")
enforce_variable_access_ordered="no_check", write_code=False)
for transform in optimizations:
knl = transform(knl)
knl = register_optimization_preambles(knl, self.device)

return knl

def __call__(self, queue, **kwargs):
knl = self.get_cached_kernel_executor()
max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box")
knl = self.get_cached_kernel_executor(
max_ntargets_in_one_box=max_ntargets_in_one_box)

centers = kwargs.pop("centers")
# "1" may be passed for rscale, which won't have its type
Expand Down
11 changes: 7 additions & 4 deletions sumpy/expansion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

from abc import ABC, abstractmethod
from typing import (
Any, ClassVar, Dict, Hashable, List, Optional, Sequence, Tuple, Type)
Any, ClassVar, Dict, Hashable, List, Optional, Sequence, Tuple, Type,
Callable)

from pytools import memoize_method
import loopy as lp
Expand Down Expand Up @@ -66,9 +67,9 @@ class ExpansionBase(ABC):
.. automethod:: get_coefficient_identifiers
.. automethod:: coefficients_from_source
.. automethod:: coefficients_from_source_vec
.. automethod:: loopy_expansion_formation
.. automethod:: evaluate
.. automethod:: loopy_evaluator
.. automethod:: loopy_expansion_formation
.. automethod:: loopy_evaluator_and_optimizations

.. automethod:: with_kernel
.. automethod:: copy
Expand Down Expand Up @@ -183,7 +184,9 @@ def evaluate(self, kernel, coeffs, bvec, rscale, sac=None):
in *coeffs*.
"""

def loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit:
def loopy_evaluator_and_optimizations(self, kernels: Sequence[Kernel]) \
-> Tuple[lp.TranslationUnit, Sequence[
Callable[[lp.TranslationUnit], lp.TranslationUnit]]]:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • One optimization callable might suffice.
  • loopy_evaluator_and_transform?
  • Consistency with get_inner_knl_and_optimizations?

(Don't feel strongly about any of this.)

"""
:returns: a :mod:`loopy` kernel that returns the evaluated
target transforms of the potential given by *kernels*.
Expand Down
15 changes: 15 additions & 0 deletions sumpy/expansion/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
VolumeTaylorExpansionMixin,
LinearPDEConformingVolumeTaylorExpansion)
from sumpy.tools import add_to_sac, mi_increment_axis
from sumpy.kernel import Kernel

import loopy as lp

from typing import Sequence, Callable, Tuple

import logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -405,6 +410,16 @@ def loopy_translate_from(self, src_expansion):
f"A direct loopy kernel for translation from "
f"{src_expansion} to {self} is not implemented.")

def loopy_evaluator_and_optimizations(self, kernels: Sequence[Kernel]) \
-> Tuple[lp.TranslationUnit, Sequence[
Callable[[lp.TranslationUnit], lp.TranslationUnit]]]:
from sumpy.expansion.loopy import (make_l2p_loopy_kernel_for_volume_taylor,
make_e2p_loopy_kernel)
try:
return make_l2p_loopy_kernel_for_volume_taylor(self, kernels)
except NotImplementedError:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make a sub-brand of NotImplementedError to make sure this is specific.

return make_e2p_loopy_kernel(self, kernels)


class VolumeTaylorLocalExpansion(
VolumeTaylorExpansion,
Expand Down
Loading