diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 55af69b3a..9b6991203 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -21,9 +21,12 @@ """ from abc import ABC, abstractmethod +from pytools import memoize_method import numpy as np import loopy as lp +from loopy.kernel.data import LocalInameTag +import pymbolic.primitives as prim from sumpy.tools import KernelCacheMixin, gather_loopy_arguments from sumpy.codegen import register_optimization_preambles @@ -71,7 +74,7 @@ def __init__(self, ctx, expansion, kernels, self.context = ctx self.expansion = expansion - self.kernels = kernels + self.kernels = tuple(kernels) self.name = name or self.default_name self.device = device @@ -82,15 +85,18 @@ def __init__(self, ctx, expansion, kernels, def default_name(self): pass + @memoize_method + def get_loopy_evaluator_and_optimizations(self): + return self.expansion.loopy_evaluator_and_optimizations(self.kernels) + def get_cache_key(self): return (type(self).__name__, self.expansion, tuple(self.kernels)) def add_loopy_eval_callable( self, loopy_knl: lp.TranslationUnit) -> lp.TranslationUnit: - inner_knl = self.expansion.loopy_evaluator(self.kernels) + inner_knl, _ = self.get_loopy_evaluator_and_optimizations() loopy_knl = lp.merge([loopy_knl, inner_knl]) loopy_knl = lp.inline_callable_kernel(loopy_knl, "e2p") - loopy_knl = lp.remove_unused_inames(loopy_knl) for kernel in self.kernels: loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") @@ -118,23 +124,27 @@ class E2PFromSingleBox(E2PBase): def default_name(self): return "e2p_from_single_box" - def get_kernel(self): + def get_kernel(self, max_ntargets_in_one_box): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() + max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] + <> tgt_ibox = target_boxes[itgt_box] {id=fetch_init0} + <> itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1} + <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] \ + {id=fetch_init2} <> center[idim] = centers[idim, tgt_ibox] {id=fetch_center} @@ -142,9 +152,13 @@ def get_kernel(self): src_expansions[tgt_ibox - src_base_ibox, icoeff] \ {id=fetch_coeffs} - for itgt - <> tgt[idim] = targets[idim, itgt] {id=fetch_tgt,dup=idim} - <> result_temp[iknl] = 0 {id=init_result,dup=iknl} + for itgt_offset + <> itgt = itgt_start + itgt_offset + <> run_itgt = itgt tgt[idim] = targets[idim, itgt] {id=fetch_tgt, \ + dup=idim,if=run_itgt} + <> result_temp[iknl] = 0 {id=init_result,dup=iknl, \ + if=run_itgt} [iknl]: result_temp[iknl] = e2p( [iknl]: result_temp[iknl], [icoeff]: coeffs[icoeff], @@ -156,9 +170,9 @@ def get_kernel(self): targets, """ + ",".join(arg.name for arg in loopy_args) + """ ) {dep=fetch_coeffs:fetch_center:init_result:fetch_tgt,\ - id=update_result} + id=update_result,if=run_itgt} result[iknl, itgt] = result_temp[iknl] * kernel_scaling \ - {id=write_result,dep=update_result} + {id=write_result,dep=update_result,if=run_itgt} end end """], @@ -183,7 +197,9 @@ def get_kernel(self): silenced_warnings="write_race(*_result)", default_offset=lp.auto, fixed_parameters={"dim": self.dim, "nresults": len(self.kernels), - "ncoeffs": ncoeffs}, + "ncoeffs": ncoeffs, + "max_work_items": max_work_items, + "max_ntargets_in_one_box": max_ntargets_in_one_box}, lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -192,15 +208,41 @@ def get_kernel(self): return loopy_knl - def get_optimized_kernel(self): - # FIXME - knl = self.get_kernel() + def get_optimized_kernel(self, max_ntargets_in_one_box): + inner_knl, optimizations = self.get_loopy_evaluator_and_optimizations() + knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) knl = lp.tag_inames(knl, {"itgt_box": "g.0"}) + knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0") + knl = lp.add_inames_to_insn(knl, "dummy", + "id:fetch_init* or id:fetch_center or id:kernel_scaling") knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") + knl = lp.tag_inames(knl, {"dummy": "l.0"}) + knl = lp.set_temporary_address_space(knl, "coeffs", lp.AddressSpace.LOCAL) knl = lp.set_options(knl, - enforce_variable_access_ordered="no_check") + enforce_variable_access_ordered="no_check", write_code=False) + + for transform in optimizations: + knl = transform(knl) knl = register_optimization_preambles(knl, self.device) + # If there are inames tagged as local in the inner kernel + # we need to remove the iname itgt_offset_inner from instructions + # within those inames and also remove the predicate run_itgt + # which depends on itgt_offset_inner + tagged_inames = [iname.name for iname in + knl.default_entrypoint.inames.values() if + iname.name.startswith("e2p_") and any( + isinstance(tag, LocalInameTag) for tag in iname.tags)] + if tagged_inames: + insn_ids = [insn.id for insn in knl.default_entrypoint.instructions + if any(iname in tagged_inames for iname in insn.within_inames)] + match = " or ".join(f"id:{insn_id}" for insn_id in insn_ids) + knl = lp.remove_inames_from_insn(knl, + frozenset(["itgt_offset_inner"]), match) + knl = lp.remove_predicates_from_insn(knl, + frozenset([prim.Variable("run_itgt")]), match) + return knl def __call__(self, queue, **kwargs): @@ -212,7 +254,9 @@ def __call__(self, queue, **kwargs): :arg centers: :arg targets: """ - knl = self.get_cached_kernel_executor() + max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box") + knl = self.get_cached_kernel_executor( + max_ntargets_in_one_box=max_ntargets_in_one_box) centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type @@ -231,42 +275,49 @@ class E2PFromCSR(E2PBase): def default_name(self): return "e2p_from_csr" - def get_kernel(self): + def get_kernel(self, max_ntargets_in_one_box): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() + max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] - - for itgt - <> tgt[idim] = targets[idim,itgt] {id=fetch_tgt,dup=idim} - - <> isrc_box_start = source_box_starts[itgt_box] - <> isrc_box_end = source_box_starts[itgt_box+1] - - <> result_temp[iknl] = 0 {id=init_result,dup=iknl} - for isrc_box - <> src_ibox = source_box_lists[isrc_box] - <> coeffs[icoeff] = \ + <> tgt_ibox = target_boxes[itgt_box] {id=init_box0} + <> itgt_start = box_target_starts[tgt_ibox] {id=init_box1} + <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] \ + {id=init_box2} + <> isrc_box_start = source_box_starts[itgt_box] {id=init_box3} + <> isrc_box_end = source_box_starts[itgt_box+1] {id=init_box4} + + <> result_temp[itgt_offset, iknl] = 0 \ + {id=init_result,dup=iknl} + for isrc_box + <> src_ibox = source_box_lists[isrc_box] {id=fetch_src_box} + <> coeffs[icoeff] = \ src_expansions[src_ibox - src_base_ibox, icoeff] \ - {id=fetch_coeffs,dup=icoeff} - <> center[idim] = centers[idim, src_ibox] \ + {id=fetch_coeffs} + <> center[idim] = centers[idim, src_ibox] \ {dup=idim,id=fetch_center} - [iknl]: result_temp[iknl] = e2p( - [iknl]: result_temp[iknl], + + for itgt_offset + <> itgt = itgt_start + itgt_offset + <> run_itgt = itgt tgt[idim] = targets[idim,itgt] \ + {id=fetch_tgt,dup=idim,if=run_itgt} + + [iknl]: result_temp[itgt_offset, iknl] = e2p( + [iknl]: result_temp[itgt_offset, iknl], [icoeff]: coeffs[icoeff], [idim]: center[idim], [idim]: tgt[idim], @@ -276,11 +327,18 @@ def get_kernel(self): targets, """ + ",".join(arg.name for arg in loopy_args) + """ ) {id=update_result, \ - dep=fetch_coeffs:fetch_center:fetch_tgt:init_result} + dep=fetch_coeffs:fetch_center:fetch_tgt:init_result, \ + if=run_itgt} end - result[iknl, itgt] = result[iknl, itgt] + result_temp[iknl] \ - * kernel_scaling \ - {dep=update_result:init_result,id=write_result,dup=iknl} + end + for itgt_offset + <> itgt2 = itgt_start + itgt_offset {id=init_itgt_for_write} + <> run_itgt2 = itgt_start + itgt_offset < itgt_end \ + {id=init_cond_for_write} + result[iknl, itgt2] = result[iknl, itgt2] + result_temp[ \ + itgt_offset, iknl] * kernel_scaling \ + {dep=update_result:init_result,id=write_result, \ + dup=iknl,if=run_itgt2} end end """], @@ -308,30 +366,49 @@ def get_kernel(self): fixed_parameters={ "ncoeffs": ncoeffs, "dim": self.dim, + "max_work_items": max_work_items, + "max_ntargets_in_one_box": max_ntargets_in_one_box, "nresults": len(self.kernels)}, lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, "iknl*:unr") - loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box") + loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,isrc_box,itgt_offset") loopy_knl = self.add_loopy_eval_callable(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") return loopy_knl - def get_optimized_kernel(self): - # FIXME - knl = self.get_kernel() - knl = lp.tag_inames(knl, {"itgt_box": "g.0"}) + def get_optimized_kernel(self, max_ntargets_in_one_box): + _, optimizations = self.get_loopy_evaluator_and_optimizations() + knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) + knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"}) + knl = lp.unprivatize_temporaries_with_inames(knl, + "itgt_offset", "result_temp") + knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0") + knl = lp.privatize_temporaries_with_inames(knl, + "itgt_offset_outer", "result_temp") + knl = lp.duplicate_inames(knl, "itgt_offset_outer", "id:init_result") + knl = lp.duplicate_inames(knl, "itgt_offset_outer", + "id:write_result or id:init_itgt_for_write or id:init_cond_for_write") + knl = lp.add_inames_to_insn(knl, "dummy", + "id:init_box* or id:fetch_src_box or id:fetch_center " + "or id:kernel_scaling") knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") + knl = lp.add_inames_to_insn(knl, "itgt_offset_inner", "id:fetch_init*") + knl = lp.set_temporary_address_space(knl, "coeffs", lp.AddressSpace.LOCAL) knl = lp.set_options(knl, - enforce_variable_access_ordered="no_check") + enforce_variable_access_ordered="no_check", write_code=False) + for transform in optimizations: + knl = transform(knl) knl = register_optimization_preambles(knl, self.device) - return knl def __call__(self, queue, **kwargs): - knl = self.get_cached_kernel_executor() + max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box") + knl = self.get_cached_kernel_executor( + max_ntargets_in_one_box=max_ntargets_in_one_box) centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type diff --git a/sumpy/expansion/__init__.py b/sumpy/expansion/__init__.py index dc3ce3baa..a657ff733 100644 --- a/sumpy/expansion/__init__.py +++ b/sumpy/expansion/__init__.py @@ -22,7 +22,8 @@ from abc import ABC, abstractmethod from typing import ( - Any, ClassVar, Dict, Hashable, List, Optional, Sequence, Tuple, Type) + Any, ClassVar, Dict, Hashable, List, Optional, Sequence, Tuple, Type, + Callable) from pytools import memoize_method import loopy as lp @@ -66,9 +67,9 @@ class ExpansionBase(ABC): .. automethod:: get_coefficient_identifiers .. automethod:: coefficients_from_source .. automethod:: coefficients_from_source_vec - .. automethod:: loopy_expansion_formation .. automethod:: evaluate - .. automethod:: loopy_evaluator + .. automethod:: loopy_expansion_formation + .. automethod:: loopy_evaluator_and_optimizations .. automethod:: with_kernel .. automethod:: copy @@ -183,7 +184,9 @@ def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): in *coeffs*. """ - def loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + def loopy_evaluator_and_optimizations(self, kernels: Sequence[Kernel]) \ + -> Tuple[lp.TranslationUnit, Sequence[ + Callable[[lp.TranslationUnit], lp.TranslationUnit]]]: """ :returns: a :mod:`loopy` kernel that returns the evaluated target transforms of the potential given by *kernels*. diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py index 7f0607f0d..7c0433de6 100644 --- a/sumpy/expansion/local.py +++ b/sumpy/expansion/local.py @@ -32,6 +32,11 @@ VolumeTaylorExpansionMixin, LinearPDEConformingVolumeTaylorExpansion) from sumpy.tools import add_to_sac, mi_increment_axis +from sumpy.kernel import Kernel + +import loopy as lp + +from typing import Sequence, Callable, Tuple import logging logger = logging.getLogger(__name__) @@ -405,6 +410,16 @@ def loopy_translate_from(self, src_expansion): f"A direct loopy kernel for translation from " f"{src_expansion} to {self} is not implemented.") + def loopy_evaluator_and_optimizations(self, kernels: Sequence[Kernel]) \ + -> Tuple[lp.TranslationUnit, Sequence[ + Callable[[lp.TranslationUnit], lp.TranslationUnit]]]: + from sumpy.expansion.loopy import (make_l2p_loopy_kernel_for_volume_taylor, + make_e2p_loopy_kernel) + try: + return make_l2p_loopy_kernel_for_volume_taylor(self, kernels) + except NotImplementedError: + return make_e2p_loopy_kernel(self, kernels) + class VolumeTaylorLocalExpansion( VolumeTaylorExpansion, diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index 00bbd3dbe..d9fa8d718 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -20,8 +20,9 @@ THE SOFTWARE. """ -from typing import Sequence +from typing import Sequence, Callable, Tuple import pymbolic +import pymbolic.primitives as prim import loopy as lp import numpy as np from sumpy.expansion import ExpansionBase @@ -29,13 +30,15 @@ import sumpy.symbolic as sym from sumpy.assignment_collection import SymbolicAssignmentCollection from sumpy.tools import gather_loopy_arguments, gather_loopy_source_arguments +from math import prod, gcd import logging logger = logging.getLogger(__name__) -def make_e2p_loopy_kernel( - expansion: ExpansionBase, kernels: Sequence[Kernel]) -> lp.TranslationUnit: +def make_e2p_loopy_kernel(expansion: ExpansionBase, kernels: Sequence[Kernel]) \ + -> Tuple[lp.TranslationUnit, Sequence[ + Callable[[lp.TranslationUnit], lp.TranslationUnit]]]: """ This is a helper function to create a loopy kernel for multipole/local evaluation. This function uses symbolic expressions given by the expansion class, @@ -126,7 +129,9 @@ def make_e2p_loopy_kernel( for kernel in kernels: loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) - return loopy_knl + optimizations = [] + + return loopy_knl, optimizations def make_p2e_loopy_kernel( @@ -225,3 +230,332 @@ def make_p2e_loopy_kernel( loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) return loopy_knl + + +def make_l2p_loopy_kernel_for_volume_taylor(expansion, kernels): + dim = expansion.dim + order = expansion.order + ncoeffs = len(expansion) + + code_transformers = [expansion.get_code_transformer()] \ + + [kernel.get_code_transformer() for kernel in kernels] + pymbolic_conv = sym.SympyToPymbolicMapper() + + max_deriv_order = 0 + sym_expr_dicts = [] + for kernel in kernels: + expr_dict = {(0,)*dim: 1} + expr_dict = kernel.get_derivative_coeff_dict_at_target(expr_dict) + max_deriv_order = max(max_deriv_order, max(sum(mi) for mi in expr_dict)) + sym_expr_dict = {} + for mi, coeff in expr_dict.items(): + coeff = pymbolic_conv(coeff) + for transform in code_transformers: + coeff = transform(coeff) + sym_expr_dict[mi] = coeff + sym_expr_dicts.append(sym_expr_dict) + + domains = [ + "{[idim]: 0<=idim=", c)]), + )) + + if c != sync_split: + # We now copy before synchronization + v = [pymbolic.var(f"z{i}") for i in range(dim)] + v[slowest_axis], v[0] = v[0], v[slowest_axis] + iorder = pymbolic.var("iorder3") + idx = get_idx(v) + domains += get_domains(v, iorder, with_sync=True)[2:] + + insns.append(lp.Assignment( + assignee=coeffs_copy[0, idx], + expression=coeffs_copy[1, idx], + id="copy_sync", + depends_on=frozenset(["update_coeffs"]), + depends_on_is_final=True, + predicates=frozenset([prim.Comparison(v[0], ">=", c)]), + )) + + v = [pymbolic.var(f"y{i}") for i in range(dim)] + v[slowest_axis], v[0] = v[0], v[slowest_axis] + iorder = pymbolic.var("iorder2") + idx = get_idx(v) + domains += get_domains(v, iorder, with_sync=False)[1:] + + if c == sync_split: + # We did not have to sync within the c rows. + # We last wrote to coeffs_copy[v[0]//c % 2, :] and we read from it. + fetch_idx = (v[0]//c) % 2 + else: + # We need to sync within the c rows. + # We last wrote to coeffs_copy[0, :] and we read from it. + fetch_idx = 0 + + for ikernel, expr_dict in enumerate(sym_expr_dicts): + expr = sum(coeff * prod(powers[i, + v[i] + max_deriv_order - mi[i]] for i in range(dim)) + * (1 / rscale ** sum(mi)) + for mi, coeff in expr_dict.items()) + + insn = lp.Assignment( + assignee=result[ikernel], + expression=(result[ikernel] + + coeffs_copy[fetch_idx, idx] * expr), + id=f"write_{ikernel}", + depends_on=frozenset(["update_monomials", + "update_coeffs" if c == sync_split else "copy_sync"]), + depends_on_is_final=True, + ) + insns.append(insn) + + tags = { + "e2p_iorder1": "l.0", + f"e2p_{x0}_outer": "unr", + f"e2p_{x0}_inner": "unr", + f"e2p_{v[0]}_inner": "unr", + "e2p_iorder2": "unr", + } + if c != sync_split: + tags["e2p_iorder3"] = "l.0" + + nsplit = min(256, ncoeffs) + + optimizations += [ + lambda knl: lp.tag_inames(knl, tags), + lambda knl: lp.set_temporary_address_space(knl, "e2p_coeffs_copy", + lp.AddressSpace.LOCAL), + lambda knl: lp.split_iname(knl, "e2p_icoeff", nsplit, inner_tag="l.0"), + ] + + target_args = gather_loopy_arguments((expansion,) + tuple(kernels)) + loopy_knl = lp.make_function(domains, insns, + kernel_data=[ + lp.GlobalArg("result", shape=(len(kernels),), is_input=True, + is_output=True), + lp.GlobalArg("coeffs", + shape=(ncoeffs,), is_input=True, is_output=False), + lp.GlobalArg("center, target", + shape=(dim,), is_input=True, is_output=False), + lp.ValueArg("rscale", is_input=True), + lp.ValueArg("itgt", is_input=True), + lp.ValueArg("ntargets", is_input=True), + lp.GlobalArg("targets", + shape=(dim, "ntargets"), is_input=True, is_output=False), + *target_args, + *temporary_variables, + ...], + name="e2p", + lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, + fixed_parameters={ + "dim": dim, + "nresults": len(kernels), + "order": order, + "max_deriv_order": max_deriv_order, + "ncoeffs": ncoeffs, + }, + ) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for kernel in kernels: + loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) + + return loopy_knl, optimizations diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py index d63a094b4..4ceaa0a58 100644 --- a/sumpy/expansion/multipole.py +++ b/sumpy/expansion/multipole.py @@ -90,17 +90,10 @@ def coefficients_from_source(self, kernel, avec, bvec, rscale, sac=None): rscale, (1,), sac=sac) def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): - from sumpy.derivative_taker import DifferentiatedExprDerivativeTaker if not self.use_rscale: rscale = 1 base_taker = kernel.get_derivative_taker(bvec, rscale, sac) - # Following is a no-op, but AxisTargetDerivative.postprocess_at_target and - # DirectionalTargetDerivative.postprocess_at_target only handles - # DifferentiatedExprDerivativeTaker and sympy expressions, so we need to - # make the taker a DifferentitatedExprDerivativeTaker instance. - base_taker = DifferentiatedExprDerivativeTaker(base_taker, - {tuple([0]*self.dim): 1}) taker = kernel.postprocess_at_target(base_taker, bvec) result = [] diff --git a/sumpy/fmm.py b/sumpy/fmm.py index de7ac6aec..3c6a4ca20 100644 --- a/sumpy/fmm.py +++ b/sumpy/fmm.py @@ -933,6 +933,7 @@ def eval_multipoles(self, result=pot, rscale=self.level_to_rscale(isrc_level), + max_ntargets_in_one_box=self.max_ntargets_in_one_box, wait_for=wait_for, @@ -1069,6 +1070,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): result=pot, rscale=self.level_to_rscale(lev), + max_ntargets_in_one_box=self.max_ntargets_in_one_box, **kwargs) events.append(evt) diff --git a/sumpy/kernel.py b/sumpy/kernel.py index 6c50350b3..a794cb0a2 100644 --- a/sumpy/kernel.py +++ b/sumpy/kernel.py @@ -249,7 +249,17 @@ def postprocess_at_target(self, expr, bvec): :arg expr: may be a :class:`sympy.core.expr.Expr` or a :class:`sumpy.derivative_taker.DifferentiatedExprDerivativeTaker`. """ - return expr + from sumpy.derivative_taker import (ExprDerivativeTaker, + DifferentiatedExprDerivativeTaker) + expr_dict = {(0,)*self.dim: 1} + expr_dict = self.get_derivative_coeff_dict_at_target(expr_dict) + if isinstance(expr, ExprDerivativeTaker): + return DifferentiatedExprDerivativeTaker(expr, expr_dict) + + result = 0 + for mi, coeff in expr_dict.items(): + result += coeff * self._diff(expr, bvec, mi) + return result def get_derivative_coeff_dict_at_source(self, expr_dict): r"""Get the derivative transformation of the expression at source @@ -263,6 +273,18 @@ def get_derivative_coeff_dict_at_source(self, expr_dict): """ return expr_dict + def get_derivative_coeff_dict_at_target(self, expr_dict): + r"""Get the derivative transformation of the expression at target + represented by the dictionary expr_dict which is mapping from multi-index + `mi` to coefficient `coeff`. + Expression represented by the dictionary `expr_dict` is + :math:`\sum_{mi} \frac{\partial^mi}{x^mi}G * coeff`. Returns an + expression of the same type. + + This function is meant to be overridden by child classes where necessary. + """ + return expr_dict + def get_global_scaling_const(self): r"""Return a global scaling constant of the kernel. Typically, this ensures that the kernel is scaled so that @@ -959,8 +981,8 @@ def get_expression(self, scaled_dist_vec): def get_derivative_coeff_dict_at_source(self, expr_dict): return self.inner_kernel.get_derivative_coeff_dict_at_source(expr_dict) - def postprocess_at_target(self, expr, bvec): - return self.inner_kernel.postprocess_at_target(expr, bvec) + def get_derivative_coeff_dict_at_target(self, expr_dict): + return self.inner_kernel.get_derivative_coeff_dict_at_target(expr_dict) def get_global_scaling_const(self): return self.inner_kernel.get_global_scaling_const() @@ -1043,23 +1065,19 @@ def __str__(self): def __repr__(self): return f"AxisTargetDerivative({self.axis}, {self.inner_kernel!r})" - def postprocess_at_target(self, expr, bvec): - from sumpy.derivative_taker import (DifferentiatedExprDerivativeTaker, - diff_derivative_coeff_dict) + def get_derivative_coeff_dict_at_target(self, expr_dict): from sumpy.symbolic import make_sym_vector as make_sympy_vector - target_vec = make_sympy_vector(self.target_array_name, self.dim) - # bvec = tgt - ctr - expr = self.inner_kernel.postprocess_at_target(expr, bvec) - if isinstance(expr, DifferentiatedExprDerivativeTaker): - transformation = diff_derivative_coeff_dict(expr.derivative_coeff_dict, - self.axis, target_vec) - return DifferentiatedExprDerivativeTaker(expr.taker, transformation) - else: - # Since `bvec` and `tgt` are two different symbolic variables - # need to differentiate by both to get the correct answer - return expr.diff(bvec[self.axis]) + expr.diff(target_vec[self.axis]) + expr_dict = self.inner_kernel.get_derivative_coeff_dict_at_target( + expr_dict) + result = defaultdict(lambda: 0) + for mi, coeff in expr_dict.items(): + new_mi = list(mi) + new_mi[self.axis] += 1 + result[tuple(new_mi)] += coeff + result[mi] += sym.diff(coeff, target_vec[self.axis]) + return dict(result) def replace_base_kernel(self, new_base_kernel): return type(self)(self.axis, @@ -1141,35 +1159,23 @@ def transform(expr): return transform - def postprocess_at_target(self, expr, bvec): - from sumpy.derivative_taker import (DifferentiatedExprDerivativeTaker, - diff_derivative_coeff_dict) - + def get_derivative_coeff_dict_at_target(self, expr_dict): from sumpy.symbolic import make_sym_vector as make_sympy_vector dir_vec = make_sympy_vector(self.dir_vec_name, self.dim) target_vec = make_sympy_vector(self.target_array_name, self.dim) - expr = self.inner_kernel.postprocess_at_target(expr, bvec) + expr_dict = self.inner_kernel.get_derivative_coeff_dict_at_target( + expr_dict) - # bvec = tgt - center - if not isinstance(expr, DifferentiatedExprDerivativeTaker): - result = 0 + # avec = tgt - center + result = defaultdict(lambda: 0) + for mi, coeff in expr_dict.items(): for axis in range(self.dim): - # Since `bvec` and `tgt` are two different symbolic variables - # need to differentiate by both to get the correct answer - result += (expr.diff(bvec[axis]) + expr.diff(target_vec[axis])) \ - * dir_vec[axis] - return result - - new_transformation = defaultdict(lambda: 0) - for axis in range(self.dim): - axis_transformation = diff_derivative_coeff_dict( - expr.derivative_coeff_dict, axis, target_vec) - for mi, coeff in axis_transformation.items(): - new_transformation[mi] += coeff * dir_vec[axis] - - return DifferentiatedExprDerivativeTaker(expr.taker, - dict(new_transformation)) + new_mi = list(mi) + new_mi[axis] += 1 + result[tuple(new_mi)] += coeff * dir_vec[axis] + result[mi] += sym.diff(coeff, target_vec[axis]) + return result def get_args(self): return [ @@ -1268,25 +1274,14 @@ def replace_base_kernel(self, new_base_kernel): def replace_inner_kernel(self, new_inner_kernel): return type(self)(self.axis, new_inner_kernel) - def postprocess_at_target(self, expr, avec): + def get_derivative_coeff_dict_at_target(self, expr_dict): from sumpy.symbolic import make_sym_vector as make_sympy_vector - from sumpy.derivative_taker import (ExprDerivativeTaker, - DifferentiatedExprDerivativeTaker) - - expr = self.inner_kernel.postprocess_at_target(expr, avec) target_vec = make_sympy_vector(self.target_array_name, self.dim) - zeros = tuple([0]*self.dim) - mult = target_vec[self.axis] + expr_dict = self.inner_kernel.get_derivative_coeff_dict_at_target( + expr_dict) - if isinstance(expr, DifferentiatedExprDerivativeTaker): - transform = {mi: coeff * mult for mi, coeff in - expr.derivative_coeff_dict.items()} - return DifferentiatedExprDerivativeTaker(expr.taker, transform) - elif isinstance(expr, ExprDerivativeTaker): - return DifferentiatedExprDerivativeTaker({zeros: mult}) - else: - return mult * expr + return {mi: coeff*target_vec[self.axis] for mi, coeff in expr_dict.items()} def get_code_transformer(self): from sumpy.codegen import VectorComponentRewriter diff --git a/sumpy/symbolic.py b/sumpy/symbolic.py index 48d43a5ef..99706c9b7 100644 --- a/sumpy/symbolic.py +++ b/sumpy/symbolic.py @@ -120,6 +120,7 @@ def _find_symbolic_backend(): functions = sym.functions Number = sym.Number Float = sym.Float +diff = sym.diff def _coeff_isneg(a): diff --git a/sumpy/toys.py b/sumpy/toys.py index a794ff152..1bc7b6883 100644 --- a/sumpy/toys.py +++ b/sumpy/toys.py @@ -259,6 +259,7 @@ def _e2p(psource, targets, e2p): target_boxes=boxes, box_target_starts=box_target_starts, box_target_counts_nonchild=box_target_counts_nonchild, + max_ntargets_in_one_box=ntargets, centers=centers, rscale=psource.rscale, targets=vector_to_device(queue, make_obj_array(targets)), diff --git a/test/test_kernels.py b/test/test_kernels.py index 2ddbd9c5f..2ac9b5415 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -374,6 +374,7 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative target_boxes=source_boxes, box_target_starts=box_target_starts, box_target_counts_nonchild=box_target_counts_nonchild, + max_ntargets_in_one_box=ntargets, centers=centers, targets=targets, rscale=rscale, @@ -571,6 +572,7 @@ def eval_at(e2p, source_box_nr, rscale): target_boxes=e2p_target_boxes, box_target_starts=e2p_box_target_starts, box_target_counts_nonchild=e2p_box_target_counts_nonchild, + max_ntargets_in_one_box=ntargets, centers=centers, targets=targets,