apache · srkreddy1238 · Nov 27, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
@@ -80,6 +80,9 @@ if(USE_OPENCL)
       message(STATUS "Set OpenCL Target version to " ${CMAKE_MATCH_1})
     endif()
   endif(USE_OPENCL_EXTN_QCOM)
+  if(PROFILE_SHADER_DUMP)
+    add_definitions(-DPROFILE_SHADER_DUMP)
+  endif(PROFILE_SHADER_DUMP)
 else()
   list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc)
 endif(USE_OPENCL)
diff --git a/include/tvm/relax/exec_builder.h b/include/tvm/relax/exec_builder.h
@@ -122,6 +122,15 @@ class ExecBuilderNode : public Object {
     rv = value;
     return ConvertConstant_(rv);
   }
+  /*!
+   * \brief update memory scopes.
+   *
+   * This function builds the memory scopes for constants.
+   *
+   * \param idx Index of the constant
+   * \param scope The memory scope.
+   */
+  void SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope);
   /*!
    * \brief Raw access to underlying executable build in progress.
    */

diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
@@ -155,6 +155,8 @@ class VMExecutable : public ffi::ModuleObj {
   std::unordered_map<std::string, Index> func_map;
   /*! \brief The global constant pool. */
   std::vector<ffi::Any> constants;
+  /*! \brief The VDevice memory scopes */
+  std::unordered_map<Index, std::string> memory_scopes;
   /*! \brief The offset of instruction. */
   std::vector<Index> instr_offset;
   /*! \brief The byte data of instruction. */
@@ -177,6 +179,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void SaveGlobalSection(dmlc::Stream* strm) const;
+  /*!
+   * \brief Save the memory scopes.
+   * \param strm The output stream.
+   */
+  void SaveMemoryScopeSection(dmlc::Stream* strm) const;
   /*!
    * \brief Save the constant pool.
    * \param strm The input stream.
@@ -197,6 +204,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void LoadGlobalSection(dmlc::Stream* strm);
+  /*!
+   * \brief Load the memory scopes.
+   * \param strm The input stream.
+   */
+  void LoadMemoryScopeSection(dmlc::Stream* strm);
   /*!
    * \brief Load the constant pool.
    * \param strm The input stream.

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
@@ -773,6 +773,12 @@ TVM_DLL Pass DefaultGPUSchedule();
  */
 TVM_DLL Pass UseAssumeToReduceBranches();
 
+/*!
+ * \brief Inject Texture Allocation intrinsic.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectTextureAlloc();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm

diff --git a/python/tvm/dlight/adreno/__init__.py b/python/tvm/dlight/adreno/__init__.py
@@ -18,3 +18,6 @@
 Adreno schedule rules.
 """
 from .convolution import Conv2d
+from .layout_transform import LayoutTransform
+from .fallback import Fallback
+from .pool import Pool2D
diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
@@ -16,215 +16,85 @@
 # under the License.
 # pylint: disable=missing-docstring, invalid-name
 """A Conv2d schedule rule for Adreno GPU operators."""
-from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional, Union
 
 from tvm import tir
 from tvm.target import Target
-from tvm.tir import IterVar
-from tvm.tir.schedule.schedule import BlockRV
 
-from ..analysis import BlockInfo, IterInfo
+from .utils import schedule_inline_blocks, schedule_default
+from .. import analysis
 from .base import AdrenoScheduleRule
 
 
-def is_spatial_block(sch: tir.Schedule, block: BlockRV) -> bool:
-    block_stmt = sch.get(block)
-    iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars}
-    return iter_types == {IterVar.DataPar}
-
-
-def is_reduction_block(sch: tir.Schedule, block: BlockRV) -> bool:
-    block_stmt = sch.get(block)
-    iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars}
-    return iter_types == {IterVar.CommReduce, IterVar.DataPar}
-
-
-def _collect_producers(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    result = []
-    for producer in sch.get_producers(block):
-        result.append(producer)
-        result.extend(_collect_producers(sch, producer))
-    return result
-
-
-def _collect_consumers(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    result = []
-    for consumer in sch.get_consumers(block):
-        result.append(consumer)
-        result.extend(_collect_consumers(sch, consumer))
-    return result
-
-
-def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> BlockInfo:
-    def _iter_kind(loop: tir.IterVar) -> str:
-        return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O")
-
-    def _is_reduction_block(block: tir.schedule.BlockRV):
-        for iter_var in sch.get(block).iter_vars:
-            if _iter_kind(iter_var) == "R":
-                return True
-        return False
-
-    return BlockInfo(
-        name=sch.get(block).name_hint,
-        iters=[
-            IterInfo(
-                kind=_iter_kind(iter_var),
-                var=iter_var.var,
-                dom=iter_var.dom.extent,
-                loop_rv=loop_rv,
-            )
-            for loop_rv, iter_var in zip(sch.get_loops(block), sch.get(block).iter_vars)
-        ],
-        block_rv=block,
-        reduction_block=_is_reduction_block(block),
-    )
-
-
-def get_reduction_blocks(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV]) -> bool:
-    # NOTE: We assume there is only one reduction block in the function
-    # all blocks are required to be spatial or reduction
-    if not all(
-        [is_reduction_block(sch, block) or is_spatial_block(sch, block) for block in blocks]
-    ):
-        return None
-
-    # There is only one reduction block
-    reduction_blocks = [block for block in blocks if is_reduction_block(sch, block)]
-    if len(reduction_blocks) != 1:
-        return None
-
-    return reduction_blocks[0]
-
-
-def is_convolution(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    # TODO: Use buffer access patterns to discover convolution type kernels instead of using name.
-    return (
-        sch.get(block).name_hint.count("conv2d_NCHWc_OIHWo")
-        and "".join([iter_type.kind for iter_type in get_block_info(sch, block).iters])
-        == "SSSSSRRR"
-    )
-
-
 class Conv2d(AdrenoScheduleRule):
     """The schedule rule for convolution computation"""
 
-    @dataclass
-    class Config:
-        block_size_x: int = 8
-        block_size_y: int = 8
-        vector_size: int = 1
-        unroll: int = 256  # 0 means no unroll
-        use_shared: bool = True
-        storage_align: bool = False
-        inner_x: bool = False
-
-    def get_configs(self, target: Target) -> Config:
-        """Get the schedule config for the target"""
-        if target.kind.name == "cuda" or target.kind.name == "rocm":
-            return Conv2d.Config(
-                block_size_x=8,
-                block_size_y=16,
-                vector_size=2,
-                unroll=256,
-                use_shared=True,
-                storage_align=True,
-                inner_x=False,
-            )
-        elif target.kind.name == "opencl" and (
-            ("android" in str(target.host)) or ("adreno" in str(target.attrs))
-        ):
-            return Conv2d.Config(
-                block_size_x=32,
-                block_size_y=4,
-                vector_size=8,
-                unroll=16,
-                use_shared=False,
-                storage_align=False,
-                inner_x=True,
-            )
-        else:
-            return Conv2d.Config()
+    @staticmethod
+    def schedule_conv2d(sch: tir.Schedule, blk: tir.schedule.BlockRV):
+        n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
+
+        bz, vz, tz = sch.split(oc, [None, 8, 1], preserve_unit_iters=True)
+        by, vy, ty = sch.split(oh, [None, 1, 16], preserve_unit_iters=True)
+        bx, vx, tx = sch.split(ow, [None, 1, 16], preserve_unit_iters=True)
+
+        bz = sch.fuse(n, bz, preserve_unit_iters=True)
+        sch.reorder(bz, by, bx, vz, vy, vx, tz, ty, tx, ob)
+        sch.bind(bz, "blockIdx.z")
+        sch.bind(by, "blockIdx.y")
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(vz, "vthread.z")
+        sch.bind(vy, "vthread.y")
+        sch.bind(vx, "vthread.x")
+        sch.bind(tz, "threadIdx.z")
+        sch.bind(ty, "threadIdx.y")
+        sch.bind(tx, "threadIdx.x")
+
+        rblk = sch.cache_read(blk, 0, "local")
+        ico, icb = sch.split(ic, [None, 4], preserve_unit_iters=True)
+        sch.reorder(ico, kh, kw, icb, ob)
+
+        sch.compute_at(rblk, kw, preserve_unit_loops=True)
+        sch.vectorize(sch.get_loops(rblk)[-1])
+        wblk = sch.cache_write(blk, 0, "local")
+        sch.reverse_compute_at(wblk, tx, preserve_unit_loops=True)
+        sch.vectorize(sch.get_loops(wblk)[-1])
+        init_blk = sch.decompose_reduction(blk, tx)
+        sch.vectorize(sch.get_loops(init_blk)[-1])
 
     def apply(  # pylint: disable=too-many-locals,missing-docstring
         self,
-        func: tir.PrimFunc,
+        func: Union[tir.PrimFunc, tir.Schedule],
         target: Target,
         _: bool,
     ) -> Optional[tir.Schedule]:
-        if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
+        if not (isinstance(func, (tir.PrimFunc, tir.Schedule))) or not self.is_target_available(
+            target
+        ):
             return None
 
         if isinstance(func, tir.PrimFunc):
             sch = tir.Schedule(func)
+            sch.work_on("main")
+        elif isinstance(func, tir.Schedule):
+            sch = func
 
-        # config = self.get_configs(target)
-        root_block = analysis.get_root_block(sch)
+        root_block = analysis.get_root_block(sch, sch.func_working_on)
         blocks = sch.get_child_blocks(root_block)
-        reduction_block = get_reduction_blocks(sch, blocks)
+        reduction_blocks = list(
+            filter(lambda block: analysis.get_block_info(sch, block).is_reduction(), blocks)
+        )
+        remaining_blocks = [blk for blk in blocks if blk not in reduction_blocks]
 
-        if reduction_block is None:
-            return None
-        if not is_convolution(sch, reduction_block):
+        def is_convolution(blk):
+            block_info = analysis.get_block_info(sch, blk)
+            return "conv2d_NCHWc" in block_info.name
+
+        if len(reduction_blocks) != 1 or not is_convolution(reduction_blocks[0]):
             return None
 
-        def schedule_data_pad(blk):
-            axes = sch.get_loops(blk)
-            axes, vec = axes[:-1], axes[-1]
-            axis = sch.fuse(*axes)
-            bx, ty, tx = sch.split(axis, [None, 16, 16])
-            sch.bind(bx, "blockIdx.x")
-            sch.bind(ty, "threadIdx.y")
-            sch.bind(tx, "threadIdx.x")
-            sch.vectorize(vec)
-
-        def schedule_conv2d(blk):
-            # TODO: Loop Pattern mayn't be reliable, need to perform better analysis.
-            n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
-            sch.reorder(n, oc, oh, ow, ic, kh, kw, ob)
-            main_lp = sch.fuse(n, oc, oh, ow)
-            bx, ty, tx = sch.split(main_lp, [None, 16, 16])
-            sch.bind(tx, "threadIdx.x")
-            sch.bind(ty, "threadIdx.y")
-            sch.bind(bx, "blockIdx.x")
-
-            ico, icv = sch.split(ic, [None, 4])
-            sch.reorder(ico, kh, kw, icv, ob)
-            rblk = sch.cache_read(blk, 0, "local")
-            sch.compute_at(rblk, kw)
-            sch.vectorize(sch.get_loops(rblk)[-1])
-            wblk = sch.cache_write(blk, 0, "local")
-            sch.reverse_compute_at(wblk, tx)
-            sch.vectorize(sch.get_loops(wblk)[-1])
-            sch.vectorize(ob)
-            init_blk = sch.decompose_reduction(blk, ico)
-            sch.vectorize(sch.get_loops(init_blk)[-1])
-
-        def is_data_pad(block: tir.stmt.Block):
-            return is_spatial_block(sch, block) and tir.analysis.has_if_then_else(sch.get(block))
-
-        def schedule_conv2d_blocks():
-
-            # Do analysis to find block type
-            blocks = sch.get_child_blocks(root_block)
-            passed_reduction = False
-            for blk in blocks:
-                if is_reduction_block(sch, blk):
-                    schedule_conv2d(blk)
-                    passed_reduction = True
-                elif is_data_pad(blk):
-                    schedule_data_pad(blk)
-                elif is_spatial_block(sch, blk):
-                    try:
-                        if not passed_reduction:
-                            sch.compute_inline(blk)
-                        else:
-                            sch.reverse_compute_inline(blk)
-                    except:  # pylint: disable=W0702
-                        pass
-                else:
-                    raise TypeError("Can't Schedule this Block", sch.get(blk))
-
-        schedule_conv2d_blocks()
+        conv_blk = reduction_blocks[0]
+        Conv2d.schedule_conv2d(sch, conv_blk)
+        remaining_blocks = schedule_inline_blocks(sch, remaining_blocks)
+        schedule_default(sch, remaining_blocks)
+
         return sch