[rocmlibs] Add support for gfx1150

saiislam · saiislam · commit f521cd27dddb · 2025-04-08T13:44:03.000-05:00
Added support for gfx1150 along with minor modifications in clr.
diff --git a/bin/patches/clr.patch b/bin/patches/clr.patch
@@ -11,3 +11,19 @@ index 72f406210..6004cf325 100644
    PATHS
      /opt/rocm/
      ${ROCM_INSTALL_PATH}
+diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
+index 751161a58..3271157b7 100644
+--- a/rocclr/device/rocm/rocdevice.cpp
++++ b/rocclr/device/rocm/rocdevice.cpp
+@@ -679,11 +679,6 @@ bool Device::create() {
+                    pciDeviceId_);
+     return false;
+   }
+-  if (agent_isas.count != 1) {
+-    LogPrintfError("HSA device %s (PCI ID %x) has %u ISAs but can only support a single ISA",
+-                   agent_name, pciDeviceId_, agent_isas.count);
+-    return false;
+-  }
+ 
+   uint32_t isa_name_length = 0;
+   if (HSA_STATUS_SUCCESS !=
diff --git a/bin/patches/patch-control-file_20.0.txt b/bin/patches/patch-control-file_20.0.txt
@@ -11,3 +11,4 @@ rocprofiler: rocprofiler-combined-no-aql-ok-fix-cov6.patch
 babelstream: babelstream-usm.patch
 llvm-project: ATD_ASO_full.patch
 UMT: umt.patch
+clr : clr.patch
diff --git a/bin/patches/patch-control-file_21.0.txt b/bin/patches/patch-control-file_21.0.txt
@@ -11,3 +11,4 @@ rocprofiler: rocprofiler-combined-no-aql-ok-fix-cov6.patch
 babelstream: babelstream-usm.patch
 llvm-project: ATD_ASO_full.patch
 UMT: umt.patch
+clr : clr.patch
diff --git a/bin/rocmlibs/build_powerinfer.sh b/bin/rocmlibs/build_powerinfer.sh
@@ -118,6 +118,8 @@ if [ "$1" == "install" ] ; then
    pushd $_repo_dir
    cd gguf-py
    echo "Installing gguf python package"
+   python3 -m venv $AOMP_INSTALL_DIR/../venv
+   source  $AOMP_INSTALL_DIR/../venv/bin/activate
    pip install .
    if [ $? != 0 ] ; then
       echo "ERROR pip install failed for PowerInfer/gguf-py package"
@@ -130,6 +132,7 @@ if [ "$1" == "install" ] ; then
       echo "ERROR pip install failed for PowerInfer/powerinfer-py package"
       exit 1
    fi
+   deactivate
    popd
    removepatch $_repo_dir
 else 
diff --git a/bin/rocmlibs/patches/patch-control-file_20.0.txt b/bin/rocmlibs/patches/patch-control-file_20.0.txt
@@ -1,4 +1,5 @@
 Tensile: tensile_aca95d17.patch
+clr : clr.patch
 rocBLAS: rocblas.patch
 rocPRIM: rocprim.patch
 rocSPARSE: rocsparse.patch
diff --git a/bin/rocmlibs/patches/patch-control-file_21.0.txt b/bin/rocmlibs/patches/patch-control-file_21.0.txt
@@ -1,4 +1,5 @@
 Tensile: tensile_aca95d17.patch
+clr : clr.patch
 rocBLAS: rocblas.patch
 rocPRIM: rocprim.patch
 rocSPARSE: rocsparse.patch
diff --git a/bin/rocmlibs/patches/powerinfer.patch b/bin/rocmlibs/patches/powerinfer.patch
@@ -7,7 +7,7 @@ index 4cf28d5..2cf69a1 100644
              # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
              # and select the line that matches the current nixpkgs version of rocBLAS.
 -            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-+            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx90c;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103"
++            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx90c;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150"
            ];
          };
          apps.llama-server = {
diff --git a/bin/rocmlibs/patches/rocblas.patch b/bin/rocmlibs/patches/rocblas.patch
@@ -1,18 +1,18 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index f70de1cf..5eedfcc3 100644
+index 2cb3b303..ad442656 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -112,7 +112,7 @@ if (NOT BUILD_ADDRESS_SANITIZER)
    set( TARGET_LIST_ROCM_5.6 "gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102")
    set( TARGET_LIST_ROCM_5.7 "gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102")
    set( TARGET_LIST_ROCM_6.0 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102")
 -  set( TARGET_LIST_ROCM_6.3 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201")
-+  set( TARGET_LIST_ROCM_6.3 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx1103;gfx90c")
++  set( TARGET_LIST_ROCM_6.3 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201;gfx1103;gfx90c")
  else()
    set( TARGET_LIST_ROCM_5.6 "gfx908:xnack+;gfx90a:xnack+")
    set( TARGET_LIST_ROCM_5.7 "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+")
 diff --git a/library/src/handle.cpp b/library/src/handle.cpp
-index ea1fa1a7..cb14874c 100644
+index ea1fa1a7..7d2a05c3 100644
 --- a/library/src/handle.cpp
 +++ b/library/src/handle.cpp
 @@ -106,6 +106,10 @@ static Processor getActiveArch(int deviceId)
@@ -26,19 +26,23 @@ index ea1fa1a7..cb14874c 100644
      else if(deviceString.find("gfx940") != std::string::npos)
      {
          return Processor::gfx940;
-@@ -146,6 +150,10 @@ static Processor getActiveArch(int deviceId)
+@@ -146,6 +150,14 @@ static Processor getActiveArch(int deviceId)
      {
          return Processor::gfx1102;
      }
 +    else if(deviceString.find("gfx1103") != std::string::npos)
 +    {
 +        return Processor::gfx1103;
++    }
++    else if(deviceString.find("gfx1150") != std::string::npos)
++    {
++        return Processor::gfx1150;
 +    }
      else if(deviceString.find("gfx1151") != std::string::npos)
      {
          return Processor::gfx1151;
 diff --git a/library/src/include/handle.hpp b/library/src/include/handle.hpp
-index 94d18c7b..c47cefe8 100644
+index 70844136..6532c78e 100644
 --- a/library/src/include/handle.hpp
 +++ b/library/src/include/handle.hpp
 @@ -82,6 +82,7 @@ enum class Processor : int
@@ -49,11 +53,12 @@ index 94d18c7b..c47cefe8 100644
      gfx940  = 940,
      gfx941  = 941,
      gfx942  = 942,
-@@ -96,6 +97,7 @@ enum class Processor : int
+@@ -96,6 +97,8 @@ enum class Processor : int
      gfx1100 = 1100,
      gfx1101 = 1101,
      gfx1102 = 1102,
 +    gfx1103 = 1103,
++    gfx1150 = 1150,
      gfx1151 = 1151,
      gfx1200 = 1200,
      gfx1201 = 1201
diff --git a/bin/rocmlibs/patches/rocprim.patch b/bin/rocmlibs/patches/rocprim.patch
@@ -1,78 +1,13 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index a5b9b127..0153b24d 100644
+index 462a5928..6caba409 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -99,7 +99,7 @@ if(NOT USE_HIP_CPU)
        )
      else()
        rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
 -        TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201"
-+        TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx90c;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103;gfx1151;gfx1200;gfx1201"
++        TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
        )
      endif()
      
-diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp
-index 58729b1d..22f7272a 100644
---- a/rocprim/include/rocprim/device/config_types.hpp
-+++ b/rocprim/include/rocprim/device/config_types.hpp
-@@ -169,9 +169,11 @@ enum class target_arch : unsigned int
-     gfx906  = 906,
-     gfx908  = 908,
-     gfx90a  = 910,
-+    gfx90c  = 912,
-     gfx1030 = 1030,
-     gfx1100 = 1100,
-     gfx1102 = 1102,
-+    gfx1103 = 1103,
-     unknown = std::numeric_limits<unsigned int>::max(),
- };
- #endif // DOXYGEN_SHOULD_SKIP_THIS
-@@ -204,16 +206,18 @@ constexpr bool prefix_equals(const char* lhs, const char* rhs, std::size_t n)
- constexpr target_arch get_target_arch_from_name(const char* const arch_name, const std::size_t n)
- {
-     constexpr const char* target_names[]
--        = {"gfx803", "gfx900", "gfx906", "gfx908", "gfx90a", "gfx1030", "gfx1100", "gfx1102"};
-+        = {"gfx803", "gfx900", "gfx906", "gfx908", "gfx90a", "gfx90c", "gfx1030", "gfx1100", "gfx1102", "gfx1103"};
-     constexpr target_arch target_architectures[] = {
-         target_arch::gfx803,
-         target_arch::gfx900,
-         target_arch::gfx906,
-         target_arch::gfx908,
-         target_arch::gfx90a,
-+        target_arch::gfx90c,
-         target_arch::gfx1030,
-         target_arch::gfx1100,
-         target_arch::gfx1102,
-+        target_arch::gfx1103,
-     };
-     static_assert(sizeof(target_names) / sizeof(target_names[0])
-                       == sizeof(target_architectures) / sizeof(target_architectures[0]),
-@@ -266,12 +270,16 @@ auto dispatch_target_arch(const target_arch target_arch)
-             return Config::template architecture_config<target_arch::gfx908>::params;
-         case target_arch::gfx90a:
-             return Config::template architecture_config<target_arch::gfx90a>::params;
-+        case target_arch::gfx90c:
-+            return Config::template architecture_config<target_arch::gfx90c>::params;
-         case target_arch::gfx1030:
-             return Config::template architecture_config<target_arch::gfx1030>::params;
-         case target_arch::gfx1100:
-             return Config::template architecture_config<target_arch::gfx1100>::params;
-         case target_arch::gfx1102:
-             return Config::template architecture_config<target_arch::gfx1102>::params;
-+        case target_arch::gfx1103:
-+            return Config::template architecture_config<target_arch::gfx1103>::params;
-         case target_arch::invalid:
-             assert(false && "Invalid target architecture selected at runtime.");
-     }
-diff --git a/scripts/autotune/create_optimization.py b/scripts/autotune/create_optimization.py
-index 130bdb3c..171de162 100755
---- a/scripts/autotune/create_optimization.py
-+++ b/scripts/autotune/create_optimization.py
-@@ -41,7 +41,7 @@ from collections import defaultdict
- from typing import Dict, List, Callable, Optional, Tuple
- from jinja2 import Environment, PackageLoader, select_autoescape
- 
--TARGET_ARCHITECTURES = ['gfx803', 'gfx900', 'gfx906', 'gfx908', 'gfx90a', 'gfx1030', 'gfx1100', 'gfx1102']
-+TARGET_ARCHITECTURES = ['gfx803', 'gfx900', 'gfx906', 'gfx908', 'gfx90a', 'gfx90c', 'gfx1030', 'gfx1100', 'gfx1102', 'gfx1103']
- # C++ typename used for optional types
- EMPTY_TYPENAME = "empty_type"
diff --git a/bin/rocmlibs/patches/tensile_aca95d17.patch b/bin/rocmlibs/patches/tensile_aca95d17.patch
@@ -1,8 +1,8 @@
 diff --git a/Tensile/AsmCaps.py b/Tensile/AsmCaps.py
-index 548b31f2..de4c2dd5 100644
+index 548b31f2..09adb3ef 100644
 --- a/Tensile/AsmCaps.py
 +++ b/Tensile/AsmCaps.py
-@@ -771,6 +771,50 @@ CACHED_ASM_CAPS = \
+@@ -771,6 +771,94 @@ CACHED_ASM_CAPS = \
                'v_mov_b64': False,
                'v_pk_fma_f16': True,
                'v_pk_fmac_f16': False},
@@ -49,23 +49,69 @@ index 548b31f2..de4c2dd5 100644
 +              'v_mad_mix_f32': False,
 +              'v_mov_b64': False,
 +              'v_pk_fma_f16': True,
++              'v_pk_fmac_f16': False},
++ (11, 5, 0): {'HasAddLshl': True,
++              'HasAtomicAdd': True,
++              'HasDirectToLdsDest': False,
++              'HasDirectToLdsNoDest': False,
++              'HasExplicitCO': True,
++              'HasExplicitNC': True,
++              'HasGLCModifier': True,
++              'HasNTModifier': False,
++              'HasLshlOr': True,
++              'HasMFMA': False,
++              'HasMFMA_b8': False,
++              'HasMFMA_bf16_1k': False,
++              'HasMFMA_bf16_original': False,
++              'HasMFMA_constSrc': False,
++              'HasMFMA_f64': False,
++              'HasMFMA_f8': False,
++              'HasMFMA_i8_908': False,
++              'HasMFMA_i8_940': False,
++              'HasMFMA_vgpr': False,
++              'HasMFMA_xf32': False,
++              'HasSMulHi': True,
++              'HasWMMA': True,
++              'KernargPreloading': False,
++              'MaxLgkmcnt': 15,
++              'MaxVmcnt': 63,
++              'SupportedISA': True,
++              'SupportedSource': True,
++              'VOP3v_dot4_i32_i8': False,
++              'v_dot2_f32_f16': True,
++              'v_dot2c_f32_f16': True,
++              'v_dot4_i32_i8': False,
++              'v_dot4c_i32_i8': False,
++              'v_fma_f16': True,
++              'v_fma_f32': True,
++              'v_fma_f64': True,
++              'v_fma_mix_f32': True,
++              'v_fmac_f16': False,
++              'v_fmac_f32': True,
++              'v_mac_f16': False,
++              'v_mac_f32': False,
++              'v_mad_mix_f32': False,
++              'v_mov_b64': False,
++              'v_pk_fma_f16': True,
 +              'v_pk_fmac_f16': False},
   (11, 5, 1): {'HasAddLshl': True,
                'HasAtomicAdd': True,
                'HasDirectToLdsDest': False,
 diff --git a/Tensile/Common.py b/Tensile/Common.py
-index 66f2caa2..4a77df59 100644
+index 66f2caa2..46c2d274 100644
 --- a/Tensile/Common.py
 +++ b/Tensile/Common.py
-@@ -253,7 +253,7 @@ globalParameters["SupportedISA"] = [(8,0,3),
+@@ -253,8 +253,8 @@ globalParameters["SupportedISA"] = [(8,0,3),
                                      (9,0,0), (9,0,6), (9,0,8), (9,0,10),
                                      (9,4,0), (9,4,1), (9,4,2),
                                      (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1),
 -                                    (11,0,0), (11,0,1), (11,0,2),
+-                                    (11,5,1),
 +                                    (11,0,0), (11,0,1), (11,0,2), (11,0,3),
-                                     (11,5,1),
++                                    (11, 5, 0), (11,5,1),
                                      (12,0,0), (12,0,1)] # assembly kernels writer supports these architectures
  
+ globalParameters["CleanupBuildFiles"] = False                     # cleanup build files (e.g. kernel assembly) once no longer needed
 @@ -316,7 +316,7 @@ globalParameters["SeparateArchitectures"] = False # write Tensile library metada
  
  globalParameters["LazyLibraryLoading"] = False # Load library and code object files when needed instead of at startup
@@ -75,7 +121,7 @@ index 66f2caa2..4a77df59 100644
  
  globalParameters["ExperimentalLogicDir"] = "/experimental/"
  
-@@ -328,13 +328,13 @@ architectureMap = {
+@@ -328,13 +328,14 @@ architectureMap = {
    'all':'_','gfx000':'none', 'gfx803':'r9nano', 'gfx900':'vega10', 'gfx900:xnack-':'vega10',
    'gfx906':'vega20', 'gfx906:xnack+':'vega20', 'gfx906:xnack-':'vega20',
    'gfx908':'arcturus','gfx908:xnack+':'arcturus', 'gfx908:xnack-':'arcturus',
@@ -88,10 +134,11 @@ index 66f2caa2..4a77df59 100644
    'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt',
 -  'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33',
 +  'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', 'gfx1103':'phoenix',
++  'gfx1150':'gfx1150',
    'gfx1151':'gfx1151',
    'gfx1200':'gfx1200',
    'gfx1201':'gfx1201'
-@@ -2461,7 +2461,7 @@ def assignGlobalParameters( config ):
+@@ -2461,7 +2462,7 @@ def assignGlobalParameters( config ):
      if os.name == "nt":
        globalParameters["CurrentISA"] = (9,0,6)
        printWarning("Failed to detect ISA so forcing (gfx906) on windows")
@@ -292,7 +339,7 @@ index 6e22a2c7..09345113 100644
      Build Tensile client executable; used for stand alone benchmarking (default).
  \-\-client-config 
 diff --git a/pytest.ini b/pytest.ini
-index 13c43039..23a53d35 100644
+index 13c43039..70ed7a3c 100644
 --- a/pytest.ini
 +++ b/pytest.ini
 @@ -92,6 +92,7 @@ markers =
@@ -317,9 +364,10 @@ index 13c43039..23a53d35 100644
   skip-gfx940:  architecture
   skip-gfx941:  architecture
   skip-gfx942:  architecture
-@@ -125,4 +128,5 @@ markers =
+@@ -125,4 +128,6 @@ markers =
   skip-gfx1100: architecture
   skip-gfx1101: architecture
   skip-gfx1102: architecture
 + skip-gfx1103: architecture
++ skip-gfx1150: architecture
   skip-gfx1151: architecture
diff --git a/bin/rocmlibs/test_powerinfer.sh b/bin/rocmlibs/test_powerinfer.sh
@@ -26,8 +26,8 @@ pushd $MODEL_DIR
 pushd $MODEL_DIR
 
 # Install huggingface-cli to download the PowerInfer GGUF models
-pip install -U "huggingface_hub[cli]"
-export PATH=$PATH:$HOME/.local/bin
+pipx install -U "huggingface_hub[cli]"
+export PATH=$PATH:$HOME/.local/share/pipx/venvs/huggingface-hub/bin
 
 # Use huggingface-cli to download the PowerInfer GGUF version of LLaMA(ReLU)-2-7B model
 huggingface-cli download --resume-download --local-dir ReluLLaMA-7B --local-dir-use-symlinks False PowerInfer/ReluLLaMA-7B-PowerInfer-GGUF

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`Tensile: tensile_aca95d17.patch`
	`2`	`+clr : clr.patch`
`2`	`3`	`rocBLAS: rocblas.patch`
`3`	`4`	`rocPRIM: rocprim.patch`
`4`	`5`	`rocSPARSE: rocsparse.patch`