Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
784210d
Add cuda test
sunnycase Sep 25, 2025
9df6355
Initial cuda support
sunnycase Sep 26, 2025
df859d9
Add cuda target
sunnycase Sep 28, 2025
a515486
Add cuda runtime module
sunnycase Oct 11, 2025
5dac66e
Update
sunnycase Oct 13, 2025
2570775
Add warp hierarchy
sunnycase Dec 12, 2025
f8ab72e
Remove trailing whitespace
sunnycase Jun 25, 2026
11bc83a
Apply code-format changes
sunnycase Jun 25, 2026
8961ef0
Gate CUDA runtime build
sunnycase Jun 25, 2026
926ca8b
Fix half fallback type
sunnycase Jun 25, 2026
d603370
Exclude CUDA tests from compiler CI
sunnycase Jun 25, 2026
f68bbc2
Align CPU runtime thread entry ABI
sunnycase Jun 25, 2026
114276c
Fix scoped local rdata serialization
sunnycase Jun 25, 2026
b010247
Fix compiler CI profiling and test result permissions
sunnycase Jun 29, 2026
8577f98
Fix macOS NTT gencode CI failure
sunnycase Jun 29, 2026
8f5b850
Skip uncached HuggingFace tests in PR CI
sunnycase Jun 29, 2026
bd57399
Fix NTT CI test target handling
sunnycase Jun 29, 2026
8e86755
Stabilize ONNX ReduceL1 CI test
sunnycase Jun 29, 2026
4ccc168
Fix NTT nested function local buffer ABI
sunnycase Jun 30, 2026
f0dcc31
Keep device function calls on operator ABI
sunnycase Jun 30, 2026
9f91b69
Normalize reduce axes in IR construction
sunnycase Jun 30, 2026
bc13b56
Make half operations callable from CUDA device code
sunnycase Jun 30, 2026
356735a
Normalize reduce axes in NTT vectorization
sunnycase Jun 30, 2026
1677f37
Find CUDA toolkit when enabling CUDA runtime
sunnycase Jun 30, 2026
49f5205
Add CUDA kernel CI job
sunnycase Jul 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 116 additions & 2 deletions .github/workflows/compiler-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

on: [push, pull_request]

permissions:
contents: read
checks: write
pull-requests: write

concurrency:
group: compiler-build-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/master' && !startsWith(github.ref, 'refs/heads/release/') }}
Expand Down Expand Up @@ -138,7 +143,7 @@ jobs:
working-directory: ${{github.workspace}}
run: |
dotnet tool install --global dotnet-coverage
dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/unit.xml "dotnet test -c ${{matrix.config.buildType}} -s test.runsettings --no-build --verbosity normal --blame"
dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/unit.xml "dotnet test -c ${{matrix.config.buildType}} -s test.runsettings --no-build --verbosity normal --filter FullyQualifiedName!~Nncase.Tests.TargetTest.UnitTestCUDAKernels --blame"
dotnet-coverage merge -o coverage.unit.xml -f cobertura -r coverage/*.xml

- name: Upload Coverage
Expand All @@ -156,6 +161,111 @@ jobs:
path: ${{github.workspace}}/src/Nncase.Compiler/bin/${{matrix.config.buildType}}/net${{matrix.dotnet-version}}/${{matrix.config.rid}}/publish
if-no-files-found: error

test-cuda-kernels:
name: test-x86_64-linux-cuda
runs-on: [self-hosted, linux, x64, cuda]
timeout-minutes: 120

env:
BUILD_TYPE: Release
DOTNET_VERSION: '8.0'
NNCASE_CUDA_ARCHITECTURES: '120'

steps:
- uses: actions/checkout@v3
- uses: seanmiddleditch/gha-setup-ninja@master

- name: Setup .NET
uses: actions/setup-dotnet@v4
with:
dotnet-version: ${{env.DOTNET_VERSION}}

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Set up CUDA build environment
shell: bash
run: |
CUDA_ROOT="${CUDA_HOME:-${CUDA_PATH:-}}"
if [ -z "$CUDA_ROOT" ]; then
if [ -d /usr/local/cuda-12.8 ]; then
CUDA_ROOT=/usr/local/cuda-12.8
else
CUDA_ROOT=/usr/local/cuda
fi
fi
test -x "$CUDA_ROOT/bin/nvcc"
nvidia-smi
"$CUDA_ROOT/bin/nvcc" --version
clang++ --version
echo "CUDA_HOME=$CUDA_ROOT" >> $GITHUB_ENV
echo "CUDA_PATH=$CUDA_ROOT" >> $GITHUB_ENV
echo "CUDAToolkit_ROOT=$CUDA_ROOT" >> $GITHUB_ENV
echo "CMAKE_CUDA_COMPILER=clang++" >> $GITHUB_ENV
echo "CC=clang" >> $GITHUB_ENV
echo "CXX=clang++" >> $GITHUB_ENV
echo "$CUDA_ROOT/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=$GITHUB_WORKSPACE/install/lib:$CUDA_ROOT/targets/x86_64-linux/lib:$CUDA_ROOT/lib64:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV

- name: Install Conan
shell: bash
run: |
pip install conan==2.6.0
pip install cmake==3.30.3
conan remote add sunnycase https://conan.sunnycase.moe --index 0 --force
conan remote update conancenter --url "https://center2.conan.io"

- name: Build CUDA native runtime
shell: bash
run: |
conan install . --build=missing -s build_type=$BUILD_TYPE -pr:a=toolchains/x86_64-linux-cuda.profile.jinja -o "&:runtime=False" -o "&:python=True" -o "&:tests=False"
cmake --preset conan-release \
-DCMAKE_CUDA_COMPILER=clang++ \
-DCMAKE_CUDA_ARCHITECTURES=$NNCASE_CUDA_ARCHITECTURES \
-DCUDAToolkit_ROOT="$CUDAToolkit_ROOT" \
-DENABLE_CUDA_RUNTIME=ON
cmake --build build/$BUILD_TYPE --config $BUILD_TYPE --parallel
cmake --install build/$BUILD_TYPE --prefix install

- name: Build .NET tests
shell: bash
run: |
dotnet restore -r linux-x64
dotnet build src/Nncase.Tests/Nncase.Tests.csproj -c $BUILD_TYPE --no-restore

- name: Set up Dotnet Test settings
shell: bash
run: |
cat > test.runsettings <<EOF
<?xml version="1.0" encoding="utf-8"?>
<RunSettings>
<RunConfiguration>
<EnvironmentVariables>
<LD_LIBRARY_PATH>$GITHUB_WORKSPACE/install/lib:$CUDA_HOME/targets/x86_64-linux/lib:$CUDA_HOME/lib64</LD_LIBRARY_PATH>
<NNCASE_TILING_MAX_SOLUTIONS>1</NNCASE_TILING_MAX_SOLUTIONS>
</EnvironmentVariables>
</RunConfiguration>
</RunSettings>
EOF

- name: Dotnet Test CUDA Kernels
working-directory: ${{github.workspace}}
shell: bash
run: |
dotnet test src/Nncase.Tests/Nncase.Tests.csproj -c $BUILD_TYPE -s test.runsettings --no-build --verbosity normal --filter "FullyQualifiedName~Nncase.Tests.TargetTest.UnitTestCUDAKernels" --blame --logger "trx;LogFileName=cuda-kernels.trx"

- name: Upload CUDA Test Results
uses: actions/upload-artifact@v4
if: always()
with:
name: nncase-cuda-test-results
path: |
**/TestResults/*.trx
**/TestResults/*_Sequence.xml
if-no-files-found: ignore

test-compiler:
needs: [build-compiler]
name: test-${{matrix.config.name}}
Expand Down Expand Up @@ -241,7 +351,11 @@ jobs:
run: |
dotnet tool install --global dotnet-coverage
dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/llm_ffi.xml pytest tests/other/ --doctest-modules --junitxml=test_results/llm_ffi.xml
dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/llm_huggingface.xml pytest tests/importer/huggingface_/ --doctest-modules --junitxml=test_results/llm_huggingface.xml
if [ -n "${HF_HOME:-}" ]; then
dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/llm_huggingface.xml pytest tests/importer/huggingface_/ --doctest-modules --junitxml=test_results/llm_huggingface.xml
else
echo "Skipping HuggingFace importer tests because HF_HOME is not configured."
fi

- name: Test
working-directory: ${{github.workspace}}
Expand Down
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ option(BUILD_TESTING "Build test programs" OFF)
option(ENABLE_OP_PROFILE "Profile ops cast time" OFF)
option(ENABLE_DUMP_MANAGER "Enable dump manager" OFF)
option(ENABLE_DUMP_MEM "Dump mem usage" OFF)
option(ENABLE_CUDA_RUNTIME "Enable CUDA runtime" OFF)

if(DEFINED CMAKE_CUDA_COMPILER AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "")
set(ENABLE_CUDA_RUNTIME ON CACHE BOOL "Enable CUDA runtime" FORCE)
endif()

if(ENABLE_CUDA_RUNTIME)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 120)
endif()
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
endif()

if (BUILDING_RUNTIME)
# option(ENABLE_VULKAN_RUNTIME "Enable Vulkan runtime" OFF)
Expand Down
7 changes: 6 additions & 1 deletion cmake/compile_flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if (MSVC)
set(PYBIND11_CPP_STANDARD "/std:c++latest")
else()
add_compile_options(-fvisibility=hidden)
add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-multichar -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare)
add_compile_options(-Wall -Wextra -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare)
if (APPLE)
add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized -Wno-deprecated -Wno-braced-scalar-init)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
Expand All @@ -15,6 +15,11 @@ else()
endif()
endif()

if (CMAKE_CUDA_COMPILER)
message(STATUS "Configuring for CUDA")
#add_compile_options(-save-temps)
endif()

if(${CMAKE_SYSTEM_PROCESSOR} MATCHES
"(x86)|(X86)|(amd64)|(AMD64)|(x86_64)|(X86_64)")
if (MSVC)
Expand Down
5 changes: 5 additions & 0 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class nncaseConan(ConanFile):
"k230_runtime": [True, False],
"k80_runtime": [True, False],
"vulkan_runtime": [True, False],
"cuda_runtime": [True, False],
"tests": [True, False],
"python": [True, False],
"python_root": ["ANY"]
Expand All @@ -40,6 +41,7 @@ class nncaseConan(ConanFile):
"k230_runtime": False,
"k80_runtime": False,
"vulkan_runtime": False,
"cuda_runtime": False,
"tests": False,
"python": True,
"python_root": ""
Expand Down Expand Up @@ -88,8 +90,11 @@ def generate(self):
tc.variables['ENABLE_K230_RUNTIME'] = self.options.k230_runtime
tc.variables['ENABLE_K80_RUNTIME'] = self.options.k80_runtime
tc.variables['ENABLE_VULKAN_RUNTIME'] = self.options.vulkan_runtime
tc.variables['ENABLE_CUDA_RUNTIME'] = self.options.cuda_runtime
tc.variables['BUILD_PYTHON_BINDING'] = self.options.python
tc.variables['BUILD_TESTING'] = self.options.tests
if self.options.cuda_runtime:
tc.variables['CMAKE_CUDA_ARCHITECTURES'] = "120"
if self.options.get_safe("python_root", default="") != "":
tc.variables['Python3_ROOT_DIR'] = self.options.python_root
if self.options.runtime:
Expand Down
8 changes: 4 additions & 4 deletions modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceBuiltn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,16 @@ public static string TopoAwareRuntimeDef(NTTTargetOptions options, ulong dataAli
return content;
}

public static string ModuleTopologyDef(NTTTargetOptions options)
public static string ModuleTopologyDef(NTTTargetOptions options, bool isCUDA)
{
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/module_topology_def.h.cshtml", options).Result;
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/module_topology_def.h.cshtml", new { Hierarchies = options.Hierarchies[0], IsCUDA = isCUDA }).Result;
return content;
}

public static string CMakeDef()
public static string CMakeDef(bool isCUDA)
{
var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "ntt_module.cmake"));
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath }).Result;
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath, IsCUDA = isCUDA }).Result;
return content;
}

Expand Down
17 changes: 14 additions & 3 deletions modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceCompiler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ public class CSourceCompiler
{
private static string? _vcVarPath;

private readonly bool _isCUDA;

/// <summary>
/// compiler exe name.
/// </summary>
Expand All @@ -37,8 +39,9 @@ public class CSourceCompiler
/// </summary>
private string _ext = string.Empty;

public CSourceCompiler()
public CSourceCompiler(bool isCUDA)
{
_isCUDA = isCUDA;
PlatformSpecific();
ArchSpecific();
}
Expand Down Expand Up @@ -186,8 +189,16 @@ private void ArchSpecific()

private string ArgumentsSpecific(string sourcePath, string outPath)
{
var archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ?
"-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty;
string archConfig = string.Empty;
if (_isCUDA)
{
archConfig = $"-DCMAKE_CUDA_ARCHITECTURES=120 -DCMAKE_CUDA_COMPILER=clang++";
}
else
{
archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ?
"-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty;
}

#if DEBUG
var config = "Release";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public static void WriteWithProfiler(string functionName, string tagName = "")
IndentScope.Writer.IndWrite("{\n");
#if false // Disable device profiling for now.
IndentScope.Writer.Write($"constexpr std::string_view function_name = \"{tagName}\";\n");
IndentScope.Writer.Write($"auto_profiler profiler(function_name, runtime::profiling_level::device);\n");
IndentScope.Writer.Write($"profile_scope profiler(function_name, profile_level::device);\n");
#endif
IndentScope.Writer.Write($"{functionName};\n");
IndentScope.Writer.IndWrite("}\n");
Expand All @@ -69,7 +69,7 @@ public static void WriteIndWithProfiler(string functionName, string tagName = ""
IndentScope.Writer.IndWrite("{\n");
#if false // Disable device profiling for now.
IndentScope.Writer.IndWrite($"constexpr std::string_view function_name = \"{tagName}\";\n");
IndentScope.Writer.IndWrite($"auto_profiler profiler(function_name, runtime::profiling_level::device);\n");
IndentScope.Writer.IndWrite($"profile_scope profiler(function_name, profile_level::device);\n");
#endif
IndentScope.Writer.IndWrite($"{functionName};\n");
IndentScope.Writer.IndWrite("}\n");
Expand All @@ -94,7 +94,7 @@ protected override CSymbol VisitPrimFunction(PrimFunction expr)
}

var ctype = $"template<{string.Join(", ", Enumerable.Range(0, expr.Parameters.Length).Select(x => $"class T{x}"))}>" + Environment.NewLine +
$"void {expr.Name}({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"T{i} &&{s.Name}").ToArray())})";
$"NTT_DEVICE void {expr.Name}({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"T{i} &&{s.Name}").ToArray())})";

using (var scope = new IndentScope(_deviceBuilder))
{
Expand Down Expand Up @@ -192,7 +192,7 @@ protected override CSymbol VisitPhysicalBuffer(PhysicalBuffer expr)
_ => throw new NotSupportedException(expr.Location.ToString()),
};

var str = $"std::span<std::byte, {size.Name}>({name} + {start.Name}, {size.Name})";
var str = $"ntt::span<std::byte, {size.Name}>({name} + {start.Name}, {size.Name})";
symbol = new(start.Type, str);
_exprMemo.Add(expr, symbol);
return symbol;
Expand Down
Loading
Loading