microsoft · markwallace-microsoft · Aug 7, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
@@ -59,6 +59,7 @@
     <PackageVersion Include="Microsoft.Identity.Client.Extensions.Msal" Version="4.74.1" />
     <PackageVersion Include="Microsoft.IdentityModel.JsonWebTokens" Version="8.13.0" />
     <PackageVersion Include="Microsoft.ML.OnnxRuntime" Version="1.22.1" />
+    <PackageVersion Include="Microsoft.ML.OnnxRuntime.Gpu" Version="1.22.1"/>
     <PackageVersion Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="1.0.1" />
     <PackageVersion Include="Microsoft.SemanticKernel.Abstractions" Version="1.58.0" />
     <PackageVersion Include="Microsoft.SemanticKernel.Connectors.OpenAI" Version="1.58.0" />

@@ -41,6 +41,7 @@
     <Project Path="samples/Demos/ModelContextProtocolPluginAuth/ModelContextProtocolPluginAuth.csproj" />
     <Project Path="samples/Demos/OllamaFunctionCalling/OllamaFunctionCalling.csproj" />
     <Project Path="samples/Demos/OnnxSimpleRAG/OnnxSimpleRAG.csproj" />
+    <Project Path="samples/Demos/OnnxSimpleChatWithCuda/OnnxSimpleChatWithCuda.csproj" />
     <Project Path="samples/Demos/OpenAIRealtime/OpenAIRealtime.csproj" />
     <Project Path="samples/Demos/ProcessWithDapr/ProcessWithDapr.csproj" />
     <Project Path="samples/Demos/QualityCheck/QualityCheckWithFilters/QualityCheckWithFilters.csproj" />

@@ -0,0 +1,20 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net8.0</TargetFramework>
+    <NoWarn>$(NoWarn);CA2007,CA2208,CS1591,CA1024,IDE0009,IDE0055,IDE0073,IDE0211,VSTHRD111,SKEXP0001</NoWarn>
+  </PropertyGroup>
+  <ItemGroup>
+    <!--    
+          TODO: fix this WORKAROUND 
+              CUDA provider set up with Microsoft.ML.OnnxRuntimeGenAI.Cuda 0.8.3 + Microsoft.ML.OnnxRuntime.Gpu 1.22.1 
+              - doesn't work with Microsoft.ML.OnnxRuntime 1.22.1
+              - works with Microsoft.ML.OnnxRuntime 1.22.0
+    -->
+    <PackageReference Include="Microsoft.ML.OnnxRuntime" VersionOverride="1.22.0" NoWarn="NU1605"/>
+    <PackageReference Include="Microsoft.ML.OnnxRuntime.Gpu" />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda"/>
+    <ProjectReference Include="..\..\..\src\Connectors\Connectors.Onnx\Connectors.Onnx.csproj"/>
+    <ProjectReference Include="..\..\..\src\SemanticKernel.Abstractions\SemanticKernel.Abstractions.csproj"/>
+  </ItemGroup>
+</Project>
@@ -0,0 +1,48 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.Extensions.AI;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.Connectors.Onnx;
+
+// Path to the folder of your downloaded ONNX CUDA model
+// i.e: D:\repo\huggingface\Phi-3-mini-4k-instruct-onnx\cuda\cuda-int4-rtn-block-32
+string modelPath = "MODEL_PATH";
+
+IKernelBuilder builder = Kernel.CreateBuilder();
+builder.AddOnnxRuntimeGenAIChatClient(
+    modelPath: modelPath,
+
+    // Specify the provider you want to use, e.g., "cuda" for GPU support
+    // For other execution providers, check: https://onnxruntime.ai/docs/genai/reference/config#provideroptions
+    providers: [new Provider("cuda")] // 
+);
+
+Kernel kernel = builder.Build();
+
+using IChatClient chatClient = kernel.GetRequiredService<IChatClient>();
+
+List<ChatMessage> chatHistory = [];
+
+while (true)
+{
+    Console.Write("User > ");
+    string userMessage = Console.ReadLine()!;
+    if (string.IsNullOrEmpty(userMessage))
+    {
+        break;
+    }
+
+    chatHistory.Add(new ChatMessage(ChatRole.User, userMessage));
+
+    try
+    {
+        ChatResponse result = await chatClient.GetResponseAsync(chatHistory, new() { MaxOutputTokens = 1024 });
+        Console.WriteLine($"Assistant > {result.Text}");
+
+        chatHistory.AddRange(result.Messages);
+    }
+    catch (Exception e)
+    {
+        Console.WriteLine(e.Message);
+    }
+}
@@ -0,0 +1,44 @@
+# Onnx Simple Chat with Cuda Execution Provider
+
+This sample demonstrates how you use ONNX Connector with CUDA Execution Provider to run Local Models straight from files using Semantic Kernel.
+
+In this example we setup Chat Client from ONNX Connector with [Microsoft's Phi-3-ONNX](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) model 
+
+> [!IMPORTANT]
+> You can modify to use any other combination of models enabled for ONNX runtime.
+
+## Semantic Kernel used Features
+
+- [Chat Client](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/src/SemanticKernel.Abstractions/AI/ChatCompletion/IChatCompletionService.cs) - Using the Chat Completion Service from [Onnx Connector](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/src/Connectors/Connectors.Onnx/OnnxRuntimeGenAIChatCompletionService.cs) to generate responses from the Local Model.
+
+## Prerequisites
+
+- [.NET 8](https://dotnet.microsoft.com/download/dotnet/8.0).
+- [NVIDIA GPU](https://www.nvidia.com/en-us/geforce/graphics-cards)
+- [NVIDIA CUDA v12 Toolkit](https://developer.nvidia.com/cuda-12-0-0-download-archive)
+- [NVIDIA cuDNN v9.11](https://developer.nvidia.com/cudnn-9-11-0-download-archive)
+- Windows users only: 
+
+  Ensure `PATH` environment variable includes the `bin` folder of the CUDA Toolkit and cuDNN. 
+    i.e:
+    - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\bin
+    - C:\Program Files\NVIDIA\CUDNN\v9.11\bin\12.9
+
+- Downloaded ONNX Models (see below).
+
+## Downloading the Model
+
+For this example we chose Hugging Face as our repository for download of the local models, go to a directory of your choice where the models should be downloaded and run the following commands:
+
+```powershell
+git lfs install
+git clone https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx
+```
+
+Update the `Program.cs` file lines below with the paths to the models you downloaded in the previous step.
+
+```csharp
+// i.e. Running on Windows
+string modelPath = "D:\\repo\\huggingface\\Phi-3-mini-4k-instruct-onnx\\cuda\\cuda-int4-rtn-block-32";
+```
+
@@ -1,9 +1,12 @@
 // Copyright (c) Microsoft. All rights reserved.
 
+using System.Collections.Generic;
 using System.Linq;
 using Microsoft.Extensions.AI;
 using Microsoft.Extensions.DependencyInjection;
+using Microsoft.ML.OnnxRuntimeGenAI;
 using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.Connectors.Onnx;
 using Xunit;
 
 namespace SemanticKernel.Connectors.Onnx.UnitTests;
@@ -74,4 +77,75 @@ public void AddOnnxRuntimeGenAIChatClientToKernelBuilderWithServiceId()
         Assert.NotNull(serviceDescriptor);
         Assert.Equal(ServiceLifetime.Singleton, serviceDescriptor.Lifetime);
     }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatClientWithProvidersToServiceCollection()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var providers = new List<Provider> { new("cuda"), new("cpu") };
+
+        // Act
+        collection.AddOnnxRuntimeGenAIChatClient("modelPath", providers);
+
+        // Assert
+        var serviceDescriptor = collection.FirstOrDefault(x => x.ServiceType == typeof(IChatClient));
+        Assert.NotNull(serviceDescriptor);
+        Assert.Equal(ServiceLifetime.Singleton, serviceDescriptor.Lifetime);
+        Assert.NotNull(serviceDescriptor.ImplementationFactory);
+    }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatClientWithProvidersToKernelBuilder()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var kernelBuilder = collection.AddKernel();
+        var providers = new List<Provider> { new("cuda"), new("cpu") };
+
+        // Act
+        kernelBuilder.AddOnnxRuntimeGenAIChatClient("modelPath", providers);
+
+        // Assert
+        var serviceDescriptor = collection.FirstOrDefault(x => x.ServiceType == typeof(IChatClient));
+        Assert.NotNull(serviceDescriptor);
+        Assert.Equal(ServiceLifetime.Singleton, serviceDescriptor.Lifetime);
+        Assert.NotNull(serviceDescriptor.ImplementationFactory);
+    }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatClientWithProvidersAndServiceIdToServiceCollection()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var providers = new List<Provider> { new("cuda") };
+
+        // Act
+        collection.AddOnnxRuntimeGenAIChatClient("modelPath", providers, serviceId: "test-service");
+        var serviceProvider = collection.BuildServiceProvider();
+
+        // Assert
+        var exception = Assert.Throws<OnnxRuntimeGenAIException>(() => serviceProvider.GetRequiredKeyedService<IChatClient>("test-service"));
+
+        Assert.Contains("genai_config.json", exception.Message);
+    }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatClientWithProvidersAndServiceIdToKernelBuilder()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var kernelBuilder = collection.AddKernel();
+        var providers = new List<Provider> { new("cuda") };
+
+        // Act
+        kernelBuilder.AddOnnxRuntimeGenAIChatClient("modelPath", providers, serviceId: "test-service");
+        var serviceProvider = collection.BuildServiceProvider();
+
+        // Assert
+        var kernel = serviceProvider.GetRequiredService<Kernel>();
+        var exception = Assert.Throws<OnnxRuntimeGenAIException>(() => kernel.GetRequiredService<IChatClient>("test-service"));
+
+        Assert.Contains("genai_config.json", exception.Message);
+    }
 }
@@ -1,6 +1,10 @@
 // Copyright (c) Microsoft. All rights reserved.
 
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Extensions.AI;
 using Microsoft.Extensions.DependencyInjection;
+using Microsoft.ML.OnnxRuntimeGenAI;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel.Connectors.Onnx;
@@ -46,4 +50,76 @@ public void AddOnnxRuntimeGenAIChatCompletionToKernelBuilder()
         Assert.NotNull(service);
         Assert.IsType<OnnxRuntimeGenAIChatCompletionService>(service);
     }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatCompletionWithProvidersToServiceCollection()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var providers = new List<Provider> { new("cuda"), new("cpu") };
+        collection.AddOnnxRuntimeGenAIChatCompletion("modelId", "modelPath", providers);
+
+        // Act
+        var serviceDescriptor = collection.FirstOrDefault(x => x.ServiceType == typeof(IChatCompletionService));
+
+        // Assert
+        Assert.NotNull(serviceDescriptor);
+        Assert.Equal(ServiceLifetime.Singleton, serviceDescriptor.Lifetime);
+        Assert.NotNull(serviceDescriptor.ImplementationFactory);
+    }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatCompletionWithProvidersToKernelBuilder()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var kernelBuilder = collection.AddKernel();
+        var providers = new List<Provider> { new("cuda"), new("cpu") };
+        kernelBuilder.AddOnnxRuntimeGenAIChatCompletion("modelId", "modelPath", providers);
+
+        // Act
+        var serviceDescriptor = collection.FirstOrDefault(x => x.ServiceType == typeof(IChatCompletionService));
+
+        // Assert
+        Assert.NotNull(serviceDescriptor);
+        Assert.Equal(ServiceLifetime.Singleton, serviceDescriptor.Lifetime);
+        Assert.NotNull(serviceDescriptor.ImplementationFactory);
+    }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatCompletionWithProvidersAndServiceIdToServiceCollection()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var providers = new List<Provider> { new("cuda") };
+        collection.AddOnnxRuntimeGenAIChatCompletion("modelId", "modelPath", providers, serviceId: "test-service");
+
+        // Act
+        var serviceProvider = collection.BuildServiceProvider();
+
+        // Assert
+        var exception = Assert.Throws<OnnxRuntimeGenAIException>(() => serviceProvider.GetRequiredKeyedService<IChatCompletionService>("test-service"));
+
+        Assert.Contains("genai_config.json", exception.Message);
+    }
+
+    [Fact]
+    public void AddOnnxRuntimeGenAIChatCompletionWithProvidersAndServiceIdToKernelBuilder()
+    {
+        // Arrange
+        var collection = new ServiceCollection();
+        var kernelBuilder = collection.AddKernel();
+        var providers = new List<Provider> { new("cuda") };
+        kernelBuilder.AddOnnxRuntimeGenAIChatCompletion("modelId", "modelPath", providers, serviceId: "test-service");
+
+        // Act
+        var serviceDescriptor = collection.FirstOrDefault(x => x.ServiceType == typeof(IChatCompletionService) && x.ServiceKey?.ToString() == "test-service");
+        var serviceProvider = collection.BuildServiceProvider();
+
+        // Assert
+        var kernel = serviceProvider.GetRequiredService<Kernel>();
+        var exception = Assert.Throws<OnnxRuntimeGenAIException>(() => kernel.GetRequiredService<IChatCompletionService>("test-service"));
+
+        Assert.Contains("genai_config.json", exception.Message);
+    }
 }