Skip to content

Commit 9931d0e

Browse files
committed
Mtmd Implementation base
1 parent de00c15 commit 9931d0e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2832
-1067
lines changed

LLama.Examples/ExampleRunner.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ public class ExampleRunner
1515
{ "Chat Session: Automatic conversation", TalkToYourself.Run },
1616
{ "Chat Session: Chinese characters", ChatChineseGB2312.Run },
1717
{ "Executor: Interactive mode chat", InteractiveModeExecute.Run },
18-
{ "Executor: Llava Interactive mode chat", LlavaInteractiveModeExecute.Run },
18+
{ "Executor: Mtmd Interactive mode chat", MtmdInteractiveModeExecute.Run },
1919
{ "Executor: Instruct mode chat", InstructModeExecute.Run },
2020
{ "Executor: Stateless mode chat", StatelessModeExecute.Run },
2121
{ "Save and Load: chat session", SaveAndLoadSession.Run },
@@ -33,7 +33,7 @@ public class ExampleRunner
3333
{ "Batched Executor: Save/Load", BatchedExecutorSaveAndLoad.Run },
3434
{ "Batched Executor: Fork", BatchedExecutorFork.Run },
3535
{ "Batched Executor: Rewind", BatchedExecutorRewind.Run },
36-
{ "Batched Executor: LLava", BatchedExecutorLLava.Run },
36+
{ "Batched Executor: Mtmd", BatchedExecutorMtmd.Run },
3737
{ "Batched Executor: BoolQ Benchmark", BatchedExecutorBoolQ.Run },
3838
{ "Batched Executor: Beam Search", BatchedExecutorBeamSearch.Run },
3939
{ "Custom Sampling Pipeline", CustomSampler.Run },

LLama.Examples/Examples/BatchedExecutorLLava.cs

Lines changed: 0 additions & 91 deletions
This file was deleted.
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using LLama.Batched;
5+
using LLama.Common;
6+
using LLama.Exceptions;
7+
using LLama.Native;
8+
using LLama.Sampling;
9+
using Spectre.Console;
10+
11+
namespace LLama.Examples.Examples;
12+
13+
/// <summary>
14+
/// Demonstrates how to evaluate an image with MTMD helpers and continue generation by
15+
/// manually scheduling batches, similar to what the batched executor does internally.
16+
/// </summary>
17+
public class BatchedExecutorMtmd
18+
{
19+
/// <summary>
20+
/// Number of completion tokens to generate after sending the image prompt.
21+
/// </summary>
22+
public const int TokenCount = 10000;
23+
24+
public static async Task Run()
25+
{
26+
// Load the base LLM and its clip/mtmd sidecar weights so the executor has everything it needs.
27+
var parameters = new ModelParams(UserSettings.GetModelPath());
28+
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
29+
var mtmdParams = MtmdContextParams.Default(); // reuse llama.cpp defaults for helper settings
30+
mtmdParams.UseGpu = false;
31+
var marker = mtmdParams.MediaMarker ?? NativeApi.MtmdDefaultMarker() ?? "<media>";
32+
33+
using var mtmd = await SafeMtmdWeights.LoadFromFileAsync(UserSettings.GetMMProjPath(), model, mtmdParams); // multimodal helper weights
34+
35+
using var executor = new BatchedExecutor(model, parameters, mtmd); // drives batched token + chunk evaluation
36+
37+
// Prepend the media marker so the helper knows where to inject the encoded image tokens.
38+
var defaultPrompt = "\nUSER: Provide a full description of the image.\nASSISTANT: ";
39+
var promptSuffix = AnsiConsole.Ask("Prompt (or ENTER for default):", defaultPrompt);
40+
var promptText = string.Concat(marker, promptSuffix);
41+
42+
var imagePath = UserSettings.GetImagePath();
43+
AnsiConsole.Write(new CanvasImage(imagePath));
44+
45+
var vocab = executor.Context.NativeHandle.ModelHandle.Vocab;
46+
47+
// Simple low-temperature sampler keeps the demo deterministic-ish.
48+
var sampler = new DefaultSamplingPipeline
49+
{
50+
Temperature = 0.1f
51+
};
52+
53+
// Stream decoded text to the console as soon as tokens arrive.
54+
var decoder = new StreamingTokenDecoder(executor.Context)
55+
{
56+
DecodeSpecialTokens = false
57+
};
58+
59+
try
60+
{
61+
// Each conversation tracks its own KV cache sequence IDs.
62+
var conversation = executor.Create();
63+
// enqueue the image so MtmdHelper sees it
64+
conversation.QueueMedia(imagePath);
65+
// schedule multimodal prompt
66+
conversation.Prompt(promptText, addBos: true, special: true);
67+
68+
Console.ForegroundColor = ConsoleColor.Yellow;
69+
Console.WriteLine("Prompt queued with multimodal chunks. Generating response...\n");
70+
Console.ResetColor();
71+
72+
var remaining = TokenCount;
73+
74+
// Run one decode/sampling/prompt cycle – mirrors the batched executor inner loop.
75+
async Task<bool> ProcessNextAsync()
76+
{
77+
var decodeResult = await executor.Infer();
78+
if (decodeResult == DecodeResult.NoKvSlot) // KV cache exhausted – surface to the user
79+
{
80+
Console.ForegroundColor = ConsoleColor.Red;
81+
Console.WriteLine("Insufficient KV cache space for multimodal evaluation.");
82+
Console.ResetColor();
83+
return false;
84+
}
85+
86+
if (decodeResult != DecodeResult.Ok)
87+
throw new RuntimeError($"Failed to evaluate batch: {decodeResult}.");
88+
89+
if (!conversation.RequiresSampling) // another conversation may still be queued
90+
return true;
91+
92+
var token = conversation.Sample(sampler); // pull logits (or -1 for mtmd chunk) and sample
93+
if (token.IsEndOfGeneration(vocab))
94+
return false;
95+
96+
decoder.Add(token);
97+
var delta = decoder.Read();
98+
if (!string.IsNullOrEmpty(delta))
99+
Console.Write(delta);
100+
101+
sampler.Accept(token); // keep sampler state in sync
102+
conversation.Prompt(token); // feed the accepted token back into the batch
103+
remaining--;
104+
return remaining > 0;
105+
}
106+
107+
while (remaining > 0 && await ProcessNextAsync()) // continue until EOS or budget is reached
108+
{
109+
}
110+
111+
Console.WriteLine();
112+
}
113+
catch (IOException ex)
114+
{
115+
Console.ForegroundColor = ConsoleColor.Red;
116+
Console.WriteLine($"Could not load media '{imagePath}': {ex.Message}");
117+
Console.ResetColor();
118+
}
119+
catch (RuntimeError ex)
120+
{
121+
Console.ForegroundColor = ConsoleColor.Red;
122+
Console.WriteLine($"MTMD processing failed: {ex.Message}");
123+
Console.ResetColor();
124+
}
125+
}
126+
}

LLama.Examples/Examples/LlavaInteractiveModeExecute.cs renamed to LLama.Examples/Examples/MtmdInteractiveModeExecute.cs

Lines changed: 55 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
using System.Collections.Generic;
2+
using System.IO;
13
using System.Text.RegularExpressions;
24
using LLama.Common;
35
using Spectre.Console;
@@ -6,27 +8,32 @@
68

79
namespace LLama.Examples.Examples
810
{
9-
// This example shows how to chat with LLaVA model with both image and text as input.
11+
// This example shows how to chat with Mtmd model with both image and text as input.
1012
// It uses the interactive executor to inference.
11-
public class LlavaInteractiveModeExecute
13+
public class MtmdInteractiveModeExecute
1214
{
1315
public static async Task Run()
1416
{
1517
string multiModalProj = UserSettings.GetMMProjPath();
1618
string modelPath = UserSettings.GetModelPath();
1719
string modelImage = UserSettings.GetImagePath();
18-
const int maxTokens = 1024;
20+
const int maxTokens = 2048;
1921

2022
var prompt = $"{{{modelImage}}}\nUSER:\nProvide a full description of the image.\nASSISTANT:\n";
2123

2224
var parameters = new ModelParams(modelPath);
2325

26+
var mtmdParameters = MtmdContextParams.Default();
27+
mtmdParameters.UseGpu = false;
28+
2429
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
2530
using var context = model.CreateContext(parameters);
26-
27-
// Llava Init
28-
using var clipModel = await LLavaWeights.LoadFromFileAsync(multiModalProj);
29-
31+
32+
// Mtmd Init
33+
using var clipModel = await SafeMtmdWeights.LoadFromFileAsync(multiModalProj, model, mtmdParameters );
34+
35+
var mediaMarker = mtmdParameters.MediaMarker ?? NativeApi.MtmdDefaultMarker() ?? "<media>";
36+
3037
var ex = new InteractiveExecutor(context, clipModel);
3138

3239
Console.ForegroundColor = ConsoleColor.Yellow;
@@ -40,38 +47,61 @@ public static async Task Run()
4047
Temperature = 0.1f
4148
},
4249

43-
AntiPrompts = new List<string> { "\nUSER:" },
50+
AntiPrompts = new List<string> { "\nASSISTANT:" },
4451
MaxTokens = maxTokens
4552

4653
};
4754

4855
do
4956
{
5057

51-
// Evaluate if we have images
58+
// Evaluate if we have media
5259
//
53-
var imageMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
54-
var imageCount = imageMatches.Count();
55-
var hasImages = imageCount > 0;
60+
var mediaMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
61+
var mediaCount = mediaMatches.Count();
62+
var hasMedia = mediaCount > 0;
5663

57-
if (hasImages)
64+
if (hasMedia)
5865
{
59-
var imagePathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
60-
var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();
66+
var mediaPathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
67+
var mediaPaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();
6168

62-
List<byte[]> imageBytes;
69+
var embeds = new List<SafeMtmdEmbed>();
70+
var imageList = new List<byte[]>();
71+
var imageExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
72+
{
73+
".png",
74+
".jpg",
75+
".jpeg",
76+
".bmp",
77+
".gif",
78+
".webp"
79+
};
80+
6381
try
6482
{
65-
imageBytes = imagePaths.Select(File.ReadAllBytes).ToList();
83+
foreach (var mediaPath in mediaPaths)
84+
{
85+
var extension = Path.GetExtension(mediaPath);
86+
if (!string.IsNullOrEmpty(extension) && imageExtensions.Contains(extension))
87+
{
88+
// Keep the raw image data so the caller can reuse or inspect the images later.
89+
imageList.Add(File.ReadAllBytes(mediaPath));
90+
}
91+
92+
var embed = clipModel.LoadMedia(mediaPath);
93+
embeds.Add(embed);
94+
}
6695
}
6796
catch (IOException exception)
6897
{
6998
Console.ForegroundColor = ConsoleColor.Red;
7099
Console.Write(
71-
$"Could not load your {(imageCount == 1 ? "image" : "images")}:");
100+
$"Could not load your {(mediaCount == 1 ? "media" : "medias")}:");
72101
Console.Write($"{exception.Message}");
73102
Console.ForegroundColor = ConsoleColor.Yellow;
74103
Console.WriteLine("Please try again.");
104+
clipModel.ClearMedia();
75105
break;
76106
}
77107

@@ -81,19 +111,17 @@ public static async Task Run()
81111
// https://github.com/ggerganov/llama.cpp/discussions/3620
82112
ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
83113

84-
int index = 0;
85-
foreach (var path in imagePathsWithCurlyBraces)
114+
// Replace placeholders with media markers (one marker per image)
115+
foreach (var path in mediaPathsWithCurlyBraces)
86116
{
87-
// First image replace to tag <image, the rest of the images delete the tag
88-
prompt = prompt.Replace(path, index++ == 0 ? "<image>" : "");
117+
prompt = prompt.Replace(path, mediaMarker, StringComparison.Ordinal);
89118
}
90119

91-
92120
Console.ForegroundColor = ConsoleColor.Yellow;
93121
Console.WriteLine($"Here are the images, that are sent to the chat model in addition to your message.");
94122
Console.WriteLine();
95123

96-
foreach (var consoleImage in imageBytes?.Select(bytes => new CanvasImage(bytes)) ?? Array.Empty<CanvasImage>())
124+
foreach (var consoleImage in imageList.Select(image => new CanvasImage(image.ToArray())))
97125
{
98126
consoleImage.MaxWidth = 50;
99127
AnsiConsole.Write(consoleImage);
@@ -108,10 +136,9 @@ public static async Task Run()
108136

109137
// Initialize Images in executor
110138
//
111-
foreach (var image in imagePaths)
112-
{
113-
ex.Images.Add(await File.ReadAllBytesAsync(image));
114-
}
139+
ex.Embeds.Clear();
140+
foreach (var embed in embeds)
141+
ex.Embeds.Add(embed);
115142
}
116143

117144
Console.ForegroundColor = Color.White;

0 commit comments

Comments
 (0)