1
+ using System . Collections . Generic ;
2
+ using System . IO ;
1
3
using System . Text . RegularExpressions ;
2
4
using LLama . Common ;
3
5
using Spectre . Console ;
6
8
7
9
namespace LLama . Examples . Examples
8
10
{
9
- // This example shows how to chat with LLaVA model with both image and text as input.
11
+ // This example shows how to chat with Mtmd model with both image and text as input.
10
12
// It uses the interactive executor to inference.
11
- public class LlavaInteractiveModeExecute
13
+ public class MtmdInteractiveModeExecute
12
14
{
13
15
public static async Task Run ( )
14
16
{
15
17
string multiModalProj = UserSettings . GetMMProjPath ( ) ;
16
18
string modelPath = UserSettings . GetModelPath ( ) ;
17
19
string modelImage = UserSettings . GetImagePath ( ) ;
18
- const int maxTokens = 1024 ;
20
+ const int maxTokens = 2048 ;
19
21
20
22
var prompt = $ "{{{modelImage}}}\n USER:\n Provide a full description of the image.\n ASSISTANT:\n ";
21
23
22
24
var parameters = new ModelParams ( modelPath ) ;
23
25
26
+ var mtmdParameters = MtmdContextParams . Default ( ) ;
27
+ mtmdParameters . UseGpu = false ;
28
+
24
29
using var model = await LLamaWeights . LoadFromFileAsync ( parameters ) ;
25
30
using var context = model . CreateContext ( parameters ) ;
26
-
27
- // Llava Init
28
- using var clipModel = await LLavaWeights . LoadFromFileAsync ( multiModalProj ) ;
29
-
31
+
32
+ // Mtmd Init
33
+ using var clipModel = await SafeMtmdWeights . LoadFromFileAsync ( multiModalProj , model , mtmdParameters ) ;
34
+
35
+ var mediaMarker = mtmdParameters . MediaMarker ?? NativeApi . MtmdDefaultMarker ( ) ?? "<media>" ;
36
+
30
37
var ex = new InteractiveExecutor ( context , clipModel ) ;
31
38
32
39
Console . ForegroundColor = ConsoleColor . Yellow ;
@@ -40,38 +47,61 @@ public static async Task Run()
40
47
Temperature = 0.1f
41
48
} ,
42
49
43
- AntiPrompts = new List < string > { "\n USER :" } ,
50
+ AntiPrompts = new List < string > { "\n ASSISTANT :" } ,
44
51
MaxTokens = maxTokens
45
52
46
53
} ;
47
54
48
55
do
49
56
{
50
57
51
- // Evaluate if we have images
58
+ // Evaluate if we have media
52
59
//
53
- var imageMatches = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
54
- var imageCount = imageMatches . Count ( ) ;
55
- var hasImages = imageCount > 0 ;
60
+ var mediaMatches = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
61
+ var mediaCount = mediaMatches . Count ( ) ;
62
+ var hasMedia = mediaCount > 0 ;
56
63
57
- if ( hasImages )
64
+ if ( hasMedia )
58
65
{
59
- var imagePathsWithCurlyBraces = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
60
- var imagePaths = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Groups [ 1 ] . Value ) . ToList ( ) ;
66
+ var mediaPathsWithCurlyBraces = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
67
+ var mediaPaths = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Groups [ 1 ] . Value ) . ToList ( ) ;
61
68
62
- List < byte [ ] > imageBytes ;
69
+ var embeds = new List < SafeMtmdEmbed > ( ) ;
70
+ var imageList = new List < byte [ ] > ( ) ;
71
+ var imageExtensions = new HashSet < string > ( StringComparer . OrdinalIgnoreCase )
72
+ {
73
+ ".png" ,
74
+ ".jpg" ,
75
+ ".jpeg" ,
76
+ ".bmp" ,
77
+ ".gif" ,
78
+ ".webp"
79
+ } ;
80
+
63
81
try
64
82
{
65
- imageBytes = imagePaths . Select ( File . ReadAllBytes ) . ToList ( ) ;
83
+ foreach ( var mediaPath in mediaPaths )
84
+ {
85
+ var extension = Path . GetExtension ( mediaPath ) ;
86
+ if ( ! string . IsNullOrEmpty ( extension ) && imageExtensions . Contains ( extension ) )
87
+ {
88
+ // Keep the raw image data so the caller can reuse or inspect the images later.
89
+ imageList . Add ( File . ReadAllBytes ( mediaPath ) ) ;
90
+ }
91
+
92
+ var embed = clipModel . LoadMedia ( mediaPath ) ;
93
+ embeds . Add ( embed ) ;
94
+ }
66
95
}
67
96
catch ( IOException exception )
68
97
{
69
98
Console . ForegroundColor = ConsoleColor . Red ;
70
99
Console . Write (
71
- $ "Could not load your { ( imageCount == 1 ? "image " : "images " ) } :") ;
100
+ $ "Could not load your { ( mediaCount == 1 ? "media " : "medias " ) } :") ;
72
101
Console . Write ( $ "{ exception . Message } ") ;
73
102
Console . ForegroundColor = ConsoleColor . Yellow ;
74
103
Console . WriteLine ( "Please try again." ) ;
104
+ clipModel . ClearMedia ( ) ;
75
105
break ;
76
106
}
77
107
@@ -81,19 +111,17 @@ public static async Task Run()
81
111
// https://github.com/ggerganov/llama.cpp/discussions/3620
82
112
ex . Context . NativeHandle . MemorySequenceRemove ( LLamaSeqId . Zero , - 1 , - 1 ) ;
83
113
84
- int index = 0 ;
85
- foreach ( var path in imagePathsWithCurlyBraces )
114
+ // Replace placeholders with media markers (one marker per image)
115
+ foreach ( var path in mediaPathsWithCurlyBraces )
86
116
{
87
- // First image replace to tag <image, the rest of the images delete the tag
88
- prompt = prompt . Replace ( path , index ++ == 0 ? "<image>" : "" ) ;
117
+ prompt = prompt . Replace ( path , mediaMarker , StringComparison . Ordinal ) ;
89
118
}
90
119
91
-
92
120
Console . ForegroundColor = ConsoleColor . Yellow ;
93
121
Console . WriteLine ( $ "Here are the images, that are sent to the chat model in addition to your message.") ;
94
122
Console . WriteLine ( ) ;
95
123
96
- foreach ( var consoleImage in imageBytes ? . Select ( bytes => new CanvasImage ( bytes ) ) ?? Array . Empty < CanvasImage > ( ) )
124
+ foreach ( var consoleImage in imageList . Select ( image => new CanvasImage ( image . ToArray ( ) ) ) )
97
125
{
98
126
consoleImage . MaxWidth = 50 ;
99
127
AnsiConsole . Write ( consoleImage ) ;
@@ -108,10 +136,9 @@ public static async Task Run()
108
136
109
137
// Initialize Images in executor
110
138
//
111
- foreach ( var image in imagePaths )
112
- {
113
- ex . Images . Add ( await File . ReadAllBytesAsync ( image ) ) ;
114
- }
139
+ ex . Embeds . Clear ( ) ;
140
+ foreach ( var embed in embeds )
141
+ ex . Embeds . Add ( embed ) ;
115
142
}
116
143
117
144
Console . ForegroundColor = Color . White ;
0 commit comments