modelCachePath) {
+ this.modelCachePath = modelCachePath;
+ return this;
+ }
+
+ public Builder modelName(String modelName) {
+ this.modelName = modelName;
+ return this;
+ }
+
+ public Builder quantization(String quantization) {
+ this.quantization = quantization;
+ return this;
+ }
+
+ public Builder onGPU(Boolean onGPU) {
+ this.onGPU = onGPU;
+ return this;
+ }
+
+ public Builder temperature(Double temperature) {
+ this.temperature = temperature;
+ return this;
+ }
+
+ public Builder topP(Double topP) {
+ this.topP = topP;
+ return this;
+ }
+
+ public Builder maxTokens(Integer maxTokens) {
+ this.maxTokens = maxTokens;
+ return this;
+ }
+
+ public Builder seed(Integer seed) {
+ this.seed = seed;
+ return this;
+ }
+
+ public GPULlama3ChatModel build() {
+ return new GPULlama3ChatModel(this);
+ }
+ }
+}
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ModelRegistry.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ModelRegistry.java
new file mode 100644
index 000000000..de62ca701
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ModelRegistry.java
@@ -0,0 +1,249 @@
+package io.quarkiverse.langchain4j.gpullama3;
+
+import java.io.*;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+
+import org.jboss.logging.Logger;
+
+/**
+ * A registry for managing GPULlama3.java models on local disk.
+ *
+ * Beehive Lab HuggingFace repository.
+ *
+ * Reused implementation of {@link io.quarkiverse.langchain4j.llama3.Llama3ModelRegistry}
+ */
+@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
+public class GPULlama3ModelRegistry {
+
+ private static final Logger LOG = Logger.getLogger(GPULlama3ModelRegistry.class);
+
+ private static final String DEFAULT_MODEL_CACHE_PATH = System.getProperty("user.home", "") + File.separator + ".langchain4j"
+ + File.separator + "models";
+
+ public static String FINISHED_MARKER = ".finished";
+
+ private final Path modelCachePath;
+
+ private GPULlama3ModelRegistry(Path modelCachePath) {
+ this.modelCachePath = modelCachePath;
+ if (!Files.exists(modelCachePath)) {
+ try {
+ Files.createDirectories(modelCachePath);
+ } catch (IOException e) {
+ throw new IOError(e);
+ }
+ }
+ }
+
+ public static GPULlama3ModelRegistry getOrCreate(Optional modelCachePath) {
+ return new GPULlama3ModelRegistry(modelCachePath.orElse(Path.of(DEFAULT_MODEL_CACHE_PATH)));
+ }
+
+ public Path constructModelDirectoryPath(ModelInfo modelInfo) {
+ return Paths.get(modelCachePath.toAbsolutePath().toString(), modelInfo.owner() + "_" + modelInfo.name());
+ }
+
+ public Path constructGgufModelFilePath(ModelInfo modelInfo, String quantization) {
+ String effectiveFileName = getEffectiveFileName(modelInfo, quantization);
+ Path modelDirectory = constructModelDirectoryPath(modelInfo);
+ return modelDirectory.resolve(effectiveFileName);
+ }
+
+ public Path downloadModel(String modelName, String quantization, Optional authToken,
+ Optional maybeProgressReporter)
+ throws IOException, InterruptedException {
+ ModelInfo modelInfo = ModelInfo.from(modelName);
+
+ String effectiveFileName = getEffectiveFileName(modelInfo, quantization);
+ Path modelDirectory = constructModelDirectoryPath(modelInfo);
+ Path result = modelDirectory.resolve(effectiveFileName);
+ if (Files.exists(result) && Files.exists(modelDirectory.resolve(FINISHED_MARKER))) {
+ return result;
+ }
+
+ HttpClient client = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.ALWAYS).build();
+ URI uri = URI.create(
+ String.format("https://huggingface.co/%s/%s/resolve/main/%s", modelInfo.owner(), modelInfo.name(),
+ effectiveFileName));
+ HttpRequest request = HttpRequest.newBuilder().uri(uri).build();
+ HttpResponse httpResponse = client.send(request, HttpResponse.BodyHandlers.ofInputStream());
+ if (httpResponse.statusCode() != 200) {
+ throw new RuntimeException(
+ "Unable to download model " + modelName + ". Response code from " + uri + " is : "
+ + httpResponse.statusCode());
+ }
+ Files.createDirectories(result.getParent());
+ long totalBytes = httpResponse.headers().firstValueAsLong("content-length").orElse(-1);
+ ProgressReporter progressReporter = maybeProgressReporter.orElse((filename, sizeDownloaded, totalSize) -> {
+ });
+
+ if (maybeProgressReporter.isEmpty()) {
+ LOG.info("Downloading file " + result.toAbsolutePath());
+ }
+ String resultFileName = result.getFileName().toString();
+ progressReporter.update(resultFileName, 0L, totalBytes);
+
+ try (CountingInputStream inStream = new CountingInputStream(httpResponse.body())) {
+ CompletableFuture cf = CompletableFuture.supplyAsync(() -> {
+ try {
+ return Files.copy(inStream, result, StandardCopyOption.REPLACE_EXISTING);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
+ while (!cf.isDone()) {
+ progressReporter.update(resultFileName, inStream.count, totalBytes);
+ }
+ if (cf.isCompletedExceptionally()) {
+ progressReporter.update(resultFileName, inStream.count, totalBytes);
+ } else {
+ progressReporter.update(resultFileName, totalBytes, totalBytes);
+ }
+
+ try {
+ cf.get();
+ } catch (Throwable e) {
+ throw new IOException("Failed to download file: " + resultFileName, e);
+ }
+ if (maybeProgressReporter.isEmpty()) {
+ LOG.info("Downloaded file " + result.toAbsolutePath());
+ }
+ }
+
+ // create a finished marker
+ Files.createFile(modelDirectory.resolve(FINISHED_MARKER));
+ return result;
+ }
+
+ private String getEffectiveFileName(ModelInfo modelInfo, String quantization) {
+ String effectiveFileName = modelInfo.name();
+ if (effectiveFileName.endsWith("-GGUF")) {
+ effectiveFileName = effectiveFileName.substring(0, effectiveFileName.length() - 5);
+ }
+ effectiveFileName = effectiveFileName + "-" + quantization + ".gguf";
+ return effectiveFileName;
+ }
+
+ /**
+ * This interface reports the progress of a .gguf file download.
+ * The implementation of the update method is used to communicate this progress.
+ */
+ public interface ProgressReporter {
+
+ void update(String filename, long sizeDownloaded, long totalSize);
+ }
+
+ /**
+ * ModelInfo is a simple data class that represents a model's owner and name.
+ *
+ * Reused implementation of {@link io.quarkiverse.langchain4j.llama3.Llama3ModelRegistry}
+ */
+ public record ModelInfo(String owner, String name) {
+
+ public static ModelInfo from(String modelName) {
+ String[] parts = modelName.split("/");
+ if (parts.length == 0 || parts.length > 2) {
+ throw new IllegalArgumentException("Model must be in the form owner/name");
+ }
+
+ String owner;
+ String name;
+
+ if (parts.length == 1) {
+ owner = null;
+ name = modelName;
+ } else {
+ owner = parts[0];
+ name = parts[1];
+ }
+
+ return new ModelInfo(owner, name);
+ }
+
+ public String toFileName() {
+ return owner + "_" + name;
+ }
+ }
+
+ /**
+ * An {@link InputStream} that counts the number of bytes read.
+ *
+ * @author Chris Nokleberg
+ *
+ * Copied from Guava
+ */
+ public static final class CountingInputStream extends FilterInputStream {
+
+ private long count;
+ private long mark = -1;
+
+ /**
+ * Wraps another input stream, counting the number of bytes read.
+ *
+ * @param in the input stream to be wrapped
+ */
+ public CountingInputStream(InputStream in) {
+ super(Objects.requireNonNull(in));
+ }
+
+ /** Returns the number of bytes read. */
+ public long getCount() {
+ return count;
+ }
+
+ @Override
+ public int read() throws IOException {
+ int result = in.read();
+ if (result != -1) {
+ count++;
+ }
+ return result;
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ int result = in.read(b, off, len);
+ if (result != -1) {
+ count += result;
+ }
+ return result;
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ long result = in.skip(n);
+ count += result;
+ return result;
+ }
+
+ @Override
+ public synchronized void mark(int readlimit) {
+ in.mark(readlimit);
+ mark = count;
+ // it's okay to mark even if mark isn't supported, as reset won't work
+ }
+
+ @Override
+ public synchronized void reset() throws IOException {
+ if (!in.markSupported()) {
+ throw new IOException("Mark not supported");
+ }
+ if (mark == -1) {
+ throw new IOException("Mark not set");
+ }
+
+ in.reset();
+ count = mark;
+ }
+ }
+}
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ResponseParser.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ResponseParser.java
new file mode 100644
index 000000000..76a50a562
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ResponseParser.java
@@ -0,0 +1,201 @@
+package io.quarkiverse.langchain4j.gpullama3;
+
+import dev.langchain4j.model.chat.response.PartialThinking;
+import dev.langchain4j.model.chat.response.StreamingChatResponseHandler;
+
+public class GPULlama3ResponseParser {
+
+ private GPULlama3ResponseParser() {
+ // Utility class - prevent instantiation
+ }
+
+ public static class ParsedResponse {
+ private final String thinkingContent;
+ private final String actualResponse;
+
+ /**
+ * Creates a new ParsedResponse.
+ *
+ * @param thinkingContent the thinking content including tags, or null if none
+ * @param actualResponse the cleaned response content
+ */
+ public ParsedResponse(String thinkingContent, String actualResponse) {
+ this.thinkingContent = thinkingContent;
+ this.actualResponse = actualResponse;
+ }
+
+ /**
+ * Returns the thinking content including <think> and </think> tags.
+ *
+ * @return the thinking content with tags, or null if no thinking content was found
+ */
+ public String getThinkingContent() {
+ return thinkingContent;
+ }
+
+ /**
+ * Returns the actual response content with thinking tags removed.
+ *
+ * @return the cleaned response content
+ */
+ public String getActualResponse() {
+ return actualResponse;
+ }
+
+ /**
+ * Returns true if the response contained thinking content.
+ *
+ * @return true if thinking content was found, false otherwise
+ */
+ public boolean hasThinking() {
+ return thinkingContent != null && !thinkingContent.trim().isEmpty();
+ }
+ }
+
+ public static ParsedResponse parseResponse(String rawResponse) {
+ if (rawResponse == null) {
+ throw new IllegalArgumentException("Raw response cannot be null");
+ }
+
+ String thinking = null;
+ String actualResponse = rawResponse;
+
+ // Find and positions
+ int thinkStart = rawResponse.indexOf("");
+ int thinkEnd = rawResponse.indexOf("");
+
+ if (thinkStart != -1 && thinkEnd != -1 && thinkEnd > thinkStart) {
+ // Extract thinking content INCLUDING the tags
+ thinking = rawResponse.substring(thinkStart, thinkEnd + 8).trim(); // Include
+
+ // Remove the entire thinking block from response
+ String beforeThink = rawResponse.substring(0, thinkStart);
+ String afterThink = rawResponse.substring(thinkEnd + 8); // Skip
+ actualResponse = (beforeThink + afterThink).trim();
+
+ // Clean up any extra whitespace
+ actualResponse = actualResponse.replaceAll("\\s+", " ").trim();
+ }
+
+ return new ParsedResponse(thinking, actualResponse);
+ }
+
+ public static String extractThinking(String rawResponse) {
+ return parseResponse(rawResponse).getThinkingContent();
+ }
+
+ public static String extractResponse(String rawResponse) {
+ return parseResponse(rawResponse).getActualResponse();
+ }
+
+ public static StreamingParser createStreamingParser(
+ StreamingChatResponseHandler handler, org.beehive.gpullama3.model.Model model) {
+ return new StreamingParser(handler, model);
+ }
+
+ /**
+ * Parser for handling streaming responses with real-time thinking content separation.
+ *
+ * This parser detects thinking content as tokens are generated and routes it to
+ * the appropriate handler methods (onPartialThinking vs onPartialResponse).
+ * The thinking tags are preserved and streamed as part of the thinking content.
+ */
+ public static class StreamingParser {
+ private final StreamingChatResponseHandler handler;
+ private final org.beehive.gpullama3.model.Model model;
+ private final StringBuilder buffer = new StringBuilder();
+ private boolean insideThinking = false;
+ private int lastProcessedLength = 0;
+
+ /**
+ * Creates a new streaming parser.
+ *
+ * @param handler the streaming response handler
+ * @param model the GPULlama3 model instance for token decoding
+ */
+ public StreamingParser(StreamingChatResponseHandler handler, org.beehive.gpullama3.model.Model model) {
+ this.handler = handler;
+ this.model = model;
+ }
+
+ /**
+ * Processes each token as it's generated by the model.
+ *
+ * @param tokenId the token ID generated by the model
+ */
+ public void onToken(int tokenId) {
+ // Check if this is a stop token and skip it
+ if (model.chatFormat().getStopTokens().contains(tokenId)) {
+ return; // Don't stream stop tokens like <|im_end|>
+ }
+
+ // Decode the token and add to buffer
+ String tokenStr = model.tokenizer().decode(java.util.List.of(tokenId));
+ buffer.append(tokenStr);
+
+ String currentText = buffer.toString();
+
+ // Process any new content since last time
+ processNewContent(currentText);
+ }
+
+ /**
+ * Processes new content in the buffer, detecting thinking state transitions
+ * and routing content to appropriate handler methods.
+ */
+ private void processNewContent(String currentText) {
+ if (currentText.length() <= lastProcessedLength) {
+ return; // No new content
+ }
+
+ String newContent = currentText.substring(lastProcessedLength);
+
+ // Process each character in the new content
+ for (int i = 0; i < newContent.length(); i++) {
+ int currentPosition = lastProcessedLength + i;
+
+ // Check if we're starting thinking
+ if (!insideThinking && isStartOfThinkTag(currentText, currentPosition)) {
+ insideThinking = true;
+ // Stream the opening tag as thinking
+ handler.onPartialThinking(new PartialThinking(""));
+ i += 6; // Skip the rest of ""
+ continue;
+ }
+
+ // Check if we're ending thinking
+ if (insideThinking && isStartOfEndThinkTag(currentText, currentPosition)) {
+ // Stream the closing tag as thinking
+ handler.onPartialThinking(new PartialThinking(""));
+ insideThinking = false;
+ i += 7; // Skip the rest of ""
+ continue;
+ }
+
+ // Stream the character to appropriate handler
+ char c = newContent.charAt(i);
+ if (insideThinking) {
+ handler.onPartialThinking(new PartialThinking(String.valueOf(c)));
+ } else {
+ handler.onPartialResponse(String.valueOf(c));
+ }
+ }
+
+ lastProcessedLength = currentText.length();
+ }
+
+ /**
+ * Checks if the text at the given position starts with "<think>".
+ */
+ private boolean isStartOfThinkTag(String text, int position) {
+ return position + 7 <= text.length() && text.regionMatches(position, "", 0, 7);
+ }
+
+ /**
+ * Checks if the text at the given position starts with "</think>".
+ */
+ private boolean isStartOfEndThinkTag(String text, int position) {
+ return position + 8 <= text.length() && text.regionMatches(position, "", 0, 8);
+ }
+ }
+}
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/GPULlama3Recorder.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/GPULlama3Recorder.java
new file mode 100644
index 000000000..9b738c269
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/GPULlama3Recorder.java
@@ -0,0 +1,89 @@
+package io.quarkiverse.langchain4j.gpullama3.runtime;
+
+import java.util.function.Supplier;
+
+import org.jboss.logging.Logger;
+
+import dev.langchain4j.model.chat.ChatModel;
+import dev.langchain4j.model.chat.DisabledChatModel;
+import io.quarkiverse.langchain4j.gpullama3.GPULlama3ChatModel;
+import io.quarkiverse.langchain4j.gpullama3.runtime.config.LangChain4jGPULlama3FixedRuntimeConfig;
+import io.quarkiverse.langchain4j.gpullama3.runtime.config.LangChain4jGPULlama3RuntimeConfig;
+import io.quarkiverse.langchain4j.runtime.NamedConfigUtil;
+import io.quarkus.runtime.RuntimeValue;
+import io.quarkus.runtime.annotations.Recorder;
+
+@Recorder
+public class GPULlama3Recorder {
+
+ private static final Logger LOG = Logger.getLogger(GPULlama3Recorder.class);
+
+ private final RuntimeValue runtimeConfig;
+ private final RuntimeValue fixedRuntimeConfig;
+
+ public GPULlama3Recorder(RuntimeValue runtimeConfig,
+ RuntimeValue fixedRuntimeConfig) {
+ this.runtimeConfig = runtimeConfig;
+ this.fixedRuntimeConfig = fixedRuntimeConfig;
+ }
+
+ public Supplier chatModel(String configName) {
+ var gpuLlama3Config = correspondingConfig(configName);
+ var gpuLlama3FixedRuntimeConfig = correspondingFixedConfig(configName);
+
+ if (gpuLlama3Config.enableIntegration()) {
+ LOG.info("Creating GPULlama3ChatModel for config: " + configName);
+ var chatModelConfig = gpuLlama3Config.chatModel();
+
+ var builder = GPULlama3ChatModel.builder()
+ .modelName(gpuLlama3FixedRuntimeConfig.chatModel().modelName())
+ .quantization(gpuLlama3FixedRuntimeConfig.chatModel().quantization())
+ .onGPU(Boolean.TRUE)
+ .modelCachePath(fixedRuntimeConfig.getValue().modelsPath());
+
+ if (chatModelConfig.temperature().isPresent()) {
+ builder.temperature(chatModelConfig.temperature().getAsDouble());
+ }
+ if (chatModelConfig.topP().isPresent()) {
+ builder.topP(chatModelConfig.topP().getAsDouble());
+ }
+ if (chatModelConfig.maxTokens().isPresent()) {
+ builder.maxTokens(chatModelConfig.maxTokens().getAsInt());
+ }
+ if (chatModelConfig.seed().isPresent()) {
+ builder.seed(chatModelConfig.seed().getAsInt());
+ }
+
+ return new Supplier<>() {
+ @Override
+ public ChatModel get() {
+ return builder.build();
+ }
+ };
+ } else {
+ return new Supplier<>() {
+ @Override
+ public ChatModel get() {
+ return new DisabledChatModel();
+ }
+ };
+ }
+ }
+
+ private LangChain4jGPULlama3RuntimeConfig.GPULlama3Config correspondingConfig(String configName) {
+ return NamedConfigUtil.isDefault(configName)
+ ? runtimeConfig.getValue().defaultConfig()
+ : runtimeConfig.getValue().namedConfig().get(configName);
+ }
+
+ private LangChain4jGPULlama3FixedRuntimeConfig.GPULlama3Config correspondingFixedConfig(String configName) {
+ return NamedConfigUtil.isDefault(configName)
+ ? fixedRuntimeConfig.getValue().defaultConfig()
+ : fixedRuntimeConfig.getValue().namedConfig().get(configName);
+ }
+
+ private boolean inDebugMode() {
+ return LOG.isDebugEnabled();
+ }
+
+}
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/NameAndQuantization.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/NameAndQuantization.java
new file mode 100644
index 000000000..a229037da
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/NameAndQuantization.java
@@ -0,0 +1,4 @@
+package io.quarkiverse.langchain4j.gpullama3.runtime;
+
+public record NameAndQuantization(String name, String quantization) {
+}
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/ChatModelConfig.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/ChatModelConfig.java
new file mode 100644
index 000000000..c177b8008
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/ChatModelConfig.java
@@ -0,0 +1,41 @@
+package io.quarkiverse.langchain4j.gpullama3.runtime.config;
+
+import java.util.OptionalDouble;
+import java.util.OptionalInt;
+
+import io.quarkus.runtime.annotations.ConfigDocDefault;
+import io.quarkus.runtime.annotations.ConfigGroup;
+import io.smallrye.config.WithDefault;
+
+@ConfigGroup
+public interface ChatModelConfig {
+
+ /**
+ * What sampling temperature to use, between 0.0 and 1.0.
+ */
+ @ConfigDocDefault("0.3")
+ @WithDefault("${quarkus.langchain4j.temperature}")
+ OptionalDouble temperature();
+
+ /**
+ * What sampling topP to use, between 0.0 and 1.0.
+ */
+ @ConfigDocDefault("0.85")
+ @WithDefault("${quarkus.langchain4j.top-p}")
+ OptionalDouble topP();
+
+ /**
+ * What seed value to use.
+ *
+ * @return
+ */
+ @ConfigDocDefault("1234")
+ @WithDefault("${quarkus.langchain4j.seed}")
+ OptionalInt seed();
+
+ /**
+ * The maximum number of tokens to generate in the completion.
+ */
+ @ConfigDocDefault("512")
+ OptionalInt maxTokens();
+}
\ No newline at end of file
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/ChatModelFixedRuntimeConfig.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/ChatModelFixedRuntimeConfig.java
new file mode 100644
index 000000000..c7f8359ce
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/ChatModelFixedRuntimeConfig.java
@@ -0,0 +1,21 @@
+package io.quarkiverse.langchain4j.gpullama3.runtime.config;
+
+import io.quarkiverse.langchain4j.gpullama3.Consts;
+import io.quarkus.runtime.annotations.ConfigGroup;
+import io.smallrye.config.WithDefault;
+
+@ConfigGroup
+public interface ChatModelFixedRuntimeConfig {
+
+ /**
+ * Model name to use
+ */
+ @WithDefault(Consts.DEFAULT_CHAT_MODEL_NAME)
+ String modelName();
+
+ /**
+ * Quantization of the model to use
+ */
+ @WithDefault(Consts.DEFAULT_CHAT_MODEL_QUANTIZATION)
+ String quantization();
+}
\ No newline at end of file
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/LangChain4jGPULlama3FixedRuntimeConfig.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/LangChain4jGPULlama3FixedRuntimeConfig.java
new file mode 100644
index 000000000..f6835644c
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/LangChain4jGPULlama3FixedRuntimeConfig.java
@@ -0,0 +1,67 @@
+package io.quarkiverse.langchain4j.gpullama3.runtime.config;
+
+import static io.quarkus.runtime.annotations.ConfigPhase.BUILD_AND_RUN_TIME_FIXED;
+
+import java.nio.file.Path;
+import java.util.Map;
+import java.util.Optional;
+
+import io.quarkus.runtime.annotations.*;
+import io.smallrye.config.ConfigMapping;
+import io.smallrye.config.WithDefaults;
+import io.smallrye.config.WithParentName;
+
+/**
+ * Fixed runtime configuration for GPULlama3 extension.
+ *
+ * This configuration is read at build time and remains fixed for the lifetime of the application.
+ * It includes settings that cannot be changed after the application is built, such as
+ * the model file path. These values are baked into the application during the build process.
+ *
+ * To change these settings, the application must be rebuilt with the new configuration values.
+ * This ensures optimal performance and allows for build-time validation and optimization.
+ *
+ * Example configuration:
+ *
+ *
+ * quarkus.langchain4j.gpu-llama3.chat-model.model-path=/path/to/model.gguf
+ *
+ *
+ * Note: These properties must be set in {@code application.properties} at build time
+ * and cannot be overridden at runtime through environment variables or system properties.
+ */
+@ConfigRoot(phase = BUILD_AND_RUN_TIME_FIXED)
+@ConfigMapping(prefix = "quarkus.langchain4j.gpu-llama3")
+public interface LangChain4jGPULlama3FixedRuntimeConfig {
+
+ /**
+ * Default model config.
+ */
+ @WithParentName
+ GPULlama3Config defaultConfig();
+
+ /**
+ * Named model config.
+ */
+ @ConfigDocSection
+ @ConfigDocMapKey("model-name")
+ @WithParentName
+ @WithDefaults
+ Map namedConfig();
+
+ /**
+ * Location on the file-system which serves as a cache for the models
+ *
+ */
+ @ConfigDocDefault("${user.home}/.langchain4j/models")
+ Optional modelsPath();
+
+ @ConfigGroup
+ interface GPULlama3Config {
+
+ /**
+ * Chat model related settings
+ */
+ ChatModelFixedRuntimeConfig chatModel();
+ }
+}
\ No newline at end of file
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/LangChain4jGPULlama3RuntimeConfig.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/LangChain4jGPULlama3RuntimeConfig.java
new file mode 100644
index 000000000..f85dc7f24
--- /dev/null
+++ b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/config/LangChain4jGPULlama3RuntimeConfig.java
@@ -0,0 +1,71 @@
+package io.quarkiverse.langchain4j.gpullama3.runtime.config;
+
+import static io.quarkus.runtime.annotations.ConfigPhase.RUN_TIME;
+
+import java.util.Map;
+import java.util.Optional;
+
+import io.quarkus.runtime.annotations.ConfigDocDefault;
+import io.quarkus.runtime.annotations.ConfigDocMapKey;
+import io.quarkus.runtime.annotations.ConfigDocSection;
+import io.quarkus.runtime.annotations.ConfigRoot;
+import io.smallrye.config.ConfigMapping;
+import io.smallrye.config.WithDefault;
+import io.smallrye.config.WithDefaults;
+import io.smallrye.config.WithParentName;
+
+/**
+ * Runtime configuration for GPULlama3 extension.
+ *
+ * This configuration is read at runtime and can be changed without rebuilding the application.
+ * It includes dynamic settings such as model parameters (temperature, max tokens),
+ * logging preferences, and integration control.
+ */
+@ConfigRoot(phase = RUN_TIME)
+@ConfigMapping(prefix = "quarkus.langchain4j.gpu-llama3")
+public interface LangChain4jGPULlama3RuntimeConfig {
+
+ /**
+ * Default model config.
+ */
+ @WithParentName
+ GPULlama3Config defaultConfig();
+
+ /**
+ * Named model config.
+ */
+ @ConfigDocSection
+ @ConfigDocMapKey("model-name")
+ @WithParentName
+ @WithDefaults
+ Map namedConfig();
+
+ interface GPULlama3Config {
+
+ /**
+ * Chat model related settings
+ */
+ ChatModelConfig chatModel();
+
+ /**
+ * Whether to enable the integration. Set to {@code false} to disable
+ * all requests.
+ */
+ @WithDefault("true")
+ Boolean enableIntegration();
+
+ /**
+ * Whether GPULlama3 should log requests
+ */
+ @ConfigDocDefault("false")
+ @WithDefault("${quarkus.langchain4j.log-requests}")
+ Optional logRequests();
+
+ /**
+ * Whether GPULlama3 client should log responses
+ */
+ @ConfigDocDefault("false")
+ @WithDefault("${quarkus.langchain4j.log-responses}")
+ Optional logResponses();
+ }
+}
\ No newline at end of file
diff --git a/model-providers/pom.xml b/model-providers/pom.xml
index 0b0e65720..429ab7b18 100644
--- a/model-providers/pom.xml
+++ b/model-providers/pom.xml
@@ -38,6 +38,7 @@