quarkiverse
diff --git a/‎integration-tests/gpu-llama3/src/main/resources/application.properties‎
Lines changed: 9 additions & 2 deletions b/‎integration-tests/gpu-llama3/src/main/resources/application.properties‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎model-providers/gpu-llama3/deployment/src/main/java/io/quarkiverse/langchain4j/gpullama3/deployment/GPULlama3Processor.java‎
Lines changed: 159 additions & 3 deletions b/‎model-providers/gpu-llama3/deployment/src/main/java/io/quarkiverse/langchain4j/gpullama3/deployment/GPULlama3Processor.java‎
Lines changed: 159 additions & 3 deletions
diff --git a/‎model-providers/gpu-llama3/deployment/src/main/java/io/quarkiverse/langchain4j/gpullama3/deployment/LangChain4jGPULlama3BuildTimeConfig.java‎
Lines changed: 8 additions & 0 deletions b/‎model-providers/gpu-llama3/deployment/src/main/java/io/quarkiverse/langchain4j/gpullama3/deployment/LangChain4jGPULlama3BuildTimeConfig.java‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/Consts.java‎
Lines changed: 17 additions & 0 deletions b/‎model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/Consts.java‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ChatModel.java‎
Lines changed: 34 additions & 11 deletions b/‎model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ChatModel.java‎
Lines changed: 34 additions & 11 deletions
@@ -1,5 +1,12 @@
+quarkus.langchain4j.gpu-llama3.include-models-in-artifact=false
+
 # Configure GPULlama3
-quarkus.langchain4j.gpu-llama3.chat-model.model-path=/Users/orion/LLMModels/beehive-llama-3.2-1b-instruct-fp16.gguf
 quarkus.langchain4j.gpu-llama3.enable-integration=true
+quarkus.langchain4j.gpu-llama3.chat-model.model-name=beehive-lab/Llama-3.2-1B-Instruct-GGUF
+quarkus.langchain4j.gpu-llama3.chat-model.quantization=FP16
 quarkus.langchain4j.gpu-llama3.chat-model.temperature=0.7
-quarkus.langchain4j.gpu-llama3.chat-model.max-tokens=100
+quarkus.langchain4j.gpu-llama3.chat-model.max-tokens=513
+
+# other supported models:
+#model-name=ggml-org/Qwen3-0.6B-GGUF
+#quantization=f16
@@ -2,22 +2,44 @@
 
 import static io.quarkiverse.langchain4j.deployment.LangChain4jDotNames.CHAT_MODEL;
 
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
 
 import jakarta.enterprise.context.ApplicationScoped;
 
+import org.jboss.logging.Logger;
+
 import io.quarkiverse.langchain4j.deployment.items.ChatModelProviderCandidateBuildItem;
 import io.quarkiverse.langchain4j.deployment.items.SelectedChatModelProviderBuildItem;
+import io.quarkiverse.langchain4j.gpullama3.GPULlama3ModelRegistry;
 import io.quarkiverse.langchain4j.gpullama3.runtime.GPULlama3Recorder;
+import io.quarkiverse.langchain4j.gpullama3.runtime.NameAndQuantization;
+import io.quarkiverse.langchain4j.gpullama3.runtime.config.ChatModelFixedRuntimeConfig;
+import io.quarkiverse.langchain4j.gpullama3.runtime.config.LangChain4jGPULlama3FixedRuntimeConfig;
+import io.quarkiverse.langchain4j.runtime.NamedConfigUtil;
 import io.quarkus.arc.deployment.SyntheticBeanBuildItem;
-import io.quarkus.deployment.annotations.BuildProducer;
-import io.quarkus.deployment.annotations.BuildStep;
-import io.quarkus.deployment.annotations.ExecutionTime;
+import io.quarkus.builder.item.MultiBuildItem;
+import io.quarkus.deployment.annotations.*;
 import io.quarkus.deployment.annotations.Record;
 import io.quarkus.deployment.builditem.FeatureBuildItem;
+import io.quarkus.deployment.builditem.LaunchModeBuildItem;
+import io.quarkus.deployment.builditem.ServiceStartBuildItem;
+import io.quarkus.deployment.console.ConsoleInstalledBuildItem;
+import io.quarkus.deployment.console.StartupLogCompressor;
+import io.quarkus.deployment.logging.LoggingSetupBuildItem;
 
 public class GPULlama3Processor {
 
+    private final static Logger LOG = Logger.getLogger(GPULlama3Processor.class);
+
     private static final String PROVIDER = "gpu-llama3";
     private static final String FEATURE = "langchain4j-gpu-llama3";
 
@@ -55,4 +77,138 @@ void generateBeans(GPULlama3Recorder recorder,
             }
         }
     }
+
+    @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
+    @Produce(ServiceStartBuildItem.class)
+    @BuildStep
+    void downloadModels(List<SelectedChatModelProviderBuildItem> selectedChatModels,
+            LoggingSetupBuildItem loggingSetupBuildItem,
+            Optional<ConsoleInstalledBuildItem> consoleInstalledBuildItem,
+            LaunchModeBuildItem launchMode,
+            LangChain4jGPULlama3BuildTimeConfig buildTimeConfig,
+            LangChain4jGPULlama3FixedRuntimeConfig fixedRuntimeConfig,
+            BuildProducer<ModelDownloadedBuildItem> modelDownloadedProducer) {
+        if (!buildTimeConfig.includeModelsInArtifact()) {
+            return;
+        }
+        GPULlama3ModelRegistry registry = GPULlama3ModelRegistry.getOrCreate(fixedRuntimeConfig.modelsPath());
+
+        BigDecimal ONE_HUNDRED = new BigDecimal("100");
+
+        if (buildTimeConfig.chatModel().enabled().orElse(true)) {
+            List<NameAndQuantization> modelsNeeded = new ArrayList<>();
+            for (var selected : selectedChatModels) {
+                if (PROVIDER.equals(selected.getProvider())) {
+                    String configName = selected.getConfigName();
+
+                    ChatModelFixedRuntimeConfig matchingConfig = NamedConfigUtil.isDefault(configName)
+                            ? fixedRuntimeConfig.defaultConfig().chatModel()
+                            : fixedRuntimeConfig.namedConfig().get(configName).chatModel();
+                    modelsNeeded.add(new NameAndQuantization(matchingConfig.modelName(), matchingConfig.quantization()));
+                }
+            }
+
+            if (!modelsNeeded.isEmpty()) {
+                StartupLogCompressor compressor = new StartupLogCompressor(
+                        (launchMode.isTest() ? "(test) " : "") + "GPULlama3.java model pull:",
+                        consoleInstalledBuildItem,
+                        loggingSetupBuildItem);
+
+                for (var model : modelsNeeded) {
+                    GPULlama3ModelRegistry.ModelInfo modelInfo = GPULlama3ModelRegistry.ModelInfo.from(model.name());
+                    Path pathOfModelDirOnDisk = registry.constructModelDirectoryPath(modelInfo);
+                    // Check if the model is already downloaded
+                    // this is done automatically by download model, but we want to provide a good progress experience, so we do it again here
+                    if (Files.exists(pathOfModelDirOnDisk.resolve(GPULlama3ModelRegistry.FINISHED_MARKER))) {
+                        LOG.debug("Model " + model.name() + "already exists in " + pathOfModelDirOnDisk);
+                    } else {
+                        // we pull one model at a time and provide progress updates to the user via logging
+                        LOG.info("Pulling model " + model.name());
+
+                        AtomicReference<Long> LAST_UPDATE_REF = new AtomicReference<>();
+
+                        try {
+                            registry.downloadModel(model.name(), model.quantization(), Optional.empty(),
+                                    Optional.of(new GPULlama3ModelRegistry.ProgressReporter() {
+                                        @Override
+                                        public void update(String filename, long sizeDownloaded, long totalSize) {
+                                            // Jlama downloads a bunch of files for each mode of which only the
+                                            // weights file is large
+                                            // and makes sense to report progress on
+                                            if (totalSize < 100_000) {
+                                                return;
+                                            }
+
+                                            if (!logUpdate(LAST_UPDATE_REF.get())) {
+                                                return;
+                                            }
+
+                                            LAST_UPDATE_REF.set(System.nanoTime());
+
+                                            BigDecimal percentage = new BigDecimal(sizeDownloaded)
+                                                    .divide(new BigDecimal(totalSize),
+                                                            4,
+                                                            RoundingMode.HALF_DOWN)
+                                                    .multiply(ONE_HUNDRED);
+                                            BigDecimal progress = percentage.setScale(2, RoundingMode.HALF_DOWN);
+                                            if (progress.compareTo(ONE_HUNDRED) >= 0) {
+                                                // avoid showing 100% for too long
+                                                LOG.infof("Verifying and cleaning up\n", progress);
+                                            } else {
+                                                LOG.infof("%s - Progress: %s%%\n", model.name(), progress);
+                                            }
+                                        }
+
+                                        /**
+                                         * @param lastUpdate The last update time in nanoseconds
+                                         *        Determines whether we should log an update.
+                                         *        This is done in order to not overwhelm the console with updates which might
+                                         *        make
+                                         *        canceling the download difficult. See
+                                         *        <a href=
+                                         *        "https://github.com/quarkiverse/quarkus-langchain4j/issues/1044">this</a>
+                                         */
+                                        private boolean logUpdate(Long lastUpdate) {
+                                            if (lastUpdate == null) {
+                                                return true;
+                                            } else {
+                                                return TimeUnit.NANOSECONDS.toMillis(System.nanoTime())
+                                                        - TimeUnit.NANOSECONDS.toMillis(lastUpdate) > 1_000;
+                                            }
+                                        }
+                                    }));
+                        } catch (IOException e) {
+                            compressor.closeAndDumpCaptured();
+                        } catch (InterruptedException e) {
+                            throw new RuntimeException(e);
+                        }
+                    }
+
+                    modelDownloadedProducer.produce(new ModelDownloadedBuildItem(model, pathOfModelDirOnDisk));
+                }
+
+                compressor.close();
+            }
+        }
+
+    }
+
+    public static final class ModelDownloadedBuildItem extends MultiBuildItem {
+
+        private final NameAndQuantization model;
+        private final Path directory;
+
+        public ModelDownloadedBuildItem(NameAndQuantization model, Path directory) {
+            this.model = model;
+            this.directory = directory;
+        }
+
+        public NameAndQuantization getModel() {
+            return model;
+        }
+
+        public Path getDirectory() {
+            return directory;
+        }
+    }
 }
@@ -4,11 +4,19 @@
 
 import io.quarkus.runtime.annotations.ConfigRoot;
 import io.smallrye.config.ConfigMapping;
+import io.smallrye.config.WithDefault;
 
 @ConfigRoot(phase = BUILD_TIME)
 @ConfigMapping(prefix = "quarkus.langchain4j.gpu-llama3")
 public interface LangChain4jGPULlama3BuildTimeConfig {
 
+    /**
+     * Determines whether the necessary GPULlama3 models are downloaded and included in the jar at build time.
+     * Currently, this option is only valid for {@code fast-jar} deployments.
+     */
+    @WithDefault("true")
+    boolean includeModelsInArtifact();
+
     /**
      * Chat model related settings
      */
 
@@ -0,0 +1,17 @@
+package io.quarkiverse.langchain4j.gpullama3;
+
+public final class Consts {
+
+    private Consts() {
+    }
+
+    /**
+     * working links:
+     * https://huggingface.co/beehive-lab/Llama-3.2-1B-Instruct-GGUF/blob/main/Llama-3.2-1B-Instruct-FP16.gguf
+     * https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-f16.gguf
+     */
+
+    public static final String DEFAULT_CHAT_MODEL_NAME = "beehive-lab/Llama-3.2-1B-Instruct-GGUF";
+    public static final String DEFAULT_CHAT_MODEL_QUANTIZATION = "FP16";
+
+}
@@ -1,9 +1,11 @@
 package io.quarkiverse.langchain4j.gpullama3;
 
 import static dev.langchain4j.internal.Utils.getOrDefault;
-import static java.util.Objects.requireNonNull;
 
+import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.nio.file.Path;
+import java.util.Optional;
 
 import dev.langchain4j.data.message.AiMessage;
 import dev.langchain4j.internal.ChatRequestValidationUtils;
@@ -16,13 +18,22 @@ public class GPULlama3ChatModel extends GPULlama3BaseModel implements ChatModel
 
     // @formatter:off
     private GPULlama3ChatModel(Builder builder) {
-        init(
-                requireNonNull(builder.modelPath, "modelPath is required and must be specified"),
-                getOrDefault(builder.temperature, 0.1),
-                getOrDefault(builder.topP, 1.0),
-                getOrDefault(builder.seed, 12345),
-                getOrDefault(builder.maxTokens, 512),
-                getOrDefault(builder.onGPU, Boolean.TRUE));
+        GPULlama3ModelRegistry gpuLlama3ModelRegistry = GPULlama3ModelRegistry.getOrCreate(builder.modelCachePath);
+        try {
+            Path modelPath = gpuLlama3ModelRegistry.downloadModel(builder.modelName, builder.quantization,
+                    Optional.empty(), Optional.empty());
+            init(
+                    modelPath,
+                    getOrDefault(builder.temperature, 0.1),
+                    getOrDefault(builder.topP, 1.0),
+                    getOrDefault(builder.seed, 12345),
+                    getOrDefault(builder.maxTokens, 512),
+                    getOrDefault(builder.onGPU, Boolean.TRUE));
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        } catch (InterruptedException e) {
+            throw new RuntimeException(e);
+        }
     }
     // @formatter:on
 
@@ -58,7 +69,9 @@ public ChatResponse doChat(ChatRequest chatRequest) {
 
     public static class Builder {
 
-        protected Path modelPath;
+        private Optional<Path> modelCachePath;
+        private String modelName = Consts.DEFAULT_CHAT_MODEL_NAME;
+        private String quantization = Consts.DEFAULT_CHAT_MODEL_QUANTIZATION;
         protected Double temperature;
         protected Double topP;
         protected Integer seed;
@@ -69,8 +82,18 @@ public Builder() {
             // This is public so it can be extended
         }
 
-        public Builder modelPath(Path modelPath) {
-            this.modelPath = modelPath;
+        public Builder modelCachePath(Optional<Path> modelCachePath) {
+            this.modelCachePath = modelCachePath;
+            return this;
+        }
+
+        public Builder modelName(String modelName) {
+            this.modelName = modelName;
+            return this;
+        }
+
+        public Builder quantization(String quantization) {
+            this.quantization = quantization;
             return this;
         }