containers · ericcurtin · Feb 12, 2025 · Feb 11, 2025
@@ -90,7 +90,7 @@ configure_common_flags() {
 
 clone_and_build_whisper_cpp() {
   local whisper_flags=("${common_flags[@]}")
-  local whisper_cpp_sha="8a9ad7844d6e2a10cddf4b92de4089d7ac2b14a9"
+  local whisper_cpp_sha="d682e150908e10caa4c15883c633d7902d385237"
   whisper_flags+=("-DBUILD_SHARED_LIBS=NO")
 
   git clone https://github.com/ggerganov/whisper.cpp
@@ -104,7 +104,7 @@ clone_and_build_whisper_cpp() {
 }
 
 clone_and_build_llama_cpp() {
-  local llama_cpp_sha="aa6fb1321333fae8853d0cdc26bcb5d438e650a1"
+  local llama_cpp_sha="4078c77f9891831f29ffc7c315c8ec6695ba5ce7"
 
   git clone https://github.com/ggerganov/llama.cpp
   cd llama.cpp

@@ -34,6 +34,9 @@ size of the prompt context (default: 2048, 0 = loaded from model)
 #### **--help**, **-h**
 show this help message and exit
 
+#### **--keepalive**
+duration to keep a model loaded (e.g. 5m)
+
 #### **--name**, **-n**
 name of the container to run the Model in
 
@@ -70,9 +73,9 @@ ramalama run granite
 >
 ```
 
-Run command with local downloaoded model
+Run command with local downloaded model for 10 minutes
 ```
-ramalama run file:///tmp/mymodel
+ramalama run --keepalive 10m file:///tmp/mymodel
 >
 ```
 
@@ -89,9 +92,15 @@ This program is a Python script that allows the user to interact with a terminal
  [end of text]
 ```
 
+## Exit Codes:
+
+0   Success
+124 RamaLama command did not exit within the keepalive time.
+
+
 ## NVIDIA CUDA Support
 
-See **[ramalama-cuda(7)](ramalama.7.md)** for setting up the host Linux system for CUDA support.
+See **[ramalama-cuda(7)](ramalama-cuda.7.md)** for setting up the host Linux system for CUDA support.
 
 ## SEE ALSO
 **[ramalama(1)](ramalama.1.md)**, **[ramalama-cuda(7)](ramalama-cuda.7.md)**

@@ -796,6 +796,16 @@ def _run(parser):
         help="size of the prompt context (0 = loaded from model)",
     )
     parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run")
+    # Disable network access by default, and give the option to pass any supported network mode into
+    # podman if needed:
+    # https://docs.podman.io/en/latest/markdown/podman-run.1.html#network-mode-net
+    parser.add_argument(
+        "--network-mode",
+        type=str,
+        default="none",
+        help="set the network mode for the container",
+    )
+
     parser.add_argument("--seed", help="override random seed")
     parser.add_argument(
         "--temp", default=config.get('temp', "0.8"), help="temperature of the response from the AI model"
@@ -807,23 +817,15 @@ def _run(parser):
         help="require HTTPS and verify certificates when contacting registries",
     )
 
-
 def run_parser(subparsers):
     parser = subparsers.add_parser("run", help="run specified AI Model as a chatbot")
     _run(parser)
-    # Disable network access by default, and give the option to pass any supported network mode into
-    # podman if needed:
-    # https://docs.podman.io/en/latest/markdown/podman-run.1.html#network-mode-net
-    parser.add_argument(
-        "--network-mode",
-        type=str,
-        default="none",
-        help="set the network mode for the container",
-    )
+    parser.add_argument("--keepalive", type=str, help="Duration to keep a model loaded (e.g. 5m)")
     parser.add_argument("MODEL")  # positional argument
     parser.add_argument(
         "ARGS", nargs="*", help="Overrides the default prompt, and the output is returned without entering the chatbot"
     )
+    parser._actions.sort(key=lambda x: x.option_strings)
     parser.set_defaults(func=run_cli)
 
 
@@ -845,12 +847,6 @@ def serve_parser(subparsers):
     parser.add_argument(
         "-p", "--port", default=config.get('port', "8080"), help="port for AI Model server to listen on"
     )
-    parser.add_argument(
-        "--network-mode",
-        type=str,
-        default="",
-        help="set the network mode for the container",
-    )
     parser.add_argument("MODEL")  # positional argument
     parser.set_defaults(func=serve_cli)
 

@@ -188,11 +188,11 @@ def setup_container(self, args):
             conman_args += ["-p", f"{args.port}:{args.port}"]
 
         # Check for env var RAMALAMA_DEVICE to explicitly declare the GPU device path
-        device_override=0
+        device_override = 0
         gpu_device = os.environ.get("RAMALAMA_DEVICE")
         if gpu_device:
             conman_args += ["--device", gpu_device]
-            device_override=1
+            device_override = 1
         if device_override != 1:
             if (sys.platform == "darwin" and os.path.basename(args.engine) != "docker") or os.path.exists("/dev/dri"):
                 conman_args += ["--device", "/dev/dri"]
@@ -265,6 +265,8 @@ def run(self, args):
         prompt = self.build_prompt(args)
         model_path = self.get_model_path(args)
         exec_args = self.build_exec_args_run(args, model_path, prompt)
+        if args.keepalive:
+            exec_args = ["timeout", args.keepalive] + exec_args
         self.execute_model(model_path, exec_args, args)
 
     def perplexity(self, args):

@@ -64,10 +64,10 @@ def init_pull(repos, accept, registry_head, model_name, model_tag, models, model
 def in_existing_cache(model_name, model_tag):
     if not available("ollama"):
         return None
-    default_ollama_caches=[
+    default_ollama_caches = [
         os.path.join(os.environ['HOME'], '.ollama/models'),
         '/usr/share/ollama/.ollama/models',
-        f'C:\\Users\\{os.getlogin()}\\.ollama\\models'
+        f'C:\\Users\\{os.getlogin()}\\.ollama\\models',
     ]
 
     for cache_dir in default_ollama_caches:
@@ -79,10 +79,11 @@ def in_existing_cache(model_name, model_tag):
                     if layer["mediaType"] == "application/vnd.ollama.image.model":
                         layer_digest = layer["digest"]
                         ollama_digest_path = os.path.join(cache_dir, 'blobs', layer_digest)
-                        if os.path.exists(str(ollama_digest_path).replace(':','-')):
-                          return str(ollama_digest_path).replace(':','-')
+                        if os.path.exists(str(ollama_digest_path).replace(':', '-')):
+                            return str(ollama_digest_path).replace(':', '-')
     return None
 
+
 class Ollama(Model):
     def __init__(self, model):
         model = rm_until_substring(model, "ollama.com/library/")

@@ -64,8 +64,16 @@ load helpers
 }
 
 @test "ramalama run tiny with prompt" {
-      skip_if_notlocal
-      run_ramalama run --name foobar tiny "Write a 1 line poem"
+    skip_if_notlocal
+    run_ramalama run --name foobar tiny "Write a 1 line poem"
+}
+
+@test "ramalama run --keepalive" {
+    # FIXME: the following skips can be removed, once we install llama-run on
+    # test systems.
+    skip_if_nocontainer 
+    skip_if_darwin
+    run_ramalama 124 run --keepalive 1s tiny
 }
 
 # vim: filetype=sh