diff --git a/container-images/scripts/build_llama_and_whisper.sh b/container-images/scripts/build_llama_and_whisper.sh index 4a66b02f..48cad908 100755 --- a/container-images/scripts/build_llama_and_whisper.sh +++ b/container-images/scripts/build_llama_and_whisper.sh @@ -90,7 +90,7 @@ configure_common_flags() { clone_and_build_whisper_cpp() { local whisper_flags=("${common_flags[@]}") - local whisper_cpp_sha="8a9ad7844d6e2a10cddf4b92de4089d7ac2b14a9" + local whisper_cpp_sha="d682e150908e10caa4c15883c633d7902d385237" whisper_flags+=("-DBUILD_SHARED_LIBS=NO") git clone https://github.com/ggerganov/whisper.cpp @@ -104,7 +104,7 @@ clone_and_build_whisper_cpp() { } clone_and_build_llama_cpp() { - local llama_cpp_sha="aa6fb1321333fae8853d0cdc26bcb5d438e650a1" + local llama_cpp_sha="4078c77f9891831f29ffc7c315c8ec6695ba5ce7" git clone https://github.com/ggerganov/llama.cpp cd llama.cpp diff --git a/docs/ramalama-run.1.md b/docs/ramalama-run.1.md index 6fdc205b..0995a60a 100644 --- a/docs/ramalama-run.1.md +++ b/docs/ramalama-run.1.md @@ -34,6 +34,9 @@ size of the prompt context (default: 2048, 0 = loaded from model) #### **--help**, **-h** show this help message and exit +#### **--keepalive** +duration to keep a model loaded (e.g. 5m) + #### **--name**, **-n** name of the container to run the Model in @@ -70,9 +73,9 @@ ramalama run granite > ``` -Run command with local downloaoded model +Run command with local downloaded model for 10 minutes ``` -ramalama run file:///tmp/mymodel +ramalama run --keepalive 10m file:///tmp/mymodel > ``` @@ -89,9 +92,15 @@ This program is a Python script that allows the user to interact with a terminal [end of text] ``` +## Exit Codes: + +0 Success +124 RamaLama command did not exit within the keepalive time. + + ## NVIDIA CUDA Support -See **[ramalama-cuda(7)](ramalama.7.md)** for setting up the host Linux system for CUDA support. +See **[ramalama-cuda(7)](ramalama-cuda.7.md)** for setting up the host Linux system for CUDA support. ## SEE ALSO **[ramalama(1)](ramalama.1.md)**, **[ramalama-cuda(7)](ramalama-cuda.7.md)** diff --git a/ramalama/cli.py b/ramalama/cli.py index 4a5ca2f0..b07492f0 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -796,6 +796,16 @@ def _run(parser): help="size of the prompt context (0 = loaded from model)", ) parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run") + # Disable network access by default, and give the option to pass any supported network mode into + # podman if needed: + # https://docs.podman.io/en/latest/markdown/podman-run.1.html#network-mode-net + parser.add_argument( + "--network-mode", + type=str, + default="none", + help="set the network mode for the container", + ) + parser.add_argument("--seed", help="override random seed") parser.add_argument( "--temp", default=config.get('temp', "0.8"), help="temperature of the response from the AI model" @@ -807,23 +817,15 @@ def _run(parser): help="require HTTPS and verify certificates when contacting registries", ) - def run_parser(subparsers): parser = subparsers.add_parser("run", help="run specified AI Model as a chatbot") _run(parser) - # Disable network access by default, and give the option to pass any supported network mode into - # podman if needed: - # https://docs.podman.io/en/latest/markdown/podman-run.1.html#network-mode-net - parser.add_argument( - "--network-mode", - type=str, - default="none", - help="set the network mode for the container", - ) + parser.add_argument("--keepalive", type=str, help="Duration to keep a model loaded (e.g. 5m)") parser.add_argument("MODEL") # positional argument parser.add_argument( "ARGS", nargs="*", help="Overrides the default prompt, and the output is returned without entering the chatbot" ) + parser._actions.sort(key=lambda x: x.option_strings) parser.set_defaults(func=run_cli) @@ -845,12 +847,6 @@ def serve_parser(subparsers): parser.add_argument( "-p", "--port", default=config.get('port', "8080"), help="port for AI Model server to listen on" ) - parser.add_argument( - "--network-mode", - type=str, - default="", - help="set the network mode for the container", - ) parser.add_argument("MODEL") # positional argument parser.set_defaults(func=serve_cli) diff --git a/ramalama/model.py b/ramalama/model.py index 802e2e0d..8f5e6939 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -188,11 +188,11 @@ def setup_container(self, args): conman_args += ["-p", f"{args.port}:{args.port}"] # Check for env var RAMALAMA_DEVICE to explicitly declare the GPU device path - device_override=0 + device_override = 0 gpu_device = os.environ.get("RAMALAMA_DEVICE") if gpu_device: conman_args += ["--device", gpu_device] - device_override=1 + device_override = 1 if device_override != 1: if (sys.platform == "darwin" and os.path.basename(args.engine) != "docker") or os.path.exists("/dev/dri"): conman_args += ["--device", "/dev/dri"] @@ -265,6 +265,8 @@ def run(self, args): prompt = self.build_prompt(args) model_path = self.get_model_path(args) exec_args = self.build_exec_args_run(args, model_path, prompt) + if args.keepalive: + exec_args = ["timeout", args.keepalive] + exec_args self.execute_model(model_path, exec_args, args) def perplexity(self, args): diff --git a/ramalama/ollama.py b/ramalama/ollama.py index b80a8922..75378a64 100644 --- a/ramalama/ollama.py +++ b/ramalama/ollama.py @@ -64,10 +64,10 @@ def init_pull(repos, accept, registry_head, model_name, model_tag, models, model def in_existing_cache(model_name, model_tag): if not available("ollama"): return None - default_ollama_caches=[ + default_ollama_caches = [ os.path.join(os.environ['HOME'], '.ollama/models'), '/usr/share/ollama/.ollama/models', - f'C:\\Users\\{os.getlogin()}\\.ollama\\models' + f'C:\\Users\\{os.getlogin()}\\.ollama\\models', ] for cache_dir in default_ollama_caches: @@ -79,10 +79,11 @@ def in_existing_cache(model_name, model_tag): if layer["mediaType"] == "application/vnd.ollama.image.model": layer_digest = layer["digest"] ollama_digest_path = os.path.join(cache_dir, 'blobs', layer_digest) - if os.path.exists(str(ollama_digest_path).replace(':','-')): - return str(ollama_digest_path).replace(':','-') + if os.path.exists(str(ollama_digest_path).replace(':', '-')): + return str(ollama_digest_path).replace(':', '-') return None + class Ollama(Model): def __init__(self, model): model = rm_until_substring(model, "ollama.com/library/") diff --git a/test/system/030-run.bats b/test/system/030-run.bats index cc40bfd3..75cd2d0b 100755 --- a/test/system/030-run.bats +++ b/test/system/030-run.bats @@ -64,8 +64,16 @@ load helpers } @test "ramalama run tiny with prompt" { - skip_if_notlocal - run_ramalama run --name foobar tiny "Write a 1 line poem" + skip_if_notlocal + run_ramalama run --name foobar tiny "Write a 1 line poem" +} + +@test "ramalama run --keepalive" { + # FIXME: the following skips can be removed, once we install llama-run on + # test systems. + skip_if_nocontainer + skip_if_darwin + run_ramalama 124 run --keepalive 1s tiny } # vim: filetype=sh