From 3332c623b2614bdb4164c3bdf8ac472c18166e25 Mon Sep 17 00:00:00 2001
From: Eric Curtin <ecurtin@redhat.com>
Date: Sun, 2 Feb 2025 15:30:54 +0000
Subject: [PATCH] Make the default of ngl be -1

This means automatically assign a value, which may be 999 or 0
depending on hardware.

Signed-off-by: Eric Curtin <ecurtin@redhat.com>
---
 docs/ramalama.1.md      | 3 ++-
 docs/ramalama.conf      | 3 ++-
 docs/ramalama.conf.5.md | 5 +++--
 ramalama/cli.py         | 4 ++--
 ramalama/model.py       | 2 +-
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md
index 7c0fe6d3..c4cdb7e9 100644
--- a/docs/ramalama.1.md
+++ b/docs/ramalama.1.md
@@ -115,7 +115,8 @@ pass --group-add keep-groups to podman (default: False)
 Needed to access the gpu on some systems, but has an impact on security, use with caution.
 
 #### **--ngl**
-number of gpu layers (default: 999)
+number of gpu layers, 0 means CPU inferencing, 999 means use max layers (default: -1)
+The default -1, means use whatever is automatically deemed appropriate (0 or 999)
 
 #### **--nocontainer**
 do not run RamaLama in the default container (default: False)
diff --git a/docs/ramalama.conf b/docs/ramalama.conf
index e8c18d73..8ce47613 100644
--- a/docs/ramalama.conf
+++ b/docs/ramalama.conf
@@ -50,8 +50,9 @@
 #keep_groups = false
 
 # Default number of layers offloaded to the gpu
+# -1 means use whatever is automatically deemed appropriate (0 or 999)
 #
-#ngl = 999
+#ngl = -1
 
 # Specify default port for services to listen on
 #
diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md
index 4b7e21ab..044a6b5c 100644
--- a/docs/ramalama.conf.5.md
+++ b/docs/ramalama.conf.5.md
@@ -92,9 +92,10 @@ RAMALAMA_IMAGE environment variable overrides this field.
 Pass `--group-add keep-groups` to podman, when using podman.
 In some cases this is needed to access the gpu from a rootless container
 
-**ngl**=999
+**ngl**=-1
 
-Default number of layers to offload to the gpu
+number of gpu layers, 0 means CPU inferencing, 999 means use max layers (default: -1)
+The default -1, means use whatever is automatically deemed appropriate (0 or 999)
 
 **port**="8080"
 
diff --git a/ramalama/cli.py b/ramalama/cli.py
index 81829201..3901c19c 100644
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -196,8 +196,8 @@ def configure_arguments(parser):
         "--ngl",
         dest="ngl",
         type=int,
-        default=config.get("ngl", 999),
-        help="Number of layers to offload to the gpu, if available",
+        default=config.get("ngl", -1),
+        help="Number of layers to offload to the gpu, if available"
     )
     parser.add_argument(
         "--keep-groups",
diff --git a/ramalama/model.py b/ramalama/model.py
index 24df822a..283f107a 100644
--- a/ramalama/model.py
+++ b/ramalama/model.py
@@ -195,7 +195,7 @@ def setup_container(self, args):
     def gpu_args(self, args, runner=False):
         gpu_args = []
         if (
-            args.gpu
+            args.gpu > 0
             or os.getenv("HIP_VISIBLE_DEVICES")
             or os.getenv("ASAHI_VISIBLE_DEVICES")
             or os.getenv("CUDA_VISIBLE_DEVICES")