Added additional, upgraded numpy, and updated handlers to meet expected behavior (accepting properties via serving.properties or env variables, throwing error when csv files contain non numeric data)

smouaa · smouaa · commit 2fd9054dfc38 · 2025-10-21T23:15:27.000Z
diff --git a/engines/python/setup/djl_python/encode_decode.py b/engines/python/setup/djl_python/encode_decode.py
@@ -22,26 +22,29 @@
 
 
 def decode_csv(inputs: Input, require_headers=True):  # type: (str) -> np.array
-    csv_string = inputs.get_as_string()
+    csv_content = inputs.get_as_string()
 
     if require_headers:
-        if not any(header in csv_string.splitlines()[0].lower()
+        if not any(header in csv_content.splitlines()[0].lower()
                    for header in ["question", "context", "inputs"]):
             raise ValueError(
                 "You need to provide the correct CSV with Header columns to use it with the inference toolkit default handler.",
             )
-        stream = StringIO(csv_string)
+        stream = StringIO(csv_content)
         request_list = list(csv.DictReader(stream))
         if "inputs" in request_list[0].keys():
             return {"inputs": [entry["inputs"] for entry in request_list]}
         else:
             return {"inputs": request_list}
     else:
         # for preditive ML inputs
-        try:
-            return np.genfromtxt(StringIO(csv_string), delimiter=",")
-        except (ValueError, TypeError) as e:
-            raise ValueError(f"Failed to parse CSV data: {str(e)}")
+        result = np.genfromtxt(StringIO(csv_content), delimiter=",")
+        # Check for NaN values which indicate non-numeric data
+        if np.isnan(result).any():
+            raise ValueError(
+                "CSV contains non-numeric data. Please provide numeric data only."
+            )
+        return result
 
 
 def encode_csv(content):  # type: (str) -> np.array
diff --git a/engines/python/setup/djl_python/sklearn_handler.py b/engines/python/setup/djl_python/sklearn_handler.py
@@ -36,8 +36,8 @@ def _get_trusted_types(self, properties: dict):
         trusted_types_str = properties.get("skops_trusted_types", "")
         if not trusted_types_str:
             raise ValueError(
-                "SKLEARN_SKOPS_TRUSTED_TYPES environment variable must be set to load skops models. "
-                "Example: SKLEARN_SKOPS_TRUSTED_TYPES='sklearn.ensemble._forest.RandomForestClassifier,numpy.ndarray'"
+                "option.skops_trusted_types must be set to load skops models. "
+                "Example: option.skops_trusted_types='sklearn.ensemble._forest.RandomForestClassifier,numpy.ndarray'"
             )
         trusted_types = [
             t.strip() for t in trusted_types_str.split(",") if t.strip()
@@ -75,7 +75,7 @@ def initialize(self, properties: dict):
             if properties.get("trust_insecure_model_files",
                               "false").lower() != "true":
                 raise ValueError(
-                    f"trust_insecure_model_files must be set to 'true' to use {model_format} format (only skops is secure by default)"
+                    f"option.trust_insecure_model_files must be set to 'true' to use {model_format} format (only skops is secure by default)"
                 )
 
             if model_format == "joblib":
diff --git a/engines/python/setup/djl_python/xgboost_handler.py b/engines/python/setup/djl_python/xgboost_handler.py
@@ -31,7 +31,8 @@ def __init__(self):
 
     def initialize(self, properties: dict):
         model_dir = properties.get("model_dir")
-        model_format = properties.get("model_format", "json")
+        model_format = (properties.get("model_format")
+                        or os.environ.get("MODEL_FORMAT") or "json")
 
         format_extensions = {
             "json": ["json"],
@@ -56,10 +57,12 @@ def initialize(self, properties: dict):
             self.model = xgb.Booster()
             self.model.load_model(model_file)
         else:  # unsafe formats: pickle, xgb
-            if properties.get("trust_insecure_model_files",
-                              "false").lower() != "true":
+            trust_insecure = (properties.get("trust_insecure_model_files")
+                              or os.environ.get("TRUST_INSECURE_MODEL_FILES")
+                              or "false")
+            if trust_insecure.lower() != "true":
                 raise ValueError(
-                    "trust_insecure_model_files must be set to 'true' to use unsafe formats (only json/ubj are secure by default)"
+                    "option.trust_insecure_model_files must be set to 'true' to use unsafe formats (only json/ubj are secure by default)"
                 )
             if model_format == "pickle":
                 with open(model_file, 'rb') as f:
diff --git a/serving/docker/Dockerfile b/serving/docker/Dockerfile
@@ -84,6 +84,7 @@ RUN scripts/install_python.sh ${python_version} && \
     scripts/patch_oss_dlc.sh python && \
     pip3 install torch=="${torch_version}" torchvision --extra-index-url https://download.pytorch.org/whl/cpu && \
     pip3 install scikit-learn=="${sklearn_version}" skops cloudpickle xgboost=="${xgboost_version}" pydantic=="${pydantic_version}" && \
+    pip3 install --upgrade numpy && \
     echo "${djl_serving_version} cpufull" > /opt/djl/bin/telemetry && \
     rm -rf /opt/djl/logs && \
     chown -R djl:djl /opt/djl && \
diff --git a/tests/integration/.gitignore b/tests/integration/.gitignore
@@ -2,3 +2,4 @@
 /logs
 /all_logs
 /models
+/client_logs
diff --git a/tests/integration/download_models.sh b/tests/integration/download_models.sh
@@ -34,6 +34,7 @@ python_skl_models_urls=(
   "https://resources.djl.ai/test-models/python/sklearn/sklearn_multi_model_v2.zip"
   "https://resources.djl.ai/test-models/python/sklearn/sklearn_unsafe_model_v2.zip"
   "https://resources.djl.ai/test-models/python/sklearn/sklearn_custom_model_v2.zip"
+  "https://resources.djl.ai/test-models/python/sklearn/sklearn_skops_model_env_v2.zip"
 )
 
 python_xgb_models_urls=(
@@ -47,18 +48,10 @@ python_xgb_models_urls=(
 download() {
   urls=("$@")
   for url in "${urls[@]}"; do
-    if [[ "$url" == */ ]]; then
-      # Directory URL - use wget to download recursively
-      dirname=$(basename "${url%/}")
-      if ! [ -d "${dirname}" ]; then
-        wget -r -np -nH --cut-dirs=3 -R "index.html*" "$url"
-      fi
-    else
-      # File URL - use curl with cache-busting headers
-      filename=${url##*/}
-      if ! [ -f "${filename}" ]; then
-        curl -sf -H "Cache-Control: no-cache" -H "Pragma: no-cache" -O "$url"
-      fi
+    filename=${url##*/}
+    # does not download the file, if file already exists
+    if ! [ -f "${filename}" ]; then
+      curl -sf -O "$url"
     fi
   done
 }
diff --git a/tests/integration/test_xgb_skl.py b/tests/integration/test_xgb_skl.py
diff --git a/tests/integration/tests.py b/tests/integration/tests.py