diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 322a9882ad..7ef3c5c66e 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -10,6 +10,7 @@ on: - pub/multiple_login_node - pub/local_repo_arch - pub/k8s_plugins + - pub/slurm_multi_arch jobs: build: diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index b8b7289834..060376d94e 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -9,6 +9,7 @@ on: - pub/multiple_login_node - pub/local_repo_arch - pub/k8s_plugins + - pub/slurm_multi_arch jobs: build: diff --git a/accelerator/ansible.cfg b/accelerator/ansible.cfg index 8fa2a0501b..18cb3ca935 100644 --- a/accelerator/ansible.cfg +++ b/accelerator/ansible.cfg @@ -14,4 +14,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=60 -o ConnectTimeout=60 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=60 -o ConnectTimeout=60 diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 004541fefc..11fd45f643 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -75,7 +75,7 @@ files["omnia_config"], files["high_availability_config"] ], - "k8s": [ + "compute_k8s": [ files['roles_config'], files["omnia_config"], files["high_availability_config"] diff --git a/common/library/module_utils/input_validation/schema/credential_rules.json b/common/library/module_utils/input_validation/schema/credential_rules.json index b5c10d041b..6ffa66cf9b 100644 --- a/common/library/module_utils/input_validation/schema/credential_rules.json +++ b/common/library/module_utils/input_validation/schema/credential_rules.json @@ -130,5 +130,17 @@ "maxLength": 128, "pattern": "^(?!admin$)[^\\\\\\-'\"]+$", "description": "Password for grafana UI. Should not be kept 'admin. Length must be at least 5 characters and must not contain backslashes (\\), hyphens (-), single quotes ('), or double quotes (\\\")." + }, + "csi_username": { + "minLength": 4, + "maxLength": 64, + "description": "Username for Powerscale UI. Must not contain semicolons (;), square brackets ([]), or backticks (`).", + "pattern": "^[^;\\[\\]`]+$" + }, + "csi_password": { + "description": "Password for Powerscale UI. Must not contain hyphens (-), single quotes ('), double quotes (\"), at symbols (@), or backslashes (\\).", + "minLength": 5, + "maxLength": 32, + "pattern": "^[^\\-\\'\\\"@\\\\]*$" } } diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index c2bd0f58eb..0f5c359d1f 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -59,6 +59,16 @@ "k8s_offline_install": { "type": "boolean", "description": "Whether to pull packages/images from local repo." + }, + "csi_powerscale_driver_secret_file_path": { + "description": "Absolute file path for the secret.yaml file.", + "type": "string", + "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" + }, + "csi_powerscale_driver_values_file_path": { + "description": "File path for the values.yaml file.", + "type": "string", + "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" } }, "required": [ @@ -74,6 +84,20 @@ "then": { "required": ["topology_manager_scope"] } + }, + { + "if": { + "properties": { + "csi_powerscale_driver_secret_file_path": { + "type": "string", + "minLength": 1 + } + }, + "required": ["csi_powerscale_driver_secret_file_path"] + }, + "then": { + "required": ["csi_powerscale_driver_values_file_path"] + } } ] } @@ -122,6 +146,16 @@ "k8s_offline_install": { "type": "boolean", "description": "Whether to pull packages/images from local repo." + }, + "csi_powerscale_driver_secret_file_path": { + "description": "Absolute file path for the secret.yaml file.", + "type": "string", + "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" + }, + "csi_powerscale_driver_values_file_path": { + "description": "File path for the values.yaml file.", + "type": "string", + "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" } }, "required": [ @@ -137,6 +171,20 @@ "then": { "required": ["topology_manager_scope"] } + }, + { + "if": { + "properties": { + "csi_powerscale_driver_secret_file_path": { + "type": "string", + "minLength": 1 + } + }, + "required": ["csi_powerscale_driver_secret_file_path"] + }, + "then": { + "required": ["csi_powerscale_driver_values_file_path"] + } } ] } diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index d8f330412d..f4c9d9b7f0 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -3,6 +3,15 @@ "title": "Telemetry Configuration", "type": "object", "properties": { + "kube_prometheus_support": { + "type": "boolean" + }, + "prometheus_scrape_interval": { + "type": "integer", + "minimum": 1, + "default": 15 + }, + "idrac_telemetry_support": { "type": "boolean" }, @@ -17,6 +26,8 @@ } }, "required": [ + "kube_prometheus_support", + "prometheus_scrape_interval", "idrac_telemetry_support", "visualization_support", "federated_idrac_telemetry_collection" diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 3c4f4f6527..fceeebdc1c 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -17,13 +17,13 @@ """ import json import os -import ipaddress import yaml +import ipaddress +import subprocess from ast import literal_eval import ansible.module_utils.input_validation.common_utils.data_fetch as get +from ansible.module_utils.input_validation.validation_flows import csi_driver_validation import ansible.module_utils.input_validation.common_utils.data_validation as validate -from ansible.modules.validate_input import generate_log_failure_message - from ansible.module_utils.input_validation.common_utils import ( validation_utils, config, @@ -32,7 +32,6 @@ ) from ansible.module_utils.input_validation.validation_flows import scheduler_validation - from ansible.module_utils.local_repo.software_utils import ( load_json, set_version_variables, @@ -947,7 +946,11 @@ def is_ip_in_range(ip_str, ip_range_str): except ValueError: return False -def validate_k8s(data, admin_bmc_networks, softwares, ha_config, tag_names, errors): + + + +def validate_k8s(data, admin_bmc_networks, softwares, ha_config, tag_names, errors, + omnia_base_dir, project_name, logger, module, input_file_path): """ Validates Kubernetes cluster configurations. @@ -962,10 +965,10 @@ def validate_k8s(data, admin_bmc_networks, softwares, ha_config, tag_names, erro bmc_static_range = admin_bmc_networks["bmc_network"]["static_range"] bmc_dynamic_range = admin_bmc_networks["bmc_network"]["dynamic_range"] primary_oim_admin_ip = admin_bmc_networks["admin_network"]["primary_oim_admin_ip"] - + # service_k8s_cluster = data["service_k8s_cluster"] cluster_set = {} - if "k8s" in softwares and "k8s" in tag_names: + if "compute_k8s" in softwares and "compute_k8s" in tag_names: cluster_set["compute_k8s_cluster"] = data.get( "compute_k8s_cluster", []) if "service_k8s" in softwares and "service_k8s" in tag_names: @@ -997,7 +1000,7 @@ def validate_k8s(data, admin_bmc_networks, softwares, ha_config, tag_names, erro f"{cluster_name} not found in high_availability_config.yml" )) pod_external_ip_range = kluster.get("pod_external_ip_range") - if not pod_external_ip_range: + if not pod_external_ip_range or str(pod_external_ip_range).strip() == "": errors.append( create_error_msg( "Pod External IP Range -", @@ -1031,7 +1034,41 @@ def validate_k8s(data, admin_bmc_networks, softwares, ha_config, tag_names, erro create_error_msg( "IP overlap -", None, - en_us_validation_msg.IP_OVERLAP_FAIL_MSG)) + en_us_validation_msg.IP_OVERLAP_FAIL_MSG)) + + #csi validation + if ( + "csi_driver_powerscale" in softwares + and ("k8s" in softwares or "service_k8s" in softwares) + ): + + csi_secret_file_path = kluster.get("csi_powerscale_driver_secret_file_path") + csi_values_file_path = kluster.get("csi_powerscale_driver_values_file_path") + + # Validate secret file path + if not csi_secret_file_path or \ + not csi_secret_file_path.strip() or \ + not os.path.exists(csi_secret_file_path.strip()): + errors.append( + create_error_msg( + "csi_powerscale_driver_secret_file_path", + csi_secret_file_path, + en_us_validation_msg.CSI_DRIVER_SECRET_FAIL_MSG, + ) + ) + else: + # If secret path is valid, ensure values path is also valid + if not csi_values_file_path or \ + not csi_values_file_path.strip() or \ + not os.path.exists(csi_values_file_path.strip()): + errors.append( + create_error_msg( + "csi_powerscale_driver_values_file_path", + csi_values_file_path, + en_us_validation_msg.CSI_DRIVER_VALUES_FAIL_MSG, + ) + ) + csi_driver_validation.validate_powerscale_secret_and_values_file(csi_secret_file_path,csi_values_file_path, errors, input_file_path) def validate_omnia_config( input_file_path, @@ -1077,8 +1114,8 @@ def validate_omnia_config( ) ) - if ("k8s" in sw_list or "service_k8s" in sw_list) and \ - ("k8s" in tag_names or "service_k8s" in tag_names): + if ("compute_k8s" in sw_list or "service_k8s" in sw_list) and \ + ("compute_k8s" in tag_names or "service_k8s" in tag_names): admin_bmc_networks = get_admin_bmc_networks( input_file_path, logger, module, omnia_base_dir, module_utils_base, project_name) ha_config_path = create_file_path( @@ -1087,7 +1124,8 @@ def validate_omnia_config( ha_config = yaml.safe_load(f) for k in ["service_k8s_cluster_ha", "compute_k8s_cluster_ha"]: ha_config[k] = [xha["cluster_name"] for xha in ha_config.get(k, [])] - validate_k8s(data, admin_bmc_networks, sw_list, ha_config, tag_names, errors) + validate_k8s(data, admin_bmc_networks, sw_list, ha_config, tag_names, + errors, omnia_base_dir, project_name, logger, module, input_file_path) return errors def validate_telemetry_config( @@ -1233,3 +1271,4 @@ def validate_additional_software( ) ) return errors + diff --git a/common/library/module_utils/input_validation/validation_flows/csi_driver_validation.py b/common/library/module_utils/input_validation/validation_flows/csi_driver_validation.py new file mode 100644 index 0000000000..27066c0b0a --- /dev/null +++ b/common/library/module_utils/input_validation/validation_flows/csi_driver_validation.py @@ -0,0 +1,261 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=import-error,no-name-in-module,too-many-positional-arguments,too-many-arguments,unused-argument +""" +Validates csi driver configuration files for Omnia. +""" +import os +import yaml +from ansible.module_utils.input_validation.common_utils import validation_utils +from ansible.module_utils.input_validation.common_utils import config + +file_names = config.files +create_error_msg = validation_utils.create_error_msg +create_file_path = validation_utils.create_file_path +contains_software = validation_utils.contains_software +check_mandatory_fields = validation_utils.check_mandatory_fields +flatten_sub_groups = validation_utils.flatten_sub_groups + + +def validate_secret_isilon_clusters(data): + """ + Validates csi secret file inputs for Omnia. + """ + + cluster_errors = [] + clusters = data.get("isilonClusters") + + # Check if isilonClusters is a defined, non-empty list + if not isinstance(clusters, list) or len(clusters) == 0: + cluster_errors.append("isilonClusters must be a non-empty list.") + return cluster_errors # Stop further checks + + for idx, item in enumerate(clusters): + cluster_prefix = f"Cluster {idx + 1}" + + # Validate clusterName + if not isinstance(item.get("clusterName"), str) or not item["clusterName"].strip(): + cluster_errors.append(f"{cluster_prefix}: Invalid or missing 'clusterName'.") + + # Validate username + if not isinstance(item.get("username"), str) or not item["username"].strip(): + cluster_errors.append(f"{cluster_prefix}: Invalid or missing 'username'.") + + # Validate password + if not isinstance(item.get("password"), str) or not item["password"].strip(): + cluster_errors.append(f"{cluster_prefix}: Invalid or missing 'password'.") + + # Validate endpoint + if not isinstance(item.get("endpoint"), str) or not item["endpoint"].strip(): + cluster_errors.append(f"{cluster_prefix}: Invalid or missing 'endpoint'.") + + # Validate endpointPort if defined + if "endpointPort" in item: + if not isinstance(item["endpointPort"], int) or not 0 < item["endpointPort"] < 65536: + cluster_errors.append( + f"{cluster_prefix}: 'endpointPort' must be an integer between 1 and 65535.") + + # Validate isDefault + if "isDefault" not in item or not isinstance(item["isDefault"], bool): + cluster_errors.append( + f"{cluster_prefix}: 'isDefault' must be a boolean and must be defined.") + + # Validate skipCertificateValidation if defined + if "skipCertificateValidation" in item: + if item["skipCertificateValidation"] is not True: + cluster_errors.append( + f"{cluster_prefix}: 'skipCertificateValidation' must be true if defined.") + + # Validate isiPath if defined + if "isiPath" in item: + isi_path = item["isiPath"] + if ( + not isinstance(isi_path, str) or + not isi_path.strip() or + not isi_path.lstrip().startswith('/') + ): + cluster_errors.append( + f"{cluster_prefix}: 'isiPath' must be a non-empty valid Unix absolute path.") + + # Validate isiVolumePathPermissions if defined + if "isiVolumePathPermissions" in item: + perms = item["isiVolumePathPermissions"] + if not isinstance(perms, str) or not perms.strip().isdigit(): + msg = ( + f"{cluster_prefix}: 'endpointPort' must be an " + "integer between 1 and 65535." + ) + cluster_errors.append(msg) + return cluster_errors + +def validate_value_file_inputs(values_data): + """ + Validates csi value file inputs for Omnia. + """ + + value_errors = [] + + def add_error(field_path, value, msg): + value_errors.append( + f"Validation Error - {field_path}: '{value}' -> {msg}" + ) + + # Helper to safely get nested values + def get_nested(data, keys, default=None): + for key in keys: + if not isinstance(data, dict) or key not in data: + return default + data = data[key] + return data + + # 1. controller.controllerCount == 1 + controller_count = get_nested(values_data, ["controller", "controllerCount"]) + if controller_count != 1: + add_error("controller.controllerCount", controller_count, "Must be 1") + + # 2. controller.replication.enabled == false + replication_enabled = get_nested(values_data, ["controller", "replication", "enabled"]) + if replication_enabled is None or replication_enabled is not False: + add_error("controller.replication.enabled", replication_enabled, "Must be false") + + # 3. controller.resizer.enabled in [true, false] + resizer_enabled = get_nested(values_data, ["controller", "resizer", "enabled"]) + if resizer_enabled not in [True, False]: + add_error("controller.resizer.enabled", resizer_enabled, "Must be true or false") + + # 4. controller.snapshot.enabled == true + snapshot_enabled = get_nested(values_data, ["controller", "snapshot", "enabled"]) + if snapshot_enabled is not True: + add_error("controller.snapshot.enabled", snapshot_enabled, "Must be true") + + # 5. endpointPort is int in 1..65535 + endpoint_port = values_data.get("endpointPort") + if endpoint_port is None or not isinstance(endpoint_port, int) or not 1 <= endpoint_port <= 65535: + add_error("endpointPort", endpoint_port, "Must be between 1 and 65535") + + # 6. skipCertificateValidation == true + skip_cert = values_data.get("skipCertificateValidation") + if skip_cert is not True: + add_error("skipCertificateValidation", skip_cert, "Must be true") + + # 7. isiAuthType in [0, 1] + isi_auth = values_data.get("isiAuthType") + if isi_auth not in [0, 1]: + add_error("isiAuthType", isi_auth, "Must be 0 or 1") + + # 8. isiAccessZone is non-empty string + isi_access = values_data.get("isiAccessZone") + if not isi_access or not isinstance(isi_access, str) or not isi_access.strip(): + add_error("isiAccessZone", isi_access, "Must be a non-empty string") + + # 9. isiPath is Unix absolute path + isi_path = values_data.get("isiPath") + if not isinstance(isi_path, str) or not isi_path.startswith("/"): + add_error("isiPath", isi_path, "Must be a valid Unix absolute path") + + # 10. isiVolumePathPermissions is a non-empty string + permissions = values_data.get("isiVolumePathPermissions") + if not permissions or not isinstance(permissions, str) or not permissions.strip(): + add_error("isiVolumePathPermissions", permissions, "Must be a valid octal string") + + return value_errors + +def encrypt_file(secret_file_path, vault_secret_file_path): + """ + encrypt the secret file + """ + + cmd = [ + "ansible-vault", + "encrypt", + secret_file_path, + "--vault-password-file", + vault_secret_file_path, + ] + return validation_utils.run_subprocess(cmd) + +def decrypt_file(secret_file_path, vault_secret_file_path): + """ + encrypt the secret file + Takes 2 inputs: file name and secret file path + """ + + cmd = [ + "ansible-vault", + "decrypt", + secret_file_path, + "--vault-password-file", + vault_secret_file_path, + ] + return validation_utils.run_subprocess(cmd) + +def process_encrypted_file(secret_file_path,vault_secret_file_path,errors): + """ + Process the secret file + decrypt the file first then parse it to get data + """ + + decrypted_file = decrypt_file(secret_file_path, vault_secret_file_path) + if decrypted_file: + try: + with open(secret_file_path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + encrypt_file(secret_file_path, vault_secret_file_path) + return data + except FileNotFoundError: + errors.append(create_error_msg("File not found", + secret_file_path, "Please check the associated file exists")) + except yaml.YAMLError: + errors.append(create_error_msg("Error loading yaml file", + secret_file_path, "Please check the associated file syntax")) + else: + errors.append(create_error_msg("Error occured when attempting to decrypt file.", + secret_file_path, "Please check that the assoicated vault file exists")) + return decrypted_file + +def validate_powerscale_secret_and_values_file( + secret_file_path, values_file_path, + errors, input_file_path): + """ + Driver code to initiate the powerscale secret and values file input validation + """ + + #valiadte secret file inputs + secrets_file_encrypted = validation_utils.is_file_encrypted(secret_file_path) + file_path = os.path.dirname(input_file_path) + vault_secret_file_path = os.path.join(file_path, ".csi_powerscale_secret_vault") + + if secrets_file_encrypted: + secret_data = process_encrypted_file(secret_file_path, vault_secret_file_path,errors) + if secret_data is None or secret_data is False: + errors.append(create_error_msg( + "Secret File Load", + secret_file_path, + "Failed to load or parse secret.yaml file. It may be invalid or empty." + )) + else: + secret_validation_errors = validate_secret_isilon_clusters(secret_data) + if secret_validation_errors: + for err in secret_validation_errors: + errors.append( + create_error_msg("Powerscale Secret File Validation Error:", err, None)) + + #validate values file input + with open(values_file_path, "r", encoding="utf-8") as f: + values_data = yaml.safe_load(f) + values_validation_errros = validate_value_file_inputs(values_data) + if values_validation_errros: + for value_err in values_validation_errros: + errors.append( + create_error_msg(f"Powerscale Value File Validation Error: ",value_err, None)) diff --git a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py index 1529875d6b..e6ab242401 100644 --- a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py @@ -655,7 +655,8 @@ def validate_ha_config(ha_data, mandatory_fields, errors, config_type=None): ("oim_ha", ["admin_virtual_ip_address", "active_node_service_tag", "passive_nodes"]), ("service_node_ha", ["service_nodes"]), ("slurm_head_node_ha", ["virtual_ip_address", "active_node_service_tag", "passive_nodes"]), - ("k8s_head_node_ha", ["virtual_ip_address", "active_node_service_tags"]) + ("compute_k8s_head_node_ha", ["virtual_ip_address", "active_node_service_tags"]), + ("service_k8s_head_node_ha", ["virtual_ip_address", "active_node_service_tags"]) ] for config_name, mandatory_fields in ha_configs: diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 22b8d26357..d2e1580966 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -101,7 +101,7 @@ "create_container_remote_auth": "pulp container remote create --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'", - "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '[\"%s\"]' --username %s --password '%s'" + "update_container_remote_auth": "pulp container remote update --name %s --url %s --upstream-name %s --policy %s --include-tags '%s' --username %s --password '%s'" } OMNIA_CREDENTIALS_YAML_PATH = "/opt/omnia/input/project_default/omnia_config_credentials.yml" diff --git a/common/library/module_utils/local_repo/download_image.py b/common/library/module_utils/local_repo/download_image.py index 07293d9c6c..c9b3020a7b 100644 --- a/common/library/module_utils/local_repo/download_image.py +++ b/common/library/module_utils/local_repo/download_image.py @@ -82,7 +82,8 @@ def create_container_remote_with_auth(remote_name, remote_url, package, policy_t return True new_tags = existing_tags + [tag] - tags_str = ",".join(new_tags) + tags_str = json.dumps(new_tags) + update_command = pulp_container_commands["update_container_remote_auth"] % ( remote_name, remote_url, package, policy_type, tags_str, docker_username, docker_password diff --git a/examples/rhel_software_config.json b/examples/rhel_software_config.json index 8e3fe97df5..212f5876f3 100644 --- a/examples/rhel_software_config.json +++ b/examples/rhel_software_config.json @@ -13,14 +13,15 @@ {"name": "nfs"}, {"name": "beegfs", "version": "7.4.5"}, {"name": "slurm"}, - {"name": "k8s", "version": "1.31.4"}, + {"name": "compute_k8s", "version": "1.31.4"}, {"name": "service_k8s", "version": "1.31.4"}, {"name": "intel_benchmarks", "version": "2024.1.0"}, {"name": "amd_benchmarks"}, {"name": "utils"}, {"name": "ucx", "version": "1.15.0"}, {"name": "openmpi", "version": "4.1.6"}, - {"name": "racadm"} + {"name": "racadm"}, + {"name": "csi_driver_powerscale", "version":"v2.14.0"} ], "amdgpu": [ diff --git a/examples/software_config_template/template_rhel_9.6_software_config.json b/examples/software_config_template/template_rhel_9.6_software_config.json index 8e3fe97df5..48467e2fbc 100644 --- a/examples/software_config_template/template_rhel_9.6_software_config.json +++ b/examples/software_config_template/template_rhel_9.6_software_config.json @@ -13,14 +13,15 @@ {"name": "nfs"}, {"name": "beegfs", "version": "7.4.5"}, {"name": "slurm"}, - {"name": "k8s", "version": "1.31.4"}, + {"name": "compute_k8s", "version": "1.31.4"}, {"name": "service_k8s", "version": "1.31.4"}, {"name": "intel_benchmarks", "version": "2024.1.0"}, {"name": "amd_benchmarks"}, {"name": "utils"}, {"name": "ucx", "version": "1.15.0"}, {"name": "openmpi", "version": "4.1.6"}, - {"name": "racadm"} + {"name": "racadm"}, + {"name": "csi_driver_powerscale", "version":"v2.14.0"} ], "amdgpu": [ diff --git a/input/config/rhel/9.6/k8s.json b/input/config/rhel/9.6/compute_k8s.json similarity index 99% rename from input/config/rhel/9.6/k8s.json rename to input/config/rhel/9.6/compute_k8s.json index 76776b3b23..a3619bc8b1 100644 --- a/input/config/rhel/9.6/k8s.json +++ b/input/config/rhel/9.6/compute_k8s.json @@ -1,5 +1,5 @@ { - "k8s": { + "compute_k8s": { "cluster": [ { "package": "firewalld", diff --git a/input/config/rhel/9.6/csi_driver_powerscale.json b/input/config/rhel/9.6/csi_driver_powerscale.json new file mode 100644 index 0000000000..47d6cd6f18 --- /dev/null +++ b/input/config/rhel/9.6/csi_driver_powerscale.json @@ -0,0 +1,89 @@ +{ + "csi_driver_powerscale": { + "cluster": [ + { + "package": "csi-powerscale", + "url": "https://github.com/dell/csi-powerscale.git", + "type": "git", + "version": "v2.14.0" + }, + { + "package": "external-snapshotter", + "url": "https://github.com/kubernetes-csi/external-snapshotter.git", + "type": "git", + "version": "v8.3.0" + }, + { + "package": "helm-charts", + "url": "https://github.com/dell/helm-charts.git", + "type": "git", + "version": "csi-isilon-2.14.0" + }, + { + "package": "quay.io/dell/container-storage-modules/csi-isilon", + "tag": "v2.14.0", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-attacher", + "tag": "v4.8.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-provisioner", + "tag": "v5.2.0", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-snapshotter", + "tag": "v8.2.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-resizer", + "tag": "v1.13.2", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-node-driver-registrar", + "tag": "v2.13.0", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-external-health-monitor-controller", + "tag": "v0.14.0", + "type": "image" + }, + { + "package": "quay.io/dell/container-storage-modules/dell-csi-replicator", + "tag": "v1.12.0", + "type": "image" + }, + { + "package": "quay.io/dell/container-storage-modules/podmon", + "tag": "v1.13.0", + "type": "image" + }, + { + "package": "quay.io/dell/container-storage-modules/csm-authorization-sidecar", + "tag": "v2.2.0", + "type": "image" + }, + { + "package": "quay.io/dell/container-storage-modules/csi-metadata-retriever", + "tag": "v1.11.0", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/snapshot-controller", + "tag": "v8.2.1", + "type": "image" + }, + { + "package": "docker.io/dellemc/csm-encryption", + "tag": "v0.6.0", + "type": "image" + } + ] + } +} diff --git a/input/config/rhel/9.6/service_k8s.json b/input/config/rhel/9.6/service_k8s.json index 4328b50069..84e0da7b8d 100644 --- a/input/config/rhel/9.6/service_k8s.json +++ b/input/config/rhel/9.6/service_k8s.json @@ -60,6 +60,22 @@ "package": "prettytable==3.14.0", "type": "pip_module" }, + { + "package": "whereabouts", + "url": "https://github.com/k8snetworkplumbingwg/whereabouts.git", + "type": "git", + "version": "v0.8.0" + }, + { + "package": "ghcr.io/k8snetworkplumbingwg/whereabouts", + "tag": "latest", + "type": "image" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, { "package": "kubectl-v1.31.4", "type": "tarball", @@ -239,6 +255,46 @@ "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" + }, + { + "package": "kube-prometheus-stack-75.11.0", + "type": "tarball", + "url": "https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-75.11.0/kube-prometheus-stack-75.11.0.tgz" + }, + { + "package": "quay.io/prometheus-operator/prometheus-operator", + "tag": "v0.83.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-state-metrics/kube-state-metrics", + "tag": "v2.16.0", + "type": "image" + }, + { + "package": "quay.io/prometheus-operator/prometheus-config-reloader", + "tag": "v0.83.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/alertmanager", + "tag": "v0.28.1", + "type": "image" + }, + { + "package": "quay.io/prometheus/node-exporter", + "tag": "v1.9.1", + "type": "image" + }, + { + "package": "quay.io/prometheus/prometheus", + "tag": "v3.5.0", + "type": "image" + }, + { + "package": "registry.k8s.io/ingress-nginx/kube-webhook-certgen", + "tag": "v1.6.0", + "type": "image" } ] } diff --git a/input/omnia_config.yml b/input/omnia_config.yml index c006061727..17db854d46 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -83,6 +83,8 @@ service_k8s_cluster: topology_manager_policy: "none" topology_manager_scope: "container" k8s_offline_install: true + csi_powerscale_driver_secret_file_path: "" + csi_powerscale_driver_values_file_path: "" compute_k8s_cluster: - cluster_name: compute_cluster @@ -94,3 +96,5 @@ compute_k8s_cluster: topology_manager_policy: "none" topology_manager_scope: "container" k8s_offline_install: true + csi_powerscale_driver_secret_file_path: "" + csi_powerscale_driver_values_file_path: "" diff --git a/input/software_config.json b/input/software_config.json index 9509515233..85bb808ee2 100644 --- a/input/software_config.json +++ b/input/software_config.json @@ -9,7 +9,7 @@ {"name": "ofed", "version": "24.10-3.2.5.0"}, {"name": "openldap"}, {"name": "nfs"}, - {"name": "k8s", "version":"1.31.4"}, + {"name": "compute_k8s", "version":"1.31.4"}, {"name": "service_k8s","version": "1.31.4"}, {"name": "slurm"} ], diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index b06c52b76a..52a8521e90 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -1,4 +1,4 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,18 @@ # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. # *********************************************************************** +# This variable signifies support for Kubernetes metric collection and deployment of the Kube-Prometheus stack. +# Set to true to deploy Prometheus, Node Exporter, Alertmanager, and related components. +# Accepted values: true or false +# Default value: false +kube_prometheus_support: false + +# Specifies the time interval (in seconds) at which Prometheus scrapes metrics from its targets. +# Must be an integer greater than 0. +# Unit: seconds +# Default: 15 +prometheus_scrape_interval: 15 + # This variable is used to enable iDRAC telemetry support # Accepted values: true or false idrac_telemetry_support: false diff --git a/input_validation/validate_config.yml b/input_validation/validate_config.yml index 911bc373e1..36c7c27e9c 100644 --- a/input_validation/validate_config.yml +++ b/input_validation/validate_config.yml @@ -21,7 +21,7 @@ - provision - security - local_repo - - k8s + - compute_k8s - service_k8s - roce - storage diff --git a/local_repo/roles/parse_and_download/tasks/create_k8s_local_repo_metadata.yml b/local_repo/roles/parse_and_download/tasks/create_k8s_local_repo_metadata.yml index 8fcb9d01ce..c79ec7ff3f 100644 --- a/local_repo/roles/parse_and_download/tasks/create_k8s_local_repo_metadata.yml +++ b/local_repo/roles/parse_and_download/tasks/create_k8s_local_repo_metadata.yml @@ -26,8 +26,8 @@ metadata_raw.content | b64decode | from_yaml if metadata_raw is defined and metadata_raw.content is defined else { - "k8s_local_repo_versions": [], - "last_k8s_local_repo_version": "", + "compute_k8s_local_repo_versions": [], + "last_compute_k8s_local_repo_version": "", "service_k8s_local_repo_versions": [], "last_service_k8s_local_repo_version": "" } diff --git a/local_repo/roles/parse_and_download/vars/main.yml b/local_repo/roles/parse_and_download/vars/main.yml index f2da0eefb7..893f09b1b9 100644 --- a/local_repo/roles/parse_and_download/vars/main.yml +++ b/local_repo/roles/parse_and_download/vars/main.yml @@ -47,7 +47,10 @@ local_repo_py_module_vars: beegfs: timeout: 7200 nthreads: 1 - k8s: + compute_k8s: + timeout: 7200 + nthreads: 8 + service_k8s: timeout: 7200 nthreads: 8 slurm: diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml index 3ca1e444ad..37810473be 100644 --- a/local_repo/roles/validation/tasks/main.yml +++ b/local_repo/roles/validation/tasks/main.yml @@ -34,7 +34,7 @@ - name: Validate metadata ansible.builtin.include_tasks: validate_metadata.yml -- name: Validate k8s json +- name: Validate compute k8s and service k8s json ansible.builtin.include_tasks: validate_k8s_json.yml - name: Loop through cluster configs to generate JSON files diff --git a/local_repo/roles/validation/tasks/validate_k8s_json.yml b/local_repo/roles/validation/tasks/validate_k8s_json.yml index 22488abecd..3b9a43d5f5 100644 --- a/local_repo/roles/validation/tasks/validate_k8s_json.yml +++ b/local_repo/roles/validation/tasks/validate_k8s_json.yml @@ -13,30 +13,30 @@ # limitations under the License. --- -- name: Validate k8s json - when: k8s_support is true +- name: Validate compute k8s json + when: compute_k8s_support is true block: - - name: Load k8s.json + - name: Load compute_k8s.json ansible.builtin.set_fact: - k8s_packages_json: "{{ lookup('file', base_k8s_json_file) | from_json }}" + compute_k8s_packages_json: "{{ lookup('file', base_compute_k8s_json_file) | from_json }}" - name: Extract and set facts for tarball URLs ansible.builtin.set_fact: - kubeadm_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'kubeadm') | map(attribute='package') | join }}" # noqa: yaml[line-length] + kubeadm_package: "{{ compute_k8s_packages_json['compute_k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'kubeadm') | map(attribute='package') | join }}" # noqa: yaml[line-length] - - name: Set default fact for dynamic k8s generation + - name: Set default fact for dynamic compute k8s generation ansible.builtin.set_fact: - dynamic_k8s_generation: false + dynamic_compute_k8s_generation: false - - name: Set fact for dynamic genaration when kubeadm package is not kubeadm-1.31.4 + - name: Set fact for compute k8s dynamic genaration when kubeadm package is not kubeadm-1.31.4 ansible.builtin.set_fact: - dynamic_k8s_generation: true + dynamic_compute_k8s_generation: true when: kubeadm_package != kubeadm_package_name - - name: Set fact for dynamic genaration when k8s version is not equal to 1.31.4 + - name: Set fact for dynamic genaration when compute k8s version is not equal to 1.31.4 ansible.builtin.set_fact: - dynamic_k8s_generation: true - when: k8s_version != "1.31.4" + dynamic_compute_k8s_generation: true + when: compute_k8s_version != "1.31.4" - name: Validate service k8s json when: service_k8s_support is true @@ -67,19 +67,19 @@ ansible.builtin.set_fact: cluster_configs: [] -# Append k8s cluster config if k8s_support is true -- name: Add k8s cluster config to cluster_configs +# Append compute_k8s cluster config if compute_k8s_support is true +- name: Add compute k8s cluster config to cluster_configs ansible.builtin.set_fact: - cluster_configs: "{{ cluster_configs + [k8s_config] }}" + cluster_configs: "{{ cluster_configs + [compute_k8s_config] }}" vars: - k8s_config: - name: "k8s" - version: "{{ k8s_version }}" - json_file: "{{ dynamic_k8s_json_folder }}/v{{ k8s_version }}/k8s_v{{ k8s_version }}.json" - base_json_file: "{{ project_input_path }}/config/{{ cluster_os_type }}/{{ cluster_os_version }}/k8s.json" - backup_json_file: "{{ dynamic_k8s_json_folder }}/k8s_bkp.json" - run_condition: "{{ k8s_support and dynamic_k8s_generation }}" - when: k8s_support | bool + compute_k8s_config: + name: "compute_k8s" + version: "{{ compute_k8s_version }}" + json_file: "{{ dynamic_k8s_json_folder }}/v{{ compute_k8s_version }}/compute_k8s_v{{ compute_k8s_version }}.json" + base_json_file: "{{ project_input_path }}/config/{{ cluster_os_type }}/{{ cluster_os_version }}/compute_k8s.json" + backup_json_file: "{{ dynamic_k8s_json_folder }}/compute_k8s_bkp.json" + run_condition: "{{ compute_k8s_support and dynamic_compute_k8s_generation }}" + when: compute_k8s_support | bool # Append service_k8s cluster config if service_k8s_support is true - name: Add service_k8s cluster config to cluster_configs @@ -101,9 +101,9 @@ {{ [ { - 'name': 'k8s', - 'version': k8s_version | default(''), - 'enabled': k8s_support | default(false) + 'name': 'compute_k8s', + 'version': compute_k8s_version | default(''), + 'enabled': compute_k8s_support | default(false) }, { 'name': 'service_k8s', diff --git a/local_repo/roles/validation/tasks/validate_software_config_json.yml b/local_repo/roles/validation/tasks/validate_software_config_json.yml index 6c1c41ccc0..c632235f6a 100644 --- a/local_repo/roles/validation/tasks/validate_software_config_json.yml +++ b/local_repo/roles/validation/tasks/validate_software_config_json.yml @@ -19,7 +19,7 @@ rocm_version: "omnia_default" bcm_roce_libraries_version: "omnia_default" intelgaudi_version: "omnia_default" - k8s_support: false + compute_k8s_support: false service_k8s_support: false - name: Check that the software_config.json exists @@ -65,9 +65,9 @@ repo_config: "{{ software_config.repo_config }}" software: "{{ software_config.softwares }}" -- name: Check if k8s support is true +- name: Check if compute_k8s support is true ansible.builtin.set_fact: - k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'k8s') | list | length > 0 }}" + compute_k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'compute_k8s') | list | length > 0 }}" - name: Check if service k8s support is true ansible.builtin.set_fact: diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index ed25184069..9be41c6a97 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -23,7 +23,7 @@ usage_message: | Example: ansible-playbook local_repo.yml -e "softwares=additional_software" - ansible-playbook local_repo.yml -e "softwares=k8s,slurm" + ansible-playbook local_repo.yml -e "softwares=compute_k8s,slurm" If you have NOT modified any software JSON files, you can run: ansible-playbook local_repo.yml" @@ -31,14 +31,12 @@ softwares_var_not_provided: "No softwares variable provided. Skipping." softwares_invalid_msg: "Invalid software_name(s) found: {{ softwares_list | difference(software_names) }}. Allowed values: {{ software_names }}" # Usage: dynamic_k8s_json.yml -# dynamic_k8s_json_file: "{{ dynamic_k8s_json_folder }}/v{{ k8s_version }}/k8s_v{{ k8s_version }}.json" -base_k8s_json_file: "{{ project_input_path }}/config/{{ cluster_os_type }}/{{ cluster_os_version }}/k8s.json" +base_compute_k8s_json_file: "{{ project_input_path }}/config/{{ cluster_os_type }}/{{ cluster_os_version }}/compute_k8s.json" base_service_k8s_json_file: "{{ project_input_path }}/config/{{ cluster_os_type }}/{{ cluster_os_version }}/service_k8s.json" dynamic_k8s_json_folder: "/opt/omnia/k8s_dynamic_json" kubespray_url: "https://github.com/kubernetes-sigs/kubespray.git" package_type_list: ["rpm", "deb", "tarball", "image", "manifest", "pip_module", "git"] dynamic_k8s_json_log_dir: "/opt/omnia/log/local_repo" -# base_k8s_json_bkp_file: "{{ dynamic_k8s_json_folder }}/k8s_bkp.json" # Usage: validate_k8s_json.yml kubeadm_package_name: "kubeadm-v1.31.4" @@ -84,7 +82,7 @@ are not being supported on {{ cluster_os_type }} : {{ missing_json_list | map('r specific_softwares: - 'beegfs' - 'amdgpu' - - 'k8s' + - 'compute_k8s' - 'cuda' - 'ofed' - 'bcm_roce' diff --git a/prepare_oim/roles/deploy_containers/idrac_telemetry/vars/main.yml b/prepare_oim/roles/deploy_containers/idrac_telemetry/vars/main.yml index 25f5c10ee9..4bb4a55f23 100644 --- a/prepare_oim/roles/deploy_containers/idrac_telemetry/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/idrac_telemetry/vars/main.yml @@ -112,7 +112,7 @@ idrac_telemetry_receiver_dir_path: "{{ idrac_telemetry_dir }}/idrac_telemetry_re idrac_telemetry_github: https://github.com/dell/iDRAC-Telemetry-Reference-Tools.git idrac_telemetry_reference_git_clone_path: "/opt/omnia/telemetry/idrac_telemetry/idrac_telemetry_receiver/iDRAC-Telemetry-Reference-Tools" idrac_telemetry_reference_path: "{{ idrac_telemetry_receiver_dir_path }}/iDRAC-Telemetry-Reference-Tools" -reference_tools_stable_commit: "94e7621" +reference_tools_stable_commit: "9a1c72b" max_retries: 10 delay_count: 5 idrac_git_clone_error_msg: | diff --git a/prepare_oim/roles/deploy_containers/pcs/vars/main.yml b/prepare_oim/roles/deploy_containers/pcs/vars/main.yml index 1dd93646ff..1d097be9dc 100644 --- a/prepare_oim/roles/deploy_containers/pcs/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/pcs/vars/main.yml @@ -76,7 +76,7 @@ stop_interval: "0s" stop_timeout: "60s" migration_threshold: 0 -ha_migration_threshold: 1 +ha_migration_threshold: 3 failure_timeout: "60s" pcs_group: omnia vip_group: omnia_vip diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml index ce8490f49b..2e70d9d5fa 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml @@ -16,7 +16,7 @@ # Check if k8s is mentioned in software_config.json - name: Check if k8s support is true ansible.builtin.set_fact: - k8s_support: "{{ software_config.softwares | selectattr('name', 'in', ['k8s', 'service_k8s']) | list | length > 0 }}" + k8s_support: "{{ software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | list | length > 0 }}" project_input_path: "{{ hostvars['localhost']['input_project_dir'] }}" cluster_os_type: "{{ software_config.cluster_os_type }}" cluster_os_version: "{{ software_config.cluster_os_version }}" @@ -27,7 +27,7 @@ block: - name: Extract k8s version ansible.builtin.set_fact: - k8s_versions: "{{ software_config.softwares | selectattr('name', 'in', ['k8s', 'service_k8s']) | map(attribute='version') | list | unique }}" # noqa: yaml[line-length] + k8s_versions: "{{ software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | map(attribute='version') | list | unique }}" # noqa: yaml[line-length] - name: Check if the k8s metadata file exists ansible.builtin.stat: path: "{{ metadata_file }}" diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml index 14d30536d4..e32f5e31a3 100644 --- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml @@ -37,12 +37,12 @@ supported_k8s_version: "v2.28.0": [1.32.5, 1.30.0] dynamic_k8s_json_folder: "/opt/omnia/k8s_dynamic_json" -dynamic_k8s_json_file: "{{ dynamic_k8s_json_folder }}/v{{ k8s_version }}/k8s_v{{ k8s_version }}.json" -base_k8s_json_file: "{{ project_input_path }}/config/{{ cluster_os_type }}/{{ cluster_os_version }}/k8s.json" +dynamic_k8s_json_file: "{{ dynamic_k8s_json_folder }}/v{{ k8s_version }}/compute_k8s_v{{ k8s_version }}.json" +base_k8s_json_file: "{{ project_input_path }}/config/{{ cluster_os_type }}/{{ cluster_os_version }}/compute_k8s.json" kubespray_url: "https://github.com/kubernetes-sigs/kubespray.git" package_type_list: ["rpm", "deb", "tarball", "image", "manifest", "pip_module", "git"] dynamic_k8s_json_log_dir: "/opt/omnia/log/local_repo" -base_k8s_json_bkp_file: "{{ dynamic_k8s_json_folder }}/k8s_bkp.json" +base_k8s_json_bkp_file: "{{ dynamic_k8s_json_folder }}/compute_k8s_bkp.json" dynamic_k8s_generation: false file_permission: "0755" diff --git a/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_api_validation.yml b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_api_validation.yml index 54909d4186..9d2a567d0c 100644 --- a/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_api_validation.yml +++ b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_api_validation.yml @@ -14,11 +14,8 @@ --- - name: Generate Base64 authentication token - ansible.builtin.shell: > - set -o pipefail && \ - echo -n "{{ item.username }}:{{ item.password }}" | base64 - register: auth_token - changed_when: false + ansible.builtin.set_fact: + auth_token: "{{ (hostvars['127.0.0.1']['csi_username'] + ':' + hostvars['127.0.0.1']['csi_password']) | b64encode }}" no_log: true - name: Set the URL for the API request @@ -34,7 +31,7 @@ url: "{{ api_url }}" method: GET headers: - Authorization: "Basic {{ auth_token.stdout }}" + Authorization: "Basic {{ auth_token }}" validate_certs: false register: response ignore_errors: true diff --git a/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_input_validation.yml b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_input_validation.yml index b681fa20b6..2095022bcf 100644 --- a/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_input_validation.yml +++ b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_input_validation.yml @@ -16,7 +16,7 @@ # Encryption for secret file - name: Check if csi_powerscale_secret_vault exists ansible.builtin.stat: - path: "{{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }}" + path: "{{ input_project_dir }}/{{ csi_powerscale_secret_vaultname }}" register: vault_key_result - name: Create ansible vault key if it does not exist @@ -26,7 +26,7 @@ - name: Save vault key ansible.builtin.lineinfile: - path: "{{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }}" + path: "{{ input_project_dir }}/{{ csi_powerscale_secret_vaultname }}" line: "{{ vault_key }}" mode: "{{ vault_key_permission }}" owner: root @@ -41,7 +41,7 @@ - name: Decrpyt secret file ansible.builtin.command: >- ansible-vault decrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} - --vault-password-file {{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }} + --vault-password-file {{ input_project_dir }}/{{ csi_powerscale_secret_vaultname }} when: "'$ANSIBLE_VAULT;' in config_content.stdout" changed_when: false @@ -54,234 +54,16 @@ - name: Encrypt secret file ansible.builtin.command: >- ansible-vault encrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} - --vault-password-file {{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }} + --vault-password-file {{ input_project_dir }}/{{ csi_powerscale_secret_vaultname }} changed_when: false -# Validate secret file -- name: Validate isilonClusters configuration - block: - - name: Ensure isilonClusters is a list - ansible.builtin.assert: - that: - - clusters.isilonClusters is defined - - clusters.isilonClusters is iterable - - clusters.isilonClusters | length > 0 - msg: "{{ fail_msg_isilon_clusters }}" - - - name: Validate each cluster entry - block: - - name: Validate clusterName in secret.yaml - block: - - name: Validate clusterName is a non-empty string - ansible.builtin.assert: - that: - - item.clusterName is defined - - item.clusterName | length > 0 - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid clusterName - ansible.builtin.fail: - msg: "{{ fail_msg_cluster_name }}" - - - name: Validate username in secret.yaml - block: - - name: Validate username is a non-empty string - ansible.builtin.assert: - that: - - item.username is defined - - item.username | length > 0 - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid username - ansible.builtin.fail: - msg: "{{ fail_msg_user_name }}" - - - name: Validate password in secret.yaml - block: - - name: Validate password is a non-empty string - ansible.builtin.assert: - that: - - item.password is defined - - item.password | length > 0 - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid password - ansible.builtin.fail: - msg: "{{ fail_msg_password }}" - - - name: Validate endpoint in secret.yaml - block: - - name: Validate endpoint is a non-empty string - ansible.builtin.assert: - that: - - item.endpoint is defined - - item.endpoint | length > 0 - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid endpoint - ansible.builtin.fail: - msg: "{{ fail_msg_endpoint }}" - - - name: Validate endpointPort in secret.yaml - block: - - name: Validate endpointPort is a non-empty string - when: item.endpointPort is defined - ansible.builtin.assert: - that: - - item.endpointPort is integer - - item.endpointPort > 0 and item.endpointPort < 65536 - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid endpointPort - ansible.builtin.fail: - msg: "{{ fail_msg_endpoint_port }}" - - - name: Validate isDefault in secret.yaml - block: - - name: Validate isDefault is boolean - ansible.builtin.assert: - that: - - item.isDefault is defined - - item.isDefault is boolean - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid isDefault - ansible.builtin.fail: - msg: "{{ fail_msg_isdefault }}" - - - name: Validate skipCertificateValidation in secret.yaml - block: - - name: Validate skipCertificateValidation is true - when: item.skipCertificateValidation is defined - ansible.builtin.assert: - that: - - item.skipCertificateValidation in [true] - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid skipCertificateValidation - ansible.builtin.fail: - msg: "{{ fail_msg_skip_certificate_validation }}" - - - name: Validate isiPath in secret.yaml - block: - - name: Validate isiPath is a valid Unix absolute path - when: item.isiPath is defined - ansible.builtin.assert: - that: - - item.isiPath is match('^/[^/].*') - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid isiPath - ansible.builtin.fail: - msg: "{{ fail_msg_isipath }}" - - - name: Validate isiVolumePathPermissions in secret.yaml - block: - - name: Validate isiVolumePathPermissions is a valid octal mode number - when: item.isiVolumePathPermissions is defined - ansible.builtin.assert: - that: - - item.isiVolumePathPermissions is string - - item.isiVolumePathPermissions | length > 0 - loop: "{{ clusters.isilonClusters }}" - no_log: true - rescue: - - name: Invalid isiVolumePathPermissions - ansible.builtin.fail: - msg: "{{ fail_msg_isi_volume_path_permissions }}" - # Validate mandate user input in values file for csi driver - name: Load values.yaml file ansible.builtin.include_vars: file: "{{ hostvars['localhost']['csi_powerscale_driver_values_file_path'] }}" name: csi_powerscale_values_file -- name: Validate controller count - ansible.builtin.assert: - that: - - csi_powerscale_values_file.controller.controllerCount == 1 - msg: | - "Invalid controllerCount value: {{ csi_powerscale_values_file.controller.controllerCount }}. It must be 1 in values.yaml file." - -- name: Validate replication enabled - ansible.builtin.assert: - that: - - csi_powerscale_values_file.controller.replication.enabled is defined - - csi_powerscale_values_file.controller.replication.enabled in [false] - msg: | - "Invalid replication enabled value: {{ csi_powerscale_values_file.controller.replication.enabled }}. It must be false in values.yaml file." - -- name: Validate resizer enabled - ansible.builtin.assert: - that: - - csi_powerscale_values_file.controller.resizer.enabled is defined - - csi_powerscale_values_file.controller.resizer.enabled in [false, true] - msg: "Invalid resizer enabled value: {{ csi_powerscale_values_file.controller.resizer.enabled }}. It must be true or false in values.yaml file." - -- name: Validate snapshot enabled - ansible.builtin.assert: - that: - - csi_powerscale_values_file.controller.snapshot.enabled is defined - - csi_powerscale_values_file.controller.snapshot.enabled in [true] - msg: "Invalid snapshot enabled value: {{ csi_powerscale_values_file.controller.snapshot.enabled }}. It must be true in values.yaml file." - -- name: Validate endpointPort - ansible.builtin.assert: - that: - - csi_powerscale_values_file.endpointPort is defined - - csi_powerscale_values_file.endpointPort | int >= 1 - - csi_powerscale_values_file.endpointPort | int <= 65535 - msg: "Invalid endpointPort: {{ csi_powerscale_values_file.endpointPort }}. It must be between 1 and 65535 in values.yaml file." - -- name: Validate skipCertificateValidation - ansible.builtin.assert: - that: - - csi_powerscale_values_file.skipCertificateValidation is defined - - csi_powerscale_values_file.skipCertificateValidation in [true] - msg: "Invalid skipCertificateValidation value: {{ csi_powerscale_values_file.skipCertificateValidation }}. It must be true in values.yaml file." - -- name: Set skipCertificateValidation to be used later - ansible.builtin.set_fact: - skip_certificate_validation_value: csi_powerscale_values_file.skipCertificateValidation - -- name: Validate isiAuthType - ansible.builtin.assert: - that: - - csi_powerscale_values_file.isiAuthType is defined - - csi_powerscale_values_file.isiAuthType in [0, 1] - msg: | - "Invalid isiAuthType: {{ csi_powerscale_values_file.isiAuthType }}. - It must be 0 (basic authentication) or 1 (session-based authentication) in values.yaml file." - -- name: Validate isiAccessZone - ansible.builtin.assert: - that: - - csi_powerscale_values_file.isiAccessZone is defined - - csi_powerscale_values_file.isiAccessZone | length > 0 - msg: "Invalid isiAccessZone: {{ csi_powerscale_values_file.isiAccessZone }}. It must be a non-empty string in values.yaml file." - -- name: Validate isiPath - ansible.builtin.assert: - that: - - csi_powerscale_values_file.isiPath is defined - - csi_powerscale_values_file.isiPath | regex_search('^/[^/].*') # Basic validation for Unix absolute path - msg: "Invalid isiPath: {{ csi_powerscale_values_file.isiPath }}. It must be a valid Unix absolute path in values.yaml file." - -- name: Validate isiVolumePathPermissions - ansible.builtin.assert: - that: - - csi_powerscale_values_file.isiVolumePathPermissions is defined - - csi_powerscale_values_file.isiVolumePathPermissions | length > 0 - msg: "Invalid isiVolumePathPermissions: {{ csi_powerscale_values_file.isiVolumePathPermissions }}. It must be valid octal mode in values.yaml file." - - name: Validate powerscale ip and credential in secret.yaml file using API call to powerscale ansible.builtin.include_tasks: csi_powerscale_driver_api_validation.yml loop: "{{ clusters.isilonClusters }}" + no_log: true diff --git a/scheduler/roles/cluster_validation/tasks/fetch_oim_metadata.yml b/scheduler/roles/cluster_validation/tasks/fetch_oim_metadata.yml new file mode 100644 index 0000000000..3afff67d10 --- /dev/null +++ b/scheduler/roles/cluster_validation/tasks/fetch_oim_metadata.yml @@ -0,0 +1,34 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Include oim metadata + when: k8s_support + block: + - name: Include oim metadata vars + ansible.builtin.include_vars: "{{ omnia_metadata_file }}" + register: include_metadata + no_log: true + + - name: Create directory for storing data + ansible.builtin.file: + path: "{{ pvc_data_folder }}" + state: directory + mode: "{{ file_perm }}" + + - name: Set fact for nfs server details + ansible.builtin.set_fact: + k8s_nfs_server_ip: "{{ nfs_server_ip }}" + k8s_server_share_path: "{{ nfs_server_share_path }}/omnia/k8s_pvc_data" + plugins_deployment: true diff --git a/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml b/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml index 7b06022169..6a6be6fa3c 100644 --- a/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml +++ b/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml @@ -30,10 +30,10 @@ file: "{{ software_config_json_file }}" name: software_config -# Check if k8s is mentioned in software_config.json -- name: Check if k8s support is true +# Check if compute k8s is mentioned in software_config.json +- name: Check if compute k8s support is true ansible.builtin.set_fact: - compute_k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'k8s') | list | length > 0 }}" + compute_k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'compute_k8s') | list | length > 0 }}" - name: Check if service k8s support is true ansible.builtin.set_fact: @@ -63,7 +63,7 @@ - name: Set software as compute_k8s ansible.builtin.set_fact: - software_stack: "k8s" + software_stack: "compute_k8s" when: compute_k8s_support - name: Check if slurm support is true diff --git a/scheduler/roles/cluster_validation/tasks/fetch_storage_config.yml b/scheduler/roles/cluster_validation/tasks/fetch_storage_config.yml index ad42808fdb..9bc59a9287 100644 --- a/scheduler/roles/cluster_validation/tasks/fetch_storage_config.yml +++ b/scheduler/roles/cluster_validation/tasks/fetch_storage_config.yml @@ -46,7 +46,7 @@ # Get k8s nfs_server_ip and nfs_server_path - name: Get nfs_server details when k8s_support is true - when: k8s_support + when: k8s_support and compute_k8s_playbook is defined block: - name: Set k8s_nfs_server_ip where k8s_share is true ansible.builtin.set_fact: diff --git a/scheduler/roles/cluster_validation/tasks/main.yml b/scheduler/roles/cluster_validation/tasks/main.yml index b404130bf3..4be2dbe0f6 100644 --- a/scheduler/roles/cluster_validation/tasks/main.yml +++ b/scheduler/roles/cluster_validation/tasks/main.yml @@ -38,6 +38,11 @@ - name: Fetch storage_config.yml inputs ansible.builtin.include_tasks: fetch_storage_config.yml + when: compute_k8s_playbook is defined + +- name: Fetch inputs for nfs client provisioner + ansible.builtin.include_tasks: fetch_oim_metadata.yml + when: service_k8s_playbook is defined - name: Include gather_facts playbook ansible.builtin.include_tasks: gather_fact_resolution.yml diff --git a/scheduler/roles/cluster_validation/tasks/set_facts.yml b/scheduler/roles/cluster_validation/tasks/set_facts.yml index 08665e4db1..1ad78a1836 100644 --- a/scheduler/roles/cluster_validation/tasks/set_facts.yml +++ b/scheduler/roles/cluster_validation/tasks/set_facts.yml @@ -77,6 +77,8 @@ topology_manager_policy: "{{ selected_cluster.topology_manager_policy }}" topology_manager_scope: "{{ selected_cluster.topology_manager_scope }}" k8s_offline_install: "{{ selected_cluster.k8s_offline_install }}" + csi_powerscale_driver_secret_file_path: "{{ selected_cluster.csi_powerscale_driver_secret_file_path }}" + csi_powerscale_driver_values_file_path: "{{ selected_cluster.csi_powerscale_driver_values_file_path }}" - name: Create a directory to store kubespray log files ansible.builtin.file: diff --git a/scheduler/roles/cluster_validation/vars/main.yml b/scheduler/roles/cluster_validation/vars/main.yml index 14e7a9359a..465e83e3f1 100644 --- a/scheduler/roles/cluster_validation/vars/main.yml +++ b/scheduler/roles/cluster_validation/vars/main.yml @@ -123,8 +123,8 @@ To install k8s provide scheduler_type: k8s. To install slurm and k8s provide sch install_scheduler_msg: "Installing job scheduler:" # # Usage: Fetch_software_config.yml -# csi_driver_powerscale_packages_file: >- -# {{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/csi_driver_powerscale.json +csi_driver_powerscale_packages_file: >- + {{ input_project_dir }}/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/csi_driver_powerscale.json # Usage: fetch_omnia_inputs.yml csi_driver_secret_file_path_success_msg: "Success. csi_driver_secret_file_path is valid in omnia_config.yml" @@ -135,16 +135,6 @@ csi_driver_values_file_path_fail_msg: "Failed. csi_driver_values_file_path is no # Usage: csi_powerscale_driver_input_validation.yml csi_powerscale_secret_vaultname: ".csi_powerscale_secret_vault" -fail_msg_isilon_clusters: "isilonClusters must be a valid list of powerscale details in secret.yaml file." -fail_msg_cluster_name: "clusterName is not valid. Provide powerscale cluster name in secret.yaml file." -fail_msg_user_name: "userName is not valid. Provide powerscale user name in secret.yaml file." -fail_msg_password: "Password is not valid. Provide powerscale password in secret.yaml file." -fail_msg_endpoint: "Endpoint is not valid. Provide powerscale IP or hostname in secret.yaml file." -fail_msg_endpoint_port: "endpointPort is not valid. Provide valid port number in secret.yaml file." -fail_msg_isdefault: "isDefault value should be true or false in secret.yaml file." -fail_msg_skip_certificate_validation: "skipCertificateValidation must be true in secret.yaml file." -fail_msg_isipath: "isiPath must be a valid Unix absolute path in secret.yaml file." -fail_msg_isi_volume_path_permissions: "isiVolumePathPermissions must be a valid directory permission (example: 0777) in secret.yaml file." fail_msg_api_call: "Please recheck powerscale username, password, endpoint and endpointPort details provided in secret.yaml and values.yaml (if endpointPort is provided only in values.yaml) file. API call to powerscale was not successful" vault_key_permission: "0644" @@ -169,3 +159,7 @@ msg_version_mismatch: >- but you requested version {{ k8s_version }} for {{ software_stack }} deployment. Please rerun local_repo.yml with version {{ k8s_version }}. k8s_version_msg_success: "{{ software_stack }} version {{ k8s_version }} is valid and matches last_{{ software_stack }}_local_repo_version" + +# Usage: fetch_oim_metadata.yml +omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml" +pvc_data_folder: "/opt/omnia/k8s_pvc_data" diff --git a/scheduler/roles/common_plugins/vars/main.yml b/scheduler/roles/common_plugins/vars/main.yml index 1e424c6a2e..d319f66bb5 100644 --- a/scheduler/roles/common_plugins/vars/main.yml +++ b/scheduler/roles/common_plugins/vars/main.yml @@ -13,16 +13,6 @@ # limitations under the License. --- -# Usage: main.yml -common_plugin: - kubernetes_dashboard: true - habanalabs-device-plugin: false - mpi-operator: true - xilinx-device-plugin: true - nfs-subdir-external-provisioner: true - multus-device-plugin: true - whereabouts-device-plugin: true - # Usage: prereq.yml k8s_tmp_dir: "/root/k8s" helm_install_fail_msg: "Failed to fetch helm path. Please verify helm installation and ensure the environment variable PATH is correctly set in the node. diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/files/empty_certificate_template.yml b/scheduler/roles/k8s_csi_powerscale_plugin/files/empty_certificate_template.yml new file mode 100644 index 0000000000..2462770283 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/files/empty_certificate_template.yml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: isilon-certs-0 + namespace: isilon +type: Opaque +data: + cert-0: "" diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/assign_csi_as_default_storage_class.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/assign_csi_as_default_storage_class.yml new file mode 100644 index 0000000000..fbfe320ebf --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/assign_csi_as_default_storage_class.yml @@ -0,0 +1,57 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check for existing NFS Client Provisioner + kubernetes.core.k8s_info: + api_version: storage.k8s.io/v1 + kind: StorageClass + name: nfs-client + register: nfs_client_sc + ignore_errors: true + +- name: Check if CSI Provisioner StorageClass (ps01) exists + kubernetes.core.k8s_info: + api_version: storage.k8s.io/v1 + kind: StorageClass + name: ps01 + register: csi_storage_class + ignore_errors: true + +- name: Remove default annotation from NFS Client Provisioner if it is default and ps01 exists + kubernetes.core.k8s: + api_version: storage.k8s.io/v1 + kind: StorageClass + name: nfs-client + merge_type: strategic-merge + definition: + metadata: + annotations: + storageclass.kubernetes.io/is-default-class: "false" + when: (nfs_client_sc.resources | length > 0) and + (csi_storage_class.resources | length > 0) and + ('storageclass.kubernetes.io/is-default-class' in nfs_client_sc.resources[0].metadata.annotations) and + (nfs_client_sc.resources[0].metadata.annotations['storageclass.kubernetes.io/is-default-class'] == "true") + +- name: Set CSI Provisioner as default StorageClass if it exists + kubernetes.core.k8s: + api_version: storage.k8s.io/v1 + kind: StorageClass + name: ps01 + merge_type: strategic-merge + definition: + metadata: + annotations: + storageclass.kubernetes.io/is-default-class: "true" + when: csi_storage_class.resources | length > 0 diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_certificate.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_certificate.yml new file mode 100644 index 0000000000..af77fd16fa --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_certificate.yml @@ -0,0 +1,35 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Empty certificate creation + block: + - name: Copy empty certificate yaml file + ansible.builtin.copy: + dest: "{{ empty_certificate_path }}" + src: "{{ empty_certificate_template_path }}" + mode: "{{ permission_644 }}" + + - name: Apply the Secret YAML to Kubernetes + block: + - name: Create empty certificate secret + ansible.builtin.command: + cmd: "kubectl apply -f {{ empty_certificate_path }}" + register: result + changed_when: result.changed + + rescue: + - name: Empty certificate secret creation failure + ansible.builtin.fail: + msg: "{{ fail_msg_empty_certificate }}" diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_secret.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_secret.yml new file mode 100644 index 0000000000..280406d1a2 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_secret.yml @@ -0,0 +1,78 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Remove existing isilon-creds secret if present + kubernetes.core.k8s: + api_version: v1 + kind: Secret + name: isilon-creds + namespace: "{{ powerscale_ns }}" + state: absent + register: delete_secret_result + failed_when: false + changed_when: delete_secret_result.changed + +- name: Create isilon-creds secret in isilon namespace + ansible.builtin.command: kubectl create secret generic isilon-creds -n {{ powerscale_ns }} --from-file=config="{{ csi_powerscale_secret_path }}" + failed_when: false + register: apply_secret + changed_when: apply_secret.changed + +- name: Get existing isilon-creds secret + kubernetes.core.k8s_info: + api_version: v1 + kind: Secret + namespace: "{{ powerscale_ns }}" + name: isilon-creds + register: existing_secret + +- name: Decode the config from secret + ansible.builtin.set_fact: + decoded_config: "{{ existing_secret.resources[0].data.config | b64decode | from_yaml }}" + +- name: Update username and password in decoded config + ansible.builtin.set_fact: + updated_config: >- + {{ + decoded_config | combine({ + 'isilonClusters': [ decoded_config.isilonClusters[0] | combine({ + 'username': hostvars['127.0.0.1']['csi_username'], + 'password': hostvars['127.0.0.1']['csi_password'] + }) ] + }) + }} + +- name: Encode updated config to base64 + ansible.builtin.set_fact: + encoded_config: "{{ updated_config | to_nice_yaml(indent=2) | b64encode }}" + +- name: Patch isilon-creds secret with updated credentials + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Secret + metadata: + name: isilon-creds + namespace: "{{ powerscale_ns }}" + data: + config: "{{ encoded_config }}" + type: Opaque + +# Remove the secret file +- name: Remove secret file + ansible.builtin.file: + path: "{{ csi_powerscale_secret_path }}" + state: absent + failed_when: false diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_image_pull.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_image_pull.yml new file mode 100644 index 0000000000..343e7b4c98 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_image_pull.yml @@ -0,0 +1,61 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set empty image list + ansible.builtin.set_fact: + csi_powerscale_image_versions: [] + +- name: Fetch and store image versions + ansible.builtin.set_fact: + csi_powerscale_image_versions: "{{ csi_powerscale_image_versions + [item.package + ':' + item.tag] }}" + loop: "{{ hostvars['localhost']['csi_driver_powerscale_packages_json']['csi_driver_powerscale']['cluster'] }}" + when: item.type == 'image' + +# Pulling images from pulp - always,partial, never +- name: Pull K8s services docker images from pulp + ansible.builtin.command: nerdctl pull {{ item }} + with_items: "{{ csi_powerscale_image_versions }}" + changed_when: true + failed_when: false + when: + - hostvars['localhost']['k8s_offline_install'] + +# Pulling images directly when k8s_offline_install: false, enable_routed_internet: true + +- name: Pull K8s services docker images from proxy + ansible.builtin.command: nerdctl pull {{ item }} + with_items: "{{ csi_powerscale_image_versions }}" + changed_when: true + failed_when: false + environment: + http_proxy: "http://{{ hostvars['localhost']['admin_nic_ip'] }}:{{ proxy_port }}" + https_proxy: "http://{{ hostvars['localhost']['admin_nic_ip'] }}:{{ proxy_port }}" + no_proxy: "localhost,127.0.0.1,{{ hostvars['localhost']['admin_nic_ip'] }},{{ hostvars['localhost']['oim_hostname'] }}" # noqa: yaml[line-length] + when: + - not hostvars['localhost']['k8s_offline_install'] + - hostvars['localhost']['enable_routed_internet'] + +# Pulling images directly when k8s_offline_install: false, enable_routed_internet: false + +- name: Pull K8s services docker images from dedicated internet + ansible.builtin.command: nerdctl pull {{ item }} + with_items: "{{ csi_powerscale_image_versions }}" + changed_when: true + failed_when: false + environment: + no_proxy: "localhost,127.0.0.1,{{ hostvars['localhost']['admin_nic_ip'] }},{{ hostvars['localhost']['oim_hostname'] }}" # noqa: yaml[line-length] + when: + - not hostvars['localhost']['k8s_offline_install'] + - not hostvars['localhost']['enable_routed_internet'] diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_install.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_install.yml new file mode 100644 index 0000000000..6c9d58ecc5 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_install.yml @@ -0,0 +1,72 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Deploy external-snapshotter config CRDs + ansible.builtin.command: + cmd: "kubectl apply -f client/config/crd/" + chdir: "{{ csi_powerscale_path }}/csi-powerscale/external-snapshotter/" + register: install_result + failed_when: false + changed_when: install_result.changed + +- name: Deploy external-snapshotter snapshot-controller CRDs + ansible.builtin.command: + cmd: "kubectl apply -f deploy/kubernetes/snapshot-controller/" + chdir: "{{ csi_powerscale_path }}/csi-powerscale/external-snapshotter/" + register: install_result + failed_when: false + changed_when: install_result.changed + +- name: Execute CSI driver installation script with timeout of seconds {{ async_time }} + ansible.builtin.command: + cmd: "./csi-install.sh --namespace {{ isilon_ns }} --values {{ csi_powerscale_values_path }}" + chdir: "{{ csi_powerscale_path }}/{{ csi_powerscale_git | regex_replace('\\.tar\\.gz$', '') }}/dell-csi-helm-installer" + register: install_result + async: "{{ async_time }}" + poll: "{{ poll_time }}" + failed_when: false + changed_when: install_result.changed + +- name: Wait for csi pods to be in Running state + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pod -n {{ isilon_ns }} --no-headers | grep {{ powerscale_pod_indcator }} | grep -v "Running" + register: isilon_non_running_pods + failed_when: false + changed_when: false + until: isilon_non_running_pods.stdout_lines | length == 0 + retries: "{{ max_attempts }}" + delay: "{{ wait_time }}" + +- name: Verify csi driver installation + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ fail_msg_csi_powerscale_driver }}" + when: isilon_non_running_pods.stdout_lines | length > 0 + +- name: Create powerscale storage class if deployment was successful + ansible.builtin.command: + cmd: "kubectl apply -f ps_storage_class.yml" + chdir: "{{ csi_powerscale_path }}" + register: sc_command_result + failed_when: false + changed_when: sc_command_result.changed + when: isilon_non_running_pods.stdout_lines | length == 0 + +- name: Remove ps_storage_class.yml file + ansible.builtin.file: + path: "{{ csi_powerscale_path }}/ps_storage_class.yml" + state: absent + force: true diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_prereq.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_prereq.yml new file mode 100644 index 0000000000..29e95b870b --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_prereq.yml @@ -0,0 +1,208 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Check if k8s is running + block: + - name: Check if Kubernetes is running + kubernetes.core.k8s_info: + api_version: v1 + kind: Node + register: node_info + failed_when: node_info.resources is not defined or node_info.resources | length == 0 + rescue: + - name: Kubernetes is not running + ansible.builtin.fail: + msg: "{{ k8s_not_deployed }}" + +# Check if powerscale is already deployed +- name: Verify powerscale is deployed on cluster + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pod -n {{ powerscale_ns }} --no-headers | grep {{ powerscale_pod_indcator }} + register: powerscale_precheck + changed_when: false + failed_when: false + +- name: Set flag if powerscale is already deployed + ansible.builtin.set_fact: + powerscale_already_deployed: "{{ powerscale_precheck.rc == 0 }}" + +- name: Pause to notify powerscale already deployed + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ warning_msg_already_deployed }}" + when: powerscale_already_deployed + +- name: Proceed prereq if powerscale not already deployed + when: not powerscale_already_deployed + block: + # Check helm is deployed on cluster + - name: Verify helm is deployed on cluster + ansible.builtin.command: helm + register: helm_return_code + changed_when: false + failed_when: false + + - name: Fail if helm is not deployed + ansible.builtin.assert: + that: + - helm_return_code.rc == 0 + fail_msg: "{{ helm_not_deployed }}" + + - name: Remove /opt/omnia/csi-driver-powerscale directory if already present + ansible.builtin.file: + path: "{{ csi_powerscale_path }}" + state: absent + + - name: Create csi-driver-powerscale directory under /opt/omnia + ansible.builtin.file: + path: "{{ csi_powerscale_path }}" + mode: "{{ permission_644 }}" + state: directory + owner: "{{ owner_value }}" + group: "{{ group_value }}" + + - name: Check if secret file is encrypted + ansible.builtin.command: cat "{{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }}" + changed_when: false + register: config_content + connection: local + delegate_to: localhost + + - name: Decrpyt secret file + ansible.builtin.command: >- + ansible-vault decrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} + --vault-password-file {{ input_project_dir }}/{{ csi_powerscale_secret_vaultname }} + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + changed_when: true + connection: local + delegate_to: localhost + + # Copy secret file to /opt/omnia + - name: Copy secret file + ansible.builtin.copy: + src: "{{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }}" + dest: "{{ csi_powerscale_secret_path }}" + owner: "{{ owner_value }}" + group: "{{ group_value }}" + mode: "{{ permission_644 }}" + + # check if powerscale is pininging by reading endpoint value from secrets.yaml file + - name: Load values.yaml file + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['csi_powerscale_driver_values_file_path'] }}" + name: csi_powerscale_values_file + + - name: Load secret file for input validation + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }}" + name: clusters + no_log: true + + - name: Extract PowerScale endpoint IP or Host from loaded secret data + ansible.builtin.set_fact: + powerscale_host: "{{ clusters.isilonClusters[0].endpoint | regex_replace('https?://', '') | regex_replace('/.*', '') }}" + + - name: Check if the extracted PowerScale IP or Host is reachable + ansible.builtin.command: + cmd: "ping -c 1 {{ powerscale_host }}" # Replace {{ power_scale_host }} with your actual host variable + register: ping_result + ignore_errors: true # Continue even if the ping fail + changed_when: false + + - name: Print ping result or error if ping fails + ansible.builtin.debug: + msg: > + {% if ping_result.rc == 0 %} + Powerscale Host reachable! Output: {{ ping_result.stdout }} + {% else %} + Powerscale Host not reachable. Error: {{ ping_result.stderr }} + {% endif %} + + - name: Encrypt secret file + ansible.builtin.command: >- + ansible-vault encrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} + --vault-password-file {{ input_project_dir }}/{{ csi_powerscale_secret_vaultname }} + changed_when: false + connection: local + delegate_to: localhost + + # Copy values file to /opt/omnia + - name: Copy values file + ansible.builtin.copy: + src: "{{ hostvars['localhost']['csi_powerscale_driver_values_file_path'] }}" + dest: "{{ csi_powerscale_values_path }}" + owner: "{{ owner_value }}" + group: "{{ group_value }}" + mode: "{{ permission_644 }}" + + - name: Get dependencies from local repo + block: + - name: Get csi-powerscale git tar + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/csi-powerscale/{{ csi_powerscale_git }}" + dest: "{{ csi_powerscale_path }}/{{ csi_powerscale_git }}" + mode: "{{ permission_644 }}" + + - name: Extract csi-powerscale tar file + ansible.builtin.unarchive: + src: "{{ csi_powerscale_path }}/{{ csi_powerscale_git }}" + dest: "{{ csi_powerscale_path }}" + remote_src: true + + - name: Get dell/helm-charts git tar + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/helm-charts/{{ helm_charts_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale/{{ helm_charts_git }}" + mode: "{{ permission_644 }}" + + - name: Get external-snapshotter git tar + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/external-snapshotter/{{ external_snapshotter_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale/{{ external_snapshotter_git }}" + mode: "{{ permission_644 }}" + rescue: + - name: Handle dependency failure + ansible.builtin.fail: + msg: "{{ fail_msg_download }}" + + - name: Extract dell/helm-charts tar file under csi-powerscale directory + ansible.builtin.unarchive: + src: "{{ csi_powerscale_path }}/csi-powerscale/{{ helm_charts_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale" + remote_src: true + + - name: Extract external snapshotter tar file under csi-powerscale directory + ansible.builtin.unarchive: + src: "{{ csi_powerscale_path }}/csi-powerscale/{{ external_snapshotter_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale" + remote_src: true + + - name: Transfer storage class template to kube_control_plane + ansible.builtin.template: + src: ps_storage_class.j2 + dest: "{{ csi_powerscale_path }}/ps_storage_class.yml" + owner: "{{ owner_value }}" + group: "{{ group_value }}" + mode: "{{ permission_644 }}" + + - name: Create isilon namespace + kubernetes.core.k8s: + api_version: v1 + kind: Namespace + name: isilon + state: present + register: command_result + changed_when: command_result.changed + failed_when: false diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/main.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/main.yml new file mode 100644 index 0000000000..860a97e8aa --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/main.yml @@ -0,0 +1,36 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: CSI powerscale driver installation + when: + - hostvars['127.0.0.1']['csi_driver_powerscale_precheck_pass'] + block: + - name: Fetch required files to kube control plane + ansible.builtin.include_tasks: csi_powerscale_prereq.yml + + - name: Deploy powerscale if not already deployed + when: not powerscale_already_deployed + block: + - name: Configure secret + ansible.builtin.include_tasks: csi_powerscale_config_secret.yml + + - name: Configure certificate + ansible.builtin.include_tasks: csi_powerscale_config_certificate.yml + + - name: Install powerscale driver + ansible.builtin.include_tasks: csi_powerscale_install.yml + + - name: Assign CSI as deafult storage Class + ansible.builtin.include_tasks: assign_csi_as_default_storage_class.yml diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/templates/ps_storage_class.j2 b/scheduler/roles/k8s_csi_powerscale_plugin/templates/ps_storage_class.j2 new file mode 100644 index 0000000000..a8158d410b --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/templates/ps_storage_class.j2 @@ -0,0 +1,13 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ps01 +provisioner: csi-isilon.dellemc.com +reclaimPolicy: Delete +allowVolumeExpansion: true +volumeBindingMode: Immediate +parameters: + AccessZone: {{ ps_access_zone }} + Isipath: {{ ps_isipath }} + RootClientEnabled: "true" + csi.storage.k8s.io/fstype: "nfs" diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/vars/main.yml b/scheduler/roles/k8s_csi_powerscale_plugin/vars/main.yml new file mode 100644 index 0000000000..58de70c3aa --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/vars/main.yml @@ -0,0 +1,61 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: csi_powerscale_config_certificate.yml +empty_certificate_path: "{{ csi_powerscale_path }}/empty_isilon-certs.yaml" +fail_msg_empty_certificate: "Failed. Unable to create empty certificate." +empty_certificate_template_path: "{{ role_path }}/files/empty_certificate_template.yml" + +# Usage: csi_powerscale_config_secret.yml, csi_powerscale_prereq.yml +input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" +csi_powerscale_secret_path: "{{ csi_powerscale_path }}/csi_powerscale_secret.yaml" + +# Usage: csi_powerscale_install.yml, csi_powerscale_prereq.yml +csi_powerscale_path: "/opt/omnia/csi-driver-powerscale" +ansible_python_interpreter: "/usr/bin/{{ hostvars['localhost']['python_package'] }}" + +# Usage: csi_powerscale_install.yml, csi_powerscale_prereq.yml +csi_powerscale_git: "csi-powerscale.tar.gz" + +# Usage: csi_powerscale_install.yml +fail_msg_csi_powerscale_driver: "Error. Deployment of csi driver was not successful. Please review the deployment. Run playbook with -vvv for more details" +pass_msg_csi_powerscale_driver: "CSI Powerscale driver installation completed successfully." +wait_time: 10 +warning_wait_time: 30 +max_attempts: 5 +isilon_ns: "isilon" +async_time: 180 +poll_time: 10 + +# Usage: csi_powerscale_prereq.yml +permission_644: "0644" +owner_value: "root" +group_value: "root" +powerscale_ns: "isilon" +powerscale_pod_indcator: "isilon-" +csi_powerscale_values_path: "{{ csi_powerscale_path }}/values.yaml" +fail_msg_download: "Failed to get required dependencies. Make sure to verify entries in csi_driver_powerscale.json and run local_repo.yml first." +helm_charts_git: "helm-charts.tar.gz" +external_snapshotter_git: "external-snapshotter.tar.gz" +k8s_not_deployed: "Failed, Kubernetes is not deployed on the cluster. Run omnia.yml with k8s entry in software_config.json to install kubernetes first." +helm_not_deployed: "Failed, Helm is not deployed on the cluster." +csi_powerscale_secret_vaultname: ".csi_powerscale_secret_vault" +vault_key_permission: "0644" +warning_msg_already_deployed: "Powerscale will not be deployed. Existing powerscale deployment is already present on the cluster. + Please remove the existing powerscale deployment first using steps mentioned in omnia document and rerun playbook to install powerscale." + +# Usage: template ps_storage_class.j2 +ps_isipath: "{{ hostvars['localhost']['csi_powerscale_values_file']['isiPath'] }}" +ps_access_zone: "{{ hostvars['localhost']['csi_powerscale_values_file']['isiAccessZone'] }}" diff --git a/scheduler/scheduler.yml b/scheduler/scheduler.yml index 53dcb4f7a0..578ad4edcb 100644 --- a/scheduler/scheduler.yml +++ b/scheduler/scheduler.yml @@ -19,7 +19,7 @@ tasks: - name: Set dynamic run tags including k8s ansible.builtin.set_fact: - omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['k8s', 'slurm']) | unique }}" + omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['compute_k8s', 'slurm', 'csi_driver_powerscale']) | unique }}" cacheable: true - name: Invoke get_config_credentials.yml @@ -138,6 +138,15 @@ - name: Deploy plugins on kube control nodes hosts: kube_control_plane[0] gather_facts: false + vars: + common_plugin: + kubernetes_dashboard: true + habanalabs-device-plugin: false + mpi-operator: true + xilinx-device-plugin: true + nfs-subdir-external-provisioner: true + multus-device-plugin: true + whereabouts-device-plugin: true roles: - common_plugins @@ -153,6 +162,22 @@ roles: - k8s_amd +- name: CSI powerscale image pulling + hosts: kube_node, kube_control_plane + tasks: + - name: Pull images + ansible.builtin.include_role: + name: k8s_csi_powerscale_plugin + tasks_from: csi_powerscale_image_pull.yml + when: + - hostvars['127.0.0.1']['csi_driver_powerscale_precheck_pass'] | default(false) | bool + +- name: Install CSI powerscale plugin on kube control nodes + hosts: kube_control_plane[0] + gather_facts: false + roles: + - k8s_csi_powerscale_plugin + - name: Install Slurm hosts: slurm_control_node, slurm_node, login any_errors_fatal: true diff --git a/scheduler/service_k8s_cluster.yml b/scheduler/service_k8s_cluster.yml index 0f22de5487..08bddc4c33 100644 --- a/scheduler/service_k8s_cluster.yml +++ b/scheduler/service_k8s_cluster.yml @@ -19,7 +19,7 @@ tasks: - name: Set dynamic run tags including k8s ansible.builtin.set_fact: - omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['service_k8s']) | unique }}" + omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['service_k8s', 'csi_driver_powerscale']) | unique }}" cacheable: true - name: Invoke get_config_credentials.yml @@ -124,3 +124,41 @@ gather_facts: false roles: - update_containerd_config + +- name: Pull k8s images + hosts: kube_control_plane, kube_node, etcd + gather_facts: false + any_errors_fatal: true + roles: + - k8s_prepare_images + +- name: Deploy plugins on kube control nodes + hosts: kube_control_plane[0] + gather_facts: false + vars: + common_plugin: + kubernetes_dashboard: false + habanalabs-device-plugin: false + mpi-operator: false + xilinx-device-plugin: false + nfs-subdir-external-provisioner: true + multus-device-plugin: false + whereabouts-device-plugin: true + roles: + - common_plugins + +- name: CSI powerscale image pulling + hosts: kube_node, kube_control_plane + tasks: + - name: Pull images + ansible.builtin.include_role: + name: k8s_csi_powerscale_plugin + tasks_from: csi_powerscale_image_pull.yml + when: + - hostvars['127.0.0.1']['csi_driver_powerscale_precheck_pass'] | default(false) | bool + +- name: Install CSI powerscale plugin on kube control nodes + hosts: kube_control_plane[0] + gather_facts: false + roles: + - k8s_csi_powerscale_plugin diff --git a/telemetry/roles/idrac_telemetry/tasks/trigger_telemetry_collection.yml b/telemetry/roles/idrac_telemetry/tasks/trigger_telemetry_collection.yml index 05a21a2564..dff1a493fa 100644 --- a/telemetry/roles/idrac_telemetry/tasks/trigger_telemetry_collection.yml +++ b/telemetry/roles/idrac_telemetry/tasks/trigger_telemetry_collection.yml @@ -13,10 +13,18 @@ # limitations under the License. --- -# Initiate iDRAC collection -- name: Initiate telemetry-collector - containers.podman.podman_container_exec: - name: "{{ idrac_telemetry_container }}" - command: "/bin/bash {{ idrac_telemetry_receiver_entry_script }}" - detach: true - when: hostvars['localhost']['idrac_telemetry_support'] +# Restart iDRAC telemetry container +- name: Restart iDRAC telemetry container + when: + - hostvars['localhost']['idrac_telemetry_support'] + - not hostvars['localhost']['federated_idrac_telemetry_collection'] + block: + - name: Restart telemetry-collector + containers.podman.podman_container: + name: "{{ idrac_telemetry_container }}" + state: started + restart: true + rescue: + - name: Telemetry container restart failed + ansible.builtin.fail: + msg: "{{ idrac_telemetry_restart_failure_msg.splitlines() | join(' ') }}" diff --git a/telemetry/roles/idrac_telemetry/vars/main.yml b/telemetry/roles/idrac_telemetry/vars/main.yml index 301e9f9939..96fe43fac1 100644 --- a/telemetry/roles/idrac_telemetry/vars/main.yml +++ b/telemetry/roles/idrac_telemetry/vars/main.yml @@ -56,10 +56,14 @@ unreachable_service_node_bmc_msg: > invalid_bmc_warning_msg: | [WARNING] Some BMC IPs are not valid. Kindly address the issues mentioned above and execute telemetry.yml. Telemetry feature wont be enabled for these BMC IPs from {{ bmc_group_data_filename }} file. +service_cluster_idrac_telemetry_dir_path: "{{ omnia_nfs_share }}/service_cluster/telemetry/idrac_telemetry" +idrac_telemetry_scripting_git_clone_path: "{{ service_cluster_idrac_telemetry_dir_path }}/iDRAC-Telemetry-Scripting" # Usage: trigger_telemetry_collection.yml idrac_telemetry_container: "idrac_telemetry_receiver" idrac_telemetry_receiver_entry_script: "/usr/local/bin/idrac_telemetry_receiver_init.sh" +idrac_telemetry_restart_failure_msg: | + Failed to restart idrac_telemetry_receiver container. Please check the logs using the command `podman logs idrac_telemetry_receiver` and try again later. # Usage: prometheus_config_reload.yml prometheus_reload: diff --git a/telemetry/roles/k8s_prometheus/tasks/download_images.yml b/telemetry/roles/k8s_prometheus/tasks/download_images.yml deleted file mode 100644 index d1a5e20ac9..0000000000 --- a/telemetry/roles/k8s_prometheus/tasks/download_images.yml +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check if telemetry support is required - when: hostvars['127.0.0.1']['telemetry_entry_present'] - block: - - name: Check if k8s_prometheus_support is required - when: hostvars['127.0.0.1']['k8s_prometheus_support'] - block: - - name: Load telemetry.json file - ansible.builtin.set_fact: - telemetry_package_json: "{{ lookup('file', telemetry_packages_file) | from_json }}" - - - name: Find images and tags from JSON - ansible.builtin.set_fact: - prom_image_versions: >- - {{ telemetry_package_json.telemetry.cluster | selectattr('package', 'in', prom_image_names) | map(attribute='package') - | zip(telemetry_package_json.telemetry.cluster - | selectattr('package', 'in', prom_image_names) | map(attribute='tag')) | map('join', ':') | list }} - - - name: Pull Kube prometheus images - ansible.builtin.command: nerdctl pull {{ item }} - with_items: "{{ prom_image_versions }}" - changed_when: true - failed_when: false - environment: - http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" - https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" - no_proxy: "{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['admin_nic_ip'] }}" - - - name: Check if prometheus_gaudi_support is required - when: hostvars['127.0.0.1']['prometheus_gaudi_support'] - block: - - name: Find images and tags from JSON in case of prometheus_gaudi_support - ansible.builtin.set_fact: - gaudi_exporter_image_versions: >- - {{ telemetry_package_json.telemetry.cluster | selectattr('package', 'in', gaudi_exporter_image_names) | map(attribute='package') - | zip(telemetry_package_json.telemetry.cluster - | selectattr('package', 'in', gaudi_exporter_image_names) | map(attribute='tag')) | map('join', ':') | list }} - - - name: Pull gaudi exporter images - ansible.builtin.command: nerdctl pull {{ item }} - with_items: "{{ gaudi_exporter_image_versions }}" - changed_when: true - failed_when: false - environment: - http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" - https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" - no_proxy: "{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['admin_nic_ip'] }}" diff --git a/telemetry/roles/k8s_prometheus/tasks/install_kube_prometheus.yml b/telemetry/roles/k8s_prometheus/tasks/install_kube_prometheus.yml index c6b87a8333..8b16fb6361 100644 --- a/telemetry/roles/k8s_prometheus/tasks/install_kube_prometheus.yml +++ b/telemetry/roles/k8s_prometheus/tasks/install_kube_prometheus.yml @@ -13,21 +13,20 @@ # limitations under the License. --- -- name: Get K8s namespace - ansible.builtin.command: kubectl get ns - changed_when: false - register: k8s_ns +- name: Load service_k8s.json file + ansible.builtin.set_fact: + service_k8s_package_json: "{{ lookup('file', service_k8s_packages_file) | from_json }}" -- name: Get K8s pods - ansible.builtin.command: kubectl get pods --all-namespaces - changed_when: false - register: k8s_pods +- name: Set fact for kube-prometheus-stack + ansible.builtin.set_fact: + kube_prometheus_stack_pkg: "{{ service_k8s_package_json['service_k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'kube-prometheus-stack') | map(attribute='package') | join }}" # noqa: yaml[line-length] -- name: Create monitoring namespace - ansible.builtin.command: - cmd: kubectl create namespace monitoring - changed_when: false - when: "'monitoring' not in k8s_ns.stdout" +- name: Ensure monitoring namespace exists + ansible.builtin.k8s: + api_version: v1 + kind: Namespace + name: monitoring + state: present - name: Create directory for temp k8s files ansible.builtin.file: @@ -41,8 +40,12 @@ dest: "{{ kube_prometheus_values_file_dest }}" mode: "{{ file_mode }}" -- name: Install kube-prometheus stack - ansible.builtin.command: "helm install prometheus '{{ kube_prometheus_stack_repo }}' --namespace monitoring -f '{{ kube_prometheus_values_file_dest }}'" - changed_when: true - when: - - prometheus_pod_name not in k8s_pods.stdout +- name: Install kube-prometheus stack via Helm + community.kubernetes.helm: + name: prometheus + chart_ref: "{{ kube_prometheus_stack_repo }}" + release_namespace: monitoring + values_files: + - "{{ kube_prometheus_values_file_dest }}" + create_namespace: true + state: present diff --git a/telemetry/roles/k8s_prometheus/tasks/main.yml b/telemetry/roles/k8s_prometheus/tasks/main.yml index 7ec8359174..e4ba7baf65 100644 --- a/telemetry/roles/k8s_prometheus/tasks/main.yml +++ b/telemetry/roles/k8s_prometheus/tasks/main.yml @@ -14,8 +14,7 @@ --- - name: Check if telemetry support is required - when: hostvars['127.0.0.1']['telemetry_entry_present'] + when: hostvars['127.0.0.1']['kube_prometheus_support'] block: - name: Install kube prometheus stack ansible.builtin.include_tasks: install_kube_prometheus.yml - when: hostvars['localhost']['k8s_prometheus_support'] diff --git a/telemetry/roles/k8s_prometheus/vars/main.yml b/telemetry/roles/k8s_prometheus/vars/main.yml index 0874cb85e9..7dcfde0441 100644 --- a/telemetry/roles/k8s_prometheus/vars/main.yml +++ b/telemetry/roles/k8s_prometheus/vars/main.yml @@ -13,21 +13,9 @@ # limitations under the License. --- -# Usage: download_images.yml -telemetry_packages_file: "{{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/telemetry.json" -prom_image_names: - - quay.io/prometheus-operator/prometheus-operator - - registry.k8s.io/kube-state-metrics/kube-state-metrics - - quay.io/prometheus-operator/prometheus-config-reloader - - quay.io/prometheus/alertmanager - - quay.io/prometheus/node-exporter - - quay.io/prometheus/prometheus - - registry.k8s.io/ingress-nginx/kube-webhook-certgen -gaudi_exporter_image_names: - - vault.habana.ai/gaudi-metric-exporter/metric-exporter - # Usage: install_kube_prometheus.yml -kube_prometheus_stack_repo: "{{ hostvars['localhost']['offline_tarball_path'] }}/kube-prometheus-stack-62.3.0.tar.gz" +service_k8s_packages_file: "{{ role_path }}/../../../input/config/{{ hostvars['localhost']['software_config'].cluster_os_type }}/{{ hostvars['localhost']['software_config'].cluster_os_version }}/service_k8s.json" # noqa: yaml[line-length] +kube_prometheus_stack_repo: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ kube_prometheus_stack_pkg }}/{{ kube_prometheus_stack_pkg }}.tar.gz" k8s_tmp_dir: "/root/k8s" kube_prometheus_values_file_source: "{{ role_path }}/templates/kube_prometheus_values.yml.j2" kube_prometheus_values_file_dest: "{{ k8s_tmp_dir }}/k8s_prometheus_values.yml" @@ -35,4 +23,3 @@ file_mode: "0655" prometheus: storage: 50Gi storageClassName: nfs-client -prometheus_pod_name: "prometheus-prometheus-kube-prometheus-prometheus" diff --git a/telemetry/roles/service_k8s_telemetry/tasks/idrac_telemetry_deployment.yml b/telemetry/roles/service_k8s_telemetry/tasks/idrac_telemetry_deployment.yml index b040b3d3a9..8bb148e583 100644 --- a/telemetry/roles/service_k8s_telemetry/tasks/idrac_telemetry_deployment.yml +++ b/telemetry/roles/service_k8s_telemetry/tasks/idrac_telemetry_deployment.yml @@ -13,16 +13,6 @@ # limitations under the License. --- -- name: Get prometheus svc IP - ansible.builtin.command: kubectl get svc "{{ prometheus_k8s_name }}" -n "{{ telemetry_namespace }}" -o=jsonpath='{.spec.clusterIP}' - changed_when: false - register: prometheus_svc_ip - -- name: Get mysqldb svc IP - ansible.builtin.command: kubectl get svc "{{ mysqldb_k8s_name }}" -n "{{ telemetry_namespace }}" -o=jsonpath='{.spec.clusterIP}' - changed_when: false - register: mysql_svc_ip - - name: Create directory for iDRAC telemetry ansible.builtin.file: path: "{{ service_cluster_idrac_telemetry_dir_path }}" @@ -72,24 +62,21 @@ msg: "{{ idrac_script_git_clone_error_msg.splitlines() | join(' ') }}" when: clone_idrac_script is failed -- name: Deploy idrac-telemetry pod +- name: Deployment definition for idrac-telemetry StatefulSet kubernetes.core.k8s: state: present definition: apiVersion: apps/v1 - kind: Deployment + kind: StatefulSet metadata: name: "{{ idrac_telemetry_k8s_name }}" namespace: "{{ telemetry_namespace }}" - labels: - app: "{{ idrac_telemetry_k8s_name }}" spec: + serviceName: "{{ idrac_telemetry_service_name }}" + replicas: "{{ statefulset_replicas }}" selector: matchLabels: app: "{{ idrac_telemetry_k8s_name }}" - replicas: 1 - strategy: - type: RollingUpdate template: metadata: labels: @@ -105,14 +92,42 @@ - ip: "127.0.0.1" hostnames: - "activemq" - - ip: "{{ prometheus_svc_ip.stdout }}" - hostnames: - - "prometheus" - - ip: "{{ mysql_svc_ip.stdout }}" + # - ip: "{{ prometheus_svc_ip.stdout }}" + # hostnames: + # - "prometheus" + - ip: "127.0.0.1" hostnames: - "mysqldb" containers: + - name: mysqldb + image: "{{ mysql_image }}" + imagePullPolicy: IfNotPresent + volumeMounts: + - name: mysqldb-pvc + mountPath: /var/lib/mysql/ + env: + - name: MYSQL_DATABASE + value: "{{ mysqldb_name }}" + - name: MYSQL_USER + valueFrom: + secretKeyRef: + name: "{{ mysqldb_secrets_name }}" + key: mysqldb_user + - name: MYSQL_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ mysqldb_secrets_name }}" + key: mysqldb_password + - name: MYSQL_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: "{{ mysqldb_secrets_name }}" + key: mysqldb_root_password + ports: + - containerPort: "{{ mysqldb_container_port1 }}" + - containerPort: "{{ mysqldb_container_port2 }}" + - name: activemq image: "{{ activemq_image }}" imagePullPolicy: "IfNotPresent" @@ -173,3 +188,35 @@ - "/bin/sh" - "-c" args: ["go run cmd/prometheuspump/prometheuspump.go"] + + volumeClaimTemplates: + - metadata: + name: mysqldb-pvc + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: "{{ mysqldb_storage }}" + +- name: Service for idrac telemetry + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Service + metadata: + name: "{{ idrac_telemetry_service_name }}" + namespace: "{{ telemetry_namespace }}" + labels: + app: "{{ idrac_telemetry_service_name }}" + spec: + clusterIP: None + ports: + - name: mysql-port-1 + port: "{{ mysqldb_container_port1 }}" + - name: mysql-port-2 + port: "{{ mysqldb_container_port2 }}" + - name: pump-port + port: "{{ prometheus_pump_port }}" + selector: + app: "{{ idrac_telemetry_k8s_name }}" diff --git a/telemetry/roles/service_k8s_telemetry/tasks/main.yml b/telemetry/roles/service_k8s_telemetry/tasks/main.yml index dd8eee92a8..ee818b1ab6 100644 --- a/telemetry/roles/service_k8s_telemetry/tasks/main.yml +++ b/telemetry/roles/service_k8s_telemetry/tasks/main.yml @@ -24,9 +24,6 @@ - name: Configure k8s secrets ansible.builtin.include_tasks: secrets_creation.yml - - name: Deployment of mysqldb pod - ansible.builtin.include_tasks: mysqldb_deployment.yml - - name: Deployment of prometheus pod ansible.builtin.include_tasks: prometheus_deployment.yml diff --git a/telemetry/roles/service_k8s_telemetry/tasks/mysqldb_deployment.yml b/telemetry/roles/service_k8s_telemetry/tasks/mysqldb_deployment.yml deleted file mode 100644 index bb55cee2dd..0000000000 --- a/telemetry/roles/service_k8s_telemetry/tasks/mysqldb_deployment.yml +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Persistent volume claim for mysqldb - kubernetes.core.k8s: - state: present - definition: - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: "{{ mysqldb_pvc_name }}" - namespace: "{{ telemetry_namespace }}" - spec: - storageClassName: "{{ storage_class_name }}" - accessModes: - - ReadWriteOnce - resources: - requests: - storage: "{{ mysqldb_storage }}" - -- name: Mysqldb pod definition - kubernetes.core.k8s: - state: present - definition: - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: "{{ mysqldb_k8s_name }}" - namespace: "{{ telemetry_namespace }}" - spec: - selector: - matchLabels: - app: "{{ mysqldb_k8s_name }}" - serviceName: "{{ mysqldb_k8s_name }}" - replicas: "{{ statefulset_replicas }}" - template: - metadata: - labels: - app: "{{ mysqldb_k8s_name }}" - spec: - volumes: - - name: mysqldb-pvc - persistentVolumeClaim: - claimName: "{{ mysqldb_pvc_name }}" - - containers: - - name: mysqldb - image: "{{ mysql_image }}" - imagePullPolicy: "IfNotPresent" - volumeMounts: - - mountPath: /var/lib/mysql/ - name: mysqldb-pvc - env: - - name: MYSQL_DATABASE - value: "{{ mysqldb_name }}" - - name: MYSQL_USER - valueFrom: - secretKeyRef: - name: "{{ mysqldb_secrets_name }}" - key: mysqldb_user - - name: MYSQL_PASSWORD - valueFrom: - secretKeyRef: - name: "{{ mysqldb_secrets_name }}" - key: mysqldb_password - - name: MYSQL_ROOT_PASSWORD - valueFrom: - secretKeyRef: - name: "{{ mysqldb_secrets_name }}" - key: mysqldb_root_password - ports: - - containerPort: "{{ mysqldb_container_port1 }}" - - containerPort: "{{ mysqldb_container_port2 }}" - -- name: Service for mysqldb - kubernetes.core.k8s: - state: present - definition: - apiVersion: v1 - kind: Service - metadata: - name: "{{ mysqldb_k8s_name }}" - namespace: "{{ telemetry_namespace }}" - labels: - app: "{{ mysqldb_k8s_name }}" - spec: - type: ClusterIP - ports: - - name: mysqldb-http-port-1 - port: "{{ mysqldb_container_port1 }}" - - name: mysqldb-http-port-2 - port: "{{ mysqldb_container_port2 }}" - selector: - app: "{{ mysqldb_k8s_name }}" diff --git a/telemetry/roles/service_k8s_telemetry/tasks/prereq_checks.yml b/telemetry/roles/service_k8s_telemetry/tasks/prereq_checks.yml index 35cec831e2..0f61c58de9 100644 --- a/telemetry/roles/service_k8s_telemetry/tasks/prereq_checks.yml +++ b/telemetry/roles/service_k8s_telemetry/tasks/prereq_checks.yml @@ -57,3 +57,9 @@ ansible.builtin.fail: msg: "{{ storage_class_missing_fail_msg }}" when: storage_class_name not in (sc_info.resources | map(attribute='metadata.name') | list) + +- name: Get Kubernetes service cluster node count + kubernetes.core.k8s_info: + api_version: v1 + kind: Node + register: node_count diff --git a/telemetry/roles/service_k8s_telemetry/vars/main.yml b/telemetry/roles/service_k8s_telemetry/vars/main.yml index f7e922c002..8b83ff08ed 100644 --- a/telemetry/roles/service_k8s_telemetry/vars/main.yml +++ b/telemetry/roles/service_k8s_telemetry/vars/main.yml @@ -20,15 +20,6 @@ k8s_not_installed_fail_msg: "Failed. Kubernetes installation required. storage_class_missing_fail_msg: "Failed. StorageClass {{ storage_class_name }} is not present in the cluster. To resolve this, create the StorageClass by running scheduler/service_k8s_cluster.yml playbook." -# Usage: mysqldb_deployment.yml -mysqldb_storage: 1Gi -mysqldb_pvc_name: mysqldb-storage-claim -mysqldb_k8s_name: mysqldb -mysqldb_name: "idrac_telemetrydb" -mysqldb_container_port1: 3306 -mysqldb_container_port2: 33060 -mysql_image: "docker.io/library/mysql:9.3.0" - # Usage: k8s_secrets.yml mysqldb_secrets_name: mysqldb-credentials @@ -37,7 +28,7 @@ omnia_nfs_share: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia" service_cluster_idrac_telemetry_dir_path: "{{ omnia_nfs_share }}/service_cluster/telemetry/idrac_telemetry" dir_permissions_755: "0755" idrac_telemetry_github_repo: https://github.com/dell/iDRAC-Telemetry-Reference-Tools.git -reference_tools_stable_commit: "94e7621" +reference_tools_stable_commit: "9a1c72b" idrac_telemetry_reference_git_clone_path: "{{ service_cluster_idrac_telemetry_dir_path }}/iDRAC-Telemetry-Reference-Tools" idrac_git_clone_error_msg: | Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_github_repo }} @@ -58,6 +49,15 @@ activemq_http_port_1: 8161 activemq_http_port_2: 61616 messagebus_http_port: 61613 configui_http_port: 8082 +mysqldb_storage: 1Gi +mysqldb_pvc_name: mysqldb-storage-claim +mysqldb_k8s_name: mysqldb +mysqldb_name: "idrac_telemetrydb" +idrac_telemetry_service_name: "idrac_telemetry_service" +mysqldb_container_port1: 3306 +mysqldb_container_port2: 33060 +prometheus_pump_port: 2112 +mysql_image: "docker.io/library/mysql:9.3.0" # Usage: prometheus_deployment.yml prometheus_pvc_name: "prometheus-pvc" @@ -65,6 +65,6 @@ prometheus_k8s_name: "prometheus" prometheus_configmap_name: "prometheus-config" prometheus_image: "docker.io/prom/prometheus:v3.4.1" prometheus_container_port: 9090 -statefulset_replicas: 1 +statefulset_replicas: "{{ node_count.resources | length }}" prometheus_storage: 1Gi prometheus_service_port: 30090 diff --git a/telemetry/roles/telemetry_validation/tasks/main.yml b/telemetry/roles/telemetry_validation/tasks/main.yml index 3989426baa..c363cfcb67 100644 --- a/telemetry/roles/telemetry_validation/tasks/main.yml +++ b/telemetry/roles/telemetry_validation/tasks/main.yml @@ -34,12 +34,16 @@ - name: Include local_repo_access variables ansible.builtin.include_tasks: read_software_config.yml - when: federated_idrac_telemetry_collection + when: federated_idrac_telemetry_collection or kube_prometheus_support - name: Validate iDRAC inventory ansible.builtin.include_tasks: validate_idrac_inventory.yml when: idrac_telemetry_support + - name: Validate kube prometheus + ansible.builtin.include_tasks: validate_k8s_prometheus.yml + when: kube_prometheus_support + # - name: Custom validation for federated way of telemetry # ansible.builtin.include_tasks: validate_service_node_status.yml # when: federated_idrac_telemetry_collection diff --git a/telemetry/roles/telemetry_validation/tasks/read_software_config.yml b/telemetry/roles/telemetry_validation/tasks/read_software_config.yml index 685aad8be3..696ff3d796 100644 --- a/telemetry/roles/telemetry_validation/tasks/read_software_config.yml +++ b/telemetry/roles/telemetry_validation/tasks/read_software_config.yml @@ -105,7 +105,7 @@ python_package_name: >- {{ telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'rpm') - | selectattr('package', 'search', 'python3') + | selectattr('package', 'search', '^python3\.\d+$') | map(attribute='package') | join }} k8s_pip_packages: >- {{ telemetry_packages['service_k8s']['cluster'] diff --git a/telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus_prometheus_gaudi.yml b/telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus.yml similarity index 83% rename from telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus_prometheus_gaudi.yml rename to telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus.yml index 4196677c9f..8e05a8acbc 100644 --- a/telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus_prometheus_gaudi.yml +++ b/telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus.yml @@ -23,7 +23,7 @@ - name: Validate kube_control_plane group ansible.builtin.assert: - that: "groups['kube_control_plane'] | length | int == 1" + that: "groups['kube_control_plane'] | length | int >= 1" fail_msg: "{{ kube_control_plane_group_fail_msg }}" - name: Validate kube_node group @@ -40,10 +40,3 @@ ansible.builtin.assert: that: "groups['etcd'] | length | int % 2 == 1" fail_msg: "{{ etcd_odd_entry_fail_msg }}" - -- name: Assert prometheus_scrape_interval - ansible.builtin.assert: - that: - - prometheus_scrape_interval is integer - - prometheus_scrape_interval > 0 - fail_msg: "{{ prometheus_scrape_interval_fail_msg }}" diff --git a/telemetry/roles/telemetry_validation/tasks/validate_k8s_setup.yml b/telemetry/roles/telemetry_validation/tasks/validate_k8s_setup.yml deleted file mode 100644 index dac76ba9a7..0000000000 --- a/telemetry/roles/telemetry_validation/tasks/validate_k8s_setup.yml +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check if telemetry support is required - when: hostvars['127.0.0.1']['telemetry_entry_present'] - block: - - name: Validate k8s cluster - when: hostvars['localhost']['k8s_prometheus_support'] or hostvars['localhost']['prometheus_gaudi_support'] - block: - - name: Set fact for k8s installation status - ansible.builtin.set_fact: - k8s_installation_status: false - - - name: Check whether k8s is installed - ansible.builtin.command: kubectl get nodes -o='Name' - register: kubectl_status - changed_when: false - failed_when: false - - - name: Set the k8s installation status - ansible.builtin.set_fact: - k8s_installation_status: true - when: k8s_error_message not in kubectl_status.msg - - - name: Fail when K8S Cluster is not setup - ansible.builtin.fail: - msg: "{{ k8s_cluster_fail_msg }}" - when: not k8s_installation_status diff --git a/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml b/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml index f3bfefdf52..7e43a22c0d 100644 --- a/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml +++ b/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml @@ -45,6 +45,7 @@ idrac_telemetry_support: "{{ idrac_telemetry_support | lower }}" visualization_support: "{{ visualization_support | lower }}" federated_idrac_telemetry_collection: "{{ federated_idrac_telemetry_collection | lower }}" + kube_prometheus_support: "{{ kube_prometheus_support | lower }}" - name: Warning for all telemetry support category values set as false ansible.builtin.pause: @@ -53,6 +54,7 @@ when: - not idrac_telemetry_support - not visualization_support + - not kube_prometheus_support - name: Warning for idrac_telemetry_support is currently set to false ansible.builtin.pause: @@ -66,11 +68,6 @@ prompt: "{{ warning_idrac_telemetry_support_true }}" when: idrac_telemetry_support -- name: Failed, Federated iDRAC Telemetry Collection not supported - ansible.builtin.fail: - msg: "{{ warning_federated_telemetry_support }}" - when: federated_idrac_telemetry_collection - # - name: Validate k8s prometheus, scrape interval and prometheus gaudi # ansible.builtin.include_tasks: validate_k8s_prometheus_prometheus_gaudi.yml # when: k8s_prometheus_support or prometheus_gaudi_support diff --git a/telemetry/roles/telemetry_validation/vars/main.yml b/telemetry/roles/telemetry_validation/vars/main.yml index ee593c3e13..1b16e8d16a 100644 --- a/telemetry/roles/telemetry_validation/vars/main.yml +++ b/telemetry/roles/telemetry_validation/vars/main.yml @@ -20,7 +20,7 @@ fail_msg_telemetry_config_file: "telemetry_config.yml file doesn't exist." pause_time_15: 15 bmc_group_data_filename: "/opt/omnia/telemetry/bmc_group_data.csv" warning_telemetry_support_false: | - "[WARNING] idrac_telemetry_support and visualization_support are false in telemetry_config.yml. + "[WARNING] kube_prometheus_support, idrac_telemetry_support and visualization_support are false in telemetry_config.yml. Omnia does not deploy telemetry feature if none of the support category is true." telemetry_config_syntax_fail_msg: "Failed. Syntax errors present in telemetry_config.yml. Fix errors and re-run playbook again." warning_idrac_telemetry_support_false: | @@ -34,17 +34,19 @@ warning_idrac_telemetry_support_true: | Confirm that all BMC IPs are reachable from the OIM and respective service cluster nodes for telemetry to function properly. Make sure that Redfish is enabled and the iDRAC has a datacenter license. Also, ensure that the firmware version is greater than 4 for iDRAC9 or greater than 1 for iDRAC10." -warning_federated_telemetry_support: "Failed: Federated iDRAC Telemetry Collection is not supported yet and will be available in future releases." # # Usage: include_provision_config.yml # provision_config_file: "{{ input_project_dir }}/provision_config.yml" # fail_msg_provision_config_file: "provision_config.yml file doesn't exist." # fail_timezone_msg: "Failed. Incorrect timezone provided. Please check the file timezone.txt in discovery/roles/discovery_validations/common/files/ folder." -# # Usage: validate_k8s_prometheus_prometheus_gaudi.yml -# k8s_prom_gaudi_inventory_fail_msg: "Inventory comprising kube_control_plane, kube_node and etcd groups should be passed \ -# when k8s_prometheus_support or prometheus_gaudi_support is true in telemetry_config.yml." -# prometheus_scrape_interval_fail_msg: "Failed. prometheus_scrape_interval accepts integer values greater than 0" +# Usage: validate_k8s_prometheus.yml +k8s_prom_gaudi_inventory_fail_msg: "Inventory comprising kube_control_plane, kube_node and etcd groups should be passed \ + when kube_prometheus_support is true in telemetry_config.yml." +kube_control_plane_group_fail_msg: "kube_control_plane group should contain atleast 1 node in inventory" +kube_node_group_fail_msg: "kube_node group should contain atleast 1 node in inventory" +etcd_group_fail_msg: "etcd group should contain atleast 1 node in inventory" +etcd_odd_entry_fail_msg: "etcd group should have odd number of nodes in inventory" # Usage: validate_image_tars.yml # noqa: yaml[line-length] @@ -62,7 +64,7 @@ software_config_file: "{{ input_project_dir }}/software_config.json" # and then execute telemetry.yml. local_repo_service_missing_msg: | [ERROR] It seems local_repo not executed with service_k8s/service_node entry in software_config.yml. - Kindly execute local_repo.yml with `service_k8s` or `service_node` entry in softwares list in software_config.yml + Kindly execute local_repo.yml with `service_k8s` or `service_node` entry in softwares list in software_config.json and then execute telemetry.yml. local_repo_access_path: "/opt/omnia/provision/local_repo_access.yml" # sn_packages_file: "{{ input_project_dir }}/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_node.json" @@ -75,11 +77,6 @@ software_config_syntax_fail_msg: "Failed. Syntax errors present in software_conf # fail_msg_prometheus_gaudi_support: "Failed. prometheus_gaudi_support is only available for cluster_os_type: ubuntu and cluster_os_version: 22.04 , 24.04. \ # Please update prometheus_gaudi_support to false in telemetry_config.yml." -# # Usage: validate_k8s_setup.yml -# k8s_error_message: "No such file or directory" -# k8s_cluster_fail_msg: "Failed. k8s cluster setup not found. Hence k8s prometheus or prometheus gaudi will not be deployed. \ -# Please run scheduler/scheduler.yml to setup k8s cluster" - # # Usage: validate_site_config.yml # site_config_file: "{{ input_project_dir }}/site_config.yml" # invalid_proxy_failure_msg: "Failed. Both http_proxy and https_proxy should be set for proxy variable provided in site_config.yml" diff --git a/telemetry/telemetry.yml b/telemetry/telemetry.yml index 0b5c9a14d9..e7c848f215 100644 --- a/telemetry/telemetry.yml +++ b/telemetry/telemetry.yml @@ -65,6 +65,17 @@ - hostvars['localhost']['idrac_telemetry_support'] - not hostvars['localhost']['federated_idrac_telemetry_collection'] +- name: Validate service k8s cluster + hosts: kube_control_plane[0] + gather_facts: false + any_errors_fatal: true + tasks: + - name: Validate service k8s cluster + ansible.builtin.include_role: + name: service_k8s_telemetry + tasks_from: prereq_checks.yml + when: hostvars['localhost']['kube_prometheus_support'] or hostvars['localhost']['federated_idrac_telemetry_collection'] + # - name: Update Repositories/Registries on nodes # ansible.builtin.import_playbook: ../utils/update_user_repo.yml # when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool ) @@ -123,6 +134,15 @@ ansible.builtin.include_role: name: service_k8s_telemetry +- name: Deployment of kube prometheus in service cluster + hosts: kube_control_plane[0] + connection: ssh + gather_facts: false + tasks: + - name: Deployment of kube prometheus + ansible.builtin.include_role: + name: k8s_prometheus + - name: Enable idrac telemetry in OIM hosts: localhost connection: local diff --git a/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 b/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 index 10a8d38030..d07145e4a6 100644 --- a/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 +++ b/utils/credential_utility/roles/create_config/templates/omnia_credential.j2 @@ -13,7 +13,7 @@ pulp_password: "" docker_username: "" docker_password: "" -#Omnia credentials +# Omnia credentials slurm_db_password: "" # Security credentials @@ -25,11 +25,15 @@ openldap_monitor_password: "" kerberos_admin_password: "" directory_manager_password: "" -# idrac telemetry +# iDrac Telemetry credentials mysqldb_user: "" mysqldb_password: "" mysqldb_root_password: "" -#visualization +# Visualization credentials grafana_username: "" grafana_password: "" + +# csi powerscale credentials +csi_username: "" +csi_password: "" diff --git a/utils/credential_utility/roles/update_config/vars/main.yml b/utils/credential_utility/roles/update_config/vars/main.yml index 3c75ae4271..4eb83bae9e 100644 --- a/utils/credential_utility/roles/update_config/vars/main.yml +++ b/utils/credential_utility/roles/update_config/vars/main.yml @@ -66,3 +66,6 @@ omnia_credentials: visualization: mandatory: - { username: grafana_username, password: grafana_password } + csi_driver_powerscale: + mandatory: + - { username: csi_username, password: csi_password } diff --git a/utils/roles/common/tasks/fetch_software_config.yml b/utils/roles/common/tasks/fetch_software_config.yml index 068b512a68..39cb07482e 100644 --- a/utils/roles/common/tasks/fetch_software_config.yml +++ b/utils/roles/common/tasks/fetch_software_config.yml @@ -16,6 +16,7 @@ - name: Initialise variables ansible.builtin.set_fact: k8s_support: false + compute_k8s_support: false service_k8s_support: false slurm_support: false @@ -24,27 +25,27 @@ file: "{{ software_config_json_file }}" name: software_config -# Check if k8s is mentioned in software_config.json -- name: Check if k8s support is true +# Check if compute k8s is mentioned in software_config.json +- name: Check if compute k8s support is true ansible.builtin.set_fact: - k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'k8s') | list | length > 0 }}" + compute_k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'compute_k8s') | list | length > 0 }}" - name: Check if service k8s support is true ansible.builtin.set_fact: service_k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | list | length > 0 }}" - name: Prepare kubernetes installation variables - when: k8s_support is true + when: compute_k8s_support is true block: - - name: Extract k8s version + - name: Extract compute_k8s version ansible.builtin.set_fact: - k8s_version: "{{ software_config.softwares | selectattr('name', 'equalto', 'k8s') | map(attribute='version') | first }}" + compute_k8s_version: "{{ software_config.softwares | selectattr('name', 'equalto', 'compute_k8s') | map(attribute='version') | first }}" # noqa: var-naming[no-jinja] - name: Prepare kubernetes installation variables when: service_k8s_support is true block: - - name: Extract k8s version + - name: Extract service k8s version ansible.builtin.set_fact: service_k8s_version: "{{ software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | map(attribute='version') | first }}" # noqa: var-naming[no-jinja] diff --git a/utils/roles/common/tasks/validate_k8s_metadata.yml b/utils/roles/common/tasks/validate_k8s_metadata.yml index 1f4d1c6240..8ab528f045 100644 --- a/utils/roles/common/tasks/validate_k8s_metadata.yml +++ b/utils/roles/common/tasks/validate_k8s_metadata.yml @@ -14,7 +14,7 @@ --- - name: Validate input passed to playbook - when: k8s_support or service_k8s_support + when: compute_k8s_support or service_k8s_support block: - name: Fail if cluster_name is not passed as an argument to the playbook ansible.builtin.fail: @@ -75,6 +75,12 @@ msg: "{{ confirmation_fail_msg }}" when: pause_result.user_input | default('yes') != 'yes' + - name: Set facts when compute_k8s_cluster + ansible.builtin.set_fact: + k8s_version: "{{ compute_k8s_version }}" + k8s_support: "{{ compute_k8s_support }}" + when: cluster_var_name == "compute_k8s_cluster" + - name: Set facts when service_k8s_cluster ansible.builtin.set_fact: k8s_version: "{{ service_k8s_version }}" @@ -94,7 +100,7 @@ k8s_service_addresses: "{{ selected_cluster.k8s_service_addresses }}" k8s_pod_network_cidr: "{{ selected_cluster.k8s_pod_network_cidr }}" -- name: Validate k8s metadata +- name: Validate compute k8s and service k8s metadata when: k8s_support block: - name: Check if the k8s metadata file exists @@ -117,12 +123,12 @@ ansible.builtin.set_fact: metadata_dict: "{{ metadata_content.content | b64decode | from_yaml }}" - - name: Check if the given k8s version exists in metadata + - name: Check if the given compute k8s or service k8s version exists in metadata ansible.builtin.set_fact: matched_kubespray_version: "{{ metadata_dict.k8s_kubespray_versions[k8s_version] }}" when: k8s_version in metadata_dict.k8s_kubespray_versions - - name: Fail if k8s version is not found in metadata + - name: Fail if compute k8s or service k8s is not found in metadata ansible.builtin.fail: msg: "{{ k8s_version_failed_msg }}" when: k8s_version not in metadata_dict.k8s_kubespray_versions diff --git a/utils/roles/inventory_validation/tasks/fetch_software_config.yml b/utils/roles/inventory_validation/tasks/fetch_software_config.yml index 84dd4a7ed4..b97d3f0fc2 100644 --- a/utils/roles/inventory_validation/tasks/fetch_software_config.yml +++ b/utils/roles/inventory_validation/tasks/fetch_software_config.yml @@ -15,7 +15,7 @@ - name: Initialise variables ansible.builtin.set_fact: - k8s_support: false + compute_k8s_support: false service_k8s_support: false slurm_support: false freeipa_support: false @@ -58,7 +58,7 @@ - name: Set k8s, slurm, freeipa and openldap support ansible.builtin.set_fact: - k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'k8s') | list | length > 0 }}" + compute_k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'compute_k8s') | list | length > 0 }}" service_k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'service_k8s') | list | length > 0 }}" slurm_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'slurm') | list | length > 0 }}" freeipa_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'freeipa') | list | length > 0 }}" diff --git a/utils/roles/inventory_validation/tasks/validate_inventory.yml b/utils/roles/inventory_validation/tasks/validate_inventory.yml index 728e4de32d..01a8559ef6 100644 --- a/utils/roles/inventory_validation/tasks/validate_inventory.yml +++ b/utils/roles/inventory_validation/tasks/validate_inventory.yml @@ -43,11 +43,11 @@ # Validate K8s requirements for all the nodes - name: Fetch omnia_config.yml - when: k8s_support and compute_k8s_playbook is defined + when: compute_k8s_support and compute_k8s_playbook is defined ansible.builtin.include_tasks: fetch_omnia_config.yml - name: Validate K8s nodes requirements - when: k8s_support and compute_k8s_playbook is defined + when: compute_k8s_support and compute_k8s_playbook is defined ansible.builtin.include_tasks: k8s_validations.yml # Validate service K8s requirements for all the nodes