diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index d7af9d4792..7b494487e2 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -314,6 +314,8 @@ def server_spec_network_key_fail_msg(nic_device): "Check high_availability_config.yml and network_spec.yml") VIRTUAL_IP_NOT_VALID = ("should be outside the admin static and dynamic ranges. " "Check high_availability_config.yml and network_spec.yml") +VIRTUAL_IP_NOT_POD_EXT = ("should be outside the pod_external_ip ranges. " + "Check high_availability_config.yml and omnia_config.yml") BMC_VIRTUAL_IP_NOT_VALID = ("should be outside any bmc static and dynamic ranges. " "Check high_availability_config.yml, network_spec.yml, and " "roles_config.yml") diff --git a/common/library/module_utils/input_validation/schema/credential_rules.json b/common/library/module_utils/input_validation/schema/credential_rules.json index 20700becb4..ab21ea8796 100644 --- a/common/library/module_utils/input_validation/schema/credential_rules.json +++ b/common/library/module_utils/input_validation/schema/credential_rules.json @@ -103,9 +103,9 @@ "pattern": "^[^;\\[\\]`]+$" }, "csi_password": { - "description": "Password for Powerscale UI. Must not contain hyphens (-), single quotes ('), double quotes (\"), at symbols (@), or backslashes (\\).", + "description": "Password for Powerscale UI. Can contain any characters. Length must be between 5 and 32.", "minLength": 5, "maxLength": 32, - "pattern": "^[^\\-\\'\\\"@\\\\]*$" + "pattern": "^.{5,32}$" } } diff --git a/common/library/module_utils/input_validation/schema/high_availability_config.json b/common/library/module_utils/input_validation/schema/high_availability_config.json index b591a947c2..2107af70f2 100644 --- a/common/library/module_utils/input_validation/schema/high_availability_config.json +++ b/common/library/module_utils/input_validation/schema/high_availability_config.json @@ -49,8 +49,19 @@ ], "properties": { "virtual_ip_address": { - "type": "string", - "pattern": "^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$" + "type": "string", + "allOf": [ + { "pattern": "^[0-9.]+$" }, + { "format": "ipv4" } + ] + }, + "active_node_service_tags": { + "type": ["array", "null"], + "minItems": 0, + "items": { + "type": "string", + "pattern": "^[a-zA-Z0-9]+$" + } } } } diff --git a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py index b158294bf5..af22106858 100644 --- a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py @@ -15,7 +15,9 @@ """ This module contains functions for validating high availability configuration. """ - +import csv +import os +import yaml from ansible.module_utils.input_validation.common_utils import validation_utils from ansible.module_utils.input_validation.common_utils import config from ansible.module_utils.input_validation.common_utils import en_us_validation_msg @@ -41,7 +43,8 @@ def get_roles_config_json(input_file_path, logger, module, omnia_base_dir, proje Returns: dict: The roles configuration as json. """ - roles_config_file_path = create_file_path(input_file_path, file_names["functional_groups_config"]) + roles_config_file_path = create_file_path(input_file_path, + file_names["functional_groups_config"]) roles_config_json = validation_utils.load_yaml_as_json( roles_config_file_path, omnia_base_dir, project_name, logger, module ) @@ -285,8 +288,8 @@ def validate_vip_address( errors, config_type, vip_address, - service_node_vip, admin_network, + pod_external_ip_list, admin_netmaskbits, oim_admin_ip ): @@ -307,50 +310,53 @@ def validate_vip_address( - None: The function does not return any value, it only appends error messages to the errors list. """ - # validate if the same virtual_ip_address is already use - if vip_address in service_node_vip: + + # virtual_ip_address is mutually exclusive with admin dynamic ranges + vip_within_dynamic_range = validation_utils.is_ip_within_range( + admin_network["dynamic_range"], vip_address + ) + + if vip_within_dynamic_range: errors.append( create_error_msg( - f"{config_type} virtual_ip_address:", + f"{config_type} virtual_ip_address", vip_address, - en_us_validation_msg.DUPLICATE_VIRTUAL_IP, + en_us_validation_msg.VIRTUAL_IP_NOT_VALID, ) ) - else: - # virtual_ip_address is mutually exclusive with admin static and dynamic ranges - vip_within_static_range = validation_utils.is_ip_within_range( - admin_network["static_range"], vip_address - ) - vip_within_dynamic_range = validation_utils.is_ip_within_range( - admin_network["dynamic_range"], vip_address + # pod external + for pod_ext in pod_external_ip_list: + vip_within_pod_external = validation_utils.is_ip_within_range( + pod_ext, vip_address ) - if vip_within_static_range or vip_within_dynamic_range: + if vip_within_pod_external: errors.append( create_error_msg( - f"{config_type} virtual_ip_address", + f"{config_type} vip in pod external", vip_address, - en_us_validation_msg.VIRTUAL_IP_NOT_VALID, + en_us_validation_msg.VIRTUAL_IP_NOT_POD_EXT, ) ) - # validate virtual_ip_address is in the admin subnet - if not validation_utils.is_ip_in_subnet(oim_admin_ip, admin_netmaskbits, vip_address): - errors.append( - create_error_msg( - f"{config_type} virtual_ip_address", - vip_address, - en_us_validation_msg.VIRTUAL_IP_NOT_IN_ADMIN_SUBNET, - ) + # pxe_map IPs + # validate virtual_ip_address is in the admin subnet + if not validation_utils.is_ip_in_subnet(oim_admin_ip, admin_netmaskbits, vip_address): + errors.append( + create_error_msg( + f"{config_type} virtual_ip_address", + vip_address, + en_us_validation_msg.VIRTUAL_IP_NOT_IN_ADMIN_SUBNET, ) + ) -def validate_k8s_head_node_ha( +def validate_service_k8s_cluster_ha( errors, config_type, ha_data, + input_file_path, network_spec_data, - roles_config_json, all_service_tags, ha_node_vip_list ): @@ -375,10 +381,24 @@ def validate_k8s_head_node_ha( None: Errors are collected in the provided `errors` list. """ admin_network = network_spec_data["admin_network"] - admin_static_range = admin_network.get("static_range", "N/A") admin_dynamic_range = admin_network.get("dynamic_range", "N/A") + admin_netmaskbits = network_spec_data.get("admin_netmaskbits") oim_admin_ip = network_spec_data["oim_admin_ip"] + with open(os.path.join(input_file_path, "provision_config.yml"), "r", encoding="utf-8") as f: + prov_cfg = yaml.safe_load(f) + + with open(prov_cfg.get('pxe_mapping_file_path'), newline='', encoding='utf-8') as csvfile: + pxe_list = list(csv.DictReader(csvfile, delimiter=",")) + pxe_admin_ips = [item["ADMIN_IP"] for item in pxe_list] + pxe_bmc_ips = [item["BMC_IP"] for item in pxe_list] + + with open(os.path.join(input_file_path, "omnia_config.yml"), "r", encoding="utf-8") as omniacfg: + omnia_config = yaml.safe_load(omniacfg) + pod_external_ip_list = [item.get("pod_external_ip_range") + for item in omnia_config.get('service_k8s_cluster') + if item.get('deployment', False)] + if not isinstance(ha_data, list): ha_data = [ha_data] for hdata in ha_data: @@ -388,7 +408,7 @@ def validate_k8s_head_node_ha( # validate active_node_service_tag and passive_node_service_tag all_service_tags_set = set(all_service_tags) active_node_service_tags_set = set(active_node_service_tags) - + vip_address = hdata.get("virtual_ip_address") # Find the intersection common_tags = all_service_tags_set & active_node_service_tags_set @@ -402,8 +422,26 @@ def validate_k8s_head_node_ha( ) ) + if vip_address: + for ip_list in (ha_node_vip_list, pxe_admin_ips, pxe_bmc_ips): + if vip_address in ip_list: + errors.append( + create_error_msg( + f"{config_type} virtual_ip_duplicate", + vip_address, + en_us_validation_msg.DUPLICATE_VIRTUAL_IP)) + validate_vip_address( + errors, + config_type, + vip_address, + admin_network, + pod_external_ip_list, + admin_netmaskbits, + oim_admin_ip + ) + if external_loadbalancer_ip: - ip_ranges = [admin_static_range, admin_dynamic_range, external_loadbalancer_ip] + ip_ranges = [admin_dynamic_range, external_loadbalancer_ip] does_overlap, _ = validation_utils.check_overlap(ip_ranges) if does_overlap: @@ -412,62 +450,62 @@ def validate_k8s_head_node_ha( ) - -def validate_slurm_head_node_ha( - errors, - config_type, - ha_data, - network_spec_data, - _roles_config_json, - all_service_tags, - ha_node_vip_list -): +def load_network_spec(input_file_path): """ - Validates the high availability configuration for a service node. + Loads network specification from a YAML file and returns it as a dictionary. - Parameters: - errors (list): A list to store error messages. - config_type (str): The type of high availability configuration. - ha_data (dict): A dictionary containing high availability data. - network_spec_data (dict): A dictionary containing network specification data. - _roles_config_json (dict): A dictionary containing roles configuration data. - all_service_tags (list): A list of all service tags. - ha_node_vip_list (list): A list of virtual IP addresses for high availability nodes. + Args: + input_file_path (str): The path to the directory containing the YAML file. Returns: - None + dict: A dictionary containing network specification information. """ - active_node_service_tag = ha_data.get("active_node_service_tag") - passive_nodes = ha_data.get("passive_nodes", []) - vip_address = ha_data.get("virtual_ip_address") + with open(os.path.join(input_file_path, "network_spec.yml"), "r", encoding="utf-8") as f: + network_spec_json = yaml.safe_load(f) + network_spec_info = { + "admin_network": get_admin_static_dynamic_ranges(network_spec_json), + "admin_nic_name": get_admin_nic_name(network_spec_json), + "bmc_network": get_bmc_network(network_spec_json), + "bmc_nic_name": get_bmc_nic_name(network_spec_json), + "admin_netmaskbits": get_admin_netmaskbits(network_spec_json), + "admin_uncorrelated_node_start_ip": get_admin_uncorrelated_node_start_ip( + network_spec_json + ), + "oim_admin_ip": get_primary_oim_admin_ip(network_spec_json) + } + return network_spec_info - # get network_spec data - admin_network = network_spec_data["admin_network"] - admin_netmaskbits = network_spec_data["admin_netmaskbits"] - oim_admin_ip = network_spec_data["oim_admin_ip"] +def validate_ha_config(ha_data, mandatory_fields, errors, config_type, + input_file_path, all_service_tags, ha_node_vip_list): + """ + Validates high availability configuration. - # validate active_node_service_tag and passive_node_service_tag - validate_service_tag_presence( - errors, config_type, all_service_tags, active_node_service_tag, passive_nodes - ) + Args: + ha_data (dict): The high availability configuration data. + mandatory_fields (list): The list of mandatory fields in the HA configuration. + errors (list): The list to store error messages. + config_type (str): The type of HA configuration. + input_file_path (str): The path to the directory containing the YAML file. + all_service_tags (list): The list of all service tags. + ha_node_vip_list (list): The list of HA node VIPs. - # validate if duplicate virtual ip address is present - if vip_address: - validate_vip_address( + Returns: + None + """ + ha_validation = { + "service_k8s_cluster_ha": validate_service_k8s_cluster_ha + } + network_spec_info = load_network_spec(input_file_path) + check_mandatory_fields(mandatory_fields, ha_data, errors) + if config_type in ha_validation: + ha_validation[config_type]( errors, config_type, - vip_address, - ha_node_vip_list, - admin_network, - admin_netmaskbits, - oim_admin_ip - ) - -# Dispatch table maps config_type to validation handler -ha_validation = { - "service_k8s_cluster_ha": validate_k8s_head_node_ha -} - + ha_data, + input_file_path, + network_spec_info, + all_service_tags, + ha_node_vip_list) def validate_high_availability_config( input_file_path, data, logger, module, omnia_base_dir, _module_utils_base, project_name @@ -490,91 +528,35 @@ def validate_high_availability_config( errors = [] ha_node_vip_list = [] all_service_tags = set() - network_spec_file_path = create_file_path(input_file_path, file_names["network_spec"]) - network_spec_json = validation_utils.load_yaml_as_json( - network_spec_file_path, omnia_base_dir, project_name, logger, module - ) - - # load roles_config for L2 validations - roles_config_json = get_roles_config_json( - input_file_path, logger, module, omnia_base_dir, project_name - ) - - network_spec_info = { - "admin_network": get_admin_static_dynamic_ranges(network_spec_json), - "admin_nic_name": get_admin_nic_name(network_spec_json), - "bmc_network": get_bmc_network(network_spec_json), - "bmc_nic_name": get_bmc_nic_name(network_spec_json), - "admin_netmaskbits": get_admin_netmaskbits(network_spec_json), - "admin_uncorrelated_node_start_ip": get_admin_uncorrelated_node_start_ip( - network_spec_json - ), - "oim_admin_ip": get_primary_oim_admin_ip(network_spec_json) - } - - # pylint: disable=too-many-branches - def validate_ha_config(ha_data, mandatory_fields, errors, config_type=None): - try: - check_mandatory_fields(mandatory_fields, ha_data, errors) - - if config_type in ha_validation: - ha_validation[config_type]( - errors, - config_type, - ha_data, - network_spec_info, - roles_config_json, - all_service_tags, - ha_node_vip_list, - ) - - # append all the active and passive node service tags to a set - if "active_node_service_tag" in ha_data: - all_service_tags.add(ha_data["active_node_service_tag"]) - elif "active_node_service_tags" in ha_data: - all_service_tags.update(ha_data.get("active_node_service_tags", [])) - - if "passive_nodes" in ha_data: - for node_service_tag in ha_data.get("passive_nodes", []): - all_service_tags.update(node_service_tag.get("node_service_tags", [])) - - if "virtual_ip_address" in ha_data: - ha_node_vip_list.append(ha_data["virtual_ip_address"]) - elif "admin_virtual_ip_address" in ha_data: - ha_node_vip_list.append(ha_data["admin_virtual_ip_address"]) - elif "bmc_virtual_ip_address" in ha_data: - ha_node_vip_list.append(ha_data["bmc_virtual_ip_address"]) - - except KeyError as e: - logger.error(f"Missing key in HA data: {e}") - errors.append(f"Missing key in HA data: {e}") ha_configs = [ - ("service_k8s_cluster_ha", ["virtual_ip_address", "active_node_service_tags"]) + ("service_k8s_cluster_ha", ["virtual_ip_address", "active_node_service_tags"], + "enable_k8s_ha") ] - for config_name, mandatory_fields in ha_configs: + for config_name, mandatory_fields, enable_key in ha_configs: ha_data = data.get(config_name) if ha_data: ha_data = ha_data[0] if isinstance(ha_data, list) else ha_data - enable_key = f'enable_{config_name.split("_", maxsplit=1)[0]}_ha' if ha_data.get(enable_key): - if config_name == "oim_ha": - ha_role = "oim_ha_node" # expected role to be defined in roles_config - check_and_validate_ha_role_in_roles_config(errors, roles_config_json, ha_role) - validate_ha_config(ha_data, mandatory_fields, errors, config_type=config_name) - elif config_name == "service_node_ha": - ha_role = "service_node" # expected role to be defined in roles_config - check_and_validate_ha_role_in_roles_config(errors, roles_config_json, ha_role) - for service_node in ha_data["service_nodes"]: - validate_ha_config( - service_node, - ["virtual_ip_address", "active_node_service_tag", "passive_nodes"], - errors, - config_type=config_name, - ) - else: - validate_ha_config(ha_data, mandatory_fields, errors, config_type=config_name) + # append all the active and passive node service tags to a set + if "active_node_service_tag" in ha_data: + all_service_tags.add(ha_data["active_node_service_tag"]) + elif "active_node_service_tags" in ha_data: + all_service_tags.update(ha_data.get("active_node_service_tags", [])) + + if "passive_nodes" in ha_data: + for node_service_tag in ha_data.get("passive_nodes", []): + all_service_tags.update(node_service_tag.get("node_service_tags", [])) + + if "admin_virtual_ip_address" in ha_data: + ha_node_vip_list.append(ha_data["admin_virtual_ip_address"]) + elif "bmc_virtual_ip_address" in ha_data: + ha_node_vip_list.append(ha_data["bmc_virtual_ip_address"]) + # oim_ha and service_node_ha has been removed + validate_ha_config(ha_data, mandatory_fields, errors, config_name, + os.path.dirname(input_file_path), + all_service_tags, ha_node_vip_list) else: logger.warning(f"Configuration for {config_name} not found.") diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 164e22f6cc..12592ccdcc 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -108,7 +108,7 @@ - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - chmod +x /etc/slurm/epilog.d/logout_user.sh + - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 8d3f6909b4..472192563c 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -135,7 +135,7 @@ - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - - chmod +x /etc/slurm/epilog.d/logout_user.sh + - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd diff --git a/prepare_oim/roles/deploy_containers/auth/templates/auth.j2 b/prepare_oim/roles/deploy_containers/auth/templates/auth.j2 index 65dc9f982c..b851516edb 100644 --- a/prepare_oim/roles/deploy_containers/auth/templates/auth.j2 +++ b/prepare_oim/roles/deploy_containers/auth/templates/auth.j2 @@ -11,9 +11,9 @@ PublishPort=0.0.0.0:{{ port }}:{{ port }} {% endfor %} # Mount configuration and bootstrap files (read-only, with SELinux relabel) -Volume={{ slapd_conf_dest }}:/etc/openldap/slapd.conf:ro,Z -Volume={{ bootstrap_ldif_dest }}:/container-init/bootstrap.ldif:ro,Z -Volume={{ openldap_tls_certs_directory }}:/etc/openldap/certs:ro,Z +Volume={{ slapd_conf_dest }}:/etc/openldap/slapd.conf:ro,z +Volume={{ bootstrap_ldif_dest }}:/container-init/bootstrap.ldif:ro,z +Volume={{ openldap_tls_certs_directory }}:/etc/openldap/certs:ro,z [Service] Restart=always diff --git a/scheduler/roles/common_plugins/tasks/pod_status.yml b/scheduler/roles/common_plugins/tasks/pod_status.yml index f8318030f6..1ac37b8ec8 100644 --- a/scheduler/roles/common_plugins/tasks/pod_status.yml +++ b/scheduler/roles/common_plugins/tasks/pod_status.yml @@ -66,6 +66,8 @@ ansible.builtin.set_fact: k8s_cluster_issue_msg: >- Cluster issues detected: Please login to kube control plane and check for more details. + After resolving the issue, make sure to rerun the playbook. + For more information, see the Troubleshooting section in the Omnia documentation. {% if k8s_status.not_ready_nodes | length > 0 %} - NotReady nodes: {{ k8s_status.not_ready_nodes }} {% endif %} diff --git a/telemetry/roles/telemetry_validation/vars/main.yml b/telemetry/roles/telemetry_validation/vars/main.yml index 989d40bca8..45318b0d3d 100644 --- a/telemetry/roles/telemetry_validation/vars/main.yml +++ b/telemetry/roles/telemetry_validation/vars/main.yml @@ -22,6 +22,10 @@ bmc_group_data_filename: "/opt/omnia/telemetry/bmc_group_data.csv" warning_telemetry_support_false: | "[WARNING] idrac_telemetry_support are false in telemetry_config.yml. Omnia does not deploy telemetry feature if none of the support category is true." +warning_bmc_group_data_file_not_updated_msg: | + "[WARNING] The following BMC IPs are missing from {{ bmc_group_data_filename }}: + {{ missing_bmc_ips | join('\n') }} + If telemetry collection required for missing IPs then re-run the playbook." telemetry_config_syntax_fail_msg: "Failed. Syntax errors present in telemetry_config.yml. Fix errors and re-run playbook again." warning_idrac_telemetry_support_false: | "[WARNING] idrac_telemetry_support is set to false in telemetry_config.yml. This means iDRAC telemetry will not be activated. diff --git a/utils/roles/idrac/tasks/check_prerequisites.yml b/utils/roles/idrac/tasks/check_prerequisites.yml deleted file mode 100644 index e5e20f881a..0000000000 --- a/utils/roles/idrac/tasks/check_prerequisites.yml +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Initialize variables - ansible.builtin.set_fact: - deploy_os_status: false - -- name: Show status of the Lifecycle Controller - dellemc.openmanage.idrac_lifecycle_controller_status_info: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ hostvars['localhost']['bmc_username'] }}" - idrac_password: "{{ hostvars['localhost']['bmc_password'] }}" - validate_certs: false - register: lc_check_status - -- name: LC not available - ansible.builtin.fail: - msg: "{{ lc_check_fail_msg }}" - when: not lc_check_status.lc_status_info.LCReady - register: lc_fail - -- name: Get system inventory - dellemc.openmanage.idrac_system_info: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ hostvars['localhost']['bmc_username'] }}" - idrac_password: "{{ hostvars['localhost']['bmc_password'] }}" - validate_certs: false - register: idrac_info diff --git a/utils/roles/idrac/tasks/configure_pxe_boot.yml b/utils/roles/idrac/tasks/configure_pxe_boot.yml deleted file mode 100644 index 4cb1b7b6e1..0000000000 --- a/utils/roles/idrac/tasks/configure_pxe_boot.yml +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Initialize active_nic - ansible.builtin.set_fact: - active_nic: [] - -- name: Set active_nic from NIC details - ansible.builtin.set_fact: - active_nic: "{{ active_nic + [idrac_info.system_info.NIC[my_idx].FQDD] }}" - with_items: "{{ idrac_info.system_info.NIC }}" - when: - - idrac_info.system_info.NIC[my_idx].LinkStatus is defined - - '"up" in idrac_info.system_info.NIC[my_idx].LinkStatus | lower' - loop_control: - index_var: my_idx - -- name: Set unique active_nic and active_nic_count - ansible.builtin.set_fact: - active_nic: "{{ active_nic | unique }}" - active_nic_count: "{{ active_nic | unique | length }}" - -- name: No active_nic present for the server - ansible.builtin.fail: - msg: "{{ active_nic_fail_msg }}" - when: active_nic_count == "0" - -- name: Configure PXE boot - block: - - name: Configure boot order for PXE booting of 1 active_nic - dellemc.openmanage.idrac_bios: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ hostvars['localhost']['bmc_username'] }}" - idrac_password: "{{ hostvars['localhost']['bmc_password'] }}" - validate_certs: false - attributes: - SetBootOrderEn: NIC.PxeDevice.1-1,NIC.PxeDevice.2-1,NIC.PxeDevice.3-1,NIC.PxeDevice.4-1 - PxeDev1Interface: "{{ active_nic[0] }}" - register: deploy_os_pxe1 - until: not deploy_os_pxe1.failed - retries: "{{ retries_count }}" - when: active_nic_count == "1" - - - name: Configure boot order for PXE booting of 2 active_nic - dellemc.openmanage.idrac_bios: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ hostvars['localhost']['bmc_username'] }}" - idrac_password: "{{ hostvars['localhost']['bmc_password'] }}" - validate_certs: false - attributes: - SetBootOrderEn: NIC.PxeDevice.1-1,NIC.PxeDevice.2-1,NIC.PxeDevice.3-1,NIC.PxeDevice.4-1 - PxeDev1Interface: "{{ active_nic[0] }}" - PxeDev2Interface: "{{ active_nic[1] }}" - register: deploy_os_pxe2 - until: not deploy_os_pxe2.failed - retries: "{{ retries_count }}" - when: active_nic_count == "2" - - - name: Configure boot order for PXE booting of 3 active_nic - dellemc.openmanage.idrac_bios: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ hostvars['localhost']['bmc_username'] }}" - idrac_password: "{{ hostvars['localhost']['bmc_password'] }}" - validate_certs: false - attributes: - SetBootOrderEn: NIC.PxeDevice.1-1,NIC.PxeDevice.2-1,NIC.PxeDevice.3-1,NIC.PxeDevice.4-1 - PxeDev1Interface: "{{ active_nic[0] }}" - PxeDev2Interface: "{{ active_nic[1] }}" - PxeDev3Interface: "{{ active_nic[2] }}" - register: deploy_os_pxe3 - until: not deploy_os_pxe3.failed - retries: "{{ retries_count }}" - when: active_nic_count == "3" - - - name: Configure boot order for PXE booting of 4 active_nic - dellemc.openmanage.idrac_bios: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ hostvars['localhost']['bmc_username'] }}" - idrac_password: "{{ hostvars['localhost']['bmc_password'] }}" - validate_certs: false - attributes: - SetBootOrderEn: NIC.PxeDevice.1-1,NIC.PxeDevice.2-1,NIC.PxeDevice.3-1,NIC.PxeDevice.4-1 - PxeDev1Interface: "{{ active_nic[0] }}" - PxeDev2Interface: "{{ active_nic[1] }}" - PxeDev3Interface: "{{ active_nic[2] }}" - PxeDev4Interface: "{{ active_nic[3] }}" - register: deploy_os_pxe4 - until: not deploy_os_pxe4.failed - retries: "{{ retries_count }}" - when: active_nic_count >= "4" - rescue: - - name: Retry configuring boot order - block: - - name: Retry configuring boot order for PXE booting - dellemc.openmanage.idrac_bios: - idrac_ip: "{{ inventory_hostname }}" - idrac_user: "{{ hostvars['localhost']['bmc_username'] }}" - idrac_password: "{{ hostvars['localhost']['bmc_password'] }}" - validate_certs: false - attributes: - SetBootOrderEn: NIC.PxeDevice.1-1,NIC.PxeDevice.2-1,NIC.PxeDevice.3-1,NIC.PxeDevice.4-1 - register: deploy_os_pxe_retry - rescue: - - name: OS provisioning failed using PXE - ansible.builtin.fail: - msg: "{{ pxe_provisioning_fail_msg }}" - always: - - name: Set deploy_os_status when provision_method == PXE for 1 active_nic - ansible.builtin.set_fact: - deploy_os_status: "{{ not deploy_os_pxe1.failed }}" - when: active_nic_count == "1" - - - name: Set deploy_os_status when provision_method == PXE for 2 active_nic - ansible.builtin.set_fact: - deploy_os_status: "{{ not deploy_os_pxe2.failed }}" - when: active_nic_count == "2" - - - name: Set deploy_os_status when provision_method == PXE for 3 active_nic - ansible.builtin.set_fact: - deploy_os_status: "{{ not deploy_os_pxe3.failed }}" - when: active_nic_count == "3" - - - name: Set deploy_os_status when provision_method == PXE for 4 active_nic - ansible.builtin.set_fact: - deploy_os_status: "{{ not deploy_os_pxe4.failed }}" - when: active_nic_count >= "4" - - - name: Set deploy_os_status when provision_method == PXE for retry - ansible.builtin.set_fact: - deploy_os_status: "{{ not deploy_os_pxe_retry.failed }}" - when: deploy_os_pxe_retry.failed is defined - -- name: Provision OS status - ansible.builtin.debug: - msg: "{{ provision_os_msg }}" - when: deploy_os_status diff --git a/utils/roles/idrac/tasks/main.yml b/utils/roles/idrac/tasks/main.yml deleted file mode 100644 index 172254514e..0000000000 --- a/utils/roles/idrac/tasks/main.yml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check prequisites - ansible.builtin.include_tasks: check_prerequisites.yml - -- name: Configure PXE boot - ansible.builtin.include_tasks: configure_pxe_boot.yml diff --git a/utils/roles/idrac/tasks/validate_inventory.yml b/utils/roles/idrac/tasks/validate_inventory.yml deleted file mode 100644 index 6658b8a75b..0000000000 --- a/utils/roles/idrac/tasks/validate_inventory.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Inventory not provided - ansible.builtin.fail: - msg: "{{ bmc_empty_inventory_fail_msg.splitlines() | join(' ') }}" - when: - - groups['all'] is defined - - (groups['all'] | length == 0) - -- name: Validate bmc group - ansible.builtin.assert: - that: "groups['bmc'] | length | int >= 1" - success_msg: "{{ bmc_validation_fail_msg }}" diff --git a/utils/roles/idrac/vars/main.yml b/utils/roles/idrac/vars/main.yml deleted file mode 100644 index 413026c19a..0000000000 --- a/utils/roles/idrac/vars/main.yml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage: check_prerequisites.yml -lc_check_fail_msg: "Failed. LC is not ready. Retry again after LC is ready" - -# Usage: configure_pxe_boot.yml -active_nic_fail_msg: "No host active nic present for the device. Please check the host connection for the server and retry again." -provision_os_msg: "OS provisioning is initiated. Wait for installation to complete for all servers." -pxe_provisioning_fail_msg: "OS provisioning using PXE failed. This could be due to outdated NIC firmware. Re-run provision.yml after fixing the issue" -retries_count: 3 - -# Usage: validate_inventory.yml -bmc_empty_inventory_fail_msg: | - Failed. Inventory not provided. - Please re-run the playbook with an inventory that includes the groups 'bmc' by using the -i inventory option. -bmc_validation_fail_msg: "Failed. bmc group in inventory must have atleast one bmc ip." diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml new file mode 100644 index 0000000000..0a9078f667 --- /dev/null +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -0,0 +1,58 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Show status of the Lifecycle Controller + dellemc.openmanage.idrac_lifecycle_controller_status_info: + idrac_ip: "{{ inventory_hostname }}" + idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" + idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" + validate_certs: false + register: lc_check_status + +- name: Check LC availibility + ansible.builtin.fail: + msg: "{{ lc_check_fail_msg }}" + when: not lc_check_status.lc_status_info.LCReady + +- name: Set reboot type + ansible.builtin.set_fact: + reboot_type: "{{ 'none' if not restart_host else ('force_restart' if force_restart else 'graceful_restart') }}" + +- name: Set boot from pxe + dellemc.openmanage.idrac_boot: + idrac_ip: "{{ inventory_hostname }}" + idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" + idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" + validate_certs: false + boot_source_override_mode: uefi + boot_source_override_target: "{{ boot_source_override_target }}" + boot_source_override_enabled: "{{ boot_source_override_enabled }}" + reset_type: "{{ reboot_type }}" + register: pxe_provisioning + ignore_errors: true + ignore_unreachable: true + +- name: OS provisioning failed using PXE + ansible.builtin.fail: + msg: "{{ pxe_provisioning_fail_msg }}" + when: pxe_provisioning is failed + +- name: IDRAC might be unreachable during OS provisioning + ansible.builtin.debug: + msg: "{{ unreachable_idrac_msg }}" + when: pxe_provisioning is unreachable + +- name: Provision OS status + ansible.builtin.debug: + msg: "{{ provision_os_msg }}" diff --git a/utils/roles/idrac_pxe_boot/vars/main.yml b/utils/roles/idrac_pxe_boot/vars/main.yml new file mode 100644 index 0000000000..bebd2b4a42 --- /dev/null +++ b/utils/roles/idrac_pxe_boot/vars/main.yml @@ -0,0 +1,32 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Change to false for not restarting host. only setting pxe_boot will happen +restart_host: true + +# Change to true for forceful reboot. by default graceful will happen +force_restart: false + +# Set boot source override mode. Valid values are once, continuous, or disabled +boot_source_override_enabled: continuous + +# Set boot source override target. Valid values are pxe,uefi_http,sd_card,uefi_target,utilities,bios_setup,hdd,cd,floppy,none +boot_source_override_target: pxe + +# Usage: main.yml +lc_check_fail_msg: "Failed. iDRAC is not ready. Retry again after iDRAC is ready" +provision_os_msg: "OS provisioning is initiated. Wait for installation to complete for all servers." +pxe_provisioning_fail_msg: "OS booting using PXE failed. This could be due to outdated NIC firmware. Re-run set_pxe_boot.yml after fixing the issue" +bmc_validation_fail_msg: "Failed. bmc group in inventory must have atleast one bmc ip." +unreachable_idrac_msg: "iDRAC is unreachable. pxe boot might be set. Please check the host reboot status manually" diff --git a/utils/roles/node_repo_update/tasks/main.yml b/utils/roles/node_repo_update/tasks/main.yml index 4ea6bbb74d..c6d1755de1 100644 --- a/utils/roles/node_repo_update/tasks/main.yml +++ b/utils/roles/node_repo_update/tasks/main.yml @@ -30,8 +30,10 @@ - name: Set hosts dict ansible.builtin.set_fact: - hosts_dict: "{{ hosts_dict | default({}) | combine({item: hostvars[item]['ansible_default_ipv4']['address'] + ' ' - + hostvars[item]['ansible_fqdn'] + ' ' + hostvars[item]['ansible_fqdn'] | split('.') | first}) }}" + hosts_dict: "{{ hosts_dict | default({}) | + combine({item: (hostvars[item]['ansible_default_ipv4']['address'] | + default(hostvars[item]['ansible_all_ipv4_addresses'][0]) | default(hostvars[item]['inventory_hostname'])) + + ' ' + hostvars[item]['ansible_fqdn'] + ' ' + hostvars[item]['ansible_fqdn'] | split('.') | first}) }}" delegate_to: localhost run_once: true loop: "{{ ansible_play_hosts | default([]) }}" @@ -53,12 +55,11 @@ - name: Update hosts file ansible.builtin.lineinfile: dest: "{{ hosts_file_dest }}" - line: "{{ hostvars[item]['ansible_default_ipv4']['address'] }} {{ hostvars[item]['ansible_fqdn'] }} - {{ hostvars[item]['ansible_fqdn'] | split('.') | first }}" + line: "{{ item }}" state: present create: true mode: "{{ hosts_file_mode }}" - with_items: "{{ ansible_play_hosts | default([]) }}" + loop: "{{ hosts_dict.values() | default([]) }}" rescue: - name: Updating hosts file failed ansible.builtin.fail: diff --git a/utils/set_pxe_boot.yml b/utils/set_pxe_boot.yml index 52aaced6c5..f6cf1258c7 100644 --- a/utils/set_pxe_boot.yml +++ b/utils/set_pxe_boot.yml @@ -12,7 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - +# ------------------------------------------------------------------------- +# PXE PREREQUISITES +# ------------------------------------------------------------------------- +# 1. Dell iDRAC BMCs must be reachable from the Ansible controller +# 2. PXE (Pre‑boot eXecution Environment) support – the NIC's +# firmware must implement the PXE option and must be enabled. +# 3. The `dellemc.openmanage` Ansible collection must be installed: +# ansible-galaxy collection install dellemc.openmanage +# 4. iDRAC firmware version must support the 'Boot Source Override' +# API (most modern iDRAC9/10 firmware do). +# 5. The TFTP/NFS/HTTP server that provides the PXE +# boot image must be reachable by the target nodes once the iDRAC +# is set to PXE mode. +# ------------------------------------------------------------------------- - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local @@ -27,19 +40,20 @@ - name: Invoke get_config_credentials.yml ansible.builtin.import_playbook: credential_utility/get_config_credentials.yml -- name: Validate inventory - hosts: localhost - connection: local - gather_facts: false - tasks: - - name: Validate inventory - ansible.builtin.include_role: - name: idrac - tasks_from: validate_inventory.yml - -- name: Deploy OS via idrac +# This configures Dell iDRAC BMCs to boot a host from PXE (network) and optionally reboots the server. +# This will set the boot mode to pxe +# Note: Restart will not happen if the server is powered off, only pxe mode will be set. +- name: Reboot Host via PXE hosts: bmc connection: local gather_facts: false + pre_tasks: + - name: Validate bmc group + ansible.builtin.assert: + that: groups['bmc'] | length | int >= 1 + fail_msg: "Failed. bmc group in inventory must have atleast one bmc ip." roles: - - idrac + - role: idrac_pxe_boot + # vars: + # restart_host: false # By default restart will be true, set to false not to restart + # force_restart: true # By default graceful_restart will happen, set to true to force restart