From 43e02202880e8a0d6fe0ebcbb00d37454f2f162d Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Mon, 6 Jan 2025 19:11:13 +0100 Subject: [PATCH 01/31] WIP slurmctld config --- pyslurm/core/slurmctld.pxd | 101 ++- pyslurm/core/slurmctld.pyx | 1285 +++++++++++++++++++++++++++++++++++- pyslurm/settings.pyx | 2 +- pyslurm/slurm/extra.pxi | 19 + pyslurm/utils/cstr.pxd | 4 +- pyslurm/utils/cstr.pyx | 21 +- pyslurm/utils/helpers.pyx | 31 + 7 files changed, 1449 insertions(+), 14 deletions(-) diff --git a/pyslurm/core/slurmctld.pxd b/pyslurm/core/slurmctld.pxd index 8bafb01f..f354083c 100644 --- a/pyslurm/core/slurmctld.pxd +++ b/pyslurm/core/slurmctld.pxd @@ -1,7 +1,7 @@ ######################################################################### # slurmctld.pxd - pyslurm slurmctld api ######################################################################### -# Copyright (C) 2023 Toni Harzendorf +# Copyright (C) 2025 Toni Harzendorf # # This file is part of PySlurm # @@ -28,12 +28,111 @@ from pyslurm.slurm cimport ( slurm_load_ctl_conf, slurm_free_ctl_conf, slurm_preempt_mode_string, + slurm_accounting_enforce_string, + slurm_sprint_cpu_bind_type, + slurm_ctl_conf_2_key_pairs, + cpu_bind_type_t, try_xmalloc, + list_t, + xfree, ) from pyslurm.utils cimport cstr from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t from pyslurm.utils.uint cimport * +from pyslurm.db.util cimport ( + SlurmList, + SlurmListItem, +) + + +cdef dict _parse_config_key_pairs(void *ptr, owned=*) + + +ctypedef struct config_key_pair_t: + char *name + char *value + cdef class Config: cdef slurm_conf_t *ptr + + cdef public: + CgroupConfig cgroup_config + AccountingGatherConfig accounting_gather_config + MPIConfig mpi_config + + +cdef class MPIConfig: + + cdef public: + pmix_cli_tmp_dir_base + pmix_coll_fence + pmix_debug + pmix_direct_conn + pmix_direct_conn_early + pmix_direct_conn_ucx + pmix_direct_same_arch + pmix_environment + pmix_fence_barrier + pmix_net_devices_ucx + pmix_timeout + pmix_tls_ucx + + @staticmethod + cdef MPIConfig from_ptr(void *ptr) + +cdef class CgroupConfig: + + cdef public: + mountpoint + plugin + systemd_timeout + ignore_systemd + ignore_systemd_on_failure + enable_controllers + + allowed_ram_space + allowed_swap_space + constrain_cores + constrain_devices + constrain_ram_space + constrain_swap_space + max_ram_percent + max_swap_percent + memory_swappiness + min_ram_space + + signal_children_processes + + @staticmethod + cdef CgroupConfig from_ptr(void *ptr) + + +cdef class AccountingGatherConfig: + + cdef public: + energy_ipmi_frequency + energy_ipmi_calc_adjustment + energy_ipmi_power_sensors + energy_ipmi_user_name + energy_ipmi_password + energy_ipmi_timeout + + profile_hdf5_dir + profile_hdf5_default + + profile_influxdb_database + profile_influxdb_default + profile_influxdb_host + profile_influxdb_password + profile_influxdb_rtpolicy + profile_influxdb_user + profile_influxdb_timeout + + infiniband_ofed_port + + sysfs_interfaces + + @staticmethod + cdef AccountingGatherConfig from_ptr(void *ptr) diff --git a/pyslurm/core/slurmctld.pyx b/pyslurm/core/slurmctld.pyx index 7f06966e..a288aaa2 100644 --- a/pyslurm/core/slurmctld.pyx +++ b/pyslurm/core/slurmctld.pyx @@ -1,7 +1,7 @@ ######################################################################### # slurmctld.pyx - pyslurm slurmctld api ######################################################################### -# Copyright (C) 2023 Toni Harzendorf +# Copyright (C) 2025 Toni Harzendorf # # This file is part of PySlurm # @@ -23,6 +23,150 @@ # cython: language_level=3 from pyslurm.core.error import verify_rpc, RPCError +from pyslurm.utils.uint import * +from pyslurm.utils.ctime import _raw_time +from pyslurm.utils.helpers import cpu_freq_int_to_str +from pyslurm.utils.helpers import instance_to_dict +from pyslurm.utils import cstr + + +cdef class MPIConfig: + + def __init__(self): + raise RuntimeError("Cannot instantiate class directly") + + def to_dict(self): + """MPI config formatted as a dictionary. + + Returns: + (dict): Config as a dict + + Examples: + >>> from pyslurm import slurmctld + >>> config = slurmctld.Config.load() + >>> mpi_config = config.mpi_config.to_dict() + """ + return instance_to_dict(self) + + @staticmethod + cdef MPIConfig from_ptr(void *ptr): + cdef: + dict conf = _parse_config_key_pairs(ptr) + MPIConfig out = MPIConfig.__new__(MPIConfig) + + out.pmix_cli_tmp_dir_base = conf.get("PMIxCliTmpDirBase") + out.pmix_coll_fence = conf.get("PMIxCollFence") + out.pmix_debug = bool(int(conf.get("PMIxDebug", 0))) + out.pmix_direct_conn = _true_false_to_bool(conf.get("PMIxDirectConn", "true")) + out.pmix_direct_conn_early = _true_false_to_bool(conf.get("PMIxDirectConnEarly", "false")) + out.pmix_direct_conn_ucx = _true_false_to_bool(conf.get("PMIxDirectConnUCX", "false")) + out.pmix_direct_same_arch = _true_false_to_bool(conf.get("PMIxDirectSameArch", "false")) + out.pmix_environment = cstr.to_dict( + conf.get("PMIxEnv", ""), delim1=";", delim2="=") + out.pmix_fence_barrier = _true_false_to_bool(conf.get("PMIxFenceBarrier", "false")) + out.pmix_net_devices_ucx = conf.get("PMIxNetDevicesUCX") + out.pmix_timeout = int(conf.get("PMIxTimeout", 300)) + out.pmix_tls_ucx = cstr.to_list(conf.get("PMIxTlsUCX", "")) + + return out + + +cdef class CgroupConfig: + + def __init__(self, job_id): + raise RuntimeError("Cannot instantiate class directly") + + def to_dict(self): + """Cgroup config formatted as a dictionary. + + Returns: + (dict): Config as a dict + + Examples: + >>> from pyslurm import slurmctld + >>> config = slurmctld.Config.load() + >>> cgroup_config = config.cgroup_config.to_dict() + """ + return instance_to_dict(self) + + @staticmethod + cdef CgroupConfig from_ptr(void *ptr): + cdef: + dict conf = _parse_config_key_pairs(ptr) + CgroupConfig out = CgroupConfig.__new__(CgroupConfig) + + out.mountpoint = conf.get("CgroupMountpoint", "/sys/fs/cgroup") + out.plugin = conf.get("CgroupPlugin", "autodetect") + out.systemd_timeout = int(conf.get("SystemdTimeout", 1000)) + out.ignore_systemd = _yesno_to_bool(conf.get("IgnoreSystemd")) + out.ignore_systemd_on_failure = _yesno_to_bool(conf.get("IgnoreSystemdOnFailure")) + out.enable_controllers = _yesno_to_bool(conf.get("EnableControllers")) + + out.allowed_ram_space = int(conf.get("AllowedRAMSpace", 100)) + out.allowed_swap_space = int(conf.get("AllowedSwapSpace", 0)) + out.constrain_cores = _yesno_to_bool(conf.get("ConstrainCores", "no")) + out.constrain_devices = _yesno_to_bool(conf.get("ConstrainDevices", "no")) + out.constrain_ram_space = _yesno_to_bool(conf.get("ConstrainRAMSpace", "no")) + out.constrain_swap_space = _yesno_to_bool(conf.get("ConstrainSwapSpace", "no")) + out.max_ram_percent = int(conf.get("MaxRAMPercent", 100)) + out.max_swap_percent = int(conf.get("MaxSwapPercent", 100)) + out.memory_swappiness = int(conf.get("MemorySwappiness", -1)) + out.min_ram_space = int(conf.get("MinRAMSpace", 30*1024)) + + out.signal_children_processes = _yesno_to_bool(conf.get("SignalChildrenProcesses", "no")) + + return out + +cdef class AccountingGatherConfig: + + def __init__(self, job_id): + raise RuntimeError("Cannot instantiate class directly") + + def to_dict(self): + """AccountingGather config formatted as a dictionary. + + Returns: + (dict): Config as a dict + + Examples: + >>> from pyslurm import slurmctld + >>> config = slurmctld.Config.load() + >>> acctg_config_dict = config.accounting_gather_config.to_dict() + """ + return instance_to_dict(self) + + @staticmethod + cdef AccountingGatherConfig from_ptr(void *ptr): + cdef: + dict conf = _parse_config_key_pairs(ptr) + AccountingGatherConfig out = AccountingGatherConfig.__new__(AccountingGatherConfig) + + out.energy_ipmi_frequency = int(conf.get("EnergyIPMIFrequency", 30)) + out.energy_ipmi_calc_adjustment = _yesno_to_bool( + conf.get("EnergyIPMICalcAdjustment")) + + # TODO: dict + out.energy_ipmi_power_sensors = conf.get("EnergyIPMIPowerSensors") + + out.energy_ipmi_user_name = conf.get("EnergyIPMIUsername") + out.energy_ipmi_password = conf.get("EnergyIPMIPassword") + out.energy_ipmi_timeout = int(conf.get("EnergyIPMITimeout", 10)) + + out.profile_hdf5_dir = conf.get("ProfileHDF5Dir") + out.profile_hdf5_default = conf.get("ProfileHDF5Default", "").split(",") + + out.profile_influxdb_database = conf.get("ProfileInfluxDBDatabase") + out.profile_influxdb_default = conf.get("ProfileInfluxDBDefault", "").split(",") + out.profile_influxdb_host = conf.get("ProfileInfluxDBHost") + out.profile_influxdb_password = conf.get("ProfileInfluxDBPass") + out.profile_influxdb_rtpolicy = conf.get("ProfileInfluxDBRTPolicy") + out.profile_influxdb_user = conf.get("ProfileInfluxDBUser") + out.profile_influxdb_timeout = int(conf.get("ProfileInfluxDBTimeout", 10)) + + out.infiniband_ofed_port = int(conf.get("InfinibandOFEDPort", 1)) + out.sysfs_interfaces = conf.get("SysfsInterfaces", []) + + return out cdef class Config: @@ -37,26 +181,1147 @@ cdef class Config: slurm_free_ctl_conf(self.ptr) self.ptr = NULL + @staticmethod + def load_scontrol(): + cdef Config conf = Config.__new__(Config) + verify_rpc(slurm_load_ctl_conf(0, &conf.ptr)) + + out = _parse_config_key_pairs(slurm_ctl_conf_2_key_pairs(conf.ptr), + owned=True) + out["CgroupSupportConfiguration"] = _parse_config_key_pairs( + conf.ptr.cgroup_conf) + out["AccountingGatherConfiguration"] = _parse_config_key_pairs( + conf.ptr.acct_gather_conf) + out["MPIPluginsConfiguration"] = _parse_config_key_pairs( + conf.ptr.mpi_conf) + + return out + @staticmethod def load(): cdef Config conf = Config.__new__(Config) verify_rpc(slurm_load_ctl_conf(0, &conf.ptr)) + + conf.cgroup_config = CgroupConfig.from_ptr(conf.ptr.cgroup_conf) + conf.accounting_gather_config = AccountingGatherConfig.from_ptr( + conf.ptr.acct_gather_conf) + conf.mpi_config = MPIConfig.from_ptr(conf.ptr.mpi_conf) + return conf - + + def to_dict(self): + """Slurmctld config formatted as a dictionary. + + Returns: + (dict): slurmctld config as a dict + + Examples: + >>> import pyslurm + >>> config = pyslurm.slurmctld.Config.load() + >>> config_dict = config.as_dict() + """ + out = instance_to_dict(self) + out["cgroup_config"] = self.cgroup_config.to_dict() + out["accounting_gather_config"] = self.accounting_gather_config.to_dict() + out["mpi_config"] = self.mpi_config.to_dict() + return out + + @property + def accounting_storage_tres(self): + return cstr.to_list(self.ptr.accounting_storage_tres) + + @property + def accounting_storage_enforce(self): + cdef char tmp[128] + slurm_accounting_enforce_string(self.ptr.accounting_storage_enforce, + tmp, sizeof(tmp)) + out = cstr.to_unicode(tmp) + if not out or out == "none": + return [] + + return out.upper().split(",") + + @property + def accounting_storage_backup_host(self): + return cstr.to_unicode(self.ptr.accounting_storage_backup_host) + + @property + def accounting_storage_external_hosts(self): + return cstr.to_list(self.ptr.accounting_storage_ext_host) + + @property + def accounting_storage_host(self): + return cstr.to_unicode(self.ptr.accounting_storage_host) + @property - def cluster(self): + def accounting_storage_parameters(self): + return cstr.to_dict(self.ptr.accounting_storage_params) + + @property + def accounting_storage_password(self): + return cstr.to_unicode(self.ptr.accounting_storage_pass) + + @property + def accounting_storage_port(self): + return u16_parse(self.ptr.accounting_storage_port) + + @property + def accounting_storage_type(self): + return cstr.to_unicode(self.ptr.accounting_storage_type) + + @property + def accounting_storage_user(self): + return cstr.to_unicode(self.ptr.accounting_storage_user) + + @property + def accounting_store_flags(self): + return _acct_store_flags_int_to_str(self.ptr.conf_flags) + + @property + def accounting_gather_node_frequency(self): + return u16_parse(self.ptr.acct_gather_node_freq) + + @property + def accounting_gather_energy_type(self): + return cstr.to_unicode(self.ptr.acct_gather_energy_type) + + @property + def accounting_gather_interconnect_type(self): + return cstr.to_unicode(self.ptr.acct_gather_interconnect_type) + + @property + def accounting_gather_filesystem_type(self): + return cstr.to_unicode(self.ptr.acct_gather_filesystem_type) + + @property + def accounting_gather_profile_type(self): + return cstr.to_unicode(self.ptr.acct_gather_profile_type) + + @property + def allow_spec_resource_usage(self): + if self.ptr.conf_flags & slurm.CONF_FLAG_ASRU: + return True + + return False + + @property + def auth_alt_types(self): + return cstr.to_list(self.ptr.authalttypes) + + @property + def auth_info(self): + return cstr.to_list(self.ptr.authinfo) + + @property + def auth_alt_params(self): + # TODO: maybe dict? + return cstr.to_list(self.ptr.authalt_params) + + @property + def auth_type(self): + return cstr.to_unicode(self.ptr.authtype) + + @property + def batch_start_timeout(self): + # seconds + return u16_parse(self.ptr.batch_start_timeout) + + @property + def bcast_exclude_paths(self): + return cstr.to_list(self.ptr.bcast_exclude) + + @property + def bcast_parameters(self): + return cstr.to_list(self.ptr.bcast_parameters) + + @property + def burst_buffer_type(self): + return cstr.to_unicode(self.ptr.bb_type) + + @property + def boot_time(self): + return _raw_time(self.ptr.boot_time) + + @property + def certmgr_parameters(self): + return cstr.to_list(self.ptr.certmgr_params) + + @property + def certmgr_type(self): + return cstr.to_unicode(self.ptr.certmgr_type) + + @property + def cli_filter_plugins(self): + return cstr.to_list(self.ptr.cli_filter_plugins) + + @property + def cluster_name(self): return cstr.to_unicode(self.ptr.cluster_name) @property - def preempt_mode(self): - cdef char *tmp = slurm_preempt_mode_string(self.ptr.preempt_mode) - return cstr.to_unicode(tmp) + def communication_parameters(self): + return cstr.to_list(self.ptr.comm_params) @property - def suspend_program(self): - return cstr.to_unicode(self.ptr.suspend_program) + def complete_wait_time(self): + # seconds + return u16_parse(self.ptr.complete_wait) @property - def resume_program(self): - return cstr.to_unicode(self.ptr.resume_program) + def disable_root_jobs(self): + if self.ptr.conf_flags & slurm.CONF_FLAG_DRJ: + return True + return False + + @property + def default_cpu_frequency(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_def) + + @property + def cpu_frequency_governors(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_govs) + + @property + def credential_type(self): + return cstr.to_unicode(self.ptr.cred_type) + + @property + def data_parser_parameters(self): + return cstr.to_unicode(self.ptr.data_parser_parameters) + + @property + def debug_flags(self): + return _debug_flags_int_to_list(self.ptr.debug_flags) + + @property + def default_memory_per_cpu(self): + return _get_memory(self.ptr.def_mem_per_cpu, per_cpu=True) + + @property + def default_memory_per_node(self): + return _get_memory(self.ptr.def_mem_per_cpu, per_cpu=False) + + @property + def dependency_parameters(self): + return cstr.to_list(self.ptr.dependency_params) + + @property + def eio_timeout(self): + # seconds + return u16_parse(self.ptr.eio_timeout) + + @property + def enforce_partition_limits(self): + return _enforce_part_limits_int_to_str(self.ptr.enforce_part_limits) + + @property + def epilog(self): + return cstr.to_list_with_count(self.ptr.epilog, + self.ptr.epilog_cnt) + + @property + def epilog_msg_time(self): + # ms + return u32_parse(self.ptr.epilog_msg_time) + + @property + def epilog_slurmctld(self): + return cstr.to_list_with_count(self.ptr.epilog_slurmctld, + self.ptr.epilog_slurmctld_cnt) + +# @property +# def external_sensors_type(self): +# return cstr.to_unicode(self.ptr.ext_sensors_type) + +# @property +# def external_sensors_frequency(self): +# return u16_parse(self.ptr.ext_sensors_freq) + + # TODO: void *ext_sensors_conf put into own class? + + @property + def federation_parameters(self): + return cstr.to_list(self.ptr.fed_params) + + @property + def first_job_id(self): + return u32_parse(self.ptr.first_job_id) + + @property + def fair_share_dampening_factor(self): + return u16_parse(self.ptr.fs_dampening_factor) + + # getnameinfo_cache_timeout + + @property + def get_environment_timeout(self): + return u16_parse(self.ptr.get_env_timeout) + + @property + def gres_types(self): + return cstr.to_list(self.ptr.gres_plugins) + + @property + def group_update_time(self): + return u16_parse(self.ptr.group_time) + + @property + def group_update_force(self): + # TODO: maybe bool? + return u16_parse_bool(self.ptr.group_force) + + @property + def default_gpu_frequency(self): + return cstr.to_unicode(self.ptr.gpu_freq_def) + + @property + def hash_plugin(self): + return cstr.to_unicode(self.ptr.hash_plugin) + + @property + def hash_value(self): + val = u32_parse(self.ptr.hash_val) + if not val: + return None + + return hex(val) + + @property + def health_check_interval(self): + return u16_parse(self.ptr.health_check_interval) + + @property + def health_check_node_state(self): + return _health_check_node_state_int_to_list( + self.ptr.health_check_node_state) + + @property + def health_check_program(self): + return cstr.to_unicode(self.ptr.health_check_program) + + @property + def inactive_limit(self): + # seconds + return u16_parse(self.ptr.inactive_limit) + + @property + def interactive_step_options(self): + return cstr.to_unicode(self.ptr.interactive_step_opts) + + @property + def job_accounting_gather_frequency(self): + return cstr.to_dict(self.ptr.job_acct_gather_freq) + + @property + def job_accounting_gather_type(self): + return cstr.to_unicode(self.ptr.job_acct_gather_type) + + @property + def job_accounting_gather_parameters(self): + return cstr.to_list(self.ptr.job_acct_gather_params) + + @property + def job_completion_host(self): + return cstr.to_unicode(self.ptr.job_comp_host) + + @property + def job_completion_location(self): + return cstr.to_unicode(self.ptr.job_comp_loc) + + @property + def job_completion_parameters(self): + return cstr.to_list(self.ptr.job_comp_params) + +# @property +# def job_completion_password(self): +# return cstr.to_unicode(self.ptr.job_comp_pass) + + @property + def job_completion_port(self): + return u32_parse(self.ptr.job_comp_port) + + @property + def job_completion_type(self): + return cstr.to_unicode(self.ptr.job_comp_type) + + @property + def job_completion_user(self): + return cstr.to_unicode(self.ptr.job_comp_user) + + @property + def job_container_type(self): + return cstr.to_unicode(self.ptr.job_container_plugin) + + @property + def job_defaults(self): + cdef char *data = slurm.job_defaults_str(self.ptr.job_defaults_list) + out = cstr.to_dict(data) + xfree(data) + return out + + @property + def job_file_append(self): + return u16_parse_bool(self.ptr.job_file_append) + + @property + def job_requeue(self): + return u16_parse_bool(self.ptr.job_requeue) + + @property + def job_submit_plugins(self): + return cstr.to_list(self.ptr.job_submit_plugins) + + @property + def keepalive_interval(self): + return u32_parse(self.ptr.keepalive_interval) + + @property + def kill_on_bad_exit(self): + return u16_parse_bool(self.ptr.kill_on_bad_exit) + + @property + def kill_wait(self): + # seconds + return u16_parse(self.ptr.kill_wait) + + @property + def launch_parameters(self): + return cstr.to_list(self.ptr.launch_params) + + @property + def licenses(self): + return cstr.to_dict(self.ptr.licenses, delim1=",", + delim2=":", def_value=1) + + @property + def log_time_format(self): + return _log_fmt_int_to_str(self.ptr.log_fmt) + + @property + def mail_domain(self): + return cstr.to_unicode(self.ptr.mail_domain) + + @property + def mail_program(self): + return cstr.to_unicode(self.ptr.mail_prog) + + @property + def max_array_size(self): + return u32_parse(self.ptr.max_array_sz) + + @property + def max_batch_requeue(self): + return u32_parse(self.ptr.max_batch_requeue) + + @property + def max_dbd_msgs(self): + return u32_parse(self.ptr.max_dbd_msgs) + + @property + def max_job_count(self): + return u32_parse(self.ptr.max_job_cnt) + + @property + def max_job_id(self): + return u32_parse(self.ptr.max_job_id) + + @property + def max_memory_per_cpu(self): + return _get_memory(self.ptr.max_mem_per_cpu, per_cpu=True) + + @property + def max_memory_per_node(self): + return _get_memory(self.ptr.max_mem_per_cpu, per_cpu=False) + + @property + def max_node_count(self): + return u32_parse(self.ptr.max_node_cnt) + + @property + def max_step_count(self): + return u32_parse(self.ptr.max_step_cnt) + + @property + def max_tasks_per_node(self): + return u32_parse(self.ptr.max_tasks_per_node) + + @property + def mcs_plugin(self): + return cstr.to_unicode(self.ptr.mcs_plugin) + + @property + def mcs_parameters(self): + return cstr.to_list(self.ptr.mcs_plugin_params) + + @property + def min_job_age(self): + return u32_parse(self.ptr.min_job_age) + + @property + def mpi_default(self): + return cstr.to_unicode(self.ptr.mpi_default) + + @property + def mpi_parameters(self): + return cstr.to_list(self.ptr.mpi_params) + + @property + def message_timeout(self): + return u16_parse(self.ptr.msg_timeout) + + @property + def next_job_id(self): + return u32_parse(self.ptr.next_job_id) + + # TODO: void *node_features_conf put into own class? + + @property + def node_features_plugins(self): + return cstr.to_list(self.ptr.node_features_plugins) + + @property + def over_time_limit(self): + return u16_parse(self.ptr.over_time_limit) + + @property + def plugin_path(self): + # TODO: maybe list + return cstr.to_unicode(self.ptr.plugindir) + + @property + def plugin_stack_config(self): + return cstr.to_unicode(self.ptr.plugstack) + +# @property +# def power_parameters(self): +# return cstr.to_list(self.ptr.power_parameters) + +# @property +# def power_plugin(self): +# return cstr.to_unicode(self.ptr.power_plugin) + + @property + def preempt_exempt_time(self): + # seconds? + return _raw_time(self.ptr.preempt_exempt_time) + + @property + def preempt_mode(self): + cdef char *tmp = slurm_preempt_mode_string(self.ptr.preempt_mode) + return cstr.to_unicode(tmp) + + @property + def preempt_parameters(self): + return cstr.to_list(self.ptr.preempt_params) + + @property + def preempt_type(self): + return cstr.to_unicode(self.ptr.preempt_type) + + @property + def prep_parameters(self): + return cstr.to_list(self.ptr.prep_params) + + @property + def prep_plugins(self): + return cstr.to_list(self.ptr.prep_plugins) + + @property + def priority_decay_half_life(self): + # seconds + return u32_parse(self.ptr.priority_decay_hl) + + @property + def priority_calc_period(self): + # seconds + return u32_parse(self.ptr.priority_calc_period) + + @property + def priority_favor_small(self): + return u16_parse_bool(self.ptr.priority_favor_small) + + @property + def priority_flags(self): + return _priority_flags_int_to_list(self.ptr.priority_flags) + + @property + def priortiy_max_age(self): + # seconds? + return u32_parse(self.ptr.priority_max_age) + + @property + def priority_parameters(self): + return cstr.to_unicode(self.ptr.priority_params) + + @property + def priority_usage_reset_period(self): + return _priority_reset_int_to_str(self.ptr.priority_reset_period) + + @property + def priority_type(self): + return cstr.to_unicode(self.ptr.priority_type) + + @property + def priority_weight_age(self): + return u32_parse(self.ptr.priority_weight_age) + + @property + def priority_weight_assoc(self): + return u32_parse(self.ptr.priority_weight_assoc) + + @property + def priority_weight_fair_share(self): + return u32_parse(self.ptr.priority_weight_fs) + + @property + def priority_weight_job_size(self): + return u32_parse(self.ptr.priority_weight_js) + + @property + def priority_weight_partition(self): + return u32_parse(self.ptr.priority_weight_part) + + @property + def priority_weight_qos(self): + return u32_parse(self.ptr.priority_weight_qos) + + @property + def priority_weight_tres(self): + return cstr.to_dict(self.ptr.priority_weight_tres) + + @property + def private_data(self): + return _private_data_int_to_list(self.ptr.private_data) + + @property + def proctrack_type(self): + return cstr.to_unicode(self.ptr.proctrack_type) + + @property + def prolog(self): + return cstr.to_list_with_count(self.ptr.prolog, + self.ptr.prolog_cnt) + + @property + def prolog_epilog_timeout(self): + # seconds + return u16_parse(self.ptr.prolog_epilog_timeout) + + @property + def prolog_slurmctld(self): + return cstr.to_list_with_count(self.ptr.prolog_slurmctld, + self.ptr.prolog_slurmctld_cnt) + + @property + def propagate_prio_process(self): + return u16_parse(self.ptr.propagate_prio_process, zero_is_noval=False) + + @property + def prolog_flags(self): + return _prolog_flags_int_to_list(self.ptr.prolog_flags) + + @property + def propagate_resource_limits(self): + return cstr.to_list(self.ptr.propagate_rlimits) + + @property + def propagate_resource_limits_except(self): + return cstr.to_list(self.ptr.propagate_rlimits_except) + + @property + def reboot_program(self): + return cstr.to_unicode(self.ptr.reboot_program) + + @property + def reconfig_flags(self): + return _reconfig_flags_int_to_list(self.ptr.reconfig_flags) + + @property + def requeue_exit(self): + return cstr.to_unicode(self.ptr.requeue_exit) + + @property + def requeue_exit_hold(self): + return cstr.to_unicode(self.ptr.requeue_exit_hold) + + @property + def resume_fail_program(self): + return cstr.to_unicode(self.ptr.resume_fail_program) + + @property + def resume_program(self): + return cstr.to_unicode(self.ptr.resume_program) + + @property + def resume_rate(self): + # minutes? + return u16_parse(self.ptr.resume_rate) + + @property + def resume_timeout(self): + # seconds + return u16_parse(self.ptr.resume_timeout) + + @property + def reservation_epilog(self): + return cstr.to_unicode(self.ptr.resv_epilog) + + @property + def reservation_over_run(self): + # minutes + return u16_parse(self.ptr.resv_over_run) + + @property + def reservation_prolog(self): + return cstr.to_unicode(self.ptr.resv_prolog) + + @property + def return_to_service(self): + return u16_parse(self.ptr.ret2service, zero_is_noval=False) + + @property + def scheduler_log_file(self): + return cstr.to_unicode(self.ptr.sched_logfile) + + @property + def scheduler_log_level(self): + return u16_parse(self.ptr.sched_log_level, zero_is_noval=False) + + @property + def scheduler_parameters(self): + return cstr.to_list(self.ptr.sched_params) + + @property + def scheduler_time_slice(self): + # seconds + return u16_parse(self.ptr.sched_time_slice) + + @property + def scheduler_type(self): + return cstr.to_unicode(self.ptr.schedtype) + + @property + def scron_parameters(self): + return cstr.to_list(self.ptr.scron_params) + + @property + def select_type(self): + return cstr.to_unicode(self.ptr.select_type) + + @property + def select_type_parameters(self): + cdef char *tmp = slurm.select_type_param_string(self.ptr.select_type_param) + return cstr.to_list(tmp) + + @property + def priority_site_factor_plugin(self): + return cstr.to_unicode(self.ptr.site_factor_plugin) + + @property + def priority_site_factor_parameters(self): + return cstr.to_unicode(self.ptr.site_factor_params) + + @property + def slurm_conf_path(self): + return cstr.to_unicode(self.ptr.slurm_conf) + + @property + def slurm_user_id(self): + return self.ptr.slurm_user_id + + @property + def slurm_user_name(self): + return cstr.to_unicode(self.ptr.slurm_user_name) + + @property + def slurmd_user_id(self): + return self.ptr.slurm_user_id + + @property + def slurmd_user_name(self): + return cstr.to_unicode(self.ptr.slurmd_user_name) + + # TODO: char *slurmctld_addr + + @property + def slurmctld_log_level(self): + return _log_level_int_to_str(self.ptr.slurmctld_debug) + + @property + def slurmctld_log_file(self): + return cstr.to_unicode(self.ptr.slurmctld_logfile) + + @property + def slurmctld_pid_file(self): + return cstr.to_unicode(self.ptr.slurmctld_pidfile) + + @property + def slurmctld_port(self): + port = self.ptr.slurmctld_port + if self.ptr.slurmctld_port_count > 1: + # Slurmctld port can be a range actually, calculated by using the + # number of ports in use that slurm conf reports for slurmctld + last_port = port + self.ptr.slurmctld_port_count - 1 + port = f"{port}-{last_port}" + + return str(port) + + @property + def slurmctld_primary_off_program(self): + return cstr.to_unicode(self.ptr.slurmctld_primary_off_prog) + + @property + def slurmctld_primary_on_program(self): + return cstr.to_unicode(self.ptr.slurmctld_primary_on_prog) + + @property + def slurmctld_syslog_level(self): + return _log_level_int_to_str(self.ptr.slurmctld_syslog_debug) + + @property + def slurmctld_timeout(self): + # seconds + return u16_parse(self.ptr.slurmctld_timeout) + + @property + def slurmctld_parameters(self): + return cstr.to_list(self.ptr.slurmctld_params) + + @property + def slurmd_log_level(self): + return _log_level_int_to_str(self.ptr.slurmd_debug) + + @property + def slurmd_log_file(self): + return cstr.to_unicode(self.ptr.slurmd_logfile) + + @property + def slurmd_parameters(self): + return cstr.to_list(self.ptr.slurmd_params) + + @property + def slurmd_pid_file(self): + return cstr.to_unicode(self.ptr.slurmd_pidfile) + + @property + def slurmd_port(self): + return self.ptr.slurmd_port + + @property + def slurmd_spool_directory(self): + return cstr.to_unicode(self.ptr.slurmd_spooldir) + + @property + def slurmd_syslog_debug_level(self): + return _log_level_int_to_str(self.ptr.slurmd_syslog_debug) + + @property + def slurmd_timeout(self): + return u16_parse(self.ptr.slurmd_timeout) + + @property + def srun_epilog(self): + return cstr.to_unicode(self.ptr.srun_epilog) + + @property + def srun_port_range(self): + if not self.ptr.srun_port_range: + return None + + low = self.ptr.srun_port_range[0] + high = self.ptr.srun_port_range[1] + return f"{low}-{high}" + + @property + def srun_prolog(self): + return cstr.to_unicode(self.ptr.srun_prolog) + + @property + def state_save_location(self): + return cstr.to_unicode(self.ptr.state_save_location) + + @property + def suspend_exclude_nodes(self): + return cstr.to_unicode(self.ptr.suspend_exc_nodes) + + @property + def suspend_exclude_partitions(self): + return cstr.to_list(self.ptr.suspend_exc_parts) + + @property + def suspend_exclude_states(self): + return cstr.to_list(self.ptr.suspend_exc_states) + + @property + def suspend_program(self): + return cstr.to_unicode(self.ptr.suspend_program) + + @property + def suspend_rate(self): + return u16_parse(self.ptr.suspend_rate) + + @property + def suspend_time(self): + return u32_parse(self.ptr.suspend_time) + + @property + def suspend_timeout(self): + return u16_parse(self.ptr.suspend_timeout) + + @property + def switch_type(self): + return cstr.to_unicode(self.ptr.switch_type) + + @property + def switch_parameters(self): + return cstr.to_list(self.ptr.switch_param) + + @property + def task_epilog(self): + return cstr.to_unicode(self.ptr.task_epilog) + + @property + def task_plugin(self): + return cstr.to_unicode(self.ptr.task_plugin) + + @property + def task_plugin_parameters(self): + cdef char cpu_bind[256] + slurm_sprint_cpu_bind_type(cpu_bind, + self.ptr.task_plugin_param) + if cpu_bind == "(null type)": + return [] + + return cstr.to_list(cpu_bind) + + @property + def task_prolog(self): + return cstr.to_unicode(self.ptr.task_prolog) + + @property + def tls_parameters(self): + return cstr.to_list(self.ptr.tls_params) + + @property + def tls_type(self): + return cstr.to_unicode(self.ptr.tls_type) + + @property + def tcp_timeout(self): + return u16_parse(self.ptr.tcp_timeout) + + @property + def temporary_filesystem(self): + return cstr.to_unicode(self.ptr.tmp_fs) + + @property + def topology_parameters(self): + return cstr.to_list(self.ptr.topology_param) + + @property + def topology_plugin(self): + return cstr.to_unicode(self.ptr.topology_plugin) + + @property + def tree_width(self): + return u16_parse(self.ptr.tree_width) + + @property + def unkillable_step_program(self): + return cstr.to_unicode(self.ptr.unkillable_program) + + @property + def unkillable_step_timeout(self): + return u16_parse(self.ptr.unkillable_timeout) + + @property + def track_wckey(self): + if self.ptr.conf_flags & slurm.CONF_FLAG_WCKEY: + return True + return False + + @property + def use_pam(self): + if self.ptr.conf_flags & slurm.CONF_FLAG_PAM: + return True + return False + + @property + def version(self): + return cstr.to_unicode(self.ptr.version) + + @property + def virtual_memory_size_factor(self): + return u16_parse(self.ptr.vsize_factor) + + @property + def default_job_wait_time(self): + return u16_parse(self.ptr.wait_time) + + @property + def x11_parameters(self): + return cstr.to_unicode(self.ptr.x11_params) + + +def _str_to_bool(val, true_str, false_str): + if not val: + return False + + v = val.lower() + if v == true_str: + return True + elif v == false_str: + return False + else: + return False + + +def _yesno_to_bool(val): + return _str_to_bool(val, "yes", "no") + + +def _true_false_to_bool(val): + return _str_to_bool(val, "true", "false") + + +cdef dict _parse_config_key_pairs(void *ptr, owned=False): + cdef: + SlurmList conf = SlurmList.wrap(ptr, owned=owned) + SlurmListItem item + config_key_pair_t *key_pair + dict out = {} + + for item in conf: + key_pair = item.data + name = cstr.to_unicode(key_pair.name) + val = cstr.to_unicode(key_pair.value) + out[name] = val + + return out + + +def _debug_flags_int_to_list(flags): + cdef char *data = slurm.debug_flags2str(flags) + return cstr.to_list_free(&data) + + +def _debug_flags_str_to_int(flags): + pass + + +# https://github.com/SchedMD/slurm/blob/01a3aac7c59c9b32a9dd4e395aa5a97a8aea4f08/slurm/slurm.h#L621 +def _enforce_part_limits_int_to_str(limits): + cdef char* data = slurm.parse_part_enforce_type_2str(limits) + return cstr.to_unicode(data) + + +# https://github.com/SchedMD/slurm/blob/01a3aac7c59c9b32a9dd4e395aa5a97a8aea4f08/slurm/slurm.h#L2741 +def _health_check_node_state_int_to_list(state): + cdef char *data = slurm.health_check_node_state_str(state) + return cstr.to_list_free(&data) + + +def _log_fmt_int_to_str(flag): + if flag == slurm.LOG_FMT_ISO8601_MS: + return "iso8601_ms" + elif flag == slurm.LOG_FMT_ISO8601: + return "iso8601" + elif flag == slurm.LOG_FMT_RFC5424_MS: + return "rfc5424_ms" + elif flag == slurm.LOG_FMT_RFC5424: + return "rfc5424" + elif flag == slurm.LOG_FMT_CLOCK: + return "clock" + elif flag == slurm.LOG_FMT_SHORT: + return "short" + elif flag == slurm.LOG_FMT_THREAD_ID: + return "thread_id" + elif flag == slurm.LOG_FMT_RFC3339: + return "rfc3339" + else: + return None + + +def _priority_flags_int_to_list(flags): + cdef char *data = slurm.priority_flags_string(flags) + return cstr.to_list_free(&data) + + +def _priority_reset_int_to_str(flag): + if flag == slurm.PRIORITY_RESET_NONE: + return None + elif flag == slurm.PRIORITY_RESET_NOW: + return "NOW" + elif flag == slurm.PRIORITY_RESET_DAILY: + return "DAILY" + elif flag == slurm.PRIORITY_RESET_WEEKLY: + return "WEEKLY" + elif flag == slurm.PRIORITY_RESET_MONTHLY: + return "MONTHLY" + elif flag == slurm.PRIORITY_RESET_QUARTERLY: + return "QUARTERLY" + elif flag == slurm.PRIORITY_RESET_YEARLY: + return "YEARLY" + else: + return None + + +def _private_data_int_to_list(flags): + cdef char tmp[128] + slurm.private_data_string(flags, tmp, sizeof(tmp)) + out = cstr.to_unicode(tmp) + if not out or out == "none": + return [] + + return out.split(",") + + +def _prolog_flags_int_to_list(flags): + cdef char *data = slurm.prolog_flags2str(flags) + return cstr.to_list_free(&data) + + +def _reconfig_flags_int_to_list(flags): + cdef char *tmp = slurm.reconfig_flags2str(flags) + return cstr.to_list_free(&tmp) + + +def _log_level_int_to_str(flags): + data = cstr.to_unicode(slurm.log_num2string(flags)) + if data == "(null)": + return None + else: + return data + +def _acct_store_flags_int_to_str(flags): + cdef list out = [] + + if flags & slurm.CONF_FLAG_SJC: + out.append("JOB_COMMENT") + if flags & slurm.CONF_FLAG_SJE: + out.append("JOB_ENV") + if flags & slurm.CONF_FLAG_SJX: + out.append("JOB_EXTRA") + if flags & slurm.CONF_FLAG_SJS: + out.append("JOB_SCRIPT") + if flags & slurm.CONF_FLAG_NO_STDIO: + out.append("NO_STDIO") + + return out + +def _get_memory(value, per_cpu): + if value != slurm.NO_VAL64: + if value & slurm.MEM_PER_CPU and per_cpu: + if value == slurm.MEM_PER_CPU: + return UNLIMITED + return u64_parse(value & (~slurm.MEM_PER_CPU)) + + # For these values, Slurm interprets 0 as being equal to + # INFINITE/UNLIMITED + elif value == 0 and not per_cpu: + return UNLIMITED + + elif not value & slurm.MEM_PER_CPU and not per_cpu: + return u64_parse(value) + return None diff --git a/pyslurm/settings.pyx b/pyslurm/settings.pyx index 5085a9f5..36b834c2 100644 --- a/pyslurm/settings.pyx +++ b/pyslurm/settings.pyx @@ -30,4 +30,4 @@ from pyslurm.utils cimport cstr LOCAL_CLUSTER = cstr.to_unicode(slurm.slurm_conf.cluster_name) if not LOCAL_CLUSTER: slurm_conf = slurmctld.Config.load() - LOCAL_CLUSTER = slurm_conf.cluster + LOCAL_CLUSTER = slurm_conf.cluster_name diff --git a/pyslurm/slurm/extra.pxi b/pyslurm/slurm/extra.pxi index bab83890..c686f9fd 100644 --- a/pyslurm/slurm/extra.pxi +++ b/pyslurm/slurm/extra.pxi @@ -259,6 +259,7 @@ cdef extern int slurm_addto_step_list(list_t *step_list, char *names) cdef extern int slurmdb_report_set_start_end_time(time_t *start, time_t *end) cdef extern uint16_t slurm_get_track_wckey() cdef extern void slurm_sprint_cpu_bind_type(char *str, cpu_bind_type_t cpu_bind_type) +cdef extern void slurm_accounting_enforce_string(uint16_t enforce, char *str, int str_len) # Slurm bit functions @@ -300,3 +301,21 @@ cdef extern void slurmdb_init_tres_cond(slurmdb_tres_cond_t *tres, bool free_it) cdef extern void slurm_free_update_part_msg(update_part_msg_t *msg) cdef extern void slurm_free_partition_info_members(partition_info_t *node) + +# +# Slurmctld stuff +# + +cdef extern char *debug_flags2str(uint64_t debug_flags) +cdef extern int debug_str2flags(const char* debug_flags, uint64_t *flags_out) +cdef extern char *parse_part_enforce_type_2str(uint16_t type) +cdef extern char *health_check_node_state_str(uint32_t node_state) +cdef extern char *priority_flags_string(uint16_t priority_flags) +cdef extern char* prolog_flags2str(uint16_t prolog_flags) +cdef extern uint16_t prolog_str2flags(char *prolog_flags) +cdef extern char *log_num2string(uint16_t inx) +cdef extern char *private_data_string(uint16_t private_data, char *str, int str_len) +cdef extern char *reconfig_flags2str(uint16_t reconfig_flags) +cdef extern uint16_t reconfig_str2flags(char *reconfig_flags) +cdef extern char *select_type_param_string(uint16_t select_type_param) +cdef extern char *job_defaults_str(list_t *in_list) diff --git a/pyslurm/utils/cstr.pxd b/pyslurm/utils/cstr.pxd index e8014a5f..eca6f25e 100644 --- a/pyslurm/utils/cstr.pxd +++ b/pyslurm/utils/cstr.pxd @@ -32,8 +32,10 @@ cdef fmalloc(char **old, val) cdef fmalloc2(char **p1, char **p2, val) cdef free_array(char **arr, count) cpdef list to_list(char *str_list, default=*) +cdef list to_list_free(char **str_list) +cdef list to_list_with_count(char **str_list, cnt) cdef from_list(char **old, vals, delim=*) cdef from_list2(char **p1, char **p2, vals, delim=*) -cpdef dict to_dict(char *str_dict, str delim1=*, str delim2=*) +cpdef dict to_dict(char *str_dict, str delim1=*, str delim2=*, def_value=*) cdef from_dict(char **old, vals, prepend=*, str delim1=*, str delim2=*) cpdef dict to_gres_dict(char *gres) diff --git a/pyslurm/utils/cstr.pyx b/pyslurm/utils/cstr.pyx index 412ef5d3..80cb99d3 100644 --- a/pyslurm/utils/cstr.pyx +++ b/pyslurm/utils/cstr.pyx @@ -106,6 +106,22 @@ cpdef list to_list(char *str_list, default=[]): return ret.split(",") +cdef list to_list_free(char **str_list): + out = to_list(str_list[0]) + xfree(str_list[0]) + return out + + +cdef list to_list_with_count(char **str_list, cnt): + cdef list out = [] + + if cnt and cnt != slurm.NO_VAL: + for i in range(cnt): + out.append(to_unicode(str_list[i])) + + return out + + def list_to_str(vals, delim=","): """Convert list to a C-String.""" cdef object final = vals @@ -125,7 +141,8 @@ cdef from_list2(char **p1, char **p2, vals, delim=","): from_list(p2, vals, delim) -cpdef dict to_dict(char *str_dict, str delim1=",", str delim2="="): +cpdef dict to_dict(char *str_dict, str delim1=",", str delim2="=", + def_value=None): """Convert a char* key=value pair to dict. With a char* Slurm represents key-values pairs usually in the form of: @@ -144,6 +161,8 @@ cpdef dict to_dict(char *str_dict, str delim1=",", str delim2="="): if delim2 in kv: key, val = kv.split(delim2, 1) out[key] = int(val) if val.isdigit() else val + elif def_value is not None: + out[kv] = def_value return out diff --git a/pyslurm/utils/helpers.pyx b/pyslurm/utils/helpers.pyx index 577a1c9a..bac5f5f7 100644 --- a/pyslurm/utils/helpers.pyx +++ b/pyslurm/utils/helpers.pyx @@ -387,3 +387,34 @@ cpdef gres_from_tres_dict(dict tres_dict): for k, v in tres_dict.items() if gres_prefix in k } + + +def cpu_freq_int_to_str(freq): + """Convert a numerical cpufreq value to its string representation.""" + if freq == slurm.CPU_FREQ_LOW: + return "LOW" + elif freq == slurm.CPU_FREQ_MEDIUM: + return "MEDIUM" + elif freq == slurm.CPU_FREQ_HIGHM1: + return "HIGHM1" + elif freq == slurm.CPU_FREQ_HIGH: + return "HIGH" + elif freq == slurm.CPU_FREQ_CONSERVATIVE: + return "CONSERVATIVE" + elif freq == slurm.CPU_FREQ_PERFORMANCE: + return "PERFORMANCE" + elif freq == slurm.CPU_FREQ_POWERSAVE: + return "POWERSAVE" + elif freq == slurm.CPU_FREQ_USERSPACE: + return "USERSPACE" + elif freq == slurm.CPU_FREQ_ONDEMAND: + return "ONDEMAND" + elif freq == slurm.CPU_FREQ_SCHEDUTIL: + return "SCHEDUTIL" + elif freq & slurm.CPU_FREQ_RANGE_FLAG: + return None + elif freq == slurm.NO_VAL or freq == 0: + return None + else: + # This is in kHz + return freq From 914953f52fa3ed7cee6116f18b42864514ea4f7c Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Mon, 6 Jan 2025 19:31:36 +0100 Subject: [PATCH 02/31] move cpu_freq_int_to_str to utils.helpers --- pyslurm/core/job/job.pyx | 1 + pyslurm/core/job/step.pyx | 2 +- pyslurm/core/job/util.pyx | 31 ------------------------------- pyslurm/core/slurmctld.pyx | 6 ++++-- pyslurm/db/step.pyx | 2 +- tests/unit/test_job.py | 1 + 6 files changed, 8 insertions(+), 35 deletions(-) diff --git a/pyslurm/core/job/job.pyx b/pyslurm/core/job/job.pyx index 6f4870c1..9605d0f7 100644 --- a/pyslurm/core/job/job.pyx +++ b/pyslurm/core/job/job.pyx @@ -50,6 +50,7 @@ from pyslurm.utils.helpers import ( _getpwall_to_dict, instance_to_dict, _get_exit_code, + cpu_freq_int_to_str, ) diff --git a/pyslurm/core/job/step.pyx b/pyslurm/core/job/step.pyx index 719257eb..b3a09509 100644 --- a/pyslurm/core/job/step.pyx +++ b/pyslurm/core/job/step.pyx @@ -34,8 +34,8 @@ from pyslurm.utils.helpers import ( uid_to_name, humanize_step_id, dehumanize_step_id, + cpu_freq_int_to_str, ) -from pyslurm.core.job.util import cpu_freq_int_to_str from pyslurm.utils.ctime import ( secs_to_timestr, mins_to_timestr, diff --git a/pyslurm/core/job/util.pyx b/pyslurm/core/job/util.pyx index 6014ad50..2d30a0f2 100644 --- a/pyslurm/core/job/util.pyx +++ b/pyslurm/core/job/util.pyx @@ -246,37 +246,6 @@ def cpu_freq_str_to_int(freq): raise ValueError(f"Invalid cpu freq value: {freq}.") -def cpu_freq_int_to_str(freq): - """Convert a numerical cpufreq value to its string representation.""" - if freq == slurm.CPU_FREQ_LOW: - return "LOW" - elif freq == slurm.CPU_FREQ_MEDIUM: - return "MEDIUM" - elif freq == slurm.CPU_FREQ_HIGHM1: - return "HIGHM1" - elif freq == slurm.CPU_FREQ_HIGH: - return "HIGH" - elif freq == slurm.CPU_FREQ_CONSERVATIVE: - return "CONSERVATIVE" - elif freq == slurm.CPU_FREQ_PERFORMANCE: - return "PERFORMANCE" - elif freq == slurm.CPU_FREQ_POWERSAVE: - return "POWERSAVE" - elif freq == slurm.CPU_FREQ_USERSPACE: - return "USERSPACE" - elif freq == slurm.CPU_FREQ_ONDEMAND: - return "ONDEMAND" - elif freq == slurm.CPU_FREQ_SCHEDUTIL: - return "SCHEDUTIL" - elif freq & slurm.CPU_FREQ_RANGE_FLAG: - return None - elif freq == slurm.NO_VAL or freq == 0: - return None - else: - # This is in kHz - return freq - - def dependency_str_to_dict(dep): if not dep: return None diff --git a/pyslurm/core/slurmctld.pyx b/pyslurm/core/slurmctld.pyx index a288aaa2..82796d7d 100644 --- a/pyslurm/core/slurmctld.pyx +++ b/pyslurm/core/slurmctld.pyx @@ -25,8 +25,10 @@ from pyslurm.core.error import verify_rpc, RPCError from pyslurm.utils.uint import * from pyslurm.utils.ctime import _raw_time -from pyslurm.utils.helpers import cpu_freq_int_to_str -from pyslurm.utils.helpers import instance_to_dict +from pyslurm.utils.helpers import ( + cpu_freq_int_to_str, + instance_to_dict, +) from pyslurm.utils import cstr diff --git a/pyslurm/db/step.pyx b/pyslurm/db/step.pyx index 0faf4d79..11cc1875 100644 --- a/pyslurm/db/step.pyx +++ b/pyslurm/db/step.pyx @@ -34,8 +34,8 @@ from pyslurm.utils.helpers import ( instance_to_dict, _get_exit_code, humanize_step_id, + cpu_freq_int_to_str, ) -from pyslurm.core.job.util import cpu_freq_int_to_str cdef class JobSteps(dict): diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index 7ce7f05b..5bb58b6e 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -24,6 +24,7 @@ import pyslurm from pyslurm import Job from pyslurm.core.job.util import * +from pyslurm.utils.helpers import cpu_freq_int_to_str def test_create_instance(): job = Job(9999) From 32b9012a86870c188ca7c80dec564a8edd3e63a0 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 7 Jan 2025 22:00:16 +0100 Subject: [PATCH 03/31] cstr: add configurable delim to to_list --- pyslurm/utils/cstr.pxd | 2 +- pyslurm/utils/cstr.pyx | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyslurm/utils/cstr.pxd b/pyslurm/utils/cstr.pxd index eca6f25e..0579819a 100644 --- a/pyslurm/utils/cstr.pxd +++ b/pyslurm/utils/cstr.pxd @@ -31,7 +31,7 @@ cdef to_unicode(char *s, default=*) cdef fmalloc(char **old, val) cdef fmalloc2(char **p1, char **p2, val) cdef free_array(char **arr, count) -cpdef list to_list(char *str_list, default=*) +cpdef list to_list(char *str_list, default=*, delim=*) cdef list to_list_free(char **str_list) cdef list to_list_with_count(char **str_list, cnt) cdef from_list(char **old, vals, delim=*) diff --git a/pyslurm/utils/cstr.pyx b/pyslurm/utils/cstr.pyx index 80cb99d3..ad7e1015 100644 --- a/pyslurm/utils/cstr.pyx +++ b/pyslurm/utils/cstr.pyx @@ -96,14 +96,14 @@ cdef fmalloc(char **old, val): old[0] = NULL -cpdef list to_list(char *str_list, default=[]): +cpdef list to_list(char *str_list, default=None, delim=","): """Convert C-String to a list.""" cdef str ret = to_unicode(str_list) if not ret: - return default + return [] if default is None else default - return ret.split(",") + return ret.split(delim) cdef list to_list_free(char **str_list): From 27480d9823d9aaa8156c1e793b60f21c0e4c566b Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 7 Jan 2025 22:00:51 +0100 Subject: [PATCH 04/31] update partition unit tests --- tests/unit/test_partition.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py index b699893c..490fba1b 100644 --- a/tests/unit/test_partition.py +++ b/tests/unit/test_partition.py @@ -32,7 +32,10 @@ def test_create_instance(): def test_parse_all(): - assert Partition("normal").to_dict() + part = Partition("normal") + assert part.to_dict() + assert part.allowed_submit_nodes == ["ALL"] + assert part.allowed_accounts == ["ALL"] def test_parse_memory(): From 20d3671083b077d6378948a17290ccf0fca0c893 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 7 Jan 2025 22:01:26 +0100 Subject: [PATCH 05/31] tests: integration tests for slurmctld --- tests/integration/test_slurmctld.py | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/integration/test_slurmctld.py diff --git a/tests/integration/test_slurmctld.py b/tests/integration/test_slurmctld.py new file mode 100644 index 00000000..d94e03ae --- /dev/null +++ b/tests/integration/test_slurmctld.py @@ -0,0 +1,58 @@ +######################################################################### +# test_slurmctld.py - slurmctld integration tests +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_slurmctld.py - integration test basic slurmctld functionalities.""" + +import pytest +import pyslurm +from pyslurm import slurmctld + + +def test_ping(): + resp = slurmctld.ping_primary() + assert resp.is_responding + assert resp.is_primary + assert resp.index == 0 + assert resp.hostname is not None + assert resp.latency > 0 + assert resp.to_dict() + + +def test_ping_all(): + pings = slurmctld.ping_all() + assert isinstance(pings, list) + + for resp in pings: + assert resp.hostname is not None + assert resp.latency > 0 + + +def test_reconfigure(): + slurmctld.reconfigure() + + +def test_load_config(): + conf = slurmctld.Config.load() + + assert conf + assert conf.to_dict() + assert conf.cgroup_config + assert conf.accounting_gather_config + assert conf.mpi_config From 863c287727da45fcb163a32673481f54f3f8ffff Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 7 Jan 2025 22:50:34 +0100 Subject: [PATCH 06/31] partition: use _get_memory from slurmctld module --- pyslurm/core/partition.pyx | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/pyslurm/core/partition.pyx b/pyslurm/core/partition.pyx index 9c50390b..5399ccf2 100644 --- a/pyslurm/core/partition.pyx +++ b/pyslurm/core/partition.pyx @@ -31,6 +31,7 @@ from pyslurm.core.error import RPCError, verify_rpc from pyslurm.utils.ctime import timestamp_to_date, _raw_time from pyslurm.constants import UNLIMITED from pyslurm.settings import LOCAL_CLUSTER +from pyslurm.core.slurmctld import _get_memory from pyslurm import xcollections from pyslurm.utils.helpers import ( uid_to_name, @@ -832,21 +833,3 @@ cdef _concat_job_default_str(typ, val, char **job_defaults_str): current.update({typ : _val}) cstr.from_dict(job_defaults_str, current) - - -def _get_memory(value, per_cpu): - if value != slurm.NO_VAL64: - if value & slurm.MEM_PER_CPU and per_cpu: - if value == slurm.MEM_PER_CPU: - return UNLIMITED - return u64_parse(value & (~slurm.MEM_PER_CPU)) - - # For these values, Slurm interprets 0 as being equal to - # INFINITE/UNLIMITED - elif value == 0 and not per_cpu: - return UNLIMITED - - elif not value & slurm.MEM_PER_CPU and not per_cpu: - return u64_parse(value) - - return None From 18af65874a7cba5e187003db32d325c23ce8a5ab Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 7 Jan 2025 22:50:47 +0100 Subject: [PATCH 07/31] partition: bump copyright --- pyslurm/core/partition.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyslurm/core/partition.pyx b/pyslurm/core/partition.pyx index 5399ccf2..e0d916aa 100644 --- a/pyslurm/core/partition.pyx +++ b/pyslurm/core/partition.pyx @@ -2,7 +2,7 @@ # partition.pyx - interface to work with partitions in slurm ######################################################################### # Copyright (C) 2023 Toni Harzendorf -# Copyright (C) 2023 PySlurm Developers +# Copyright (C) 2025 PySlurm Developers # # This file is part of PySlurm # From 8c5af16aff206375c7368e61ecbe492a07acd638 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 7 Jan 2025 22:51:21 +0100 Subject: [PATCH 08/31] slurmctld: start adding documentation --- pyslurm/core/slurmctld.pxd | 27 ++++- pyslurm/core/slurmctld.pyx | 208 ++++++++++++++++++++++++++++++------- 2 files changed, 199 insertions(+), 36 deletions(-) diff --git a/pyslurm/core/slurmctld.pxd b/pyslurm/core/slurmctld.pxd index f354083c..747513b1 100644 --- a/pyslurm/core/slurmctld.pxd +++ b/pyslurm/core/slurmctld.pxd @@ -31,6 +31,12 @@ from pyslurm.slurm cimport ( slurm_accounting_enforce_string, slurm_sprint_cpu_bind_type, slurm_ctl_conf_2_key_pairs, + slurm_reconfigure, + slurm_shutdown, + slurm_ping, + slurm_takeover, + ping_all_controllers, + controller_ping_t, cpu_bind_type_t, try_xmalloc, list_t, @@ -38,7 +44,12 @@ from pyslurm.slurm cimport ( ) from pyslurm.utils cimport cstr from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t -from pyslurm.utils.uint cimport * +from pyslurm.utils.uint cimport ( + u16_parse, + u32_parse, + u64_parse, + u16_parse_bool, +) from pyslurm.db.util cimport ( SlurmList, @@ -54,6 +65,17 @@ ctypedef struct config_key_pair_t: char *value +cdef class PingResponse: + """Slurm Controller Ping response information""" + + cdef public: + is_primary + is_responding + index + hostname + latency + + cdef class Config: cdef slurm_conf_t *ptr @@ -64,6 +86,7 @@ cdef class Config: cdef class MPIConfig: + """Slurm MPI Config (mpi.conf)""" cdef public: pmix_cli_tmp_dir_base @@ -83,6 +106,7 @@ cdef class MPIConfig: cdef MPIConfig from_ptr(void *ptr) cdef class CgroupConfig: + """Slurm Cgroup Config (cgroup.conf)""" cdef public: mountpoint @@ -110,6 +134,7 @@ cdef class CgroupConfig: cdef class AccountingGatherConfig: + """Slurm Accounting Gather Config (acct_gather.conf)""" cdef public: energy_ipmi_frequency diff --git a/pyslurm/core/slurmctld.pyx b/pyslurm/core/slurmctld.pyx index 82796d7d..9606e809 100644 --- a/pyslurm/core/slurmctld.pyx +++ b/pyslurm/core/slurmctld.pyx @@ -23,13 +23,167 @@ # cython: language_level=3 from pyslurm.core.error import verify_rpc, RPCError -from pyslurm.utils.uint import * +from pyslurm.utils.uint import ( + u16_parse, + u32_parse, + u64_parse, +) +from pyslurm.constants import UNLIMITED from pyslurm.utils.ctime import _raw_time from pyslurm.utils.helpers import ( cpu_freq_int_to_str, instance_to_dict, ) from pyslurm.utils import cstr +from typing import Union +import time +from enum import IntEnum + + +class ShutdownMode(IntEnum): + """Mode of operation for shutdown action""" + ALL = 0 + CORE_FILE = 1 + CONTROLLER_ONLY = 2 + + +cdef class PingResponse: + + def to_dict(self): + """Slurmctld ping response formatted as dictionary. + + Returns: + (dict): Ping response as a dict + + Examples: + >>> from pyslurm import slurmctld + >>> ctld_primary = slurmctld.Config.ping(0) + >>> primary_dict = ctld_primary.to_dict() + """ + return instance_to_dict(self) + + +def ping(index): + """Ping a Slurm controller + + Returns: + (pyslurm.slurmctld.PingResponse): a ping response + + Examples: + >>> from pyslurm import slurmctld + >>> resp = slurmctld.ping(0) + >>> print(resp.hostname, resp.latency) + slurmctl 1.246 + """ + t0 = time.perf_counter() + rc = slurm_ping(index) + t1 = time.perf_counter() + + verify_rpc(rc) + ctl_cnt = slurm.slurm_conf.control_cnt + + if index >= ctl_cnt: + raise RPCError(msg="Invalid Index specified.") + + info = PingResponse() + info.is_primary = index == 0 + info.is_responding = not rc + info.index = index + info.hostname = cstr.to_unicode(slurm.slurm_conf.control_machine[index]) + info.latency = round((t1 - t0) * 1000, 3) + + return info + + +def ping_primary(): + """Ping the primary Slurm Controller. + + See `ping()` for more information and examples. + + Returns: + (pyslurm.slurmctld.PingResponse): a ping response + """ + return ping(0) + + +def ping_backup(): + """Ping the first backup Slurm Controller. + + See `ping()` for more information and examples. + + Returns: + (pyslurm.slurmctld.PingResponse): a ping response + """ + return ping(1) + + +def ping_all(): + """Ping all Slurm Controllers. + + Returns: + (list[pyslurm.slurmctld.PingResponse]): a list of ping responses + + Raises: + (pyslurm.RPCError): When the ping was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> resps = slurmctld.ping_all() + >>> for resp in resps: + ... print(resp.hostname, resp.latency) + ... + slurmctl 1.246 + slurmctlbackup 1.373 + """ + cdef list out = [] + + ctl_cnt = slurm.slurm_conf.control_cnt + for i in range(ctl_cnt): + out.append(ping(i)) + + return out + + +def shutdown(mode: Union[ShutdownMode, int]): + """Shutdown Slurm Controller or all Daemons + + Args: + mode: + Whether only the Slurm controller shut be downed, or also all other + slurmd daemons. + + Raises: + (pyslurm.RPCError): When shutdowning the daemons was not successful. + """ + verify_rpc(slurm_shutdown(int(mode))) + + +def reconfigure(): + """Trigger Slurm Controller to reload the Config + + Raises: + (pyslurm.RPCError): When reconfiguring was not successful. + """ + verify_rpc(slurm_reconfigure()) + + +def takeover(index = 1): + """Let a Backup Slurm Controller take over as the Primary. + + Args: + index (int, optional = 1): + Index of the Backup Controller that should take over. By default, + the `index` is `1`, meaning the next Controller configured after + the Primary in slurm.conf (second `SlurmctlHost` entry) will be + asked to take over operation. + + If you have more than one backup controller configured, you can for + example also pass `2` as the index. + + Raises: + (pyslurm.RPCError): When reconfiguring was not successful. + """ + verify_rpc(slurm_takeover(index)) cdef class MPIConfig: @@ -75,7 +229,7 @@ cdef class MPIConfig: cdef class CgroupConfig: - def __init__(self, job_id): + def __init__(self): raise RuntimeError("Cannot instantiate class directly") def to_dict(self): @@ -121,7 +275,7 @@ cdef class CgroupConfig: cdef class AccountingGatherConfig: - def __init__(self, job_id): + def __init__(self): raise RuntimeError("Cannot instantiate class directly") def to_dict(self): @@ -147,7 +301,7 @@ cdef class AccountingGatherConfig: out.energy_ipmi_calc_adjustment = _yesno_to_bool( conf.get("EnergyIPMICalcAdjustment")) - # TODO: dict + # TODO: maybe dict? out.energy_ipmi_power_sensors = conf.get("EnergyIPMIPowerSensors") out.energy_ipmi_user_name = conf.get("EnergyIPMIUsername") @@ -176,8 +330,9 @@ cdef class Config: def __cinit__(self): self.ptr = NULL - def __init__(self, job_id): - raise RuntimeError("Cannot instantiate class directly") + def __init__(self): + raise RuntimeError("Cannot instantiate class directly. " + "Use slurmctld.Config.load() to get an instance.") def __dealloc__(self): slurm_free_ctl_conf(self.ptr) @@ -201,6 +356,13 @@ cdef class Config: @staticmethod def load(): + """Load the current Slurm configuration (slurm.conf) + + This also loads the following other configurations: + * `cgroup.conf` (`cgroup_config`) + * `acct_gather.conf` (`accounting_gather_config`) + * `mpi.conf` (`mpi_config`) + """ cdef Config conf = Config.__new__(Config) verify_rpc(slurm_load_ctl_conf(0, &conf.ptr)) @@ -208,6 +370,7 @@ cdef class Config: conf.accounting_gather_config = AccountingGatherConfig.from_ptr( conf.ptr.acct_gather_conf) conf.mpi_config = MPIConfig.from_ptr(conf.ptr.mpi_conf) + # TODO: node_features_conf return conf @@ -431,16 +594,6 @@ cdef class Config: return cstr.to_list_with_count(self.ptr.epilog_slurmctld, self.ptr.epilog_slurmctld_cnt) -# @property -# def external_sensors_type(self): -# return cstr.to_unicode(self.ptr.ext_sensors_type) - -# @property -# def external_sensors_frequency(self): -# return u16_parse(self.ptr.ext_sensors_freq) - - # TODO: void *ext_sensors_conf put into own class? - @property def federation_parameters(self): return cstr.to_list(self.ptr.fed_params) @@ -469,7 +622,6 @@ cdef class Config: @property def group_update_force(self): - # TODO: maybe bool? return u16_parse_bool(self.ptr.group_force) @property @@ -485,7 +637,6 @@ cdef class Config: val = u32_parse(self.ptr.hash_val) if not val: return None - return hex(val) @property @@ -534,10 +685,6 @@ cdef class Config: def job_completion_parameters(self): return cstr.to_list(self.ptr.job_comp_params) -# @property -# def job_completion_password(self): -# return cstr.to_unicode(self.ptr.job_comp_pass) - @property def job_completion_port(self): return u32_parse(self.ptr.job_comp_port) @@ -675,8 +822,6 @@ cdef class Config: def next_job_id(self): return u32_parse(self.ptr.next_job_id) - # TODO: void *node_features_conf put into own class? - @property def node_features_plugins(self): return cstr.to_list(self.ptr.node_features_plugins) @@ -686,22 +831,13 @@ cdef class Config: return u16_parse(self.ptr.over_time_limit) @property - def plugin_path(self): - # TODO: maybe list - return cstr.to_unicode(self.ptr.plugindir) + def plugin_dirs(self): + return cstr.to_list(self.ptr.plugindir, None, ":") @property def plugin_stack_config(self): return cstr.to_unicode(self.ptr.plugstack) -# @property -# def power_parameters(self): -# return cstr.to_list(self.ptr.power_parameters) - -# @property -# def power_plugin(self): -# return cstr.to_unicode(self.ptr.power_plugin) - @property def preempt_exempt_time(self): # seconds? @@ -1295,6 +1431,7 @@ def _log_level_int_to_str(flags): else: return data + def _acct_store_flags_int_to_str(flags): cdef list out = [] @@ -1311,6 +1448,7 @@ def _acct_store_flags_int_to_str(flags): return out + def _get_memory(value, per_cpu): if value != slurm.NO_VAL64: if value & slurm.MEM_PER_CPU and per_cpu: From af7f04bf1fca32be3198aa0d45a027b204d4da37 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Wed, 8 Jan 2025 23:43:26 +0100 Subject: [PATCH 09/31] add debug flags, fs dampening factor and logging level RPCs --- pyslurm/core/slurmctld.pxd | 4 + pyslurm/core/slurmctld.pyx | 206 ++++++++++++++++++++++++++++++++++++- pyslurm/slurm/extra.pxi | 1 + 3 files changed, 208 insertions(+), 3 deletions(-) diff --git a/pyslurm/core/slurmctld.pxd b/pyslurm/core/slurmctld.pxd index 747513b1..234d236c 100644 --- a/pyslurm/core/slurmctld.pxd +++ b/pyslurm/core/slurmctld.pxd @@ -35,6 +35,10 @@ from pyslurm.slurm cimport ( slurm_shutdown, slurm_ping, slurm_takeover, + slurm_set_debugflags, + slurm_set_debug_level, + slurm_set_schedlog_level, + slurm_set_fs_dampeningfactor, ping_all_controllers, controller_ping_t, cpu_bind_type_t, diff --git a/pyslurm/core/slurmctld.pyx b/pyslurm/core/slurmctld.pyx index 9606e809..cd743422 100644 --- a/pyslurm/core/slurmctld.pyx +++ b/pyslurm/core/slurmctld.pyx @@ -186,6 +186,192 @@ def takeover(index = 1): verify_rpc(slurm_takeover(index)) +def add_debug_flags(flags): + """Add DebugFlags to slurmctld + + Args: + flags (list): + For an available list of possible values, please check the + `slurm.conf` documentation under `DebugFlags`. + + Raises: + (pyslurm.RPCError): When setting the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.add_debug_flags(["CpuFrequency"]) + """ + if not flags: + return + + data = _debug_flags_str_to_int(flags) + if not data: + raise RPCError(msg="Invalid Debug Flags specified.") + + verify_rpc(slurm_set_debugflags(data, 0)) + + +def remove_debug_flags(flags): + """Remove DebugFlags from slurmctld. + + Args: + flags (list): + For an available list of possible values, please check the + `slurm.conf` documentation under `DebugFlags`. + + Raises: + (pyslurm.RPCError): When removing the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.remove_debug_flags(["CpuFrequency"]) + """ + if not flags: + return + + data = _debug_flags_str_to_int(flags) + if not data: + raise RPCError(msg="Invalid Debug Flags specified.") + + verify_rpc(slurm_set_debugflags(0, data)) + + +def clear_debug_flags(): + """Remove all currently set debug flags from slurmctld. + + Raises: + (pyslurm.RPCError): When removing the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.clear_debug_flags() + """ + current_flags = get_debug_flags() + if not current_flags: + return + + data = _debug_flags_str_to_int(current_flags) + verify_rpc(slurm_set_debugflags(0, data)) + + +def get_debug_flags(): + """Get the current list of debug flags for the slurmctld. + + Raises: + (pyslurm.RPCError): When getting the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> flags = slurmctld.get_debug_flags() + >>> print(flags) + ['CpuFrequency', 'Backfill'] + """ + return Config.load().debug_flags + + +def set_log_level(level): + """Set the logging level for slurmctld. + + Args: + level (str): + For an available list of possible values, please check the + `slurm.conf` documentation under `SlurmctldDebug`. + + Raises: + (pyslurm.RPCError): When setting the log level was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.set_log_level("quiet") + """ + data = _log_level_str_to_int(level) + verify_rpc(slurm_set_debug_level(data)) + + +def get_log_level(): + """Get the current log level for the slurmctld. + + Raises: + (pyslurm.RPCError): When getting the log level was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> level = slurmctld.get_log_level() + >>> print(level) + quiet + """ + return Config.load().slurmctld_log_level + + +def enable_scheduler_logging(): + """Enable scheduler logging for slurmctld. + + Raises: + (pyslurm.RPCError): When enabling scheduler logging was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.enable_scheduler_logging() + """ + verify_rpc(slurm_set_schedlog_level(1)) + + +def is_scheduler_logging_enabled(): + """Check whether scheduler logging is enabled for slurmctld. + + Returns: + (bool): Whether scheduler logging is enabled or not. + + Raises: + (pyslurm.RPCError): When getting the scheduler logging was not + successful. + + Examples: + >>> from pyslurm import slurmctld + >>> print(slurmctld.is_scheduler_logging_enabled()) + False + """ + return Config.load().scheduler_logging + + +def set_fair_share_dampening_factor(factor): + """Set the FairShare Dampening factor. + + Args: + factor (int): + The factor to set. A minimum value of `1`, and a maximum value of + `65535` are allowed. + + Raises: + (pyslurm.RPCError): When setting the factor was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.set_fair_share_dampening_factor(100) + """ + max_value = (2 ** 16) - 1 + if not factor or factor >= max_value: + raise RPCError(msg=f"Invalid Dampening factor: {factor}. " + f"Factor must be between 0 and {max_value}.") + + verify_rpc(slurm_set_fs_dampeningfactor(factor)) + + +def get_fair_share_dampening_factor(): + """Get the currently set FairShare Dampening factor. + + Raises: + (pyslurm.RPCError): When getting the factor was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> factor = slurmctld.get_fair_share_dampening_factor() + >>> print(factor) + 100 + """ + return Config.load().fair_share_dampening_factor + + cdef class MPIConfig: def __init__(self): @@ -1022,8 +1208,8 @@ cdef class Config: return cstr.to_unicode(self.ptr.sched_logfile) @property - def scheduler_log_level(self): - return u16_parse(self.ptr.sched_log_level, zero_is_noval=False) + def scheduler_logging(self): + return u16_parse_bool(self.ptr.sched_log_level) @property def scheduler_parameters(self): @@ -1344,7 +1530,13 @@ def _debug_flags_int_to_list(flags): def _debug_flags_str_to_int(flags): - pass + cdef: + uint64_t flags_num = 0 + char *flags_str = NULL + + flags_str = cstr.from_unicode(cstr.list_to_str(flags)) + slurm.debug_str2flags(flags_str, &flags_num) + return flags_num # https://github.com/SchedMD/slurm/blob/01a3aac7c59c9b32a9dd4e395aa5a97a8aea4f08/slurm/slurm.h#L621 @@ -1432,6 +1624,14 @@ def _log_level_int_to_str(flags): return data +def _log_level_str_to_int(level): + cdef uint16_t data = slurm.log_string2num(str(level)) + if u16_parse(data, zero_is_noval=False) is None: + raise RPCError(msg=f"Invalid Log level: {level}.") + + return data + + def _acct_store_flags_int_to_str(flags): cdef list out = [] diff --git a/pyslurm/slurm/extra.pxi b/pyslurm/slurm/extra.pxi index c686f9fd..09dfae70 100644 --- a/pyslurm/slurm/extra.pxi +++ b/pyslurm/slurm/extra.pxi @@ -314,6 +314,7 @@ cdef extern char *priority_flags_string(uint16_t priority_flags) cdef extern char* prolog_flags2str(uint16_t prolog_flags) cdef extern uint16_t prolog_str2flags(char *prolog_flags) cdef extern char *log_num2string(uint16_t inx) +cdef extern uint16_t log_string2num(const char *name) cdef extern char *private_data_string(uint16_t private_data, char *str, int str_len) cdef extern char *reconfig_flags2str(uint16_t reconfig_flags) cdef extern uint16_t reconfig_str2flags(char *reconfig_flags) From 3467456618ec61a397436a4b03c42e49a431912f Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Wed, 8 Jan 2025 23:43:51 +0100 Subject: [PATCH 10/31] add more tests for slurmctld functions --- tests/integration/test_slurmctld.py | 51 +++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/integration/test_slurmctld.py b/tests/integration/test_slurmctld.py index d94e03ae..4d1f1004 100644 --- a/tests/integration/test_slurmctld.py +++ b/tests/integration/test_slurmctld.py @@ -56,3 +56,54 @@ def test_load_config(): assert conf.cgroup_config assert conf.accounting_gather_config assert conf.mpi_config + + +def test_debug_flags(): + slurmctld.clear_debug_flags() + + slurmctld.add_debug_flags([]) + assert slurmctld.get_debug_flags() == [] + + slurmctld.add_debug_flags(["CpuFrequency", "Backfill"]) + assert slurmctld.get_debug_flags() == ["Backfill", "CpuFrequency"] + + slurmctld.add_debug_flags(["Agent"]) + assert slurmctld.get_debug_flags() == ["Agent", "Backfill", "CpuFrequency"] + + slurmctld.remove_debug_flags(["CpuFrequency"]) + assert slurmctld.get_debug_flags() == ["Agent", "Backfill"] + + slurmctld.clear_debug_flags() + assert slurmctld.get_debug_flags() == [] + + +def test_log_level(): + slurmctld.set_log_level("debug5") + assert slurmctld.get_log_level() == "debug5" + + slurmctld.set_log_level("debug2") + assert slurmctld.get_log_level() == "debug2" + + with pytest.raises(pyslurm.RPCError, + match=r"Invalid Log*"): + slurmctld.set_log_level("invalid") + + +def test_scheduler_log_level(): + assert not slurmctld.is_scheduler_logging_enabled() + + +def test_fair_share_dampening_factor(): + slurmctld.set_fair_share_dampening_factor(100) + assert slurmctld.get_fair_share_dampening_factor() == 100 + + slurmctld.set_fair_share_dampening_factor(1) + assert slurmctld.get_fair_share_dampening_factor() == 1 + + with pytest.raises(pyslurm.RPCError, + match=r"Invalid Dampening*"): + slurmctld.set_fair_share_dampening_factor(0) + + with pytest.raises(pyslurm.RPCError, + match=r"Invalid Dampening*"): + slurmctld.set_fair_share_dampening_factor(99999999) From 93a1d64eeeb9a726b90a1df483e84edfb65df9e2 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 14 Jan 2025 15:50:21 +0100 Subject: [PATCH 11/31] almost complete documentation for the config --- pyslurm/core/slurmctld.pxd | 823 ++++++++++++++++++++++++++++++++++++- 1 file changed, 822 insertions(+), 1 deletion(-) diff --git a/pyslurm/core/slurmctld.pxd b/pyslurm/core/slurmctld.pxd index 234d236c..7e011700 100644 --- a/pyslurm/core/slurmctld.pxd +++ b/pyslurm/core/slurmctld.pxd @@ -3,6 +3,9 @@ ######################################################################### # Copyright (C) 2025 Toni Harzendorf # +# Note: Some classes are annotated with additional Copyright notices further +# down +# # This file is part of PySlurm # # PySlurm is free software; you can redistribute it and/or modify @@ -80,7 +83,747 @@ cdef class PingResponse: latency +# Documentation for the attributes in the Config class have been largely taken +# from the official slurm.conf overview at: +# https://slurm.schedmd.com/slurm.conf.html +# +# Therefore, the following Copyright notices that slurm.conf has (see +# https://slurm.schedmd.com/slurm.conf.html#SECTION_COPYING), are also listed +# here: +# +# Copyright (C) 2002-2007 The Regents of the University of California. Produced +# at Lawrence Livermore National Laboratory (cf, pyslurm/slurm/SLURM_DISCLAIMER). +# Copyright (C) 2008-2010 Lawrence Livermore +# National Security. Copyright (C) 2010-2022 SchedMD LLC. cdef class Config: + """The Slurm Configuration. + + All attributes in this class are read-only. + + Attributes: + accounting_storage_enforce (list[str]): + List of enforcements on Job submissions. + {slurm.conf::OPT_AccountingStorageEnforce} + accounting_storage_backup_host (str): + Name of the backup machine hosting the Slurm database. + {slurm.conf::OPT_AccountingStorageBackupHost} + accounting_storage_external_hosts (list[str]): + List of external slurmdbds to register with. + {slurm.conf::OPT_AccountingStorageExternalHost} + accounting_storage_host (str): + Name of the machine hosting the slurm database. + {slurm.conf::OPT_AccountingStorageHost + accounting_storage_parameters (dict[str, str]): + Options for the accounting storage Plugin + {slurm.conf::OPT_AccountingStorageParameters} + accounting_storage_port (int): + Listening port of the Accounting Database Server + {slurm.conf::OPT_AccountingStoragePort} + accounting_storage_tres (list): + List of configured Resources to track on the Cluster. + {slurm.conf::OPT_AccountingStorageTRES} + accounting_storage_type (str): + The accounting storage type used. + {slurm.conf::OPT_AccountingStorageType} + accounting_storage_user (str): + The User accounting accessing the accounting database. + {slurm.conf::OPT_AccountingStorageUser} + accounting_store_flags (list[str]): + List of fields that the slurmctld also sends to the accounting + database. + {slurm.conf::OPT_AccountingStoreFlags} + accounting_gather_node_frequency (int): + Accounting-Gather plugins sampling interval for node accounting. + {slurm.conf::OPT_AcctGatherNodeFreq} + accounting_gather_energy_type (str): + Plugin used for energy consumption accounting. + {slurm.conf::OPT_AcctGatherEnergyType} + accounting_gather_interconnect_type (str): + Plugin used for interconnect network traffic accounting. + {slurm.conf::OPT_AcctGatherInterconnectType} + accounting_gather_filesystem_type (str): + Plugin used for filesystem traffic accounting. + {slurm.conf::OPT_AcctGatherFilesystemType} + accounting_gather_profile_type (str): + Plugin used for detailed job profiling. + {slurm.conf::OPT_AcctGatherProfileType} + allow_spec_resource_usage (bool): + Whether Slurm allows jobs to override the nodes configured + `CoreSpecCount` + {slurm.conf::OPT_AllowSpecResourcesUsage} + auth_alt_types (list[str]): + List of alternative authentication plugins the slurmctld permits. + {slurm.conf::OPT_AuthAltTypes} + auth_alt_parameters (dict[str, str]: + Options for the alternative authentication plugins. + {slurm.conf::OPT_AuthAltParameters} + auth_info (list[str]): + List of additional information used for authentication of + communication between Slurm daemons. + {slurm.conf::OPT_AuthInfo} + auth_type (str): + Primary authentication method for communications between Slurm + components. + {slurm.conf::OPT_AuthType} + batch_start_timeout (int): + The maximum time (in seconds) that a batch job is permitted for + launching before being considered missing and releasing the + allocation. + {slurm.conf::OPT_BatchStartTimeout} + bcast_exclude_paths (list[str]): + List of absolute directory paths to be excluded when + autodetecting and broadcasting executable shared object dependencies + through `sbcast` or `srun --bcast`. + {slurm.conf::OPT_BcastExclude} + bcast_parameters (dict[str, str]: + Options for `sbcast` and `srun --bcast` behaviour. + {slurm.conf::OPT_BcastParameters} + burst_buffer_type (str): + Plugin used to manage burst buffers. + {slurm.conf::OPT_BurstBufferType} + slurmctld_boot_time (int): + Timestamp of when the slurmctld last booted. + certmgr_parameters (str): + List of options for the certmgr Plugin. + certmgr_type (str): + Plugin used for certmgr mechanism. + cli_filter_plugins (list[str]): + List of CLI Filter plugins to use. + {slurm.conf::OPT_CliFilterPlugins} + cluster_name (str): + Name of the Cluster. + {slurm.conf::OPT_ClusterName} + communication_parameters (dict[str, Union[str, int]]): + Communication options for Cluster daemons. + {slurm.conf::OPT_CommunicationParameters} + complete_wait_time (int): + The time to wait, in seconds, when any job is in the COMPLETING state + before any additional jobs are scheduled. + {slurm.conf::OPT_CompleteWait} + default_cpu_frequency_governor (str): + Default CPU governor to use when a Job has not specified the + `--cpu-freq` option. + {slurm.conf::OPT_CpuFreqDef} + cpu_frequency_governors (list[str]): + List of CPU Governors allowed to be set on Job submission. + {slurm.conf::OPT_CpuFreqGovernors} + credential_type (str): + Cryptographic signature tool to be used when creating job step + credentials. + {slurm.conf::OPT_CredType} + data_parser_parameters (str): + Default value to apply for `data_parser` plugin parameters. + {slurm.conf::OPT_DataParserParameters} + debug_flags (list[str]): + List of DebugFlags currently set for Daemons. + {slurm.conf::OPT_DebugFlags} + default_memory_per_cpu (int): + Default real memory size available per allocated CPU in Mebibytes. + {slurm.conf::OPT_DefMemPerCPU} + default_memory_per_node (int): + Default real memory size available per allocated Node in Mebibytes. + {slurm.conf::OPT_DefMemPerNode} + dependency_parameters (list[str]): + List of parameters for dependencies. + {slurm.conf::OPT_DependencyParameters} + disable_root_jobs (bool): + Whether root can submit Jobs or not. + {slurm.conf::OPT_DisableRootJobs} + eio_timeout (int): + The number of seconds srun waits for slurmstepd to close the TCP/IP + connection used to relay data between the user application and srun + when the user application terminates. + {slurm.conf::OPT_EioTimeout} + enforce_partition_limits (str): + Controls which Limits are enforced on Partition level. + {slurm.conf::OPT_EnforcePartLimits} + epilog (list[str]): + List of Epilog scripts in use that are executed as root on every + node when a Job completes. + {slurm.conf::OPT_Epilog} + epilog_msg_time (int): + The number of microseconds that the slurmctld daemon requires to + process an epilog completion message from the slurmd daemons. + {slurm.conf::OPT_EpilogMsgTime} + epilog_slurmctld (list[str]): + List of Epilog scripts in use that are executed by slurmctld at job + allocation. + {slurm.conf::OPT_EpilogSlurmctld} + fair_share_dampening_factor (int): + Dampen the effect of exceeding a user or group's fair share of + allocated resources. + {slurm.conf::OPT_FairShareDampeningFactor} + federation_parameters (list[str]): + Options for Federations + {slurm.conf::OPT_FederationParameters} + first_job_id (int): + The job id to be used for the first job submitted. + {slurm.conf::OPT_FirstJobId} + get_environment_timeout (int): + How long a Job waits (in seconds) to load the Users environment + before attempting to load it from a cache file. + {slurm.conf::OPT_GetEnvTimeout} + gres_types (list[str]): + List of generic resources to be managed. + {slurm.conf::OPT_GresTypes} + group_update_force (bool): + Whether user group membership information is updated periodically, + even if there are no changes to `/etc/group`. + {slurm.conf::OPT_GroupUpdateForce} + group_update_time (int): + How frequently information about user group membership is updated, + and how longer it is cached (in seconds). + {slurm.conf::OPT_GroupUpdateTime} + default_gpu_frequency (str): + Default GPU frequency to use when running a job step if it has not + been explicitly set using the --gpu-freq option. + {slurm.conf::OPT_GpuFreqDef} + hash_plugin (str): + Type of hash plugin used for network communication. + {slurm.conf::OPT_HashPlugin} + hash_value (str): + Current configuration hash value (hex). + health_check_interval (int): + Interval in seconds between executions of `HealthCheckProgram` + {slurm.conf::OPT_HealthCheckInterval} + health_check_node_state (list[str]): + List of node states which are eligible to execute + `HealthCheckProgram` + {slurm.conf::OPT_HealthCheckNodeState} + health_check_program (str): + Pathname of a script that is periodally executed as root user on + all compute nodes. + {slurm.conf::OPT_HealthCheckProgram} + inactive_limit (int): + The interval, in seconds, after which a non-responsive job + allocation command (e.g. `srun` or `salloc`) will result in the job + being terminated. + {slurm.conf::OPT_InactiveLimit} + interactive_step_options (str): + When `LaunchParameters=use_interactive_step` is enabled, launching + salloc will automatically start an srun process with + `interactive_step_options` to launch a terminal on a node in the job + allocation. + {slurm.conf::OPT_InteractiveStepOptions} + job_accounting_gather_type (str): + The job accounting gather plugin used to collect usage information + about Jobs. + {slurm.conf::OPT_JobAcctGatherType} + job_accounting_gather_frequency (dict[str, int]): + The job accounting and profiling sampling intervals. + {slurm.conf::OPT_JobAcctGatherFrequency} + job_accounting_gather_parameters (list[str]): + Arbitrary paramerers for `job_accounting_gather_type` + {slurm.conf::OPT_JobAcctGatherParams} + job_completion_host (str): + Name of the machine hosting the job completion database. + {slurm.conf::OPT_JobCompHost} + job_completion_location (str): + Sets a string which has different meaning depending on + `job_completion_type` + {slurm.conf::OPT_JobCompLoc} + job_completion_parameters (list[str]): + Arbitrary text passed to the Job completion plugin. + {slurm.conf::OPT_JobCompParams} + job_completion_port (int): + The listening port of the job completion database server. + {slurm.conf::OPT_JobCompPort} + job_completion_type (str): + Job completion logging mechanism type + {slurm.conf::OPT_JobCompType} + job_completion_user (str): + User account user fo accessing the job completion database. + {slurm.conf::OPT_JobCompUser} + job_container_type (str): + Plugin used for job isolation through Linux namespaces. + {slurm.conf::OPT_JobContainerType} + job_file_append (bool): + This option controls what to do if a job's output or error file + exist when the job is started. If `True`, then append to the + existing file. `False`, which is the default, means any existing + files are truncated. + {slurm.conf::OPT_JobFileAppend} + job_requeue (bool): + Whether jobs are requeuable by default + {slurm.conf::OPT_JobRequeue} + job_submit_plugins (list[str]): + Site specific list of plugins used for setting default job + parameters and/or logging events + {slurm.conf::OPT_JobSubmitPlugins} + kill_on_bad_exit (bool): + Whether a step will be terminated immediately if any task is + crashed or aborted. + {slurm.conf::OPT_KillOnBadExit} + kill_wait_time (int): + The interval, in seconds, given to a job's processes between the + `SIGTERM` and `SIGKILL` signals upon reaching its time limit. + {slurm.conf::OPT_KillWait} + launch_parameters (list[str]) + Options for the job launch plugin. + {slurm.conf::OPT_LaunchParameters} + licenses (dict[str, int]): + Licenses that can be allocated to jobs. + {slurm.conf::OPT_Licenses} + log_time_format (str): + Format of the timestamp in slurmctld and slurmd log-files. + {slurm.conf::OPT_LogTimeFormat} + mail_domain (str): + Domain name to qualify usernames if email address is not explicity + given with the `--mail-user` option. + {slurm.conf::OPT_MailDomain} + mail_program (str): + Pathname to the program used to send emails per user request + {slurm.conf::OPT_MailProg} + max_array_size (int): + Maximum job array task index value allowed. + {slurm.conf::OPT_MaxArraySize} + max_batch_requeue (int): + Maximum number of times a batch job may be automatically requeued + before being marked as `JobHeldAdmin`. + {slurm.conf::OPT_MaxBatchRequeue} + max_dbd_msgs (int): + Maximum number of messages the Slurm controllers queues before + starting to drop them when the slurmdbd is down. + {slurm.conf::OPT_MaxDBDMsgs} + max_job_count (int): + Maximum number of jobs slurmctld can have in memory at one time. + {slurm.conf::OPT_MaxJobCount} + max_job_id (int): + Highest job ID possible for Jobs that will be assigned + automatically on submission. + {slurm.conf::OPT_MaxJobId} + max_memory_per_cpu (int): + Maximum real memory size avialable per allocated CPU in Mebibytes. + {slurm.conf::OPT_MaxMemPerCPU} + max_memory_per_node (int): + Maximum real memory size avialable per allocated Node in Mebibytes. + {slurm.conf::OPT_MaxMemPerNode} + max_node_count (int): + Maximum count of nodes which may exist in the slurmctld. + {slurm.conf::OPT_MaxNodeCount} + max_step_count (int): + Maximum number of Steps that any Job can initiate. + {slurm.conf::OPT_MaxStepCount} + max_tasks_per_node (int): + Maximum number of tasks Slurm will allow for a job step to spawn on + a single node. + {slurm.conf::OPT_MaxTasksPerNode} + mcs_plugin (str): + Associate a security label to jobs, for resource sharing among jobs + with the same label. + {slurm.conf::OPT_MCSPlugin} + mcs_parameters (list[str]): + Parameters for the MCS Plugin. + {slurm.conf::OPT_MCSParameters} + min_job_age (int): + Minimum age (in seconds) of a completed Job before its record is + cleared from slurmctlds memory. + {slurm.conf::OPT_MinJobAge} + mpi_default (str): + Default type of MPI that will be used. + {slurm.conf::OPT_MpiDefault} + mpi_parameters (list[str]): + Parameters for MPI. + message_timeout (int): + Time permitted for a round-trip communication to complete in + seconds. + {slurm.conf::OPT_MessageTimeout} + next_job_id (int): + Next Job-ID that will be assigned. + node_features_plugins (list[str]): + Plugins to be used for support of node features which can change + through time. + {slurm.conf::OPT_NodeFeaturesPlugins} + over_time_limit (int): + Number of minutes by which a job can exceed its time limit before + being canceled. + {slurm.conf::OPT_OverTimeLimit} + plugin_dirs (list[str]): + List of paths where Slurm looks for plugins. + {slurm.conf::OPT_PluginDir} + plugin_stack_config (str): + Location of the config file for Slurm stackable plugins. + {slurm.conf::OPT_PlugStackConfig} + preempt_exempt_time (int): + Minimum run time for all jobs before they can be considered for + preemption. + {slurm.conf::OPT_PreemptExemptTime} + preempt_mode (str): + Mechanism used to preempt jobs or enable gang scheduling. + {slurm.conf::OPT_PreemptMode} + preempt_parameters (list[str]): + Options for the Preempt Plugin. + {slurm.conf::OPT_PreemptParameters} + preempt_type (str): + Plugin used to identify which jobs can be preempted. + {slurm.conf::OPT_PreemptMode} + prep_parameters (list[str]): + Parameters passed to the PrEpPlugins. + {slurm.conf::OPT_PrEpParamrters} + prep_plugins (list[str]): + List of PrEp Plugins to be used. + {slurm.conf::OPT_PrEpPlugins} + priority_decay_half_life (int): + Controls how long (in seconds) prior resource use is considered in + determining how over- or under-serviced an association is. + {slurm.conf::OPT_PriorityDecayHalfLife} + priority_calc_period (int): + Period (in minutes) in which the half-life decay will be + re-calculated. + {slurm.conf::OPT_PriorityCalcPeriod} + priority_favor_small (bool): + Whether small jobs should be given preferential scheduling + priority. + {slurm.conf::OPT_PriorityFavorSmall} + priority_flags (list[str]): + List of flags that modify priority behaviour. + {slurm.conf::OPT_PriorityFlags} + priority_max_age (int): + Job age that is needed before receiving the maximum age factor in + computing priority. + {slurm.conf::OPT_PriorityMaxAge} + priority_parameters (str): + Arbitrary string used by the `priority_type` plugin. + {slurm.conf::OPT_PriorityParameters} + priority_usage_reset_period (str): + At this interval the usage of associations will be reset to 0. + {slurm.conf::OPT_PriorityUsageResetPeriod} + priority_type (str): + Specifies the plugin to be used in establishing a job's scheduling + priority. + {slurm.conf::OPT_PriorityType} + priority_weight_age (int): + An integer value that sets the degree to which the queue wait time + component contributes to the job's priority. + {slurm.conf::OPT_PriorityWeightAge} + priority_weight_assoc (int): + An integer value that sets the degree to which the association + component contributes to the job's priority. + {slurm.conf::OPT_PriorityWeightAssoc} + priority_weight_fair_share (int): + An integer value that sets the degree to which the fair-share + component contributes to the job's priority. + {slurm.conf::OPT_PriorityWeightFairShare} + priority_weight_job_size (int): + An integer value that sets the degree to which the job size + component contributes to the job's priority. + {slurm.conf::OPT_PriorityWeightJobSize} + priority_weight_partition (int): + Partition factor used by priority/multifactor plugin in calculating + job priority. + {slurm.conf::OPT_PriorityWeightPartition} + priority_weight_qos (int): + An integer value that sets the degree to which the Quality Of + Service component contributes to the job's priority + {slurm.conf::OPT_PriorityWeightQOS} + priority_weight_tres (dict[str, int]): + TRES Types and weights that sets the degree that each TRES Type + contributes to the job's priority. + {slurm.conf::OPT_PriorityWeightTRES} + private_data (list[str]): + Defines what type of information is hidden from regular users. + {slurm.conf::OPT_PrivateData} + proctrack_type (str): + Identifies the plugin to be used for process tracking on a job step + basis. + {slurm.conf::OPT_ProctrackType} + prolog (list[str]): + List of pathnames of programs for the slurmd to execute whenever + it is asked to run a job step from a new job allocation. + {slurm.conf::OPT_Prolog} + prolog_epilog_timeout (int): + The interval in seconds Slurm waits for Prolog and Epilog before + terminating them. + {slurm.conf::OPT_PrologEpilogTimeout} + prolog_slurmctld (list[str]): + List of pathnames of programs for the slurmctld daemon to execute + before granting a new job allocation. + {slurm.conf::OPT_PrologSlurmctld} + propagate_prio_process (int): + Controls the scheduling priority (nice value) of user spawned + tasks. + {slurm.conf::OPT_PropagatePrioProcess} + prolog_flags (list[str]): + Flags to control the Prolog behavior. + {slurm.conf::OPT_PrologFlags} + propagate_resource_limits (list[str]): + List of resource limit names that are propagated to the Job + environment. + {slurm.conf::OPT_PropagateResourceLimits} + propagate_resource_limits_except (list[str]): + List of resource limit names that are excluded from propagation to + the Job environment. + {slurm.conf::OPT_PropagateResourceLimitsExcept} + reboot_program (str): + Program to be executed on each compute node to reboot it. + {slurm.conf::OPT_RebootProgram} + reconfig_flags (lisr[str]): + List of flags to control various actions that may be taken when a + reconfigure command is issued (for example with `scontrol + reconfig`). + {slurm.conf::OPT_ReconfigFlags} + requeue_exit (str): + Enables automatic requeue for batch jobs which exit with the + specified values. + {slurm.conf::OPT_RequeueExit} + requeue_exit_hold (str): + Enables automatic requeue for batch jobs which exit with the + specified values, with these jobs being held until released + manually by the user. + {slurm.conf::OPT_RequeueExitHold} + resume_fail_program (str): + The program that will be executed when nodes fail to resume to by + `resume_timeout`. + {slurm.conf::OPT_ResumeFailProgram} + resume_program (str): + Program that will be executed when a node in power save mode is + assigned work to perform. + {slurm.conf::OPT_ResumeProgram} + resume_rate (int): + Number of nodes per minute that will be restored from power save + mode to normal operation by `resume_program`. + {slurm.conf::OPT_ResumeRate} + resume_timeout (int): + Maximum time permitted (in seconds) between when a node resume + request is issued and when the node is actually available for use. + {slurm.conf::OPT_ResumeTimeout} + reservation_epilog (str): + Pathname of a program for the slurmctld to execute when a + reservation ends. + {slurm.conf::OPT_ResvEpilog} + reservation_over_run (int): + Describes how long (in minutes) a job already running in a + reservation should be permitted to execute after the end time of + the reservation has been reached + {slurm.conf::OPT_ResvOverRun} + reservation_prolog (str): + Pathname of a program for the slurmctld to execute when a + reservation begins. + {slurm.conf::OPT_ResvProlog} + return_to_service (int): + Controls when a `DOWN` node will be returned to service + {slurm.conf::OPT_ReturnToService} + scheduler_log_file (str): + pathname of the scheduling event logging file. + {slurm.conf::OPT_SlurmSchedLogFile} + scheduler_logging_enabled (bool): + The initial level of scheduling event logging. + {slurm.conf::OPT_SlurmSchedLogLevel} + scheduler_parameters (list[str]): + List of options for the `scheduler_type` plugin. + {slurm.conf::OPT_SchedulerParameters} + scheduler_time_slice (int): + Number of seconds in each time slice when gang scheduling is + enabled. + {slurm.conf::OPT_SchedulerTimeSlice} + scheduler_type (str): + Identifies the type of scheduler to be used. + {slurm.conf::OPT_SchedulerType} + scron_parameters (list[str]): + Parameters for scron. + {slurm.conf::OPT_ScronParameters} + select_type (str): + Identifies the type of resource selection algorithm to be used. + {slurm.conf::OPT_SelectType} + select_type_parameters (list[str]): + Parameters passed to the `select_type` plugin. + {slurm.conf::OPT_SelectTypeParameters} + priority_site_factor_plugin (str): + Specifies an optional plugin to be used alongside + "priority/multifactor", which is meant to initially set and + continuously update the SiteFactor priority factor. + {slurm.conf::OPT_PrioritySiteFactorPlugin} + priority_site_factor_parameters (str): + Arbitrary string used by the PrioritySiteFactorPlugin plugin. + {slurm.conf::OPT_PrioritySiteFactorParameters} + slurm_conf_path (str): + Path of the current slurm.conf file used. + slurm_user_id (int): + UID of the `slurm_user_Name` + slurm_user_name (str): + Name of the Slurm User + slurmd_user_id (int): + UID of the `slurmd_user_name` + slurmd_user_name (str): + Name of the User slurmd runs as. + slurmctld_log_level (str): + The level of detail to provide `slurmctld` daemon's logs. + {slurm.conf::OPT_SlurmctldDebug} + slurmctld_log_file (str): + Pathname of a file into which the `slurmctld` daemon's logs are + written. + {slurm.conf::OPT_SlurmctldLogFile} + slurmctld_pid_file (str): + Pathname of a file into which the `slurmctld` daemon may write its + process id. + {slurm.conf::OPT_SlurmctldPidFile} + slurmctld_port (str): + Port number where `slurmctld` listens to for work. + Note that this can also be a port range. + {slurm.conf::OPT_SlurmctldPort} + slurmctld_primary_off_program (str): + This program is executed when a `slurmctld` daemon running as the + primary server becomes a backup server. + {slurm.conf::OPT_SlurmctldPrimaryOffProg} + slurmctld_primary_on_program (str): + This program is executed when a `slurmctld` daemon running as a + backup server becomes the primary server. + {slurm.conf::OPT_SlurmctldPrimaryOnProg} + slurmctld_syslog_level (str): + Level of detail that the `slurmctld` logs to the syslog. + {slurm.conf::OPT_SlurmctldSyslogDebug} + slurmctld_timeout (int): + The interval, in seconds, that the backup controller waits for the + primary controller to respond before assuming control. + {slurm.conf::OPT_SlurmctldTimeout} + slurmctld_parameters (list[str]): + Options set for the `slurmctld`. + {slurm.conf::OPT_SlurmctldParameters} + slurmd_log_level (str): + Level of detail `slurmd` is logging. + {slurm.conf::OPT_SlurmdDebug} + slurmd_log_file (str): + Pathname of the file where `slurmd` writes logs to. + {slurm.conf::OPT_SlurmdLogFile} + slurmd_parameters (list[str]): + Parameters for the `slurmd`. + {slurm.conf::OPT_SlurmdParameters} + slurmd_pid_file (str): + Pathname of a file into which the `slurmd` daemon may write its + process id. + {slurm.conf::OPT_SlurmdPidFile} + slurmd_port (int): + Port number where `slurmd` listens to for work. + {slurm.conf::OPT_SlurmdPort} + slurmd_spool_directory (str): + Pathname of a directory into which the `slurmd` daemon's state + information and batch job script information are written. + {slurm.conf::OPT_SlurmdSpoolDir} + slurmd_syslog_level (str): + Level of detail that the `slurmd` logs to the syslog. + {slurm.conf::OPT_SlurmdSyslogDebug} + slurmd_timeout (int): + The interval, in seconds, that `slurmctld` waits for `slurmd` to + respond before configuring that node's state to `DOWN`. + {slurm.conf::OPT_SlurmdTimeout} + srun_epilog (str): + Pathname of an executable to be run by `srun` following the + completion of a job step. + {slurm.conf::OPT_SrunEpilog} + srun_port_range (str): + Ports `srun` creates to communicate with the `slurmctld`, the + `slurmstepd` and to handle the application I/O. + {slurm.conf::OPT_SrunPortRange} + srun_prolog (str): + Pathname of an executable to be run by `srun` prior to the launch + of a job step. + {slurm.conf::OPT_SrunProlog} + state_save_location (str): + Pathname of a directory where `slurmctld` saves its state. + {slurm.conf::OPT_StateSaveLocation} + suspend_exclude_nodes (str): + Specifies the nodes which are to not be placed in power save mode, + even if the node remains idle for an extended period of time. + {slurm.conf::OPT_SuspendExcNodes} + suspend_exclude_partitions (str): + Specifies the partitions whose nodes are to not be placed in power + save mode, even if the node remains idle for an extended period of + time. + {slurm.conf::OPT_SuspendExcParts} + suspend_exclude_states (list[str]): + Specifies node states that are not to be powered down + automatically. + {slurm.conf::OPT_SuspendExcStates} + suspend_program (str): + Program that will be executed when a node remains idle for an + extended period of time. + {slurm.conf::OPT_SuspendProgram} + suspend_rate (int): + Number of nodes per minute that are placed into power save mode. + {slurm.conf::OPT_SuspendRate} + suspend_time (int): + Nodes which remain idle or down for this number of seconds will be + placed into power save mode. + {slurm.conf::OPT_SuspendTime} + suspend_timeout (int): + Maximum time permitted (in seconds) between when a node suspend + request is issued and when the node is shutdown. + {slurm.conf::OPT_SuspendTimeout} + switch_type (str): + Identifies the type of switch or interconnect used for application + communications. + {slurm.conf::OPT_SwitchType} + switch_parameters (list[str]): + Optional parameters for the switch plugin. + {slurm.conf::OPT_SwitchParameters} + task_epilog (str): + Pathname of a program to be executed as the slurm job's owner after + termination of each task. + {slurm.conf::OPT_TaskEpilog} + task_plugin (str): + Identifies the type of task launch plugin, typically used to + provide resource management within a node. + {slurm.conf::OPT_TaskPlugin} + task_plugin_parameters (list[str]): + Optional Parameters for `task_plugin`. + {slurm.conf::OPT_TaskPluginParam} + task_prolog (str): + Pathname of a program to be executed as the slurm job's owner prior + to initiation of each task. + {slurm.conf::OPT_TaskProlog} + tls_parameters (list[str]): + Parameters for `tls_type`. + tls_type (str): + TLS Plugin used. + tcp_timeout (int): + Time permitted for TCP connection to be established. + {slurm.conf::OPT_TCPTimeout} + temporary_filesystem (str): + Pathname of the file system available to user jobs for temporary + storage. + {slurm.conf::OPT_TmpFS} + topology_parameters (list[str]): + List of network topology options + {slurm.conf::OPT_TopologyParam} + topology_plugin (str): + Identifies the plugin to be used for determining the network + topology and optimizing job allocations to minimize network + contention. + {slurm.conf::OPT_TopologyPlugin} + tree_width (int): + Specifies the width of the virtual network tree `slurmd` uses for + communication. + {slurm.conf::OPT_TreeWidth} + unkillable_step_program (str): + Program that will be executed when the processes in a job step are + determined unkillable. + {slurm.conf::OPT_UnkillableStepProgram} + unkillable_step_timeout (int): + The length of time, in seconds, that Slurm will wait before + deciding that processes in a job step are unkillable. + {slurm.conf::OPT_UnkillableStepTimeout} + track_wckey (bool): + Whether WCKeys are tracked or not. + {slurm.conf::OPT_TrackWCKey} + use_pam (bool): + Whether PAM (Pluggable Authentication Modules for Linux) will be + enabled or not. + {slurm.conf::OPT_UsePAM} + version (str): + Version as returned by the `slurmctld`. + virtual_memory_size_factor (int): + Specifies the job's or job step's virtual memory limit as a + percentage of its real memory limit. + {slurm.conf::OPT_VSizeFactor} + default_job_wait_time (int): + Specifies how many seconds the srun command should by default wait + after the first task terminates before terminating all remaining + tasks. + {slurm.conf::OPT_WaitTime} + x11_parameters (list[str]): + Parameters for Slurm's built-in X11 forwarding implementation. + {slurm.conf::OPT_X11Parameters} + """ cdef slurm_conf_t *ptr cdef public: @@ -109,6 +852,7 @@ cdef class MPIConfig: @staticmethod cdef MPIConfig from_ptr(void *ptr) + cdef class CgroupConfig: """Slurm Cgroup Config (cgroup.conf)""" @@ -137,9 +881,86 @@ cdef class CgroupConfig: cdef CgroupConfig from_ptr(void *ptr) +# Documentation for the attributes in the AccountingGatherConfig class have +# been largely taken from the official acct_gather.conf overview at: +# https://slurm.schedmd.com/acct_gather.conf.html +# +# Therefore, the following Copyright notices that acct_gather.conf has (see +# https://slurm.schedmd.com/acct_gather.conf.html#SECTION_COPYING), are also +# listed here: +# +# Copyright (C) 2012-2013 Bull. +# Copyright (C) 2012-2022 SchedMD LLC. cdef class AccountingGatherConfig: - """Slurm Accounting Gather Config (acct_gather.conf)""" + """Slurm Accounting Gather Config (acct_gather.conf) + Attributes: + energy_ipmi_frequency (int): + Number of seconds between BMC access samples or XCC samples, + depending on the plugin used. + {acct_gather.conf::OPT_EnergyIPMIFrequency} + energy_ipmi_calc_adjustment (bool): + When `True`, the consumption between the last BMC access sample and + a step consumption update is approximated to get more accurate task + consumption. + {acct_gather.conf::OPT_EnergyIPMICalcAdjustment} + energy_ipmi_power_sensors (str): + IDs of the sensors to used. + {acct_gather.conf::OPT_EnergyIPMIPowerSensors} + energy_ipmi_user_name (str): + BMC Username + {acct_gather.conf::OPT_EnergyIPMIUsername} + energy_ipmi_password (str): + BMC Password + {acct_gather.conf::OPT_EnergyIPMIPassword} + energy_ipmi_timeout (int): + Timeout, in seconds, for initializing the IPMI XCC context for a + new gathering thread. Default is 10 seconds. + {acct_gather.conf::OPT_EnergyIPMITimeout} + profile_hdf5_dir (str): + Path to the shared folder into which the `acct_gather_profile` + plugin will write detailed data. + {acct_gather.conf::OPT_ProfileHDF5Dir} + profile_hdf5_default (list[str]): + List of data types to be collected for each job submission. + {acct_gather.conf::OPT_ProfileHDF5Default} + profile_influxdb_database (str): + InfluxDB v1.x database name where profiling information is to be + written. InfluxDB v2.x bucket name where profiling information is + to be written. + {acct_gather.conf::OPT_ProfileInfluxDBDatabase} + profile_influxdb_default (list[str]): + List of data types to be collected for each job submission. + {acct_gather.conf::OPT_ProfileInfluxDBDefault} + profile_influxdb_host (str): + The hostname of the machine where the InfluxDB instance is executed + and the port used by the HTTP API. + {acct_gather.conf::OPT_ProfileInfluxDBHost} + profile_influxdb_password (str): + Password for `profile_influxdb_user` + {acct_gather.conf::OPT_ProfileInfluxDBPass} + profile_influxdb_rtpolicy (str): + The InfluxDB v1.x retention policy name for the database configured + in ProfileInfluxDBDatabase option. The InfluxDB v2.x retention + policy bucket name for the database configured in + ProfileInfluxDBDatabase option. + {acct_gather.conf::OPT_ProfileInfluxDBRTPolicy} + profile_influxdb_user (str): + InfluxDB username that should be used to gain access to the + database configured in `profile_influxdb_database`. + {acct_gather.conf::OPT_ProfileInfluxDBRTUser} + profile_influxdb_timeout (int): + The maximum time in seconds that an HTTP query to the InfluxDB + server can take. + {acct_gather.conf::OPT_ProfileInfluxDBTimeout} + infiniband_ofed_port (int): + Represents the port number of the local Infiniband card that we are + willing to monitor. + {acct_gather.conf::OPT_InfinibandOFEDPort} + sysfs_interfaces (list[str]): + List of interface names to collect statistics from. + {acct_gather.conf::OPT_SysfsInterfaces} + """ cdef public: energy_ipmi_frequency energy_ipmi_calc_adjustment From 7724b57176f68f5242745ab2dda923a713cf48d6 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Tue, 14 Jan 2025 15:51:00 +0100 Subject: [PATCH 12/31] slurmctld.Config: some name changes and further considerations --- pyslurm/core/slurmctld.pyx | 118 +++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 58 deletions(-) diff --git a/pyslurm/core/slurmctld.pyx b/pyslurm/core/slurmctld.pyx index cd743422..2ddf4db9 100644 --- a/pyslurm/core/slurmctld.pyx +++ b/pyslurm/core/slurmctld.pyx @@ -331,7 +331,7 @@ def is_scheduler_logging_enabled(): >>> print(slurmctld.is_scheduler_logging_enabled()) False """ - return Config.load().scheduler_logging + return Config.load().scheduler_logging_enabled def set_fair_share_dampening_factor(factor): @@ -577,10 +577,6 @@ cdef class Config: out["mpi_config"] = self.mpi_config.to_dict() return out - @property - def accounting_storage_tres(self): - return cstr.to_list(self.ptr.accounting_storage_tres) - @property def accounting_storage_enforce(self): cdef char tmp[128] @@ -608,14 +604,14 @@ cdef class Config: def accounting_storage_parameters(self): return cstr.to_dict(self.ptr.accounting_storage_params) - @property - def accounting_storage_password(self): - return cstr.to_unicode(self.ptr.accounting_storage_pass) - @property def accounting_storage_port(self): return u16_parse(self.ptr.accounting_storage_port) + @property + def accounting_storage_tres(self): + return cstr.to_list(self.ptr.accounting_storage_tres) + @property def accounting_storage_type(self): return cstr.to_unicode(self.ptr.accounting_storage_type) @@ -664,9 +660,9 @@ cdef class Config: return cstr.to_list(self.ptr.authinfo) @property - def auth_alt_params(self): - # TODO: maybe dict? - return cstr.to_list(self.ptr.authalt_params) + def auth_alt_parameters(self): + return cstr.to_dict(self.ptr.authalt_params, delim1=",", + delim2="=", def_value=True) @property def auth_type(self): @@ -683,14 +679,15 @@ cdef class Config: @property def bcast_parameters(self): - return cstr.to_list(self.ptr.bcast_parameters) + return cstr.to_dict(self.ptr.authalt_params, delim1=",", + delim2="=", def_value=True) @property def burst_buffer_type(self): return cstr.to_unicode(self.ptr.bb_type) @property - def boot_time(self): + def slurmctld_boot_time(self): return _raw_time(self.ptr.boot_time) @property @@ -711,7 +708,9 @@ cdef class Config: @property def communication_parameters(self): - return cstr.to_list(self.ptr.comm_params) + # TODO: check again + return cstr.to_dict(self.ptr.comm_params, delim1=",", + delim2="=", def_value=True) @property def complete_wait_time(self): @@ -719,13 +718,7 @@ cdef class Config: return u16_parse(self.ptr.complete_wait) @property - def disable_root_jobs(self): - if self.ptr.conf_flags & slurm.CONF_FLAG_DRJ: - return True - return False - - @property - def default_cpu_frequency(self): + def default_cpu_frequency_governor(self): return cpu_freq_int_to_str(self.ptr.cpu_freq_def) @property @@ -752,10 +745,20 @@ cdef class Config: def default_memory_per_node(self): return _get_memory(self.ptr.def_mem_per_cpu, per_cpu=False) + # TODO: DefCpuPerGPU + # TODO: DefMemPerGPU + @property def dependency_parameters(self): + # TODO: check format again return cstr.to_list(self.ptr.dependency_params) + @property + def disable_root_jobs(self): + if self.ptr.conf_flags & slurm.CONF_FLAG_DRJ: + return True + return False + @property def eio_timeout(self): # seconds @@ -780,6 +783,10 @@ cdef class Config: return cstr.to_list_with_count(self.ptr.epilog_slurmctld, self.ptr.epilog_slurmctld_cnt) + @property + def fair_share_dampening_factor(self): + return u16_parse(self.ptr.fs_dampening_factor) + @property def federation_parameters(self): return cstr.to_list(self.ptr.fed_params) @@ -788,12 +795,6 @@ cdef class Config: def first_job_id(self): return u32_parse(self.ptr.first_job_id) - @property - def fair_share_dampening_factor(self): - return u16_parse(self.ptr.fs_dampening_factor) - - # getnameinfo_cache_timeout - @property def get_environment_timeout(self): return u16_parse(self.ptr.get_env_timeout) @@ -802,14 +803,14 @@ cdef class Config: def gres_types(self): return cstr.to_list(self.ptr.gres_plugins) - @property - def group_update_time(self): - return u16_parse(self.ptr.group_time) - @property def group_update_force(self): return u16_parse_bool(self.ptr.group_force) + @property + def group_update_time(self): + return u16_parse(self.ptr.group_time) + @property def default_gpu_frequency(self): return cstr.to_unicode(self.ptr.gpu_freq_def) @@ -847,14 +848,14 @@ cdef class Config: def interactive_step_options(self): return cstr.to_unicode(self.ptr.interactive_step_opts) - @property - def job_accounting_gather_frequency(self): - return cstr.to_dict(self.ptr.job_acct_gather_freq) - @property def job_accounting_gather_type(self): return cstr.to_unicode(self.ptr.job_acct_gather_type) + @property + def job_accounting_gather_frequency(self): + return cstr.to_dict(self.ptr.job_acct_gather_freq) + @property def job_accounting_gather_parameters(self): return cstr.to_list(self.ptr.job_acct_gather_params) @@ -869,6 +870,7 @@ cdef class Config: @property def job_completion_parameters(self): + # TODO: maybe dict? return cstr.to_list(self.ptr.job_comp_params) @property @@ -906,16 +908,12 @@ cdef class Config: def job_submit_plugins(self): return cstr.to_list(self.ptr.job_submit_plugins) - @property - def keepalive_interval(self): - return u32_parse(self.ptr.keepalive_interval) - @property def kill_on_bad_exit(self): return u16_parse_bool(self.ptr.kill_on_bad_exit) @property - def kill_wait(self): + def kill_wait_time(self): # seconds return u16_parse(self.ptr.kill_wait) @@ -998,6 +996,7 @@ cdef class Config: @property def mpi_parameters(self): + # TODO: check format again return cstr.to_list(self.ptr.mpi_params) @property @@ -1036,6 +1035,7 @@ cdef class Config: @property def preempt_parameters(self): + # TODO: check format again return cstr.to_list(self.ptr.preempt_params) @property @@ -1057,7 +1057,7 @@ cdef class Config: @property def priority_calc_period(self): - # seconds + # TODO: seconds or minutes? return u32_parse(self.ptr.priority_calc_period) @property @@ -1070,7 +1070,7 @@ cdef class Config: @property def priortiy_max_age(self): - # seconds? + # TODO: seconds or minutes? return u32_parse(self.ptr.priority_max_age) @property @@ -1087,27 +1087,27 @@ cdef class Config: @property def priority_weight_age(self): - return u32_parse(self.ptr.priority_weight_age) + return u32_parse(self.ptr.priority_weight_age, zero_is_noval=False) @property def priority_weight_assoc(self): - return u32_parse(self.ptr.priority_weight_assoc) + return u32_parse(self.ptr.priority_weight_assoc, zero_is_noval=False) @property def priority_weight_fair_share(self): - return u32_parse(self.ptr.priority_weight_fs) + return u32_parse(self.ptr.priority_weight_fs, zero_is_noval=False) @property def priority_weight_job_size(self): - return u32_parse(self.ptr.priority_weight_js) + return u32_parse(self.ptr.priority_weight_js, zero_is_noval=False) @property def priority_weight_partition(self): - return u32_parse(self.ptr.priority_weight_part) + return u32_parse(self.ptr.priority_weight_part, zero_is_noval=False) @property def priority_weight_qos(self): - return u32_parse(self.ptr.priority_weight_qos) + return u32_parse(self.ptr.priority_weight_qos, zero_is_noval=False) @property def priority_weight_tres(self): @@ -1178,12 +1178,10 @@ cdef class Config: @property def resume_rate(self): - # minutes? return u16_parse(self.ptr.resume_rate) @property def resume_timeout(self): - # seconds return u16_parse(self.ptr.resume_timeout) @property @@ -1192,8 +1190,7 @@ cdef class Config: @property def reservation_over_run(self): - # minutes - return u16_parse(self.ptr.resv_over_run) + return u16_parse(self.ptr.resv_over_run, zero_is_noval=False) @property def reservation_prolog(self): @@ -1208,11 +1205,13 @@ cdef class Config: return cstr.to_unicode(self.ptr.sched_logfile) @property - def scheduler_logging(self): + def scheduler_logging_enabled(self): + # TODO: check again return u16_parse_bool(self.ptr.sched_log_level) @property def scheduler_parameters(self): + # TODO: check format again return cstr.to_list(self.ptr.sched_params) @property @@ -1265,8 +1264,6 @@ cdef class Config: def slurmd_user_name(self): return cstr.to_unicode(self.ptr.slurmd_user_name) - # TODO: char *slurmctld_addr - @property def slurmctld_log_level(self): return _log_level_int_to_str(self.ptr.slurmctld_debug) @@ -1309,6 +1306,7 @@ cdef class Config: @property def slurmctld_parameters(self): + # TODO: check format again return cstr.to_list(self.ptr.slurmctld_params) @property @@ -1321,6 +1319,7 @@ cdef class Config: @property def slurmd_parameters(self): + # TODO: Check again return cstr.to_list(self.ptr.slurmd_params) @property @@ -1336,7 +1335,7 @@ cdef class Config: return cstr.to_unicode(self.ptr.slurmd_spooldir) @property - def slurmd_syslog_debug_level(self): + def slurmd_syslog_level(self): return _log_level_int_to_str(self.ptr.slurmd_syslog_debug) @property @@ -1398,6 +1397,7 @@ cdef class Config: @property def switch_parameters(self): + # TODO: Check format again return cstr.to_list(self.ptr.switch_param) @property @@ -1440,6 +1440,7 @@ cdef class Config: @property def topology_parameters(self): + # TODO: check format again return cstr.to_list(self.ptr.topology_param) @property @@ -1480,11 +1481,12 @@ cdef class Config: @property def default_job_wait_time(self): + # TODO: reconsider name return u16_parse(self.ptr.wait_time) @property def x11_parameters(self): - return cstr.to_unicode(self.ptr.x11_params) + return cstr.to_list(self.ptr.x11_params) def _str_to_bool(val, true_str, false_str): From 88171463c315a2dcc117802cd1cee01629d4a57f Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 17 Jan 2025 16:13:38 +0100 Subject: [PATCH 13/31] refactor the slurmctld stuff into seperate modules --- pyslurm/core/slurmctld/__init__.pxd | 4 + pyslurm/core/slurmctld/__init__.py | 27 + pyslurm/core/slurmctld/base.pxd | 50 ++ pyslurm/core/slurmctld/base.pyx | 383 ++++++++++++ .../{slurmctld.pxd => slurmctld/config.pxd} | 174 +++++- .../{slurmctld.pyx => slurmctld/config.pyx} | 574 +++--------------- 6 files changed, 702 insertions(+), 510 deletions(-) create mode 100644 pyslurm/core/slurmctld/__init__.pxd create mode 100644 pyslurm/core/slurmctld/__init__.py create mode 100644 pyslurm/core/slurmctld/base.pxd create mode 100644 pyslurm/core/slurmctld/base.pyx rename pyslurm/core/{slurmctld.pxd => slurmctld/config.pxd} (86%) rename pyslurm/core/{slurmctld.pyx => slurmctld/config.pyx} (72%) diff --git a/pyslurm/core/slurmctld/__init__.pxd b/pyslurm/core/slurmctld/__init__.pxd new file mode 100644 index 00000000..edc6a267 --- /dev/null +++ b/pyslurm/core/slurmctld/__init__.pxd @@ -0,0 +1,4 @@ +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from .config cimport Config diff --git a/pyslurm/core/slurmctld/__init__.py b/pyslurm/core/slurmctld/__init__.py new file mode 100644 index 00000000..f8130aab --- /dev/null +++ b/pyslurm/core/slurmctld/__init__.py @@ -0,0 +1,27 @@ +from .config import ( + Config, + MPIConfig, + AccountingGatherConfig, + CgroupConfig, +) +from .base import ( + PingResponse, + ShutdownMode, + ping, + ping_primary, + ping_backup, + ping_all, + shutdown, + reconfigure, + takeover, + add_debug_flags, + remove_debug_flags, + clear_debug_flags, + get_debug_flags, + set_log_level, + get_log_level, + enable_scheduler_logging, + is_scheduler_logging_enabled, + set_fair_share_dampening_factor, + get_fair_share_dampening_factor, +) diff --git a/pyslurm/core/slurmctld/base.pxd b/pyslurm/core/slurmctld/base.pxd new file mode 100644 index 00000000..51a6de64 --- /dev/null +++ b/pyslurm/core/slurmctld/base.pxd @@ -0,0 +1,50 @@ +######################################################################### +# slurmctld/base.pxd - pyslurm slurmctld api functions +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + slurm_conf_t, + slurm_reconfigure, + slurm_shutdown, + slurm_ping, + slurm_takeover, + slurm_set_debugflags, + slurm_set_debug_level, + slurm_set_schedlog_level, + slurm_set_fs_dampeningfactor, +) +from libc.stdint cimport uint16_t, uint64_t +from pyslurm.utils.uint cimport u16_parse +from pyslurm.utils cimport cstr + + +cdef class PingResponse: + """Slurm Controller Ping response information""" + + cdef public: + is_primary + is_responding + index + hostname + latency diff --git a/pyslurm/core/slurmctld/base.pyx b/pyslurm/core/slurmctld/base.pyx new file mode 100644 index 00000000..48425bd8 --- /dev/null +++ b/pyslurm/core/slurmctld/base.pyx @@ -0,0 +1,383 @@ +######################################################################### +# slurmctld/base.pyx - pyslurm slurmctld api functions +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.error import verify_rpc, RPCError +from pyslurm.utils.helpers import instance_to_dict +from pyslurm.utils import cstr +from typing import Union +import time +from enum import IntEnum +from .config import Config +from pyslurm.utils.uint import u16_parse + + +class ShutdownMode(IntEnum): + """Mode of operation for shutdown action""" + ALL = 0 + CORE_FILE = 1 + CONTROLLER_ONLY = 2 + + +cdef class PingResponse: + + def to_dict(self): + """Slurmctld ping response formatted as dictionary. + + Returns: + (dict): Ping response as a dict + + Examples: + >>> from pyslurm import slurmctld + >>> ctld_primary = slurmctld.Config.ping(0) + >>> primary_dict = ctld_primary.to_dict() + """ + return instance_to_dict(self) + + +def ping(index): + """Ping a Slurm controller + + Returns: + (pyslurm.slurmctld.PingResponse): a ping response + + Examples: + >>> from pyslurm import slurmctld + >>> resp = slurmctld.ping(0) + >>> print(resp.hostname, resp.latency) + slurmctl 1.246 + """ + t0 = time.perf_counter() + rc = slurm_ping(index) + t1 = time.perf_counter() + + verify_rpc(rc) + ctl_cnt = slurm.slurm_conf.control_cnt + + if index >= ctl_cnt: + raise RPCError(msg="Invalid Index specified.") + + info = PingResponse() + info.is_primary = index == 0 + info.is_responding = not rc + info.index = index + info.hostname = cstr.to_unicode(slurm.slurm_conf.control_machine[index]) + info.latency = round((t1 - t0) * 1000, 3) + + return info + + +def ping_primary(): + """Ping the primary Slurm Controller. + + See `ping()` for more information and examples. + + Returns: + (pyslurm.slurmctld.PingResponse): a ping response + """ + return ping(0) + + +def ping_backup(): + """Ping the first backup Slurm Controller. + + See `ping()` for more information and examples. + + Returns: + (pyslurm.slurmctld.PingResponse): a ping response + """ + return ping(1) + + +def ping_all(): + """Ping all Slurm Controllers. + + Returns: + (list[pyslurm.slurmctld.PingResponse]): a list of ping responses + + Raises: + (pyslurm.RPCError): When the ping was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> resps = slurmctld.ping_all() + >>> for resp in resps: + ... print(resp.hostname, resp.latency) + ... + slurmctl 1.246 + slurmctlbackup 1.373 + """ + cdef list out = [] + + ctl_cnt = slurm.slurm_conf.control_cnt + for i in range(ctl_cnt): + out.append(ping(i)) + + return out + + +def shutdown(mode: Union[ShutdownMode, int]): + """Shutdown Slurm Controller or all Daemons + + Args: + mode: + Whether only the Slurm controller shut be downed, or also all other + slurmd daemons. + + Raises: + (pyslurm.RPCError): When shutdowning the daemons was not successful. + """ + verify_rpc(slurm_shutdown(int(mode))) + + +def reconfigure(): + """Trigger Slurm Controller to reload the Config + + Raises: + (pyslurm.RPCError): When reconfiguring was not successful. + """ + verify_rpc(slurm_reconfigure()) + + +def takeover(index = 1): + """Let a Backup Slurm Controller take over as the Primary. + + Args: + index (int, optional = 1): + Index of the Backup Controller that should take over. By default, + the `index` is `1`, meaning the next Controller configured after + the Primary in slurm.conf (second `SlurmctlHost` entry) will be + asked to take over operation. + + If you have more than one backup controller configured, you can for + example also pass `2` as the index. + + Raises: + (pyslurm.RPCError): When reconfiguring was not successful. + """ + verify_rpc(slurm_takeover(index)) + + +def add_debug_flags(flags): + """Add DebugFlags to slurmctld + + Args: + flags (list): + For an available list of possible values, please check the + `slurm.conf` documentation under `DebugFlags`. + + Raises: + (pyslurm.RPCError): When setting the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.add_debug_flags(["CpuFrequency"]) + """ + if not flags: + return + + data = _debug_flags_str_to_int(flags) + if not data: + raise RPCError(msg="Invalid Debug Flags specified.") + + verify_rpc(slurm_set_debugflags(data, 0)) + + +def remove_debug_flags(flags): + """Remove DebugFlags from slurmctld. + + Args: + flags (list): + For an available list of possible values, please check the + `slurm.conf` documentation under `DebugFlags`. + + Raises: + (pyslurm.RPCError): When removing the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.remove_debug_flags(["CpuFrequency"]) + """ + if not flags: + return + + data = _debug_flags_str_to_int(flags) + if not data: + raise RPCError(msg="Invalid Debug Flags specified.") + + verify_rpc(slurm_set_debugflags(0, data)) + + +def clear_debug_flags(): + """Remove all currently set debug flags from slurmctld. + + Raises: + (pyslurm.RPCError): When removing the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.clear_debug_flags() + """ + current_flags = get_debug_flags() + if not current_flags: + return + + data = _debug_flags_str_to_int(current_flags) + verify_rpc(slurm_set_debugflags(0, data)) + + +def get_debug_flags(): + """Get the current list of debug flags for the slurmctld. + + Raises: + (pyslurm.RPCError): When getting the debug flags was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> flags = slurmctld.get_debug_flags() + >>> print(flags) + ['CpuFrequency', 'Backfill'] + """ + return Config.load().debug_flags + + +def set_log_level(level): + """Set the logging level for slurmctld. + + Args: + level (str): + For an available list of possible values, please check the + `slurm.conf` documentation under `SlurmctldDebug`. + + Raises: + (pyslurm.RPCError): When setting the log level was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.set_log_level("quiet") + """ + data = _log_level_str_to_int(level) + verify_rpc(slurm_set_debug_level(data)) + + +def get_log_level(): + """Get the current log level for the slurmctld. + + Raises: + (pyslurm.RPCError): When getting the log level was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> level = slurmctld.get_log_level() + >>> print(level) + quiet + """ + return Config.load().slurmctld_log_level + + +def enable_scheduler_logging(): + """Enable scheduler logging for slurmctld. + + Raises: + (pyslurm.RPCError): When enabling scheduler logging was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.enable_scheduler_logging() + """ + verify_rpc(slurm_set_schedlog_level(1)) + + +def is_scheduler_logging_enabled(): + """Check whether scheduler logging is enabled for slurmctld. + + Returns: + (bool): Whether scheduler logging is enabled or not. + + Raises: + (pyslurm.RPCError): When getting the scheduler logging was not + successful. + + Examples: + >>> from pyslurm import slurmctld + >>> print(slurmctld.is_scheduler_logging_enabled()) + False + """ + return Config.load().scheduler_logging_enabled + + +def set_fair_share_dampening_factor(factor): + """Set the FairShare Dampening factor. + + Args: + factor (int): + The factor to set. A minimum value of `1`, and a maximum value of + `65535` are allowed. + + Raises: + (pyslurm.RPCError): When setting the factor was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.set_fair_share_dampening_factor(100) + """ + max_value = (2 ** 16) - 1 + if not factor or factor >= max_value: + raise RPCError(msg=f"Invalid Dampening factor: {factor}. " + f"Factor must be between 0 and {max_value}.") + + verify_rpc(slurm_set_fs_dampeningfactor(factor)) + + +def get_fair_share_dampening_factor(): + """Get the currently set FairShare Dampening factor. + + Raises: + (pyslurm.RPCError): When getting the factor was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> factor = slurmctld.get_fair_share_dampening_factor() + >>> print(factor) + 100 + """ + return Config.load().fair_share_dampening_factor + + +def _debug_flags_str_to_int(flags): + cdef: + uint64_t flags_num = 0 + char *flags_str = NULL + + flags_str = cstr.from_unicode(cstr.list_to_str(flags)) + slurm.debug_str2flags(flags_str, &flags_num) + return flags_num + + +def _log_level_str_to_int(level): + cdef uint16_t data = slurm.log_string2num(str(level)) + if u16_parse(data, zero_is_noval=False) is None: + raise RPCError(msg=f"Invalid Log level: {level}.") + + return data + diff --git a/pyslurm/core/slurmctld.pxd b/pyslurm/core/slurmctld/config.pxd similarity index 86% rename from pyslurm/core/slurmctld.pxd rename to pyslurm/core/slurmctld/config.pxd index 7e011700..1277cd65 100644 --- a/pyslurm/core/slurmctld.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -1,5 +1,5 @@ ######################################################################### -# slurmctld.pxd - pyslurm slurmctld api +# slurmctld/config.pxd - pyslurm slurmctld config api ######################################################################### # Copyright (C) 2025 Toni Harzendorf # @@ -34,18 +34,7 @@ from pyslurm.slurm cimport ( slurm_accounting_enforce_string, slurm_sprint_cpu_bind_type, slurm_ctl_conf_2_key_pairs, - slurm_reconfigure, - slurm_shutdown, - slurm_ping, - slurm_takeover, - slurm_set_debugflags, - slurm_set_debug_level, - slurm_set_schedlog_level, - slurm_set_fs_dampeningfactor, - ping_all_controllers, - controller_ping_t, cpu_bind_type_t, - try_xmalloc, list_t, xfree, ) @@ -72,17 +61,6 @@ ctypedef struct config_key_pair_t: char *value -cdef class PingResponse: - """Slurm Controller Ping response information""" - - cdef public: - is_primary - is_responding - index - hostname - latency - - # Documentation for the attributes in the Config class have been largely taken # from the official slurm.conf overview at: # https://slurm.schedmd.com/slurm.conf.html @@ -93,8 +71,8 @@ cdef class PingResponse: # # Copyright (C) 2002-2007 The Regents of the University of California. Produced # at Lawrence Livermore National Laboratory (cf, pyslurm/slurm/SLURM_DISCLAIMER). -# Copyright (C) 2008-2010 Lawrence Livermore -# National Security. Copyright (C) 2010-2022 SchedMD LLC. +# Copyright (C) 2008-2010 Lawrence Livermore National Security. +# Copyright (C) 2010-2022 SchedMD LLC. cdef class Config: """The Slurm Configuration. @@ -676,8 +654,12 @@ cdef class Config: The interval, in seconds, that the backup controller waits for the primary controller to respond before assuming control. {slurm.conf::OPT_SlurmctldTimeout} - slurmctld_parameters (list[str]): + slurmctld_parameters (dict[str, Union[str, int, bool]]): Options set for the `slurmctld`. + + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. {slurm.conf::OPT_SlurmctldParameters} slurmd_log_level (str): Level of detail `slurmd` is logging. @@ -832,9 +814,61 @@ cdef class Config: MPIConfig mpi_config +# Documentation for the attributes in the MPIConfig class have +# been largely taken from the official mpi.conf overview at: +# https://slurm.schedmd.com/mpi.conf.html +# +# Therefore, the following Copyright notices that mpi.conf has (see +# https://slurm.schedmd.com/mpi.conf.html#SECTION_COPYING), are also +# listed here: +# +# Copyright (C) 2022 SchedMD LLC. cdef class MPIConfig: - """Slurm MPI Config (mpi.conf)""" + """Slurm MPI Config (mpi.conf) + Attributes: + pmix_cli_tmp_dir_base (str): + Directory to have PMIx use for temporary files. + {mpi.conf::OPT_PMIxCliTmpDirBase} + pmix_coll_fence (str): + Defines the type of fence to use for collecting inter-node data. + {mpi.conf::OPT_PMIxCollFence} + pmix_debug (bool): + Whether debug logging for the PMIx Plugin is enabled or not. + {mpi.conf::OPT_PMIxDebug} + pmix_direct_conn (bool): + Whether direct launching of tasks is enabled or not. + {mpi.conf::OPT_PMIxDirectConn} + pmix_direct_conn_early (bool): + Whether early connection to a parent node are allowed or not. + {mpi.conf::OPT_PMIxDirectConnEarly} + pmix_direct_conn_ucx (bool): + Whether PMIx is allowed to use UCX for communication. + {mpi.conf::OPT_PMIxDirectConnUCX} + pmix_direct_same_arch (bool): + Whether additional communication optimizations are enabled when + `pmix_direct_conn` is also set to `True`, also assuming all nodes + of the job have the same architecture. + {mpi.conf::OPT_PMIxDirectSameArch} + pmix_environment (dict[str, Union[str, int]): + Environment variables to bet set in the Job environment, used by + PMIx. + {mpi.conf::OPT_PMIxEnv} + pmix_fence_barrier (bool): + Whether to fence inter-node communication for data collection. + {mpi.conf::OPT_PMIxFenceBarrier} + pmix_net_devices_ucx (str): + Type of network device to use for communication. + {mpi.conf::OPT_PMIxNetDevicesUCX} + pmix_timeout (int): + The maximum time (in seconds) allowed for communication between + hosts to take place. + {mpi.conf::OPT_PMIxTimeout} + pmix_tls_ucx (list[str]): + List of values for the UCX_TLS variable which restrict the + transports to use. + {mpi.conf::OPT_PMIxTlsUCX} + """ cdef public: pmix_cli_tmp_dir_base pmix_coll_fence @@ -853,9 +887,93 @@ cdef class MPIConfig: cdef MPIConfig from_ptr(void *ptr) +# Documentation for the attributes in the CgroupConfig class have +# been largely taken from the official cgroup.conf overview at: +# https://slurm.schedmd.com/cgroup.conf.html +# +# Therefore, the following Copyright notices that cgroup.conf has (see +# https://slurm.schedmd.com/cgroup.conf.html#SECTION_COPYING), are also +# listed here: +# +# Copyright (C) 2010-2012 Lawrence Livermore National Security. (cf, +# pyslurm/slurm/SLURM_DISCLAIMER). +# Copyright (C) 2010-2022 SchedMD LLC. cdef class CgroupConfig: - """Slurm Cgroup Config (cgroup.conf)""" + """Slurm Cgroup Config (cgroup.conf) + Attributes: + mountpoint (str): + Specifies the PATH under which cgroup controllers should be + mounted. + {cgroup.conf::OPT_CgroupMountpoint) + plugin (str): + Specifies the plugin to be used when interacting with the cgroup + subsystem. + {cgroup.conf::OPT_CgroupPlugin) + systemd_timeout (int): + Maximum time (in milliseconds) that Slurm will wait for the slurmd + scope to be ready before failing. + {cgroup.conf::OPT_SystemdTimeout) + ignore_systemd (bool): + If `True`, it will avoid any call to dbus and contact with systemd, + and cgroup hierarchy preparation is done manually. Only for + `cgroup/v2` + {cgroup.conf::OPT_IgnoreSystemd) + ignore_systemd_on_failure (bool): + Similar to `ignore_systemd`, but only in the case that a dbus call + does not succeed. Only for `cgroup/v2`. + {cgroup.conf::OPT_IgnoreSystemdOnFailure) + enable_controllers (bool): + When enabled, `slurmd` gets the available controllers from root`s + cgroup.controllers file located in `mountpoint`. + {cgroup.conf::OPT_EnableControllers) + allowed_ram_space (int): + Constrains the job/step cgroup RAM to this percentage of the + allocated memory. + {cgroup.conf::OPT_AllowedRAMSpace) + allowed_swap_space (float): + Constrain the job cgroup swap space to this percentage of the + allocated memory. + {cgroup.conf::OPT_AllowedSwapSpace) + constrain_cores (bool): + When `True`, then constrain allowed cores to the subset of + allocated resources. + {cgroup.conf::OPT_ConstrainCores) + constrain_devices (bool): + When `True`, then constrain the job's allowed devices based on GRES + allocated resources. + {cgroup.conf::OPT_ConstrainDevices) + constrain_ram_space (bool): + When `True`, then constrain the job's RAM usage by setting the + memory soft limit to the allocated memory and the hard limit to the + allocated memory * `allowed_ram_space`. + {cgroup.conf::OPT_ConstrainRAMSpace) + constrain_swap_space (bool): + When `True`, then constrain the job's swap space usage. + {cgroup.conf::OPT_ConstrainSwapSpace) + max_ram_percent (float): + Upper bound in percent of total RAM (configured RealMemory of the + node) on the RAM constraint for a job. + {cgroup.conf::OPT_MaxRAMPercent) + max_swap_percent (float): + Upper bound (in percent of total RAM, configured RealMemory of the + node) on the amount of RAM+Swap that may be used for a job. + {cgroup.conf::OPT_MaxSwapPercent) + memory_swappiness (float): + Configures the kernel's priority for swapping out anonymous pages + verses file cache pages for the job cgroup. Only for `cgroup/v1`. + A value of `-1.0` means that the kernel's default swappiness value + will be used. + {cgroup.conf::OPT_MemorySwappiness) + min_ram_space (int): + Lower bound (in Mebibytes) on the memory limits defined by + `allowed_ram_space` and `allowed_swap_space`. + {cgroup.conf::OPT_MinRAMSpace) + signal_children_processes (bool): + When `True`, then send signals (for cancelling, suspending, + resuming, etc.) to all children processes in a job/step. + {cgroup.conf::OPT_SignalChildrenProcesses) + """ cdef public: mountpoint plugin diff --git a/pyslurm/core/slurmctld.pyx b/pyslurm/core/slurmctld/config.pyx similarity index 72% rename from pyslurm/core/slurmctld.pyx rename to pyslurm/core/slurmctld/config.pyx index 2ddf4db9..54455525 100644 --- a/pyslurm/core/slurmctld.pyx +++ b/pyslurm/core/slurmctld/config.pyx @@ -1,5 +1,5 @@ ######################################################################### -# slurmctld.pyx - pyslurm slurmctld api +# slurmctld/config.pyx - pyslurm slurmctld config api ######################################################################### # Copyright (C) 2025 Toni Harzendorf # @@ -36,340 +36,6 @@ from pyslurm.utils.helpers import ( ) from pyslurm.utils import cstr from typing import Union -import time -from enum import IntEnum - - -class ShutdownMode(IntEnum): - """Mode of operation for shutdown action""" - ALL = 0 - CORE_FILE = 1 - CONTROLLER_ONLY = 2 - - -cdef class PingResponse: - - def to_dict(self): - """Slurmctld ping response formatted as dictionary. - - Returns: - (dict): Ping response as a dict - - Examples: - >>> from pyslurm import slurmctld - >>> ctld_primary = slurmctld.Config.ping(0) - >>> primary_dict = ctld_primary.to_dict() - """ - return instance_to_dict(self) - - -def ping(index): - """Ping a Slurm controller - - Returns: - (pyslurm.slurmctld.PingResponse): a ping response - - Examples: - >>> from pyslurm import slurmctld - >>> resp = slurmctld.ping(0) - >>> print(resp.hostname, resp.latency) - slurmctl 1.246 - """ - t0 = time.perf_counter() - rc = slurm_ping(index) - t1 = time.perf_counter() - - verify_rpc(rc) - ctl_cnt = slurm.slurm_conf.control_cnt - - if index >= ctl_cnt: - raise RPCError(msg="Invalid Index specified.") - - info = PingResponse() - info.is_primary = index == 0 - info.is_responding = not rc - info.index = index - info.hostname = cstr.to_unicode(slurm.slurm_conf.control_machine[index]) - info.latency = round((t1 - t0) * 1000, 3) - - return info - - -def ping_primary(): - """Ping the primary Slurm Controller. - - See `ping()` for more information and examples. - - Returns: - (pyslurm.slurmctld.PingResponse): a ping response - """ - return ping(0) - - -def ping_backup(): - """Ping the first backup Slurm Controller. - - See `ping()` for more information and examples. - - Returns: - (pyslurm.slurmctld.PingResponse): a ping response - """ - return ping(1) - - -def ping_all(): - """Ping all Slurm Controllers. - - Returns: - (list[pyslurm.slurmctld.PingResponse]): a list of ping responses - - Raises: - (pyslurm.RPCError): When the ping was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> resps = slurmctld.ping_all() - >>> for resp in resps: - ... print(resp.hostname, resp.latency) - ... - slurmctl 1.246 - slurmctlbackup 1.373 - """ - cdef list out = [] - - ctl_cnt = slurm.slurm_conf.control_cnt - for i in range(ctl_cnt): - out.append(ping(i)) - - return out - - -def shutdown(mode: Union[ShutdownMode, int]): - """Shutdown Slurm Controller or all Daemons - - Args: - mode: - Whether only the Slurm controller shut be downed, or also all other - slurmd daemons. - - Raises: - (pyslurm.RPCError): When shutdowning the daemons was not successful. - """ - verify_rpc(slurm_shutdown(int(mode))) - - -def reconfigure(): - """Trigger Slurm Controller to reload the Config - - Raises: - (pyslurm.RPCError): When reconfiguring was not successful. - """ - verify_rpc(slurm_reconfigure()) - - -def takeover(index = 1): - """Let a Backup Slurm Controller take over as the Primary. - - Args: - index (int, optional = 1): - Index of the Backup Controller that should take over. By default, - the `index` is `1`, meaning the next Controller configured after - the Primary in slurm.conf (second `SlurmctlHost` entry) will be - asked to take over operation. - - If you have more than one backup controller configured, you can for - example also pass `2` as the index. - - Raises: - (pyslurm.RPCError): When reconfiguring was not successful. - """ - verify_rpc(slurm_takeover(index)) - - -def add_debug_flags(flags): - """Add DebugFlags to slurmctld - - Args: - flags (list): - For an available list of possible values, please check the - `slurm.conf` documentation under `DebugFlags`. - - Raises: - (pyslurm.RPCError): When setting the debug flags was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> slurmctld.add_debug_flags(["CpuFrequency"]) - """ - if not flags: - return - - data = _debug_flags_str_to_int(flags) - if not data: - raise RPCError(msg="Invalid Debug Flags specified.") - - verify_rpc(slurm_set_debugflags(data, 0)) - - -def remove_debug_flags(flags): - """Remove DebugFlags from slurmctld. - - Args: - flags (list): - For an available list of possible values, please check the - `slurm.conf` documentation under `DebugFlags`. - - Raises: - (pyslurm.RPCError): When removing the debug flags was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> slurmctld.remove_debug_flags(["CpuFrequency"]) - """ - if not flags: - return - - data = _debug_flags_str_to_int(flags) - if not data: - raise RPCError(msg="Invalid Debug Flags specified.") - - verify_rpc(slurm_set_debugflags(0, data)) - - -def clear_debug_flags(): - """Remove all currently set debug flags from slurmctld. - - Raises: - (pyslurm.RPCError): When removing the debug flags was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> slurmctld.clear_debug_flags() - """ - current_flags = get_debug_flags() - if not current_flags: - return - - data = _debug_flags_str_to_int(current_flags) - verify_rpc(slurm_set_debugflags(0, data)) - - -def get_debug_flags(): - """Get the current list of debug flags for the slurmctld. - - Raises: - (pyslurm.RPCError): When getting the debug flags was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> flags = slurmctld.get_debug_flags() - >>> print(flags) - ['CpuFrequency', 'Backfill'] - """ - return Config.load().debug_flags - - -def set_log_level(level): - """Set the logging level for slurmctld. - - Args: - level (str): - For an available list of possible values, please check the - `slurm.conf` documentation under `SlurmctldDebug`. - - Raises: - (pyslurm.RPCError): When setting the log level was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> slurmctld.set_log_level("quiet") - """ - data = _log_level_str_to_int(level) - verify_rpc(slurm_set_debug_level(data)) - - -def get_log_level(): - """Get the current log level for the slurmctld. - - Raises: - (pyslurm.RPCError): When getting the log level was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> level = slurmctld.get_log_level() - >>> print(level) - quiet - """ - return Config.load().slurmctld_log_level - - -def enable_scheduler_logging(): - """Enable scheduler logging for slurmctld. - - Raises: - (pyslurm.RPCError): When enabling scheduler logging was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> slurmctld.enable_scheduler_logging() - """ - verify_rpc(slurm_set_schedlog_level(1)) - - -def is_scheduler_logging_enabled(): - """Check whether scheduler logging is enabled for slurmctld. - - Returns: - (bool): Whether scheduler logging is enabled or not. - - Raises: - (pyslurm.RPCError): When getting the scheduler logging was not - successful. - - Examples: - >>> from pyslurm import slurmctld - >>> print(slurmctld.is_scheduler_logging_enabled()) - False - """ - return Config.load().scheduler_logging_enabled - - -def set_fair_share_dampening_factor(factor): - """Set the FairShare Dampening factor. - - Args: - factor (int): - The factor to set. A minimum value of `1`, and a maximum value of - `65535` are allowed. - - Raises: - (pyslurm.RPCError): When setting the factor was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> slurmctld.set_fair_share_dampening_factor(100) - """ - max_value = (2 ** 16) - 1 - if not factor or factor >= max_value: - raise RPCError(msg=f"Invalid Dampening factor: {factor}. " - f"Factor must be between 0 and {max_value}.") - - verify_rpc(slurm_set_fs_dampeningfactor(factor)) - - -def get_fair_share_dampening_factor(): - """Get the currently set FairShare Dampening factor. - - Raises: - (pyslurm.RPCError): When getting the factor was not successful. - - Examples: - >>> from pyslurm import slurmctld - >>> factor = slurmctld.get_fair_share_dampening_factor() - >>> print(factor) - 100 - """ - return Config.load().fair_share_dampening_factor cdef class MPIConfig: @@ -444,15 +110,15 @@ cdef class CgroupConfig: out.ignore_systemd_on_failure = _yesno_to_bool(conf.get("IgnoreSystemdOnFailure")) out.enable_controllers = _yesno_to_bool(conf.get("EnableControllers")) - out.allowed_ram_space = int(conf.get("AllowedRAMSpace", 100)) - out.allowed_swap_space = int(conf.get("AllowedSwapSpace", 0)) + out.allowed_ram_space = float(conf.get("AllowedRAMSpace", 100.0)) + out.allowed_swap_space = float(conf.get("AllowedSwapSpace", 0.0)) out.constrain_cores = _yesno_to_bool(conf.get("ConstrainCores", "no")) out.constrain_devices = _yesno_to_bool(conf.get("ConstrainDevices", "no")) out.constrain_ram_space = _yesno_to_bool(conf.get("ConstrainRAMSpace", "no")) out.constrain_swap_space = _yesno_to_bool(conf.get("ConstrainSwapSpace", "no")) - out.max_ram_percent = int(conf.get("MaxRAMPercent", 100)) - out.max_swap_percent = int(conf.get("MaxSwapPercent", 100)) - out.memory_swappiness = int(conf.get("MemorySwappiness", -1)) + out.max_ram_percent = float(conf.get("MaxRAMPercent", 100.0)) + out.max_swap_percent = float(conf.get("MaxSwapPercent", 100.0)) + out.memory_swappiness = float(conf.get("MemorySwappiness", -1.0)) out.min_ram_space = int(conf.get("MinRAMSpace", 30*1024)) out.signal_children_processes = _yesno_to_bool(conf.get("SignalChildrenProcesses", "no")) @@ -622,7 +288,21 @@ cdef class Config: @property def accounting_store_flags(self): - return _acct_store_flags_int_to_str(self.ptr.conf_flags) + out = [] + flags = self.ptr.conf_flags + + if flags & slurm.CONF_FLAG_SJC: + out.append("JOB_COMMENT") + if flags & slurm.CONF_FLAG_SJE: + out.append("JOB_ENV") + if flags & slurm.CONF_FLAG_SJX: + out.append("JOB_EXTRA") + if flags & slurm.CONF_FLAG_SJS: + out.append("JOB_SCRIPT") + if flags & slurm.CONF_FLAG_NO_STDIO: + out.append("NO_STDIO") + + return out @property def accounting_gather_node_frequency(self): @@ -735,7 +415,8 @@ cdef class Config: @property def debug_flags(self): - return _debug_flags_int_to_list(self.ptr.debug_flags) + cdef char *data = slurm.debug_flags2str(self.ptr.debug_flags) + return cstr.to_list_free(&data) @property def default_memory_per_cpu(self): @@ -766,7 +447,9 @@ cdef class Config: @property def enforce_partition_limits(self): - return _enforce_part_limits_int_to_str(self.ptr.enforce_part_limits) + cdef char* data = slurm.parse_part_enforce_type_2str( + self.ptr.enforce_part_limits) + return cstr.to_unicode(data) @property def epilog(self): @@ -832,8 +515,9 @@ cdef class Config: @property def health_check_node_state(self): - return _health_check_node_state_int_to_list( + cdef char *data = slurm.health_check_node_state_str( self.ptr.health_check_node_state) + return cstr.to_list_free(&data) @property def health_check_program(self): @@ -928,7 +612,25 @@ cdef class Config: @property def log_time_format(self): - return _log_fmt_int_to_str(self.ptr.log_fmt) + flag = self.ptr.log_fmt + if flag == slurm.LOG_FMT_ISO8601_MS: + return "iso8601_ms" + elif flag == slurm.LOG_FMT_ISO8601: + return "iso8601" + elif flag == slurm.LOG_FMT_RFC5424_MS: + return "rfc5424_ms" + elif flag == slurm.LOG_FMT_RFC5424: + return "rfc5424" + elif flag == slurm.LOG_FMT_CLOCK: + return "clock" + elif flag == slurm.LOG_FMT_SHORT: + return "short" + elif flag == slurm.LOG_FMT_THREAD_ID: + return "thread_id" + elif flag == slurm.LOG_FMT_RFC3339: + return "rfc3339" + else: + return None @property def mail_domain(self): @@ -1066,7 +768,8 @@ cdef class Config: @property def priority_flags(self): - return _priority_flags_int_to_list(self.ptr.priority_flags) + cdef char *data = slurm.priority_flags_string(self.ptr.priority_flags) + return cstr.to_list_free(&data) @property def priortiy_max_age(self): @@ -1079,7 +782,23 @@ cdef class Config: @property def priority_usage_reset_period(self): - return _priority_reset_int_to_str(self.ptr.priority_reset_period) + flag = self.ptr.priority_reset_period + if flag == slurm.PRIORITY_RESET_NONE: + return None + elif flag == slurm.PRIORITY_RESET_NOW: + return "NOW" + elif flag == slurm.PRIORITY_RESET_DAILY: + return "DAILY" + elif flag == slurm.PRIORITY_RESET_WEEKLY: + return "WEEKLY" + elif flag == slurm.PRIORITY_RESET_MONTHLY: + return "MONTHLY" + elif flag == slurm.PRIORITY_RESET_QUARTERLY: + return "QUARTERLY" + elif flag == slurm.PRIORITY_RESET_YEARLY: + return "YEARLY" + else: + return None @property def priority_type(self): @@ -1115,7 +834,13 @@ cdef class Config: @property def private_data(self): - return _private_data_int_to_list(self.ptr.private_data) + cdef char tmp[128] + slurm.private_data_string(self.ptr.private_data, tmp, sizeof(tmp)) + out = cstr.to_unicode(tmp) + if not out or out == "none": + return [] + + return out.split(",") @property def proctrack_type(self): @@ -1142,7 +867,8 @@ cdef class Config: @property def prolog_flags(self): - return _prolog_flags_int_to_list(self.ptr.prolog_flags) + cdef char *data = slurm.prolog_flags2str(self.ptr.prolog_flags) + return cstr.to_list_free(&data) @property def propagate_resource_limits(self): @@ -1158,7 +884,8 @@ cdef class Config: @property def reconfig_flags(self): - return _reconfig_flags_int_to_list(self.ptr.reconfig_flags) + cdef char *tmp = slurm.reconfig_flags2str(self.ptr.reconfig_flags) + return cstr.to_list_free(&tmp) @property def requeue_exit(self): @@ -1306,8 +1033,8 @@ cdef class Config: @property def slurmctld_parameters(self): - # TODO: check format again - return cstr.to_list(self.ptr.slurmctld_params) + return cstr.to_dict(self.ptr.slurmctld_params, delim1=",", + delim2="=", def_value=True) @property def slurmd_log_level(self): @@ -1489,27 +1216,6 @@ cdef class Config: return cstr.to_list(self.ptr.x11_params) -def _str_to_bool(val, true_str, false_str): - if not val: - return False - - v = val.lower() - if v == true_str: - return True - elif v == false_str: - return False - else: - return False - - -def _yesno_to_bool(val): - return _str_to_bool(val, "yes", "no") - - -def _true_false_to_bool(val): - return _str_to_bool(val, "true", "false") - - cdef dict _parse_config_key_pairs(void *ptr, owned=False): cdef: SlurmList conf = SlurmList.wrap(ptr, owned=owned) @@ -1526,96 +1232,25 @@ cdef dict _parse_config_key_pairs(void *ptr, owned=False): return out -def _debug_flags_int_to_list(flags): - cdef char *data = slurm.debug_flags2str(flags) - return cstr.to_list_free(&data) - - -def _debug_flags_str_to_int(flags): - cdef: - uint64_t flags_num = 0 - char *flags_str = NULL - - flags_str = cstr.from_unicode(cstr.list_to_str(flags)) - slurm.debug_str2flags(flags_str, &flags_num) - return flags_num - - -# https://github.com/SchedMD/slurm/blob/01a3aac7c59c9b32a9dd4e395aa5a97a8aea4f08/slurm/slurm.h#L621 -def _enforce_part_limits_int_to_str(limits): - cdef char* data = slurm.parse_part_enforce_type_2str(limits) - return cstr.to_unicode(data) - - -# https://github.com/SchedMD/slurm/blob/01a3aac7c59c9b32a9dd4e395aa5a97a8aea4f08/slurm/slurm.h#L2741 -def _health_check_node_state_int_to_list(state): - cdef char *data = slurm.health_check_node_state_str(state) - return cstr.to_list_free(&data) - - -def _log_fmt_int_to_str(flag): - if flag == slurm.LOG_FMT_ISO8601_MS: - return "iso8601_ms" - elif flag == slurm.LOG_FMT_ISO8601: - return "iso8601" - elif flag == slurm.LOG_FMT_RFC5424_MS: - return "rfc5424_ms" - elif flag == slurm.LOG_FMT_RFC5424: - return "rfc5424" - elif flag == slurm.LOG_FMT_CLOCK: - return "clock" - elif flag == slurm.LOG_FMT_SHORT: - return "short" - elif flag == slurm.LOG_FMT_THREAD_ID: - return "thread_id" - elif flag == slurm.LOG_FMT_RFC3339: - return "rfc3339" - else: - return None - - -def _priority_flags_int_to_list(flags): - cdef char *data = slurm.priority_flags_string(flags) - return cstr.to_list_free(&data) - +def _str_to_bool(val, true_str, false_str): + if not val: + return False -def _priority_reset_int_to_str(flag): - if flag == slurm.PRIORITY_RESET_NONE: - return None - elif flag == slurm.PRIORITY_RESET_NOW: - return "NOW" - elif flag == slurm.PRIORITY_RESET_DAILY: - return "DAILY" - elif flag == slurm.PRIORITY_RESET_WEEKLY: - return "WEEKLY" - elif flag == slurm.PRIORITY_RESET_MONTHLY: - return "MONTHLY" - elif flag == slurm.PRIORITY_RESET_QUARTERLY: - return "QUARTERLY" - elif flag == slurm.PRIORITY_RESET_YEARLY: - return "YEARLY" + v = val.lower() + if v == true_str: + return True + elif v == false_str: + return False else: - return None - - -def _private_data_int_to_list(flags): - cdef char tmp[128] - slurm.private_data_string(flags, tmp, sizeof(tmp)) - out = cstr.to_unicode(tmp) - if not out or out == "none": - return [] - - return out.split(",") + return False -def _prolog_flags_int_to_list(flags): - cdef char *data = slurm.prolog_flags2str(flags) - return cstr.to_list_free(&data) +def _yesno_to_bool(val): + return _str_to_bool(val, "yes", "no") -def _reconfig_flags_int_to_list(flags): - cdef char *tmp = slurm.reconfig_flags2str(flags) - return cstr.to_list_free(&tmp) +def _true_false_to_bool(val): + return _str_to_bool(val, "true", "false") def _log_level_int_to_str(flags): @@ -1626,31 +1261,6 @@ def _log_level_int_to_str(flags): return data -def _log_level_str_to_int(level): - cdef uint16_t data = slurm.log_string2num(str(level)) - if u16_parse(data, zero_is_noval=False) is None: - raise RPCError(msg=f"Invalid Log level: {level}.") - - return data - - -def _acct_store_flags_int_to_str(flags): - cdef list out = [] - - if flags & slurm.CONF_FLAG_SJC: - out.append("JOB_COMMENT") - if flags & slurm.CONF_FLAG_SJE: - out.append("JOB_ENV") - if flags & slurm.CONF_FLAG_SJX: - out.append("JOB_EXTRA") - if flags & slurm.CONF_FLAG_SJS: - out.append("JOB_SCRIPT") - if flags & slurm.CONF_FLAG_NO_STDIO: - out.append("NO_STDIO") - - return out - - def _get_memory(value, per_cpu): if value != slurm.NO_VAL64: if value & slurm.MEM_PER_CPU and per_cpu: From bc8929bb7c2ea86f80b0fd757d76cca923d8211c Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 17 Jan 2025 16:14:31 +0100 Subject: [PATCH 14/31] partition: update from where we take _get_memory function --- pyslurm/core/partition.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyslurm/core/partition.pyx b/pyslurm/core/partition.pyx index e0d916aa..53e41648 100644 --- a/pyslurm/core/partition.pyx +++ b/pyslurm/core/partition.pyx @@ -31,7 +31,7 @@ from pyslurm.core.error import RPCError, verify_rpc from pyslurm.utils.ctime import timestamp_to_date, _raw_time from pyslurm.constants import UNLIMITED from pyslurm.settings import LOCAL_CLUSTER -from pyslurm.core.slurmctld import _get_memory +from pyslurm.core.slurmctld.config import _get_memory from pyslurm import xcollections from pyslurm.utils.helpers import ( uid_to_name, From 339af33bac4efa674c49da7581c55d5d82270c88 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 17 Jan 2025 19:15:57 +0100 Subject: [PATCH 15/31] slurmctld.config: update documentation to make it prettier --- pyslurm/core/slurmctld/config.pxd | 723 ++++++++++++++++++++---------- 1 file changed, 482 insertions(+), 241 deletions(-) diff --git a/pyslurm/core/slurmctld/config.pxd b/pyslurm/core/slurmctld/config.pxd index 1277cd65..490894e2 100644 --- a/pyslurm/core/slurmctld/config.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -81,84 +81,108 @@ cdef class Config: Attributes: accounting_storage_enforce (list[str]): List of enforcements on Job submissions. - {slurm.conf::OPT_AccountingStorageEnforce} + + {slurm.conf#OPT_AccountingStorageEnforce} accounting_storage_backup_host (str): Name of the backup machine hosting the Slurm database. - {slurm.conf::OPT_AccountingStorageBackupHost} + + {slurm.conf#OPT_AccountingStorageBackupHost} accounting_storage_external_hosts (list[str]): List of external slurmdbds to register with. - {slurm.conf::OPT_AccountingStorageExternalHost} + + {slurm.conf#OPT_AccountingStorageExternalHost} accounting_storage_host (str): Name of the machine hosting the slurm database. - {slurm.conf::OPT_AccountingStorageHost + {slurm.conf#OPT_AccountingStorageHost} + accounting_storage_parameters (dict[str, str]): Options for the accounting storage Plugin - {slurm.conf::OPT_AccountingStorageParameters} + + {slurm.conf#OPT_AccountingStorageParameters} accounting_storage_port (int): Listening port of the Accounting Database Server - {slurm.conf::OPT_AccountingStoragePort} + + {slurm.conf#OPT_AccountingStoragePort} accounting_storage_tres (list): List of configured Resources to track on the Cluster. - {slurm.conf::OPT_AccountingStorageTRES} + + {slurm.conf#OPT_AccountingStorageTRES} accounting_storage_type (str): The accounting storage type used. - {slurm.conf::OPT_AccountingStorageType} + + {slurm.conf#OPT_AccountingStorageType} accounting_storage_user (str): The User accounting accessing the accounting database. - {slurm.conf::OPT_AccountingStorageUser} + + {slurm.conf#OPT_AccountingStorageUser} accounting_store_flags (list[str]): List of fields that the slurmctld also sends to the accounting database. - {slurm.conf::OPT_AccountingStoreFlags} + + {slurm.conf#OPT_AccountingStoreFlags} accounting_gather_node_frequency (int): Accounting-Gather plugins sampling interval for node accounting. - {slurm.conf::OPT_AcctGatherNodeFreq} + + {slurm.conf#OPT_AcctGatherNodeFreq} accounting_gather_energy_type (str): Plugin used for energy consumption accounting. - {slurm.conf::OPT_AcctGatherEnergyType} + + {slurm.conf#OPT_AcctGatherEnergyType} accounting_gather_interconnect_type (str): Plugin used for interconnect network traffic accounting. - {slurm.conf::OPT_AcctGatherInterconnectType} + + {slurm.conf#OPT_AcctGatherInterconnectType} accounting_gather_filesystem_type (str): Plugin used for filesystem traffic accounting. - {slurm.conf::OPT_AcctGatherFilesystemType} + + {slurm.conf#OPT_AcctGatherFilesystemType} accounting_gather_profile_type (str): Plugin used for detailed job profiling. - {slurm.conf::OPT_AcctGatherProfileType} + + {slurm.conf#OPT_AcctGatherProfileType} allow_spec_resource_usage (bool): Whether Slurm allows jobs to override the nodes configured `CoreSpecCount` - {slurm.conf::OPT_AllowSpecResourcesUsage} + + {slurm.conf#OPT_AllowSpecResourcesUsage} auth_alt_types (list[str]): List of alternative authentication plugins the slurmctld permits. - {slurm.conf::OPT_AuthAltTypes} + + {slurm.conf#OPT_AuthAltTypes} auth_alt_parameters (dict[str, str]: Options for the alternative authentication plugins. - {slurm.conf::OPT_AuthAltParameters} + + {slurm.conf#OPT_AuthAltParameters} auth_info (list[str]): List of additional information used for authentication of communication between Slurm daemons. - {slurm.conf::OPT_AuthInfo} + + {slurm.conf#OPT_AuthInfo} auth_type (str): Primary authentication method for communications between Slurm components. - {slurm.conf::OPT_AuthType} + + {slurm.conf#OPT_AuthType} batch_start_timeout (int): The maximum time (in seconds) that a batch job is permitted for launching before being considered missing and releasing the allocation. - {slurm.conf::OPT_BatchStartTimeout} + + {slurm.conf#OPT_BatchStartTimeout} bcast_exclude_paths (list[str]): List of absolute directory paths to be excluded when autodetecting and broadcasting executable shared object dependencies through `sbcast` or `srun --bcast`. - {slurm.conf::OPT_BcastExclude} + + {slurm.conf#OPT_BcastExclude} bcast_parameters (dict[str, str]: Options for `sbcast` and `srun --bcast` behaviour. - {slurm.conf::OPT_BcastParameters} + + {slurm.conf#OPT_BcastParameters} burst_buffer_type (str): Plugin used to manage burst buffers. - {slurm.conf::OPT_BurstBufferType} + + {slurm.conf#OPT_BurstBufferType} slurmctld_boot_time (int): Timestamp of when the slurmctld last booted. certmgr_parameters (str): @@ -167,453 +191,578 @@ cdef class Config: Plugin used for certmgr mechanism. cli_filter_plugins (list[str]): List of CLI Filter plugins to use. - {slurm.conf::OPT_CliFilterPlugins} + + {slurm.conf#OPT_CliFilterPlugins} cluster_name (str): Name of the Cluster. - {slurm.conf::OPT_ClusterName} + + {slurm.conf#OPT_ClusterName} communication_parameters (dict[str, Union[str, int]]): Communication options for Cluster daemons. - {slurm.conf::OPT_CommunicationParameters} + + {slurm.conf#OPT_CommunicationParameters} complete_wait_time (int): The time to wait, in seconds, when any job is in the COMPLETING state before any additional jobs are scheduled. - {slurm.conf::OPT_CompleteWait} + + {slurm.conf#OPT_CompleteWait} default_cpu_frequency_governor (str): Default CPU governor to use when a Job has not specified the `--cpu-freq` option. - {slurm.conf::OPT_CpuFreqDef} + + {slurm.conf#OPT_CpuFreqDef} cpu_frequency_governors (list[str]): List of CPU Governors allowed to be set on Job submission. - {slurm.conf::OPT_CpuFreqGovernors} + + {slurm.conf#OPT_CpuFreqGovernors} credential_type (str): Cryptographic signature tool to be used when creating job step credentials. - {slurm.conf::OPT_CredType} + + {slurm.conf#OPT_CredType} data_parser_parameters (str): Default value to apply for `data_parser` plugin parameters. - {slurm.conf::OPT_DataParserParameters} + + {slurm.conf#OPT_DataParserParameters} debug_flags (list[str]): List of DebugFlags currently set for Daemons. - {slurm.conf::OPT_DebugFlags} + + {slurm.conf#OPT_DebugFlags} default_memory_per_cpu (int): Default real memory size available per allocated CPU in Mebibytes. - {slurm.conf::OPT_DefMemPerCPU} + + {slurm.conf#OPT_DefMemPerCPU} default_memory_per_node (int): Default real memory size available per allocated Node in Mebibytes. - {slurm.conf::OPT_DefMemPerNode} + + {slurm.conf#OPT_DefMemPerNode} dependency_parameters (list[str]): List of parameters for dependencies. - {slurm.conf::OPT_DependencyParameters} + + {slurm.conf#OPT_DependencyParameters} disable_root_jobs (bool): Whether root can submit Jobs or not. - {slurm.conf::OPT_DisableRootJobs} + + {slurm.conf#OPT_DisableRootJobs} eio_timeout (int): The number of seconds srun waits for slurmstepd to close the TCP/IP connection used to relay data between the user application and srun when the user application terminates. - {slurm.conf::OPT_EioTimeout} + + {slurm.conf#OPT_EioTimeout} enforce_partition_limits (str): Controls which Limits are enforced on Partition level. - {slurm.conf::OPT_EnforcePartLimits} + + {slurm.conf#OPT_EnforcePartLimits} epilog (list[str]): List of Epilog scripts in use that are executed as root on every node when a Job completes. - {slurm.conf::OPT_Epilog} + + {slurm.conf#OPT_Epilog} epilog_msg_time (int): The number of microseconds that the slurmctld daemon requires to process an epilog completion message from the slurmd daemons. - {slurm.conf::OPT_EpilogMsgTime} + + {slurm.conf#OPT_EpilogMsgTime} epilog_slurmctld (list[str]): List of Epilog scripts in use that are executed by slurmctld at job allocation. - {slurm.conf::OPT_EpilogSlurmctld} + + {slurm.conf#OPT_EpilogSlurmctld} fair_share_dampening_factor (int): Dampen the effect of exceeding a user or group's fair share of allocated resources. - {slurm.conf::OPT_FairShareDampeningFactor} + + {slurm.conf#OPT_FairShareDampeningFactor} federation_parameters (list[str]): Options for Federations - {slurm.conf::OPT_FederationParameters} + + {slurm.conf#OPT_FederationParameters} first_job_id (int): The job id to be used for the first job submitted. - {slurm.conf::OPT_FirstJobId} + + {slurm.conf#OPT_FirstJobId} get_environment_timeout (int): How long a Job waits (in seconds) to load the Users environment before attempting to load it from a cache file. - {slurm.conf::OPT_GetEnvTimeout} + + {slurm.conf#OPT_GetEnvTimeout} gres_types (list[str]): List of generic resources to be managed. - {slurm.conf::OPT_GresTypes} + + {slurm.conf#OPT_GresTypes} group_update_force (bool): Whether user group membership information is updated periodically, even if there are no changes to `/etc/group`. - {slurm.conf::OPT_GroupUpdateForce} + + {slurm.conf#OPT_GroupUpdateForce} group_update_time (int): How frequently information about user group membership is updated, and how longer it is cached (in seconds). - {slurm.conf::OPT_GroupUpdateTime} + + {slurm.conf#OPT_GroupUpdateTime} default_gpu_frequency (str): Default GPU frequency to use when running a job step if it has not been explicitly set using the --gpu-freq option. - {slurm.conf::OPT_GpuFreqDef} + + {slurm.conf#OPT_GpuFreqDef} hash_plugin (str): Type of hash plugin used for network communication. - {slurm.conf::OPT_HashPlugin} + + {slurm.conf#OPT_HashPlugin} hash_value (str): Current configuration hash value (hex). health_check_interval (int): Interval in seconds between executions of `HealthCheckProgram` - {slurm.conf::OPT_HealthCheckInterval} + + {slurm.conf#OPT_HealthCheckInterval} health_check_node_state (list[str]): List of node states which are eligible to execute - `HealthCheckProgram` - {slurm.conf::OPT_HealthCheckNodeState} + `HealthCheckProgram`. + + {slurm.conf#OPT_HealthCheckNodeState} health_check_program (str): Pathname of a script that is periodally executed as root user on all compute nodes. - {slurm.conf::OPT_HealthCheckProgram} + + {slurm.conf#OPT_HealthCheckProgram} inactive_limit (int): The interval, in seconds, after which a non-responsive job allocation command (e.g. `srun` or `salloc`) will result in the job being terminated. - {slurm.conf::OPT_InactiveLimit} + + {slurm.conf#OPT_InactiveLimit} interactive_step_options (str): When `LaunchParameters=use_interactive_step` is enabled, launching salloc will automatically start an srun process with `interactive_step_options` to launch a terminal on a node in the job allocation. - {slurm.conf::OPT_InteractiveStepOptions} + + {slurm.conf#OPT_InteractiveStepOptions} job_accounting_gather_type (str): The job accounting gather plugin used to collect usage information about Jobs. - {slurm.conf::OPT_JobAcctGatherType} + + {slurm.conf#OPT_JobAcctGatherType} job_accounting_gather_frequency (dict[str, int]): The job accounting and profiling sampling intervals. - {slurm.conf::OPT_JobAcctGatherFrequency} + + {slurm.conf#OPT_JobAcctGatherFrequency} job_accounting_gather_parameters (list[str]): Arbitrary paramerers for `job_accounting_gather_type` - {slurm.conf::OPT_JobAcctGatherParams} + + {slurm.conf#OPT_JobAcctGatherParams} job_completion_host (str): Name of the machine hosting the job completion database. - {slurm.conf::OPT_JobCompHost} + + {slurm.conf#OPT_JobCompHost} job_completion_location (str): Sets a string which has different meaning depending on `job_completion_type` - {slurm.conf::OPT_JobCompLoc} + + {slurm.conf#OPT_JobCompLoc} job_completion_parameters (list[str]): Arbitrary text passed to the Job completion plugin. - {slurm.conf::OPT_JobCompParams} + + {slurm.conf#OPT_JobCompParams} job_completion_port (int): The listening port of the job completion database server. - {slurm.conf::OPT_JobCompPort} + + {slurm.conf#OPT_JobCompPort} job_completion_type (str): Job completion logging mechanism type - {slurm.conf::OPT_JobCompType} + + {slurm.conf#OPT_JobCompType} job_completion_user (str): User account user fo accessing the job completion database. - {slurm.conf::OPT_JobCompUser} + + {slurm.conf#OPT_JobCompUser} job_container_type (str): Plugin used for job isolation through Linux namespaces. - {slurm.conf::OPT_JobContainerType} + + {slurm.conf#OPT_JobContainerType} job_file_append (bool): This option controls what to do if a job's output or error file exist when the job is started. If `True`, then append to the existing file. `False`, which is the default, means any existing files are truncated. - {slurm.conf::OPT_JobFileAppend} + + {slurm.conf#OPT_JobFileAppend} job_requeue (bool): Whether jobs are requeuable by default - {slurm.conf::OPT_JobRequeue} + + {slurm.conf#OPT_JobRequeue} job_submit_plugins (list[str]): Site specific list of plugins used for setting default job parameters and/or logging events - {slurm.conf::OPT_JobSubmitPlugins} + + {slurm.conf#OPT_JobSubmitPlugins} kill_on_bad_exit (bool): Whether a step will be terminated immediately if any task is crashed or aborted. - {slurm.conf::OPT_KillOnBadExit} + + {slurm.conf#OPT_KillOnBadExit} kill_wait_time (int): The interval, in seconds, given to a job's processes between the `SIGTERM` and `SIGKILL` signals upon reaching its time limit. - {slurm.conf::OPT_KillWait} + + {slurm.conf#OPT_KillWait} launch_parameters (list[str]) Options for the job launch plugin. - {slurm.conf::OPT_LaunchParameters} + + {slurm.conf#OPT_LaunchParameters} licenses (dict[str, int]): Licenses that can be allocated to jobs. - {slurm.conf::OPT_Licenses} + + {slurm.conf#OPT_Licenses} log_time_format (str): Format of the timestamp in slurmctld and slurmd log-files. - {slurm.conf::OPT_LogTimeFormat} + + {slurm.conf#OPT_LogTimeFormat} mail_domain (str): Domain name to qualify usernames if email address is not explicity given with the `--mail-user` option. - {slurm.conf::OPT_MailDomain} + + {slurm.conf#OPT_MailDomain} mail_program (str): Pathname to the program used to send emails per user request - {slurm.conf::OPT_MailProg} + + {slurm.conf#OPT_MailProg} max_array_size (int): Maximum job array task index value allowed. - {slurm.conf::OPT_MaxArraySize} + + {slurm.conf#OPT_MaxArraySize} max_batch_requeue (int): Maximum number of times a batch job may be automatically requeued before being marked as `JobHeldAdmin`. - {slurm.conf::OPT_MaxBatchRequeue} + + {slurm.conf#OPT_MaxBatchRequeue} max_dbd_msgs (int): Maximum number of messages the Slurm controllers queues before starting to drop them when the slurmdbd is down. - {slurm.conf::OPT_MaxDBDMsgs} + + {slurm.conf#OPT_MaxDBDMsgs} max_job_count (int): Maximum number of jobs slurmctld can have in memory at one time. - {slurm.conf::OPT_MaxJobCount} + + {slurm.conf#OPT_MaxJobCount} max_job_id (int): Highest job ID possible for Jobs that will be assigned automatically on submission. - {slurm.conf::OPT_MaxJobId} + + {slurm.conf#OPT_MaxJobId} max_memory_per_cpu (int): Maximum real memory size avialable per allocated CPU in Mebibytes. - {slurm.conf::OPT_MaxMemPerCPU} + + {slurm.conf#OPT_MaxMemPerCPU} max_memory_per_node (int): Maximum real memory size avialable per allocated Node in Mebibytes. - {slurm.conf::OPT_MaxMemPerNode} + + {slurm.conf#OPT_MaxMemPerNode} max_node_count (int): Maximum count of nodes which may exist in the slurmctld. - {slurm.conf::OPT_MaxNodeCount} + + {slurm.conf#OPT_MaxNodeCount} max_step_count (int): Maximum number of Steps that any Job can initiate. - {slurm.conf::OPT_MaxStepCount} + + {slurm.conf#OPT_MaxStepCount} max_tasks_per_node (int): Maximum number of tasks Slurm will allow for a job step to spawn on a single node. - {slurm.conf::OPT_MaxTasksPerNode} + + {slurm.conf#OPT_MaxTasksPerNode} mcs_plugin (str): Associate a security label to jobs, for resource sharing among jobs with the same label. - {slurm.conf::OPT_MCSPlugin} + + {slurm.conf#OPT_MCSPlugin} mcs_parameters (list[str]): Parameters for the MCS Plugin. - {slurm.conf::OPT_MCSParameters} + + {slurm.conf#OPT_MCSParameters} min_job_age (int): Minimum age (in seconds) of a completed Job before its record is cleared from slurmctlds memory. - {slurm.conf::OPT_MinJobAge} + + {slurm.conf#OPT_MinJobAge} mpi_default (str): Default type of MPI that will be used. - {slurm.conf::OPT_MpiDefault} + + {slurm.conf#OPT_MpiDefault} mpi_parameters (list[str]): Parameters for MPI. + + {slurm.conf#OPT_MpiParams} message_timeout (int): Time permitted for a round-trip communication to complete in seconds. - {slurm.conf::OPT_MessageTimeout} + + {slurm.conf#OPT_MessageTimeout} next_job_id (int): Next Job-ID that will be assigned. node_features_plugins (list[str]): Plugins to be used for support of node features which can change through time. - {slurm.conf::OPT_NodeFeaturesPlugins} + + {slurm.conf#OPT_NodeFeaturesPlugins} over_time_limit (int): Number of minutes by which a job can exceed its time limit before being canceled. - {slurm.conf::OPT_OverTimeLimit} + + {slurm.conf#OPT_OverTimeLimit} plugin_dirs (list[str]): List of paths where Slurm looks for plugins. - {slurm.conf::OPT_PluginDir} + + {slurm.conf#OPT_PluginDir} plugin_stack_config (str): Location of the config file for Slurm stackable plugins. - {slurm.conf::OPT_PlugStackConfig} + + {slurm.conf#OPT_PlugStackConfig} preempt_exempt_time (int): Minimum run time for all jobs before they can be considered for preemption. - {slurm.conf::OPT_PreemptExemptTime} + + {slurm.conf#OPT_PreemptExemptTime} preempt_mode (str): Mechanism used to preempt jobs or enable gang scheduling. - {slurm.conf::OPT_PreemptMode} + + {slurm.conf#OPT_PreemptMode} preempt_parameters (list[str]): Options for the Preempt Plugin. - {slurm.conf::OPT_PreemptParameters} + + {slurm.conf#OPT_PreemptParameters} preempt_type (str): Plugin used to identify which jobs can be preempted. - {slurm.conf::OPT_PreemptMode} + + {slurm.conf#OPT_PreemptMode} prep_parameters (list[str]): Parameters passed to the PrEpPlugins. - {slurm.conf::OPT_PrEpParamrters} + + {slurm.conf#OPT_PrEpParamrters} prep_plugins (list[str]): List of PrEp Plugins to be used. - {slurm.conf::OPT_PrEpPlugins} + + {slurm.conf#OPT_PrEpPlugins} priority_decay_half_life (int): Controls how long (in seconds) prior resource use is considered in determining how over- or under-serviced an association is. - {slurm.conf::OPT_PriorityDecayHalfLife} + + {slurm.conf#OPT_PriorityDecayHalfLife} priority_calc_period (int): Period (in minutes) in which the half-life decay will be re-calculated. - {slurm.conf::OPT_PriorityCalcPeriod} + + {slurm.conf#OPT_PriorityCalcPeriod} priority_favor_small (bool): Whether small jobs should be given preferential scheduling priority. - {slurm.conf::OPT_PriorityFavorSmall} + + {slurm.conf#OPT_PriorityFavorSmall} priority_flags (list[str]): List of flags that modify priority behaviour. - {slurm.conf::OPT_PriorityFlags} + + {slurm.conf#OPT_PriorityFlags} priority_max_age (int): Job age that is needed before receiving the maximum age factor in computing priority. - {slurm.conf::OPT_PriorityMaxAge} + + {slurm.conf#OPT_PriorityMaxAge} priority_parameters (str): Arbitrary string used by the `priority_type` plugin. - {slurm.conf::OPT_PriorityParameters} + + {slurm.conf#OPT_PriorityParameters} priority_usage_reset_period (str): At this interval the usage of associations will be reset to 0. - {slurm.conf::OPT_PriorityUsageResetPeriod} + + {slurm.conf#OPT_PriorityUsageResetPeriod} priority_type (str): Specifies the plugin to be used in establishing a job's scheduling priority. - {slurm.conf::OPT_PriorityType} + + {slurm.conf#OPT_PriorityType} priority_weight_age (int): An integer value that sets the degree to which the queue wait time component contributes to the job's priority. - {slurm.conf::OPT_PriorityWeightAge} + + {slurm.conf#OPT_PriorityWeightAge} priority_weight_assoc (int): An integer value that sets the degree to which the association component contributes to the job's priority. - {slurm.conf::OPT_PriorityWeightAssoc} + + {slurm.conf#OPT_PriorityWeightAssoc} priority_weight_fair_share (int): An integer value that sets the degree to which the fair-share component contributes to the job's priority. - {slurm.conf::OPT_PriorityWeightFairShare} + + {slurm.conf#OPT_PriorityWeightFairShare} priority_weight_job_size (int): An integer value that sets the degree to which the job size component contributes to the job's priority. - {slurm.conf::OPT_PriorityWeightJobSize} + + {slurm.conf#OPT_PriorityWeightJobSize} priority_weight_partition (int): Partition factor used by priority/multifactor plugin in calculating job priority. - {slurm.conf::OPT_PriorityWeightPartition} + + {slurm.conf#OPT_PriorityWeightPartition} priority_weight_qos (int): An integer value that sets the degree to which the Quality Of Service component contributes to the job's priority - {slurm.conf::OPT_PriorityWeightQOS} + + {slurm.conf#OPT_PriorityWeightQOS} priority_weight_tres (dict[str, int]): TRES Types and weights that sets the degree that each TRES Type contributes to the job's priority. - {slurm.conf::OPT_PriorityWeightTRES} + + {slurm.conf#OPT_PriorityWeightTRES} private_data (list[str]): Defines what type of information is hidden from regular users. - {slurm.conf::OPT_PrivateData} + + {slurm.conf#OPT_PrivateData} proctrack_type (str): Identifies the plugin to be used for process tracking on a job step basis. - {slurm.conf::OPT_ProctrackType} + + {slurm.conf#OPT_ProctrackType} prolog (list[str]): List of pathnames of programs for the slurmd to execute whenever it is asked to run a job step from a new job allocation. - {slurm.conf::OPT_Prolog} + + {slurm.conf#OPT_Prolog} prolog_epilog_timeout (int): The interval in seconds Slurm waits for Prolog and Epilog before terminating them. - {slurm.conf::OPT_PrologEpilogTimeout} + + {slurm.conf#OPT_PrologEpilogTimeout} prolog_slurmctld (list[str]): List of pathnames of programs for the slurmctld daemon to execute before granting a new job allocation. - {slurm.conf::OPT_PrologSlurmctld} + + {slurm.conf#OPT_PrologSlurmctld} propagate_prio_process (int): Controls the scheduling priority (nice value) of user spawned tasks. - {slurm.conf::OPT_PropagatePrioProcess} + + {slurm.conf#OPT_PropagatePrioProcess} prolog_flags (list[str]): Flags to control the Prolog behavior. - {slurm.conf::OPT_PrologFlags} + + {slurm.conf#OPT_PrologFlags} propagate_resource_limits (list[str]): List of resource limit names that are propagated to the Job environment. - {slurm.conf::OPT_PropagateResourceLimits} + + {slurm.conf#OPT_PropagateResourceLimits} propagate_resource_limits_except (list[str]): List of resource limit names that are excluded from propagation to the Job environment. - {slurm.conf::OPT_PropagateResourceLimitsExcept} + + {slurm.conf#OPT_PropagateResourceLimitsExcept} reboot_program (str): Program to be executed on each compute node to reboot it. - {slurm.conf::OPT_RebootProgram} + + {slurm.conf#OPT_RebootProgram} reconfig_flags (lisr[str]): List of flags to control various actions that may be taken when a reconfigure command is issued (for example with `scontrol reconfig`). - {slurm.conf::OPT_ReconfigFlags} + + {slurm.conf#OPT_ReconfigFlags} requeue_exit (str): Enables automatic requeue for batch jobs which exit with the specified values. - {slurm.conf::OPT_RequeueExit} + + {slurm.conf#OPT_RequeueExit} requeue_exit_hold (str): Enables automatic requeue for batch jobs which exit with the specified values, with these jobs being held until released manually by the user. - {slurm.conf::OPT_RequeueExitHold} + + {slurm.conf#OPT_RequeueExitHold} resume_fail_program (str): The program that will be executed when nodes fail to resume to by `resume_timeout`. - {slurm.conf::OPT_ResumeFailProgram} + + {slurm.conf#OPT_ResumeFailProgram} resume_program (str): Program that will be executed when a node in power save mode is assigned work to perform. - {slurm.conf::OPT_ResumeProgram} + + {slurm.conf#OPT_ResumeProgram} resume_rate (int): Number of nodes per minute that will be restored from power save mode to normal operation by `resume_program`. - {slurm.conf::OPT_ResumeRate} + + {slurm.conf#OPT_ResumeRate} resume_timeout (int): Maximum time permitted (in seconds) between when a node resume request is issued and when the node is actually available for use. - {slurm.conf::OPT_ResumeTimeout} + + {slurm.conf#OPT_ResumeTimeout} reservation_epilog (str): Pathname of a program for the slurmctld to execute when a reservation ends. - {slurm.conf::OPT_ResvEpilog} + + {slurm.conf#OPT_ResvEpilog} reservation_over_run (int): Describes how long (in minutes) a job already running in a reservation should be permitted to execute after the end time of the reservation has been reached - {slurm.conf::OPT_ResvOverRun} + + {slurm.conf#OPT_ResvOverRun} reservation_prolog (str): Pathname of a program for the slurmctld to execute when a reservation begins. - {slurm.conf::OPT_ResvProlog} + + {slurm.conf#OPT_ResvProlog} return_to_service (int): Controls when a `DOWN` node will be returned to service - {slurm.conf::OPT_ReturnToService} + + {slurm.conf#OPT_ReturnToService} scheduler_log_file (str): - pathname of the scheduling event logging file. - {slurm.conf::OPT_SlurmSchedLogFile} + Pathname of the scheduling event logging file. + + {slurm.conf#OPT_SlurmSchedLogFile} scheduler_logging_enabled (bool): The initial level of scheduling event logging. - {slurm.conf::OPT_SlurmSchedLogLevel} + + {slurm.conf#OPT_SlurmSchedLogLevel} scheduler_parameters (list[str]): List of options for the `scheduler_type` plugin. - {slurm.conf::OPT_SchedulerParameters} + + {slurm.conf#OPT_SchedulerParameters} scheduler_time_slice (int): Number of seconds in each time slice when gang scheduling is enabled. - {slurm.conf::OPT_SchedulerTimeSlice} + + {slurm.conf#OPT_SchedulerTimeSlice} scheduler_type (str): Identifies the type of scheduler to be used. - {slurm.conf::OPT_SchedulerType} + + {slurm.conf#OPT_SchedulerType} scron_parameters (list[str]): Parameters for scron. - {slurm.conf::OPT_ScronParameters} + + {slurm.conf#OPT_ScronParameters} select_type (str): Identifies the type of resource selection algorithm to be used. - {slurm.conf::OPT_SelectType} + + {slurm.conf#OPT_SelectType} select_type_parameters (list[str]): Parameters passed to the `select_type` plugin. - {slurm.conf::OPT_SelectTypeParameters} + + {slurm.conf#OPT_SelectTypeParameters} priority_site_factor_plugin (str): Specifies an optional plugin to be used alongside "priority/multifactor", which is meant to initially set and continuously update the SiteFactor priority factor. - {slurm.conf::OPT_PrioritySiteFactorPlugin} + + {slurm.conf#OPT_PrioritySiteFactorPlugin} priority_site_factor_parameters (str): Arbitrary string used by the PrioritySiteFactorPlugin plugin. - {slurm.conf::OPT_PrioritySiteFactorParameters} + + {slurm.conf#OPT_PrioritySiteFactorParameters} slurm_conf_path (str): Path of the current slurm.conf file used. slurm_user_id (int): @@ -626,185 +775,231 @@ cdef class Config: Name of the User slurmd runs as. slurmctld_log_level (str): The level of detail to provide `slurmctld` daemon's logs. - {slurm.conf::OPT_SlurmctldDebug} + + {slurm.conf#OPT_SlurmctldDebug} slurmctld_log_file (str): Pathname of a file into which the `slurmctld` daemon's logs are written. - {slurm.conf::OPT_SlurmctldLogFile} + + {slurm.conf#OPT_SlurmctldLogFile} slurmctld_pid_file (str): Pathname of a file into which the `slurmctld` daemon may write its process id. - {slurm.conf::OPT_SlurmctldPidFile} + + {slurm.conf#OPT_SlurmctldPidFile} slurmctld_port (str): Port number where `slurmctld` listens to for work. Note that this can also be a port range. - {slurm.conf::OPT_SlurmctldPort} + + {slurm.conf#OPT_SlurmctldPort} slurmctld_primary_off_program (str): This program is executed when a `slurmctld` daemon running as the primary server becomes a backup server. - {slurm.conf::OPT_SlurmctldPrimaryOffProg} + + {slurm.conf#OPT_SlurmctldPrimaryOffProg} slurmctld_primary_on_program (str): This program is executed when a `slurmctld` daemon running as a backup server becomes the primary server. - {slurm.conf::OPT_SlurmctldPrimaryOnProg} + + {slurm.conf#OPT_SlurmctldPrimaryOnProg} slurmctld_syslog_level (str): Level of detail that the `slurmctld` logs to the syslog. - {slurm.conf::OPT_SlurmctldSyslogDebug} + + {slurm.conf#OPT_SlurmctldSyslogDebug} slurmctld_timeout (int): The interval, in seconds, that the backup controller waits for the primary controller to respond before assuming control. - {slurm.conf::OPT_SlurmctldTimeout} + + {slurm.conf#OPT_SlurmctldTimeout} slurmctld_parameters (dict[str, Union[str, int, bool]]): Options set for the `slurmctld`. If a value in this dict is `True`, it means this parameter does not have any additional options specified, and is just an "enabled" option. - {slurm.conf::OPT_SlurmctldParameters} + + {slurm.conf#OPT_SlurmctldParameters} slurmd_log_level (str): Level of detail `slurmd` is logging. - {slurm.conf::OPT_SlurmdDebug} + + {slurm.conf#OPT_SlurmdDebug} slurmd_log_file (str): Pathname of the file where `slurmd` writes logs to. - {slurm.conf::OPT_SlurmdLogFile} + + {slurm.conf#OPT_SlurmdLogFile} slurmd_parameters (list[str]): Parameters for the `slurmd`. - {slurm.conf::OPT_SlurmdParameters} + + {slurm.conf#OPT_SlurmdParameters} slurmd_pid_file (str): Pathname of a file into which the `slurmd` daemon may write its process id. - {slurm.conf::OPT_SlurmdPidFile} + + {slurm.conf#OPT_SlurmdPidFile} slurmd_port (int): Port number where `slurmd` listens to for work. - {slurm.conf::OPT_SlurmdPort} + + {slurm.conf#OPT_SlurmdPort} slurmd_spool_directory (str): Pathname of a directory into which the `slurmd` daemon's state information and batch job script information are written. - {slurm.conf::OPT_SlurmdSpoolDir} + + {slurm.conf#OPT_SlurmdSpoolDir} slurmd_syslog_level (str): Level of detail that the `slurmd` logs to the syslog. - {slurm.conf::OPT_SlurmdSyslogDebug} + + {slurm.conf#OPT_SlurmdSyslogDebug} slurmd_timeout (int): The interval, in seconds, that `slurmctld` waits for `slurmd` to respond before configuring that node's state to `DOWN`. - {slurm.conf::OPT_SlurmdTimeout} + + {slurm.conf#OPT_SlurmdTimeout} srun_epilog (str): Pathname of an executable to be run by `srun` following the completion of a job step. - {slurm.conf::OPT_SrunEpilog} + + {slurm.conf#OPT_SrunEpilog} srun_port_range (str): Ports `srun` creates to communicate with the `slurmctld`, the `slurmstepd` and to handle the application I/O. - {slurm.conf::OPT_SrunPortRange} + + {slurm.conf#OPT_SrunPortRange} srun_prolog (str): Pathname of an executable to be run by `srun` prior to the launch of a job step. - {slurm.conf::OPT_SrunProlog} + + {slurm.conf#OPT_SrunProlog} state_save_location (str): Pathname of a directory where `slurmctld` saves its state. - {slurm.conf::OPT_StateSaveLocation} + + {slurm.conf#OPT_StateSaveLocation} suspend_exclude_nodes (str): Specifies the nodes which are to not be placed in power save mode, even if the node remains idle for an extended period of time. - {slurm.conf::OPT_SuspendExcNodes} + + {slurm.conf#OPT_SuspendExcNodes} suspend_exclude_partitions (str): Specifies the partitions whose nodes are to not be placed in power save mode, even if the node remains idle for an extended period of time. - {slurm.conf::OPT_SuspendExcParts} + + {slurm.conf#OPT_SuspendExcParts} suspend_exclude_states (list[str]): Specifies node states that are not to be powered down automatically. - {slurm.conf::OPT_SuspendExcStates} + + {slurm.conf#OPT_SuspendExcStates} suspend_program (str): Program that will be executed when a node remains idle for an extended period of time. - {slurm.conf::OPT_SuspendProgram} + + {slurm.conf#OPT_SuspendProgram} suspend_rate (int): Number of nodes per minute that are placed into power save mode. - {slurm.conf::OPT_SuspendRate} + + {slurm.conf#OPT_SuspendRate} suspend_time (int): Nodes which remain idle or down for this number of seconds will be placed into power save mode. - {slurm.conf::OPT_SuspendTime} + + {slurm.conf#OPT_SuspendTime} suspend_timeout (int): Maximum time permitted (in seconds) between when a node suspend request is issued and when the node is shutdown. - {slurm.conf::OPT_SuspendTimeout} + + {slurm.conf#OPT_SuspendTimeout} switch_type (str): Identifies the type of switch or interconnect used for application communications. - {slurm.conf::OPT_SwitchType} + + {slurm.conf#OPT_SwitchType} switch_parameters (list[str]): Optional parameters for the switch plugin. - {slurm.conf::OPT_SwitchParameters} + + {slurm.conf#OPT_SwitchParameters} task_epilog (str): Pathname of a program to be executed as the slurm job's owner after termination of each task. - {slurm.conf::OPT_TaskEpilog} + + {slurm.conf#OPT_TaskEpilog} task_plugin (str): Identifies the type of task launch plugin, typically used to provide resource management within a node. - {slurm.conf::OPT_TaskPlugin} + + {slurm.conf#OPT_TaskPlugin} task_plugin_parameters (list[str]): Optional Parameters for `task_plugin`. - {slurm.conf::OPT_TaskPluginParam} + + {slurm.conf#OPT_TaskPluginParam} task_prolog (str): Pathname of a program to be executed as the slurm job's owner prior to initiation of each task. - {slurm.conf::OPT_TaskProlog} + + {slurm.conf#OPT_TaskProlog} tls_parameters (list[str]): Parameters for `tls_type`. tls_type (str): TLS Plugin used. tcp_timeout (int): Time permitted for TCP connection to be established. - {slurm.conf::OPT_TCPTimeout} + + {slurm.conf#OPT_TCPTimeout} temporary_filesystem (str): Pathname of the file system available to user jobs for temporary storage. - {slurm.conf::OPT_TmpFS} + + {slurm.conf#OPT_TmpFS} topology_parameters (list[str]): List of network topology options - {slurm.conf::OPT_TopologyParam} + + {slurm.conf#OPT_TopologyParam} topology_plugin (str): Identifies the plugin to be used for determining the network topology and optimizing job allocations to minimize network contention. - {slurm.conf::OPT_TopologyPlugin} + + {slurm.conf#OPT_TopologyPlugin} tree_width (int): Specifies the width of the virtual network tree `slurmd` uses for communication. - {slurm.conf::OPT_TreeWidth} + + {slurm.conf#OPT_TreeWidth} unkillable_step_program (str): Program that will be executed when the processes in a job step are determined unkillable. - {slurm.conf::OPT_UnkillableStepProgram} + + {slurm.conf#OPT_UnkillableStepProgram} unkillable_step_timeout (int): The length of time, in seconds, that Slurm will wait before deciding that processes in a job step are unkillable. - {slurm.conf::OPT_UnkillableStepTimeout} + + {slurm.conf#OPT_UnkillableStepTimeout} track_wckey (bool): Whether WCKeys are tracked or not. - {slurm.conf::OPT_TrackWCKey} + + {slurm.conf#OPT_TrackWCKey} use_pam (bool): Whether PAM (Pluggable Authentication Modules for Linux) will be enabled or not. - {slurm.conf::OPT_UsePAM} + + {slurm.conf#OPT_UsePAM} version (str): Version as returned by the `slurmctld`. virtual_memory_size_factor (int): Specifies the job's or job step's virtual memory limit as a percentage of its real memory limit. - {slurm.conf::OPT_VSizeFactor} + + {slurm.conf#OPT_VSizeFactor} default_job_wait_time (int): Specifies how many seconds the srun command should by default wait after the first task terminates before terminating all remaining tasks. - {slurm.conf::OPT_WaitTime} + + {slurm.conf#OPT_WaitTime} x11_parameters (list[str]): Parameters for Slurm's built-in X11 forwarding implementation. - {slurm.conf::OPT_X11Parameters} + + {slurm.conf#OPT_X11Parameters} """ cdef slurm_conf_t *ptr @@ -829,45 +1024,57 @@ cdef class MPIConfig: Attributes: pmix_cli_tmp_dir_base (str): Directory to have PMIx use for temporary files. - {mpi.conf::OPT_PMIxCliTmpDirBase} + + {mpi.conf#OPT_PMIxCliTmpDirBase} pmix_coll_fence (str): Defines the type of fence to use for collecting inter-node data. - {mpi.conf::OPT_PMIxCollFence} + + {mpi.conf#OPT_PMIxCollFence} pmix_debug (bool): Whether debug logging for the PMIx Plugin is enabled or not. - {mpi.conf::OPT_PMIxDebug} + + {mpi.conf#OPT_PMIxDebug} pmix_direct_conn (bool): Whether direct launching of tasks is enabled or not. - {mpi.conf::OPT_PMIxDirectConn} + + {mpi.conf#OPT_PMIxDirectConn} pmix_direct_conn_early (bool): Whether early connection to a parent node are allowed or not. - {mpi.conf::OPT_PMIxDirectConnEarly} + + {mpi.conf#OPT_PMIxDirectConnEarly} pmix_direct_conn_ucx (bool): Whether PMIx is allowed to use UCX for communication. - {mpi.conf::OPT_PMIxDirectConnUCX} + + {mpi.conf#OPT_PMIxDirectConnUCX} pmix_direct_same_arch (bool): Whether additional communication optimizations are enabled when `pmix_direct_conn` is also set to `True`, also assuming all nodes of the job have the same architecture. - {mpi.conf::OPT_PMIxDirectSameArch} + + {mpi.conf#OPT_PMIxDirectSameArch} pmix_environment (dict[str, Union[str, int]): Environment variables to bet set in the Job environment, used by PMIx. - {mpi.conf::OPT_PMIxEnv} + + {mpi.conf#OPT_PMIxEnv} pmix_fence_barrier (bool): Whether to fence inter-node communication for data collection. - {mpi.conf::OPT_PMIxFenceBarrier} + + {mpi.conf#OPT_PMIxFenceBarrier} pmix_net_devices_ucx (str): Type of network device to use for communication. - {mpi.conf::OPT_PMIxNetDevicesUCX} + + {mpi.conf#OPT_PMIxNetDevicesUCX} pmix_timeout (int): The maximum time (in seconds) allowed for communication between hosts to take place. - {mpi.conf::OPT_PMIxTimeout} + + {mpi.conf#OPT_PMIxTimeout} pmix_tls_ucx (list[str]): List of values for the UCX_TLS variable which restrict the transports to use. - {mpi.conf::OPT_PMIxTlsUCX} + + {mpi.conf#OPT_PMIxTlsUCX} """ cdef public: pmix_cli_tmp_dir_base @@ -905,74 +1112,91 @@ cdef class CgroupConfig: mountpoint (str): Specifies the PATH under which cgroup controllers should be mounted. - {cgroup.conf::OPT_CgroupMountpoint) + + {cgroup.conf#OPT_CgroupMountpoint} plugin (str): Specifies the plugin to be used when interacting with the cgroup subsystem. - {cgroup.conf::OPT_CgroupPlugin) + + {cgroup.conf#OPT_CgroupPlugin} systemd_timeout (int): Maximum time (in milliseconds) that Slurm will wait for the slurmd scope to be ready before failing. - {cgroup.conf::OPT_SystemdTimeout) + + {cgroup.conf#OPT_SystemdTimeout} ignore_systemd (bool): If `True`, it will avoid any call to dbus and contact with systemd, and cgroup hierarchy preparation is done manually. Only for `cgroup/v2` - {cgroup.conf::OPT_IgnoreSystemd) + + {cgroup.conf#OPT_IgnoreSystemd} ignore_systemd_on_failure (bool): Similar to `ignore_systemd`, but only in the case that a dbus call does not succeed. Only for `cgroup/v2`. - {cgroup.conf::OPT_IgnoreSystemdOnFailure) + + {cgroup.conf#OPT_IgnoreSystemdOnFailure} enable_controllers (bool): When enabled, `slurmd` gets the available controllers from root`s cgroup.controllers file located in `mountpoint`. - {cgroup.conf::OPT_EnableControllers) + + {cgroup.conf#OPT_EnableControllers} allowed_ram_space (int): Constrains the job/step cgroup RAM to this percentage of the allocated memory. - {cgroup.conf::OPT_AllowedRAMSpace) + + {cgroup.conf#OPT_AllowedRAMSpace} allowed_swap_space (float): Constrain the job cgroup swap space to this percentage of the allocated memory. - {cgroup.conf::OPT_AllowedSwapSpace) + + {cgroup.conf#OPT_AllowedSwapSpace} constrain_cores (bool): When `True`, then constrain allowed cores to the subset of allocated resources. - {cgroup.conf::OPT_ConstrainCores) + + {cgroup.conf#OPT_ConstrainCores} constrain_devices (bool): When `True`, then constrain the job's allowed devices based on GRES allocated resources. - {cgroup.conf::OPT_ConstrainDevices) + + {cgroup.conf#OPT_ConstrainDevices} constrain_ram_space (bool): When `True`, then constrain the job's RAM usage by setting the memory soft limit to the allocated memory and the hard limit to the allocated memory * `allowed_ram_space`. - {cgroup.conf::OPT_ConstrainRAMSpace) + + {cgroup.conf#OPT_ConstrainRAMSpace} constrain_swap_space (bool): When `True`, then constrain the job's swap space usage. - {cgroup.conf::OPT_ConstrainSwapSpace) + + {cgroup.conf#OPT_ConstrainSwapSpace} max_ram_percent (float): Upper bound in percent of total RAM (configured RealMemory of the node) on the RAM constraint for a job. - {cgroup.conf::OPT_MaxRAMPercent) + + {cgroup.conf#OPT_MaxRAMPercent} max_swap_percent (float): Upper bound (in percent of total RAM, configured RealMemory of the node) on the amount of RAM+Swap that may be used for a job. - {cgroup.conf::OPT_MaxSwapPercent) + + {cgroup.conf#OPT_MaxSwapPercent} memory_swappiness (float): Configures the kernel's priority for swapping out anonymous pages verses file cache pages for the job cgroup. Only for `cgroup/v1`. A value of `-1.0` means that the kernel's default swappiness value will be used. - {cgroup.conf::OPT_MemorySwappiness) + + {cgroup.conf#OPT_MemorySwappiness} min_ram_space (int): Lower bound (in Mebibytes) on the memory limits defined by `allowed_ram_space` and `allowed_swap_space`. - {cgroup.conf::OPT_MinRAMSpace) + + {cgroup.conf#OPT_MinRAMSpace} signal_children_processes (bool): When `True`, then send signals (for cancelling, suspending, resuming, etc.) to all children processes in a job/step. - {cgroup.conf::OPT_SignalChildrenProcesses) + + {cgroup.conf#OPT_SignalChildrenProcesses} """ cdef public: mountpoint @@ -1016,68 +1240,85 @@ cdef class AccountingGatherConfig: energy_ipmi_frequency (int): Number of seconds between BMC access samples or XCC samples, depending on the plugin used. - {acct_gather.conf::OPT_EnergyIPMIFrequency} + + {acct_gather.conf#OPT_EnergyIPMIFrequency} energy_ipmi_calc_adjustment (bool): When `True`, the consumption between the last BMC access sample and a step consumption update is approximated to get more accurate task consumption. - {acct_gather.conf::OPT_EnergyIPMICalcAdjustment} + + {acct_gather.conf#OPT_EnergyIPMICalcAdjustment} energy_ipmi_power_sensors (str): IDs of the sensors to used. - {acct_gather.conf::OPT_EnergyIPMIPowerSensors} + + {acct_gather.conf#OPT_EnergyIPMIPowerSensors} energy_ipmi_user_name (str): BMC Username - {acct_gather.conf::OPT_EnergyIPMIUsername} + + {acct_gather.conf#OPT_EnergyIPMIUsername} energy_ipmi_password (str): BMC Password - {acct_gather.conf::OPT_EnergyIPMIPassword} + + {acct_gather.conf#OPT_EnergyIPMIPassword} energy_ipmi_timeout (int): Timeout, in seconds, for initializing the IPMI XCC context for a new gathering thread. Default is 10 seconds. - {acct_gather.conf::OPT_EnergyIPMITimeout} + + {acct_gather.conf#OPT_EnergyIPMITimeout} profile_hdf5_dir (str): Path to the shared folder into which the `acct_gather_profile` plugin will write detailed data. - {acct_gather.conf::OPT_ProfileHDF5Dir} + + {acct_gather.conf#OPT_ProfileHDF5Dir} profile_hdf5_default (list[str]): List of data types to be collected for each job submission. - {acct_gather.conf::OPT_ProfileHDF5Default} + + {acct_gather.conf#OPT_ProfileHDF5Default} profile_influxdb_database (str): InfluxDB v1.x database name where profiling information is to be written. InfluxDB v2.x bucket name where profiling information is to be written. - {acct_gather.conf::OPT_ProfileInfluxDBDatabase} + + {acct_gather.conf#OPT_ProfileInfluxDBDatabase} profile_influxdb_default (list[str]): List of data types to be collected for each job submission. - {acct_gather.conf::OPT_ProfileInfluxDBDefault} + + {acct_gather.conf#OPT_ProfileInfluxDBDefault} profile_influxdb_host (str): The hostname of the machine where the InfluxDB instance is executed and the port used by the HTTP API. - {acct_gather.conf::OPT_ProfileInfluxDBHost} + + {acct_gather.conf#OPT_ProfileInfluxDBHost} profile_influxdb_password (str): Password for `profile_influxdb_user` - {acct_gather.conf::OPT_ProfileInfluxDBPass} + + {acct_gather.conf#OPT_ProfileInfluxDBPass} profile_influxdb_rtpolicy (str): The InfluxDB v1.x retention policy name for the database configured in ProfileInfluxDBDatabase option. The InfluxDB v2.x retention policy bucket name for the database configured in ProfileInfluxDBDatabase option. - {acct_gather.conf::OPT_ProfileInfluxDBRTPolicy} + + {acct_gather.conf#OPT_ProfileInfluxDBRTPolicy} profile_influxdb_user (str): InfluxDB username that should be used to gain access to the database configured in `profile_influxdb_database`. - {acct_gather.conf::OPT_ProfileInfluxDBRTUser} + + {acct_gather.conf#OPT_ProfileInfluxDBRTUser} profile_influxdb_timeout (int): The maximum time in seconds that an HTTP query to the InfluxDB server can take. - {acct_gather.conf::OPT_ProfileInfluxDBTimeout} + + {acct_gather.conf#OPT_ProfileInfluxDBTimeout} infiniband_ofed_port (int): Represents the port number of the local Infiniband card that we are willing to monitor. - {acct_gather.conf::OPT_InfinibandOFEDPort} + + {acct_gather.conf#OPT_InfinibandOFEDPort} sysfs_interfaces (list[str]): List of interface names to collect statistics from. - {acct_gather.conf::OPT_SysfsInterfaces} + + {acct_gather.conf#OPT_SysfsInterfaces} """ cdef public: energy_ipmi_frequency From 4fd6443bca70702ee85b00f6eaec7a212f8cd479 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 17 Jan 2025 19:16:33 +0100 Subject: [PATCH 16/31] mkdocs.yml: add pymdownx.magiclink --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 723b429f..c0b6ba3a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -73,6 +73,7 @@ markdown_extensions: - pymdownx.inlinehilite - pymdownx.superfences - pymdownx.details + - pymdownx.magiclink extra: version: From d0e744bb67147844ea10827b670b49ec4c4fe584 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 17 Jan 2025 19:17:27 +0100 Subject: [PATCH 17/31] docs: add griffe_exts.py script for prettier docs This allows to dynamically link to the Slurm Documentation with different Slurm versions. For example it will replace this in the docstring: {slurm.conf#OPT_MpiParams} with this in the finally rendered docs, when we are on pyslurm 24.11.x: https://slurm.schedmd.com/archive/slurm-24.11-latest/slurm.conf.html#OPT_MpiParams This way we can dynamically link to the Slurm docs. --- mkdocs.yml | 2 + scripts/griffe_exts.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 scripts/griffe_exts.py diff --git a/mkdocs.yml b/mkdocs.yml index c0b6ba3a..c80f2d93 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,6 +60,8 @@ plugins: show_root_heading: true show_symbol_type_toc: true show_symbol_type_heading: true + extensions: + - scripts/griffe_exts.py:DynamicDocstrings markdown_extensions: - admonition diff --git a/scripts/griffe_exts.py b/scripts/griffe_exts.py new file mode 100644 index 00000000..12457c2b --- /dev/null +++ b/scripts/griffe_exts.py @@ -0,0 +1,98 @@ +######################################################################### +# scripts/griffe_exts.py - griffe extensions for documentation +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +import ast +import inspect +import griffe +import pyslurm +import re + +logger = griffe.get_logger(__name__) +SLURM_VERSION = ".".join(pyslurm.__version__.split(".")[:-1]) +SLURM_DOCS_URL_BASE = "https://slurm.schedmd.com/archive" +SLURM_DOCS_URL_VERSIONED = f"{SLURM_DOCS_URL_BASE}/slurm-{SLURM_VERSION}-latest" + +config_files = ["acct_gather.conf", "slurm.conf", "cgroup.conf", "mpi.conf"] + + +def replace_with_slurm_docs_url(match): + first_part = match.group(1) + second_part = match.group(2) + ref = f"[{first_part}{second_part}]" + return f'{ref}({SLURM_DOCS_URL_VERSIONED}/{first_part}.html{second_part})' + + +pattern = re.compile( + r'\{(' + + '|'.join([re.escape(config) for config in config_files]) + + r')' # Match the first word before "#" + + r'([#][^}]+)\}' # Match "#" and everything after it until } +) + +# This class is inspired from here, with a few adaptions: +# https://github.com/mkdocstrings/griffe/blob/97f3613c5f0ae5653e8b91479c716b9ec44baacc/docs/guide/users/extending.md#full-example +# +# ISC License +# +# Copyright (c) 2021, Timothée Mazzucotelli +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +class DynamicDocstrings(griffe.Extension): + def __init__(self, object_paths: list[str] | None = None) -> None: + self.object_paths = object_paths + + def on_instance( + self, + node: ast.AST | griffe.ObjectNode, + obj: griffe.Object, + agent: griffe.Visitor | griffe.Inspector, + **kwargs, + ) -> None: + if self.object_paths and obj.path not in self.object_paths: + return + + try: + runtime_obj = griffe.dynamic_import(obj.path) + docstring = runtime_obj.__doc__ + except ImportError: + logger.debug(f"Could not get dynamic docstring for {obj.path}") + return + except AttributeError: + logger.debug(f"Object {obj.path} does not have a __doc__ attribute") + return + + if not docstring or not obj.docstring: + return + + # Update the object instance with the evaluated docstring. + fmt_docstring = pattern.sub(replace_with_slurm_docs_url, docstring) + docstring = inspect.cleandoc(fmt_docstring) + obj.docstring.value = docstring From eca500a196ba496febe4f55c2ca65b1fc14b2c64 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Fri, 17 Jan 2025 19:46:31 +0100 Subject: [PATCH 18/31] slurmctld.config: update some params and docs --- pyslurm/core/slurmctld/config.pxd | 76 ++++++++++++++++++++++++++----- pyslurm/core/slurmctld/config.pyx | 37 ++++++++------- 2 files changed, 82 insertions(+), 31 deletions(-) diff --git a/pyslurm/core/slurmctld/config.pxd b/pyslurm/core/slurmctld/config.pxd index 490894e2..241f7b7b 100644 --- a/pyslurm/core/slurmctld/config.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -95,9 +95,13 @@ cdef class Config: Name of the machine hosting the slurm database. {slurm.conf#OPT_AccountingStorageHost} - accounting_storage_parameters (dict[str, str]): + accounting_storage_parameters (dict[str, Union[str, int, bool]]): Options for the accounting storage Plugin + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_AccountingStorageParameters} accounting_storage_port (int): Listening port of the Accounting Database Server @@ -149,9 +153,13 @@ cdef class Config: List of alternative authentication plugins the slurmctld permits. {slurm.conf#OPT_AuthAltTypes} - auth_alt_parameters (dict[str, str]: + auth_alt_parameters (dict[str, Union[str, int, bool]]): Options for the alternative authentication plugins. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_AuthAltParameters} auth_info (list[str]): List of additional information used for authentication of @@ -175,9 +183,13 @@ cdef class Config: through `sbcast` or `srun --bcast`. {slurm.conf#OPT_BcastExclude} - bcast_parameters (dict[str, str]: + bcast_parameters (dict[str, Union[int, str, bool]]: Options for `sbcast` and `srun --bcast` behaviour. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_BcastParameters} burst_buffer_type (str): Plugin used to manage burst buffers. @@ -200,6 +212,10 @@ cdef class Config: communication_parameters (dict[str, Union[str, int]]): Communication options for Cluster daemons. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_CommunicationParameters} complete_wait_time (int): The time to wait, in seconds, when any job is in the COMPLETING state @@ -236,9 +252,13 @@ cdef class Config: Default real memory size available per allocated Node in Mebibytes. {slurm.conf#OPT_DefMemPerNode} - dependency_parameters (list[str]): + dependency_parameters (dict[str, Union[str, int, bool]]): List of parameters for dependencies. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_DependencyParameters} disable_root_jobs (bool): Whether root can submit Jobs or not. @@ -274,9 +294,13 @@ cdef class Config: allocated resources. {slurm.conf#OPT_FairShareDampeningFactor} - federation_parameters (list[str]): + federation_parameters (dict[str, Union[str, int, bool]]): Options for Federations + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_FederationParameters} first_job_id (int): The job id to be used for the first job submitted. @@ -361,9 +385,13 @@ cdef class Config: `job_completion_type` {slurm.conf#OPT_JobCompLoc} - job_completion_parameters (list[str]): + job_completion_parameters (dict[str, Union[str, int, bool]]): Arbitrary text passed to the Job completion plugin. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_JobCompParams} job_completion_port (int): The listening port of the job completion database server. @@ -490,9 +518,13 @@ cdef class Config: Default type of MPI that will be used. {slurm.conf#OPT_MpiDefault} - mpi_parameters (list[str]): + mpi_parameters (dict[str, Union[str, int, bool]]): Parameters for MPI. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_MpiParams} message_timeout (int): Time permitted for a round-trip communication to complete in @@ -528,9 +560,13 @@ cdef class Config: Mechanism used to preempt jobs or enable gang scheduling. {slurm.conf#OPT_PreemptMode} - preempt_parameters (list[str]): + preempt_parameters (dict[str, Union[str, int, bool]]): Options for the Preempt Plugin. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_PreemptParameters} preempt_type (str): Plugin used to identify which jobs can be preempted. @@ -728,9 +764,13 @@ cdef class Config: The initial level of scheduling event logging. {slurm.conf#OPT_SlurmSchedLogLevel} - scheduler_parameters (list[str]): + scheduler_parameters (dict[str, Union[str, int, bool]]): List of options for the `scheduler_type` plugin. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_SchedulerParameters} scheduler_time_slice (int): Number of seconds in each time slice when gang scheduling is @@ -827,9 +867,13 @@ cdef class Config: Pathname of the file where `slurmd` writes logs to. {slurm.conf#OPT_SlurmdLogFile} - slurmd_parameters (list[str]): + slurmd_parameters (dict[str, Union[str, int, bool]]): Parameters for the `slurmd`. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_SlurmdParameters} slurmd_pid_file (str): Pathname of a file into which the `slurmd` daemon may write its @@ -913,9 +957,13 @@ cdef class Config: communications. {slurm.conf#OPT_SwitchType} - switch_parameters (list[str]): + switch_parameters (dict[str, Union[str, int, bool]]): Optional parameters for the switch plugin. + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_SwitchParameters} task_epilog (str): Pathname of a program to be executed as the slurm job's owner after @@ -949,9 +997,13 @@ cdef class Config: storage. {slurm.conf#OPT_TmpFS} - topology_parameters (list[str]): + topology_parameters (dict[str, Union[str, int, bool]]): List of network topology options + If a value in this dict is `True`, it means this parameter does not + have any additional options specified, and is just an "enabled" + option. + {slurm.conf#OPT_TopologyParam} topology_plugin (str): Identifies the plugin to be used for determining the network diff --git a/pyslurm/core/slurmctld/config.pyx b/pyslurm/core/slurmctld/config.pyx index 54455525..b41b3f45 100644 --- a/pyslurm/core/slurmctld/config.pyx +++ b/pyslurm/core/slurmctld/config.pyx @@ -268,7 +268,8 @@ cdef class Config: @property def accounting_storage_parameters(self): - return cstr.to_dict(self.ptr.accounting_storage_params) + return cstr.to_dict(self.ptr.accounting_storage_params, delim1=",", + delim2="=", def_value=True) @property def accounting_storage_port(self): @@ -388,7 +389,6 @@ cdef class Config: @property def communication_parameters(self): - # TODO: check again return cstr.to_dict(self.ptr.comm_params, delim1=",", delim2="=", def_value=True) @@ -431,8 +431,8 @@ cdef class Config: @property def dependency_parameters(self): - # TODO: check format again - return cstr.to_list(self.ptr.dependency_params) + return cstr.to_dict(self.ptr.dependency_params, delim1=",", + delim2="=", def_value=True) @property def disable_root_jobs(self): @@ -554,8 +554,8 @@ cdef class Config: @property def job_completion_parameters(self): - # TODO: maybe dict? - return cstr.to_list(self.ptr.job_comp_params) + return cstr.to_dict(self.ptr.job_comp_params, delim1=",", + delim2="=", def_value=True) @property def job_completion_port(self): @@ -698,8 +698,8 @@ cdef class Config: @property def mpi_parameters(self): - # TODO: check format again - return cstr.to_list(self.ptr.mpi_params) + return cstr.to_dict(self.ptr.mpi_params, delim1=",", + delim2="=", def_value=True) @property def message_timeout(self): @@ -737,8 +737,8 @@ cdef class Config: @property def preempt_parameters(self): - # TODO: check format again - return cstr.to_list(self.ptr.preempt_params) + return cstr.to_dict(self.ptr.preempt_params, delim1=",", + delim2="=", def_value=True) @property def preempt_type(self): @@ -759,7 +759,6 @@ cdef class Config: @property def priority_calc_period(self): - # TODO: seconds or minutes? return u32_parse(self.ptr.priority_calc_period) @property @@ -938,8 +937,8 @@ cdef class Config: @property def scheduler_parameters(self): - # TODO: check format again - return cstr.to_list(self.ptr.sched_params) + return cstr.to_dict(self.ptr.sched_params, delim1=",", + delim2="=", def_value=True) @property def scheduler_time_slice(self): @@ -1046,8 +1045,8 @@ cdef class Config: @property def slurmd_parameters(self): - # TODO: Check again - return cstr.to_list(self.ptr.slurmd_params) + return cstr.to_dict(self.ptr.slurmd_params, delim1=",", + delim2="=", def_value=True) @property def slurmd_pid_file(self): @@ -1124,8 +1123,8 @@ cdef class Config: @property def switch_parameters(self): - # TODO: Check format again - return cstr.to_list(self.ptr.switch_param) + return cstr.to_dict(self.ptr.switch_param, delim1=",", + delim2="=", def_value=True) @property def task_epilog(self): @@ -1167,8 +1166,8 @@ cdef class Config: @property def topology_parameters(self): - # TODO: check format again - return cstr.to_list(self.ptr.topology_param) + return cstr.to_dict(self.ptr.topology_param, delim1=",", + delim2="=", def_value=True) @property def topology_plugin(self): From 130478cca4c99c6824a1e5def299a6c2f2966cf6 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:26:40 +0100 Subject: [PATCH 19/31] slurmctld: seperate enums into different file --- pyslurm/core/slurmctld/__init__.py | 2 +- pyslurm/core/slurmctld/base.pyx | 8 +------ pyslurm/core/slurmctld/enums.pyx | 38 ++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 pyslurm/core/slurmctld/enums.pyx diff --git a/pyslurm/core/slurmctld/__init__.py b/pyslurm/core/slurmctld/__init__.py index f8130aab..fe8f970b 100644 --- a/pyslurm/core/slurmctld/__init__.py +++ b/pyslurm/core/slurmctld/__init__.py @@ -4,9 +4,9 @@ AccountingGatherConfig, CgroupConfig, ) +from .enums import ShutdownMode from .base import ( PingResponse, - ShutdownMode, ping, ping_primary, ping_backup, diff --git a/pyslurm/core/slurmctld/base.pyx b/pyslurm/core/slurmctld/base.pyx index 48425bd8..e01ecb38 100644 --- a/pyslurm/core/slurmctld/base.pyx +++ b/pyslurm/core/slurmctld/base.pyx @@ -30,13 +30,7 @@ import time from enum import IntEnum from .config import Config from pyslurm.utils.uint import u16_parse - - -class ShutdownMode(IntEnum): - """Mode of operation for shutdown action""" - ALL = 0 - CORE_FILE = 1 - CONTROLLER_ONLY = 2 +from .enums import ShutdownMode cdef class PingResponse: diff --git a/pyslurm/core/slurmctld/enums.pyx b/pyslurm/core/slurmctld/enums.pyx new file mode 100644 index 00000000..df7a1ac0 --- /dev/null +++ b/pyslurm/core/slurmctld/enums.pyx @@ -0,0 +1,38 @@ +######################################################################### +# slurmctld/enums.pyx - pyslurm slurmctld enums +######################################################################### +# Copyright (C) 2025 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from enum import IntEnum + +class ShutdownMode(IntEnum): + """Mode of operation for shutdown action.""" + ALL = 0 + CORE_FILE = 1 + CONTROLLER_ONLY = 2 + + +# A bit hacky, but it works for now. Putting the docstring under the enum value +# does not work unfortunately. +ShutdownMode.ALL.__doc__ = "Shutdown all daemons (slurmctld and slurmd)" +ShutdownMode.CORE_FILE.__doc__ = "Shutdown only slurmctld, and create a coredump" +ShutdownMode.CONTROLLER_ONLY.__doc__ = "Shutdown only slurmctld, without a coredump" From a73708dc1345f960578ae558ff400b5273516d60 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:27:11 +0100 Subject: [PATCH 20/31] slurmctld: more docs --- pyslurm/core/slurmctld/base.pxd | 15 ++++++++++- pyslurm/core/slurmctld/base.pyx | 45 ++++++++++++++++++++++++++----- pyslurm/core/slurmctld/config.pxd | 8 +++++- pyslurm/core/slurmctld/config.pyx | 44 ++++++++---------------------- 4 files changed, 71 insertions(+), 41 deletions(-) diff --git a/pyslurm/core/slurmctld/base.pxd b/pyslurm/core/slurmctld/base.pxd index 51a6de64..fd6f6a4d 100644 --- a/pyslurm/core/slurmctld/base.pxd +++ b/pyslurm/core/slurmctld/base.pxd @@ -40,8 +40,21 @@ from pyslurm.utils cimport cstr cdef class PingResponse: - """Slurm Controller Ping response information""" + """Slurm Controller Ping response information + Attributes: + is_primary (bool): + Whether this Slurm Controller is the primary Server. + is_responding (bool): + Whether this Slurm Controller actually responds to the ping. + index (int): + The index in the slurm.conf. For example, 0 means primary. + hostname (str): + Hostname of the Controller + latency (float): + The latency which the Controller responds with. This is in + milliseconds. + """ cdef public: is_primary is_responding diff --git a/pyslurm/core/slurmctld/base.pyx b/pyslurm/core/slurmctld/base.pyx index e01ecb38..b8ed0a88 100644 --- a/pyslurm/core/slurmctld/base.pyx +++ b/pyslurm/core/slurmctld/base.pyx @@ -25,11 +25,11 @@ from pyslurm.core.error import verify_rpc, RPCError from pyslurm.utils.helpers import instance_to_dict from pyslurm.utils import cstr +from pyslurm.utils.uint import u16_parse from typing import Union import time from enum import IntEnum from .config import Config -from pyslurm.utils.uint import u16_parse from .enums import ShutdownMode @@ -88,6 +88,12 @@ def ping_primary(): Returns: (pyslurm.slurmctld.PingResponse): a ping response + + Examples: + >>> from pyslurm import slurmctld + >>> resp = slurmctld.ping_primary() + >>> print(resp.hostname, resp.latency, resp.is_primary) + slurmctl 1.222 True """ return ping(0) @@ -99,6 +105,12 @@ def ping_backup(): Returns: (pyslurm.slurmctld.PingResponse): a ping response + + Examples: + >>> from pyslurm import slurmctld + >>> resp = slurmctld.ping_backup() + >>> print(resp.hostname, resp.latency, resp.is_primary) + slurmctlbackup 1.373 False """ return ping(1) @@ -140,6 +152,10 @@ def shutdown(mode: Union[ShutdownMode, int]): Raises: (pyslurm.RPCError): When shutdowning the daemons was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.shutdown(slurmctld.ShutdownMode.ALL) """ verify_rpc(slurm_shutdown(int(mode))) @@ -149,6 +165,10 @@ def reconfigure(): Raises: (pyslurm.RPCError): When reconfiguring was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.reconfigure() """ verify_rpc(slurm_reconfigure()) @@ -168,6 +188,10 @@ def takeover(index = 1): Raises: (pyslurm.RPCError): When reconfiguring was not successful. + + Examples: + >>> from pyslurm import slurmctld + >>> slurmctld.takeover(1) """ verify_rpc(slurm_takeover(index)) @@ -176,7 +200,7 @@ def add_debug_flags(flags): """Add DebugFlags to slurmctld Args: - flags (list): + flags (list[str]): For an available list of possible values, please check the `slurm.conf` documentation under `DebugFlags`. @@ -185,7 +209,7 @@ def add_debug_flags(flags): Examples: >>> from pyslurm import slurmctld - >>> slurmctld.add_debug_flags(["CpuFrequency"]) + >>> slurmctld.add_debug_flags(["CpuFrequency", "Backfill"]) """ if not flags: return @@ -201,7 +225,7 @@ def remove_debug_flags(flags): """Remove DebugFlags from slurmctld. Args: - flags (list): + flags (list[str]): For an available list of possible values, please check the `slurm.conf` documentation under `DebugFlags`. @@ -231,6 +255,8 @@ def clear_debug_flags(): Examples: >>> from pyslurm import slurmctld >>> slurmctld.clear_debug_flags() + >>> print(slurmctld.get_debug_flags()) + [] """ current_flags = get_debug_flags() if not current_flags: @@ -269,6 +295,9 @@ def set_log_level(level): Examples: >>> from pyslurm import slurmctld >>> slurmctld.set_log_level("quiet") + >>> log_level = slurmctld.get_log_level() + >>> print(log_level) + quiet """ data = _log_level_str_to_int(level) verify_rpc(slurm_set_debug_level(data)) @@ -282,8 +311,8 @@ def get_log_level(): Examples: >>> from pyslurm import slurmctld - >>> level = slurmctld.get_log_level() - >>> print(level) + >>> log_level = slurmctld.get_log_level() + >>> print(log_level) quiet """ return Config.load().slurmctld_log_level @@ -298,6 +327,8 @@ def enable_scheduler_logging(): Examples: >>> from pyslurm import slurmctld >>> slurmctld.enable_scheduler_logging() + >>> print(slurmctld.is_scheduler_logging_enabled()) + True """ verify_rpc(slurm_set_schedlog_level(1)) @@ -334,6 +365,8 @@ def set_fair_share_dampening_factor(factor): Examples: >>> from pyslurm import slurmctld >>> slurmctld.set_fair_share_dampening_factor(100) + >>> print(slurmctld.get_fair_share_dampening_factor) + 100 """ max_value = (2 ** 16) - 1 if not factor or factor >= max_value: diff --git a/pyslurm/core/slurmctld/config.pxd b/pyslurm/core/slurmctld/config.pxd index 241f7b7b..3e5246cd 100644 --- a/pyslurm/core/slurmctld/config.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -79,6 +79,12 @@ cdef class Config: All attributes in this class are read-only. Attributes: + cgroup_config (pyslurm.slurmctld.CgroupConfig): + The CGroup Configuration data + accounting_gather_config (pyslurm.slurmctld.AccountingGatherConfig):. + The Accounting Gather Configuration data. + mpi_config (pyslurm.slurmctld.MPIConfig): + The MPI Configuration data. accounting_storage_enforce (list[str]): List of enforcements on Job submissions. @@ -1104,7 +1110,7 @@ cdef class MPIConfig: of the job have the same architecture. {mpi.conf#OPT_PMIxDirectSameArch} - pmix_environment (dict[str, Union[str, int]): + pmix_environment (dict[str, Union[str, int]]): Environment variables to bet set in the Job environment, used by PMIx. diff --git a/pyslurm/core/slurmctld/config.pyx b/pyslurm/core/slurmctld/config.pyx index b41b3f45..387418f1 100644 --- a/pyslurm/core/slurmctld/config.pyx +++ b/pyslurm/core/slurmctld/config.pyx @@ -41,7 +41,9 @@ from typing import Union cdef class MPIConfig: def __init__(self): - raise RuntimeError("Cannot instantiate class directly") + raise RuntimeError("Cannot instantiate class directly. " + "Use slurmctld.Config.load() and access the " + "mpi_config attribute there") def to_dict(self): """MPI config formatted as a dictionary. @@ -82,7 +84,9 @@ cdef class MPIConfig: cdef class CgroupConfig: def __init__(self): - raise RuntimeError("Cannot instantiate class directly") + raise RuntimeError("Cannot instantiate class directly. " + "Use slurmctld.Config.load() and access the " + "cgroup_config attribute there") def to_dict(self): """Cgroup config formatted as a dictionary. @@ -128,7 +132,9 @@ cdef class CgroupConfig: cdef class AccountingGatherConfig: def __init__(self): - raise RuntimeError("Cannot instantiate class directly") + raise RuntimeError("Cannot instantiate class directly. " + "Use slurmctld.Config.load() and access the " + "accounting_gather_config attribute there") def to_dict(self): """AccountingGather config formatted as a dictionary. @@ -152,10 +158,8 @@ cdef class AccountingGatherConfig: out.energy_ipmi_frequency = int(conf.get("EnergyIPMIFrequency", 30)) out.energy_ipmi_calc_adjustment = _yesno_to_bool( conf.get("EnergyIPMICalcAdjustment")) - - # TODO: maybe dict? - out.energy_ipmi_power_sensors = conf.get("EnergyIPMIPowerSensors") - + out.energy_ipmi_power_sensors = cstr.to_dict( + conf.get("EnergyIPMIPowerSensors", ""), delim1=";", delim2="=") out.energy_ipmi_user_name = conf.get("EnergyIPMIUsername") out.energy_ipmi_password = conf.get("EnergyIPMIPassword") out.energy_ipmi_timeout = int(conf.get("EnergyIPMITimeout", 10)) @@ -190,22 +194,6 @@ cdef class Config: slurm_free_ctl_conf(self.ptr) self.ptr = NULL - @staticmethod - def load_scontrol(): - cdef Config conf = Config.__new__(Config) - verify_rpc(slurm_load_ctl_conf(0, &conf.ptr)) - - out = _parse_config_key_pairs(slurm_ctl_conf_2_key_pairs(conf.ptr), - owned=True) - out["CgroupSupportConfiguration"] = _parse_config_key_pairs( - conf.ptr.cgroup_conf) - out["AccountingGatherConfiguration"] = _parse_config_key_pairs( - conf.ptr.acct_gather_conf) - out["MPIPluginsConfiguration"] = _parse_config_key_pairs( - conf.ptr.mpi_conf) - - return out - @staticmethod def load(): """Load the current Slurm configuration (slurm.conf) @@ -351,7 +339,6 @@ cdef class Config: @property def batch_start_timeout(self): - # seconds return u16_parse(self.ptr.batch_start_timeout) @property @@ -394,7 +381,6 @@ cdef class Config: @property def complete_wait_time(self): - # seconds return u16_parse(self.ptr.complete_wait) @property @@ -442,7 +428,6 @@ cdef class Config: @property def eio_timeout(self): - # seconds return u16_parse(self.ptr.eio_timeout) @property @@ -458,7 +443,6 @@ cdef class Config: @property def epilog_msg_time(self): - # ms return u32_parse(self.ptr.epilog_msg_time) @property @@ -525,7 +509,6 @@ cdef class Config: @property def inactive_limit(self): - # seconds return u16_parse(self.ptr.inactive_limit) @property @@ -598,7 +581,6 @@ cdef class Config: @property def kill_wait_time(self): - # seconds return u16_parse(self.ptr.kill_wait) @property @@ -754,7 +736,6 @@ cdef class Config: @property def priority_decay_half_life(self): - # seconds return u32_parse(self.ptr.priority_decay_hl) @property @@ -852,7 +833,6 @@ cdef class Config: @property def prolog_epilog_timeout(self): - # seconds return u16_parse(self.ptr.prolog_epilog_timeout) @property @@ -942,7 +922,6 @@ cdef class Config: @property def scheduler_time_slice(self): - # seconds return u16_parse(self.ptr.sched_time_slice) @property @@ -1027,7 +1006,6 @@ cdef class Config: @property def slurmctld_timeout(self): - # seconds return u16_parse(self.ptr.slurmctld_timeout) @property From c627af80a4d9203b4bc26170fc396cf93a6f97d3 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:27:41 +0100 Subject: [PATCH 21/31] docs: ignore UNLIMITED constant in DynamicDocstrings extension it messes with the docstring for some reason --- mkdocs.yml | 4 +++- scripts/griffe_exts.py | 11 ++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index c80f2d93..040614ec 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -61,7 +61,9 @@ plugins: show_symbol_type_toc: true show_symbol_type_heading: true extensions: - - scripts/griffe_exts.py:DynamicDocstrings + - scripts/griffe_exts.py:DynamicDocstrings: + ignore_paths: + - pyslurm.constants.UNLIMITED markdown_extensions: - admonition diff --git a/scripts/griffe_exts.py b/scripts/griffe_exts.py index 12457c2b..cf91e675 100644 --- a/scripts/griffe_exts.py +++ b/scripts/griffe_exts.py @@ -66,8 +66,11 @@ def replace_with_slurm_docs_url(match): # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. class DynamicDocstrings(griffe.Extension): - def __init__(self, object_paths: list[str] | None = None) -> None: - self.object_paths = object_paths + def __init__(self, include_paths: list[str] | None = None, + ignore_paths: list[str] | None = None) -> None: + + self.include_paths = include_paths + self.ignore_paths = ignore_paths def on_instance( self, @@ -76,7 +79,9 @@ def on_instance( agent: griffe.Visitor | griffe.Inspector, **kwargs, ) -> None: - if self.object_paths and obj.path not in self.object_paths: + + if ((self.include_paths and obj.path not in self.include_paths) + or (self.ignore_paths and obj.path in self.ignore_paths)): return try: From ee49dcc107206730363150c6479bd2ab959d2acd Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:28:31 +0100 Subject: [PATCH 22/31] docs: add slurmctld reference API page --- docs/reference/slurmctld.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 docs/reference/slurmctld.md diff --git a/docs/reference/slurmctld.md b/docs/reference/slurmctld.md new file mode 100644 index 00000000..f13419fe --- /dev/null +++ b/docs/reference/slurmctld.md @@ -0,0 +1,8 @@ +--- +title: slurmctld +--- + +::: pyslurm.slurmctld + handler: python + options: + members: yes From e4b9114f856d9dd5c65ee190cade36946ad97d87 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:29:22 +0100 Subject: [PATCH 23/31] docs: remove old config class documentation --- docs/reference/config.md | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 docs/reference/config.md diff --git a/docs/reference/config.md b/docs/reference/config.md deleted file mode 100644 index 62a36d68..00000000 --- a/docs/reference/config.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Config ---- - -!!! warning - This API is currently being completely reworked, and is subject to be - removed in the future when a replacement is introduced - -::: pyslurm.deprecated.config From e26c6405a15c90f7977a1edb915573ce2c486c7a Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:43:27 +0100 Subject: [PATCH 24/31] slurmctld:plugin_dirs: use named kwargs --- pyslurm/core/slurmctld/config.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyslurm/core/slurmctld/config.pyx b/pyslurm/core/slurmctld/config.pyx index 387418f1..5a8bf280 100644 --- a/pyslurm/core/slurmctld/config.pyx +++ b/pyslurm/core/slurmctld/config.pyx @@ -701,7 +701,7 @@ cdef class Config: @property def plugin_dirs(self): - return cstr.to_list(self.ptr.plugindir, None, ":") + return cstr.to_list(self.ptr.plugindir, default=None, delim=":") @property def plugin_stack_config(self): From adbdbbb0958217387e331b864a36ae7d88e0c79d Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:43:46 +0100 Subject: [PATCH 25/31] update common tests --- tests/unit/test_common.py | 47 +++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/tests/unit/test_common.py b/tests/unit/test_common.py index 6bc3f708..6f50cf45 100644 --- a/tests/unit/test_common.py +++ b/tests/unit/test_common.py @@ -1,7 +1,7 @@ ######################################################################### # test_common.py - common utility tests ######################################################################### -# Copyright (C) 2023 Toni Harzendorf +# Copyright (C) 2025 Toni Harzendorf # # This file is part of PySlurm # @@ -108,6 +108,25 @@ def test_str_to_dict(self): assert cstr.to_dict(input_str) == expected_dict assert cstr.to_dict("") == {} + expected_dict = {"param1": True, "param2": "opt1", "param3": True} + input_str = "param1,param2=opt1,param3" + assert cstr.to_dict(input_str, delim1=",", delim2="=", def_value=True) + + expected_dict = {"license1": 1, "license2": 5, "license3": 20} + input_str = "license1,license2:5,license3:20" + assert cstr.to_dict(input_str, delim1=",", delim2=":", def_value=1) + + def test_str_to_list(self): + expected_list = ["val1", "val2", "val3"] + input_str = "val1,val2,val3" + assert cstr.to_list(input_str) == expected_list + + expected_list = ["/path/to/dir1", "/path/to/dir2"] + input_str = "/path/to/dir1:/path/to/dir2" + assert cstr.to_list(input_str, default=None, delim=":") == expected_list + + assert cstr.to_list("") == [] + def test_dict_to_str(self): input_dict = {"key1": "value1", "key2": "value2"} expected_str = "key1=value1,key2=value2" @@ -255,32 +274,6 @@ def test_set_parse_bool_flag(self): assert not part.allow_root_jobs -# def _uint_bool_impl(self, arg): -# js = JobSubmitDescription() - -# setattr(js, arg, True) -# assert getattr(js, arg) == True - -# setattr(js, arg, False) -# assert getattr(js, arg) == False - -# # Set to true again to make sure toggling actually works. -# setattr(js, arg, True) -# assert getattr(js, arg) == True - -# setattr(js, arg, None) -# assert getattr(js, arg) == False - -# def test_u8_bool(self): -# self._uint_bool_impl("overcommit") - -# def test_u16_bool(self): -# self._uint_bool_impl("requires_contiguous_nodes") - -# def test_u64_bool_flag(self): -# self._uint_bool_impl("kill_on_invalid_dependency") - - class TestTime: def test_parse_minutes(self): From 0f74c9f41f12a603ad0b67033545a9c9577e9468 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 11:45:25 +0100 Subject: [PATCH 26/31] fix code spelling errors --- pyslurm/core/slurmctld/config.pxd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyslurm/core/slurmctld/config.pxd b/pyslurm/core/slurmctld/config.pxd index 3e5246cd..b138e9b2 100644 --- a/pyslurm/core/slurmctld/config.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -408,7 +408,7 @@ cdef class Config: {slurm.conf#OPT_JobCompType} job_completion_user (str): - User account user fo accessing the job completion database. + User account user for accessing the job completion database. {slurm.conf#OPT_JobCompUser} job_container_type (str): @@ -454,7 +454,7 @@ cdef class Config: {slurm.conf#OPT_LogTimeFormat} mail_domain (str): - Domain name to qualify usernames if email address is not explicity + Domain name to qualify usernames if email address is not explicitly given with the `--mail-user` option. {slurm.conf#OPT_MailDomain} @@ -486,11 +486,11 @@ cdef class Config: {slurm.conf#OPT_MaxJobId} max_memory_per_cpu (int): - Maximum real memory size avialable per allocated CPU in Mebibytes. + Maximum real memory size available per allocated CPU in Mebibytes. {slurm.conf#OPT_MaxMemPerCPU} max_memory_per_node (int): - Maximum real memory size avialable per allocated Node in Mebibytes. + Maximum real memory size available per allocated Node in Mebibytes. {slurm.conf#OPT_MaxMemPerNode} max_node_count (int): From 207736ad7c1758ec4ecade473ddab0c5f07ff11f Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sat, 18 Jan 2025 12:19:40 +0100 Subject: [PATCH 27/31] fix docs --- pyslurm/core/slurmctld/config.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyslurm/core/slurmctld/config.pxd b/pyslurm/core/slurmctld/config.pxd index b138e9b2..85980f3e 100644 --- a/pyslurm/core/slurmctld/config.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -81,7 +81,7 @@ cdef class Config: Attributes: cgroup_config (pyslurm.slurmctld.CgroupConfig): The CGroup Configuration data - accounting_gather_config (pyslurm.slurmctld.AccountingGatherConfig):. + accounting_gather_config (pyslurm.slurmctld.AccountingGatherConfig): The Accounting Gather Configuration data. mpi_config (pyslurm.slurmctld.MPIConfig): The MPI Configuration data. From 0b295751ed2d8d4d54844ebf61a95ad3ef623b8c Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sun, 19 Jan 2025 12:06:13 +0100 Subject: [PATCH 28/31] more doc stuff --- pyslurm/core/slurmctld/base.pyx | 16 ++++++++-------- pyslurm/core/slurmctld/config.pxd | 14 +++++++------- pyslurm/core/slurmctld/config.pyx | 7 ++----- pyslurm/core/slurmctld/enums.pyx | 6 +++--- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/pyslurm/core/slurmctld/base.pyx b/pyslurm/core/slurmctld/base.pyx index b8ed0a88..62582078 100644 --- a/pyslurm/core/slurmctld/base.pyx +++ b/pyslurm/core/slurmctld/base.pyx @@ -197,7 +197,7 @@ def takeover(index = 1): def add_debug_flags(flags): - """Add DebugFlags to slurmctld + """Add DebugFlags to `slurmctld` Args: flags (list[str]): @@ -222,7 +222,7 @@ def add_debug_flags(flags): def remove_debug_flags(flags): - """Remove DebugFlags from slurmctld. + """Remove DebugFlags from `slurmctld`. Args: flags (list[str]): @@ -247,7 +247,7 @@ def remove_debug_flags(flags): def clear_debug_flags(): - """Remove all currently set debug flags from slurmctld. + """Remove all currently set debug flags from `slurmctld`. Raises: (pyslurm.RPCError): When removing the debug flags was not successful. @@ -267,7 +267,7 @@ def clear_debug_flags(): def get_debug_flags(): - """Get the current list of debug flags for the slurmctld. + """Get the current list of debug flags for the `slurmctld`. Raises: (pyslurm.RPCError): When getting the debug flags was not successful. @@ -282,7 +282,7 @@ def get_debug_flags(): def set_log_level(level): - """Set the logging level for slurmctld. + """Set the logging level for `slurmctld`. Args: level (str): @@ -304,7 +304,7 @@ def set_log_level(level): def get_log_level(): - """Get the current log level for the slurmctld. + """Get the current log level for the `slurmctld`. Raises: (pyslurm.RPCError): When getting the log level was not successful. @@ -319,7 +319,7 @@ def get_log_level(): def enable_scheduler_logging(): - """Enable scheduler logging for slurmctld. + """Enable scheduler logging for `slurmctld`. Raises: (pyslurm.RPCError): When enabling scheduler logging was not successful. @@ -334,7 +334,7 @@ def enable_scheduler_logging(): def is_scheduler_logging_enabled(): - """Check whether scheduler logging is enabled for slurmctld. + """Check whether scheduler logging is enabled for `slurmctld`. Returns: (bool): Whether scheduler logging is enabled or not. diff --git a/pyslurm/core/slurmctld/config.pxd b/pyslurm/core/slurmctld/config.pxd index 85980f3e..664b391e 100644 --- a/pyslurm/core/slurmctld/config.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -441,7 +441,7 @@ cdef class Config: `SIGTERM` and `SIGKILL` signals upon reaching its time limit. {slurm.conf#OPT_KillWait} - launch_parameters (list[str]) + launch_parameters (list[str]): Options for the job launch plugin. {slurm.conf#OPT_LaunchParameters} @@ -606,8 +606,8 @@ cdef class Config: {slurm.conf#OPT_PriorityFlags} priority_max_age (int): - Job age that is needed before receiving the maximum age factor in - computing priority. + Job age (in seconds) that is needed before receiving the maximum + age factor in computing priority. {slurm.conf#OPT_PriorityMaxAge} priority_parameters (str): @@ -1048,7 +1048,7 @@ cdef class Config: percentage of its real memory limit. {slurm.conf#OPT_VSizeFactor} - default_job_wait_time (int): + wait_time (int): Specifies how many seconds the srun command should by default wait after the first task terminates before terminating all remaining tasks. @@ -1077,7 +1077,7 @@ cdef class Config: # # Copyright (C) 2022 SchedMD LLC. cdef class MPIConfig: - """Slurm MPI Config (mpi.conf) + """Slurm MPI Config (`mpi.conf`) Attributes: pmix_cli_tmp_dir_base (str): @@ -1164,7 +1164,7 @@ cdef class MPIConfig: # pyslurm/slurm/SLURM_DISCLAIMER). # Copyright (C) 2010-2022 SchedMD LLC. cdef class CgroupConfig: - """Slurm Cgroup Config (cgroup.conf) + """Slurm Cgroup Config (`cgroup.conf`) Attributes: mountpoint (str): @@ -1292,7 +1292,7 @@ cdef class CgroupConfig: # Copyright (C) 2012-2013 Bull. # Copyright (C) 2012-2022 SchedMD LLC. cdef class AccountingGatherConfig: - """Slurm Accounting Gather Config (acct_gather.conf) + """Slurm Accounting Gather Config (`acct_gather.conf`) Attributes: energy_ipmi_frequency (int): diff --git a/pyslurm/core/slurmctld/config.pyx b/pyslurm/core/slurmctld/config.pyx index 5a8bf280..f72cac75 100644 --- a/pyslurm/core/slurmctld/config.pyx +++ b/pyslurm/core/slurmctld/config.pyx @@ -752,8 +752,7 @@ cdef class Config: return cstr.to_list_free(&data) @property - def priortiy_max_age(self): - # TODO: seconds or minutes? + def priority_max_age(self): return u32_parse(self.ptr.priority_max_age) @property @@ -912,7 +911,6 @@ cdef class Config: @property def scheduler_logging_enabled(self): - # TODO: check again return u16_parse_bool(self.ptr.sched_log_level) @property @@ -1184,8 +1182,7 @@ cdef class Config: return u16_parse(self.ptr.vsize_factor) @property - def default_job_wait_time(self): - # TODO: reconsider name + def wait_time(self): return u16_parse(self.ptr.wait_time) @property diff --git a/pyslurm/core/slurmctld/enums.pyx b/pyslurm/core/slurmctld/enums.pyx index df7a1ac0..e2fe7cf7 100644 --- a/pyslurm/core/slurmctld/enums.pyx +++ b/pyslurm/core/slurmctld/enums.pyx @@ -33,6 +33,6 @@ class ShutdownMode(IntEnum): # A bit hacky, but it works for now. Putting the docstring under the enum value # does not work unfortunately. -ShutdownMode.ALL.__doc__ = "Shutdown all daemons (slurmctld and slurmd)" -ShutdownMode.CORE_FILE.__doc__ = "Shutdown only slurmctld, and create a coredump" -ShutdownMode.CONTROLLER_ONLY.__doc__ = "Shutdown only slurmctld, without a coredump" +ShutdownMode.ALL.__doc__ = "Shutdown all daemons (`slurmctld` and `slurmd`)" +ShutdownMode.CORE_FILE.__doc__ = "Shutdown only `slurmctld`, and create a coredump" +ShutdownMode.CONTROLLER_ONLY.__doc__ = "Shutdown only `slurmctld`, without a coredump" From 273892435c73d140853ba76139a9736f0ffdc354 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sun, 19 Jan 2025 12:06:54 +0100 Subject: [PATCH 29/31] wip --- mkdocs.yml | 4 +--- scripts/griffe_exts.py | 5 ++++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 040614ec..c80f2d93 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -61,9 +61,7 @@ plugins: show_symbol_type_toc: true show_symbol_type_heading: true extensions: - - scripts/griffe_exts.py:DynamicDocstrings: - ignore_paths: - - pyslurm.constants.UNLIMITED + - scripts/griffe_exts.py:DynamicDocstrings markdown_extensions: - admonition diff --git a/scripts/griffe_exts.py b/scripts/griffe_exts.py index cf91e675..905f8358 100644 --- a/scripts/griffe_exts.py +++ b/scripts/griffe_exts.py @@ -97,7 +97,10 @@ def on_instance( if not docstring or not obj.docstring: return - # Update the object instance with the evaluated docstring. fmt_docstring = pattern.sub(replace_with_slurm_docs_url, docstring) + if fmt_docstring == docstring: + # No need to update the docstring if nothing has changed + return + docstring = inspect.cleandoc(fmt_docstring) obj.docstring.value = docstring From 692bc854fa898385a53d9c7cb2a87d32eb9e421d Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sun, 19 Jan 2025 12:07:29 +0100 Subject: [PATCH 30/31] remove the setup.py clean step from builddocs.sh --- scripts/builddocs.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/builddocs.sh b/scripts/builddocs.sh index 9b1aeaea..23e37ce3 100755 --- a/scripts/builddocs.sh +++ b/scripts/builddocs.sh @@ -17,7 +17,6 @@ done shift $((OPTIND-1)) -python setup.py clean pip install -r doc_requirements.txt scripts/build.sh -j${OPT_JOBS} -d mkdocs build From 82f1376d9c5cf5d5a8422d51f594346a70928685 Mon Sep 17 00:00:00 2001 From: Toni Harzendorf Date: Sun, 19 Jan 2025 12:39:55 +0100 Subject: [PATCH 31/31] more doc fixes --- pyslurm/core/slurmctld/base.pyx | 4 ++-- pyslurm/core/slurmctld/config.pxd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyslurm/core/slurmctld/base.pyx b/pyslurm/core/slurmctld/base.pyx index 62582078..efdd14d3 100644 --- a/pyslurm/core/slurmctld/base.pyx +++ b/pyslurm/core/slurmctld/base.pyx @@ -177,10 +177,10 @@ def takeover(index = 1): """Let a Backup Slurm Controller take over as the Primary. Args: - index (int, optional = 1): + index (int, optional=1): Index of the Backup Controller that should take over. By default, the `index` is `1`, meaning the next Controller configured after - the Primary in slurm.conf (second `SlurmctlHost` entry) will be + the Primary in `slurm.conf` (second `SlurmctldHost` entry) will be asked to take over operation. If you have more than one backup controller configured, you can for diff --git a/pyslurm/core/slurmctld/config.pxd b/pyslurm/core/slurmctld/config.pxd index 664b391e..75efd735 100644 --- a/pyslurm/core/slurmctld/config.pxd +++ b/pyslurm/core/slurmctld/config.pxd @@ -705,7 +705,7 @@ cdef class Config: Program to be executed on each compute node to reboot it. {slurm.conf#OPT_RebootProgram} - reconfig_flags (lisr[str]): + reconfig_flags (list[str]): List of flags to control various actions that may be taken when a reconfigure command is issued (for example with `scontrol reconfig`).