Source code for adaptive_scheduler._scheduler.slurm

"""SLURM for Adaptive Scheduler."""

from __future__ import annotations

import copy
import getpass
import re
import subprocess
import textwrap
from distutils.spawn import find_executable
from functools import cached_property, lru_cache
from typing import TYPE_CHECKING, TypeVar

from adaptive_scheduler._scheduler.base_scheduler import BaseScheduler
from adaptive_scheduler._scheduler.common import run_submit

if TYPE_CHECKING:
    from pathlib import Path
    from typing import Any

    from adaptive_scheduler.utils import EXECUTOR_TYPES


T = TypeVar("T")


def _maybe_as_tuple(
    x: T | tuple[T, ...] | None,
    n: int | None,
    *,
    check_type: type | None = None,
) -> tuple[T, ...] | T | None:
    if x is None:
        return x
    if check_type is not None and not isinstance(x, (check_type, tuple)):
        msg = f"Expected `{check_type}` or `tuple[{check_type}, ...]`, got `{type(x)}`"
        raise TypeError(msg)
    if n is None:
        return x
    if isinstance(x, tuple):
        assert len(x) == n
        return x
    return tuple(copy.deepcopy(x) for _ in range(n))


def _tuple_lengths(*maybe_tuple: tuple[Any, ...] | Any) -> int | None:
    """Get the length of the items that are in tuples."""
    length = None
    for y in maybe_tuple:
        if isinstance(y, tuple):
            if length is None:
                length = len(y)
            elif length != len(y):
                msg = "All tuples should have the same length."
                raise ValueError(msg)
    return length



[docs]
class SLURM(BaseScheduler):
    """Base object for a Scheduler.

    ``cores``, ``nodes``, ``cores_per_node``, ``extra_scheduler``,
    ``executor_type``, ``extra_script``, ``exclusive``, ``extra_env_vars``,
    ``num_threads`` and ``partition`` can be either a single value or a tuple of
    values. If a tuple is given, then the length of the tuple should be the same
    as the number of learners (jobs) that are run. This allows for different
    resources for different jobs.

    Parameters
    ----------
    cores
        Number of cores per job (so per learner.)
        Either use `cores` or `nodes` and `cores_per_node`.
    nodes
        Number of nodes per job (so per learner.)
        Either `nodes` and `cores_per_node` or use `cores`.
    cores_per_node
        Number of cores per node.
        Either `nodes` and `cores_per_node` or use `cores`.
    partition
        The SLURM partition to submit the job to.
    exclusive
        Whether to use exclusive nodes (e.g., if SLURM it adds ``--exclusive`` as option).
    log_folder
        The folder in which to put the log-files.
    mpiexec_executable
        ``mpiexec`` executable. By default `mpiexec` will be
        used (so probably from ``conda``).
    executor_type
        The executor that is used, by default `concurrent.futures.ProcessPoolExecutor` is used.
        One can use ``"ipyparallel"``, ``"dask-mpi"``, ``"mpi4py"``,
        ``"loky"``, ``"sequential"``, or ``"process-pool"``.
    num_threads
        ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, ``OMP_NUM_THREADS``, and
        ``NUMEXPR_NUM_THREADS`` will be set to this number.
    extra_scheduler
        Extra ``#SLURM`` (depending on scheduler type)
        arguments, e.g. ``["--exclusive=user", "--time=1"]`` or a tuple of lists,
        e.g. ``(["--time=10"], ["--time=20"]])`` for two jobs.
    extra_env_vars
        Extra environment variables that are exported in the job
        script. e.g. ``["TMPDIR='/scratch'", "PYTHONPATH='my_dir:$PYTHONPATH'"]``.
    extra_script
        Extra script that will be executed after any environment variables are set,
        but before the main scheduler is run.

    """

    # Attributes that all schedulers need to have
    _ext = ".sbatch"
    _submit_cmd = "sbatch"
    _JOB_ID_VARIABLE = "${SLURM_JOB_ID}"
    _options_flag = "SBATCH"
    _cancel_cmd = "scancel"

    def __init__(  # noqa: PLR0912, PLR0915, C901
        self,
        *,
        cores: int | tuple[int, ...] | None = None,
        nodes: int | tuple[int, ...] | None = None,
        cores_per_node: int | tuple[int, ...] | None = None,
        partition: str | tuple[str, ...] | None = None,
        exclusive: bool | tuple[bool, ...] = True,
        python_executable: str | None = None,
        log_folder: str | Path = "",
        mpiexec_executable: str | None = None,
        executor_type: EXECUTOR_TYPES | tuple[EXECUTOR_TYPES, ...] = "process-pool",
        num_threads: int | tuple[int, ...] = 1,
        extra_scheduler: list[str] | tuple[list[str], ...] | None = None,
        extra_env_vars: list[str] | tuple[list[str], ...] | None = None,
        extra_script: str | tuple[str, ...] | None = None,
        batch_folder: str | Path = "",
    ) -> None:
        """Initialize the scheduler."""
        # Store the original values
        self._cores = cores
        self._nodes = nodes
        self._cores_per_node = cores_per_node
        self._partition = partition
        self._executor_type = executor_type
        self._num_threads = num_threads
        self._exclusive = exclusive
        self.__extra_scheduler = extra_scheduler
        self.__extra_env_vars = extra_env_vars
        self.__extra_script = extra_script

        msg = "Specify either `nodes` and `cores_per_node`, or only `cores`, not both."
        if cores is None:
            if nodes is None or cores_per_node is None:
                raise ValueError(msg)
        elif nodes is not None or cores_per_node is not None:
            raise ValueError(msg)

        if extra_scheduler is None:
            extra_scheduler = []
        if extra_env_vars is None:
            extra_env_vars = []
        if extra_script is None:
            extra_script = ""

        # If any is a tuple, then all should be a tuple
        n = _tuple_lengths(
            cores,
            nodes,
            cores_per_node,
            partition,
            executor_type,
            num_threads,
            exclusive,
            extra_scheduler,
            extra_env_vars,
            extra_script,
        )
        single_job_script = n is None
        cores = _maybe_as_tuple(cores, n, check_type=int)
        self.nodes = nodes = _maybe_as_tuple(nodes, n, check_type=int)
        self.cores_per_node = _maybe_as_tuple(cores_per_node, n, check_type=int)
        self.partition = partition = _maybe_as_tuple(partition, n, check_type=str)
        executor_type = _maybe_as_tuple(executor_type, n, check_type=str)  # type: ignore[assignment]
        num_threads = _maybe_as_tuple(num_threads, n, check_type=int)  # type: ignore[assignment]
        self.exclusive = _maybe_as_tuple(exclusive, n, check_type=bool)
        extra_scheduler = _maybe_as_tuple(extra_scheduler, n, check_type=list)
        extra_env_vars = _maybe_as_tuple(extra_env_vars, n, check_type=list)
        extra_script = _maybe_as_tuple(extra_script, n, check_type=str)

        if self.cores_per_node is not None:
            if single_job_script:
                assert isinstance(self.cores_per_node, int)
                assert isinstance(nodes, int)
                assert isinstance(extra_scheduler, list)
                extra_scheduler.append(f"--ntasks-per-node={self.cores_per_node}")
                cores = self.cores_per_node * nodes
            else:
                assert isinstance(self.cores_per_node, tuple)
                assert isinstance(nodes, tuple)
                assert isinstance(extra_scheduler, tuple)
                for lst, cpn in zip(extra_scheduler, self.cores_per_node):
                    assert isinstance(lst, list)
                    lst.append(f"--ntasks-per-node={cpn}")
                cores = tuple(cpn * n for cpn, n in zip(self.cores_per_node, nodes))

        if partition is not None:
            if single_job_script:
                assert isinstance(partition, str)
                assert isinstance(extra_scheduler, list)
                if partition not in self.partitions:
                    msg = f"Invalid partition: {partition}, only {self.partitions} are available."
                    raise ValueError(msg)
                extra_scheduler.append(f"--partition={partition}")
            else:
                if any(p not in self.partitions for p in partition):
                    msg = f"Invalid partition: {partition}, only {self.partitions} are available."
                    raise ValueError(msg)
                assert isinstance(extra_scheduler, tuple)
                for lst, p in zip(extra_scheduler, partition):
                    assert isinstance(lst, list)
                    lst.append(f"--partition={p}")

        if single_job_script:
            assert isinstance(extra_scheduler, list)
            assert isinstance(exclusive, bool)
            if self.exclusive:
                extra_scheduler.append("--exclusive")
        else:
            assert isinstance(extra_scheduler, tuple)
            assert isinstance(self.exclusive, tuple)
            for _ex, lst in zip(self.exclusive, extra_scheduler):
                assert isinstance(lst, list)
                if _ex:
                    lst.append("--exclusive")

        assert cores is not None
        super().__init__(
            cores,
            python_executable=python_executable,
            log_folder=log_folder,
            mpiexec_executable=mpiexec_executable,
            executor_type=executor_type,
            num_threads=num_threads,
            extra_scheduler=extra_scheduler,
            extra_env_vars=extra_env_vars,
            extra_script=extra_script,
            batch_folder=batch_folder,
        )
        # SLURM specific
        self.mpiexec_executable = mpiexec_executable or "srun --mpi=pmi2"

    def __getstate__(self) -> dict[str, Any]:
        """Get the state of the SLURM scheduler."""
        state = super().__getstate__()
        state["cores"] = self._cores
        state["nodes"] = self._nodes
        state["cores_per_node"] = self._cores_per_node
        state["partition"] = self._partition
        state["executor_type"] = self._executor_type
        state["num_threads"] = self._num_threads
        state["exclusive"] = self._exclusive
        state["extra_scheduler"] = self.__extra_scheduler
        state["extra_env_vars"] = self.__extra_env_vars
        state["extra_script"] = self.__extra_script
        return state

    def __setstate__(self, state: dict[str, Any]) -> None:
        """Set the state of the SLURM scheduler."""
        self.__init__(**state)  # type: ignore[misc]

    def _ipyparallel(self, *, index: int | None = None) -> tuple[str, tuple[str, ...]]:
        cores = self._get_cores(index=index)
        job_id = self._JOB_ID_VARIABLE
        profile = "${profile}"
        # We need to reserve one core for the controller
        if self.nodes is not None and self.partition is not None and self.exclusive:
            if self.single_job_script:
                partition = self.partition
                nodes = self.nodes
            else:
                assert isinstance(self.partition, list)
                assert isinstance(self.nodes, list)
                assert index is not None
                partition = self.partition[index]
                nodes = self.nodes[index]
            assert isinstance(partition, str)
            assert isinstance(nodes, int)
            # Limit the number of cores to the maximum number of cores per node
            max_cores_per_node = self.partitions[partition]
            tot_cores = nodes * max_cores_per_node
            cores = min(cores, tot_cores - 1)
        else:  # noqa: PLR5501
            if self.single_job_script:
                assert isinstance(self.cores, int)
                cores = self.cores - 1
            else:
                assert isinstance(self.cores, tuple)
                assert index is not None
                cores = self.cores[index] - 1

        start = textwrap.dedent(
            f"""\
            profile=adaptive_scheduler_{job_id}

            echo "Creating profile {profile}"
            ipython profile create {profile}

            echo "Launching controller"
            ipcontroller --ip="*" --profile={profile} --log-to-file &
            sleep 10

            echo "Launching engines"
            srun --ntasks {cores} ipengine \\
                --profile={profile} \\
                --cluster-id='' \\
                --log-to-file &

            echo "Starting the Python script"
            srun --ntasks 1 {self.python_executable} {self.launcher} \\
            """,
        )
        custom = (f"    --profile {profile}",)
        return start, custom


[docs]
    def job_script(self, options: dict[str, Any], *, index: int | None = None) -> str:
        """Get a jobscript in string form.

        Returns
        -------
        job_script
            A job script that can be submitted to SLURM.
        index
            The index of the job that is being run. This is used when
            specifying different resources for different jobs.

        """
        cores = self._get_cores(index=index)
        job_script = textwrap.dedent(
            f"""\
            #!/bin/bash
            #SBATCH --ntasks {cores}
            #SBATCH --no-requeue
            {{extra_scheduler}}

            {{extra_env_vars}}

            {{extra_script}}

            {{executor_specific}}
            """,
        )

        return job_script.format(
            extra_scheduler=self.extra_scheduler(index=index),
            extra_env_vars=self.extra_env_vars(index=index),
            extra_script=self.extra_script(index=index),
            executor_specific=self._executor_specific("${NAME}", options, index=index),
        )



[docs]
    def start_job(self, name: str, *, index: int | None = None) -> None:
        """Writes a job script and submits it to the scheduler."""
        if self.single_job_script:
            name_prefix = name.rsplit("-", 1)[0]
        else:
            name_prefix = name
            assert index is not None
            options = self._multi_job_script_options(index)
            self.write_job_script(name_prefix, options, index=index)

        (output_fname,) = self.output_fnames(name)
        output_str = str(output_fname).replace(self._JOB_ID_VARIABLE, "%A")
        output_opt = f"--output {output_str}"
        name_opt = f"--job-name {name}"
        submit_cmd = (
            f"{self.submit_cmd} {name_opt} {output_opt} {self.batch_fname(name_prefix)}"
        )
        run_submit(submit_cmd, name)



[docs]
    @staticmethod
    def queue(*, me_only: bool = True) -> dict[str, dict[str, str]]:
        """Get the queue of jobs."""
        python_format = {
            "JobID": 100,
            "Name": 100,
            "state": 100,
            "NumNodes": 100,
            "NumTasks": 100,
            "ReasonList": 4000,
            "SubmitTime": 100,
            "StartTime": 100,
            "UserName": 100,
            "Partition": 100,
        }  # (key -> length) mapping

        slurm_format = ",".join(f"{k}:{v}" for k, v in python_format.items())
        squeue_executable = find_executable("squeue")
        assert isinstance(squeue_executable, str)
        cmd = [
            squeue_executable,
            rf'--Format=",{slurm_format},"',
            "--noheader",
            "--array",
        ]
        if me_only:
            username = getpass.getuser()
            cmd.append(f"--user={username}")
        proc = subprocess.run(cmd, text=True, capture_output=True, check=False)
        output = proc.stdout

        if (
            "squeue: error" in output
            or "slurm_load_jobs error" in output
            or proc.returncode != 0
        ):
            msg = "SLURM is not responding."
            raise RuntimeError(msg)

        def line_to_dict(line: str) -> dict[str, str]:
            chars = list(line)
            info = {}
            for k, v in python_format.items():
                info[k] = "".join(chars[:v]).strip()
                chars = chars[v:]
            return info

        squeue = [line_to_dict(line) for line in output.split("\n")]
        states = ("PENDING", "RUNNING", "CONFIGURING")
        squeue = [info for info in squeue if info["state"] in states]
        running = {info.pop("JobID"): info for info in squeue}
        for info in running.values():
            info["job_name"] = info.pop("Name")
        return running


    @cached_property
    def partitions(self) -> dict[str, int]:
        """Get the partitions of the SLURM scheduler."""
        return slurm_partitions()  # type: ignore[return-value]


[docs]
    @staticmethod
    def cancel_jobs(name: str, *, dry: bool = False) -> None:
        """Cancel jobs with names matching the pattern '{name}-{i}' where i is an integer.

        Parameters
        ----------
        name
            The base name of the jobs to cancel. Jobs with names that start with '{name}-'
            followed by an integer will be canceled.
        dry
            If True, perform a dry run and print the job IDs that would be canceled without
            actually canceling them. Default is False.

        Raises
        ------
        RuntimeError
            If there is an error while canceling the jobs.

        Examples
        --------
        >>> SLURM.cancel_jobs("my_job")
        # Cancels all running jobs with names like "my_job-1", "my_job-2", etc.

        >>> SLURM.cancel_jobs("my_job", dry=True)
        # Prints the job IDs that would be canceled without actually canceling them.

        """
        running_jobs = SLURM.queue()
        job_ids_to_cancel = []

        for job_id, job_info in running_jobs.items():
            job_name = job_info["job_name"]
            if job_name.startswith(f"{name}-"):
                suffix = job_name[len(name) + 1 :]
                if suffix.isdigit():
                    job_ids_to_cancel.append(job_id)

        if job_ids_to_cancel:
            job_ids_str = ",".join(job_ids_to_cancel)
            cmd = f"{SLURM._cancel_cmd} {job_ids_str}"
            if dry:
                print(f"Dry run: would cancel jobs with IDs: {job_ids_str}")
            else:
                try:
                    subprocess.run(cmd.split(), check=True)
                except subprocess.CalledProcessError as e:
                    msg = f"Failed to cancel jobs with name {name}. Error: {e}"
                    raise RuntimeError(
                        msg,
                    ) from e
        else:
            print(f"No running jobs found with name pattern '{name}-<integer>'")




def _get_ncores(partition: str) -> int | None:
    numbers = re.findall(r"\d+", partition)
    if not numbers:
        return None
    return int(numbers[0])



[docs]
@lru_cache(maxsize=1)
def slurm_partitions(
    *,
    timeout: int = 5,
    with_ncores: bool = True,
) -> list[str] | dict[str, int | None]:
    """Get the available slurm partitions, raises subprocess.TimeoutExpired after timeout."""
    output = subprocess.run(
        ["sinfo", "-ahO", "partition"],
        capture_output=True,
        timeout=timeout,
        check=False,
    )
    lines = output.stdout.decode("utf-8").split("\n")
    partitions = sorted(partition for line in lines if (partition := line.strip()))
    # Sort partitions alphabetically, but put the default partition first
    partitions = sorted(partitions, key=lambda s: ("*" not in s, s))
    # Remove asterisk, which is used for default partition
    partitions = [partition.replace("*", "") for partition in partitions]
    if not with_ncores:
        return partitions

    return {partition: _get_ncores(partition) for partition in partitions}