Source code for lst_auto_rta.Auto_RTA

#!/usr/bin/env python

"""Automatically start/stop RTA reconstruction pipeline for new runs during a observation night

New runs are found querying the TCU pymongo database regularly.
This script:
- copies the conda environment and reconstruction model to the RAM of the slurm nodes
- queries the TCU pymongo database for new runs regularly and for each new run:
  - stops the r0->dl1 daemons for the previous runs
  - starts the r0->dl1 daemons for the new run, using a static configuration (CDB configuration)
    from disk, and writing the dynamic configuration to disk as well.
  - starts the "engineering gui" scripts allowing to monitor data processing for the run.
- stops at a fixed hour according to its configuration file
- cleans the RAM of slurm worker nodes
"""

import argparse
import datetime
import json
import logging
import shlex
import signal
import subprocess as sp
import time
from pathlib import Path
from subprocess import CalledProcessError
from threading import Thread
from typing import Dict, List, NamedTuple

from annotated_types import Gt, Le
from pymongo.errors import PyMongoError
from typing_extensions import Annotated

from lst_auto_rta.config.configuration import (
    AutoRTAConfiguration,
    DataStreamConnectionConfiguration,
    ObservationParameters,
)
from lst_auto_rta.observation_data import ObsInfo, get_current_run_info
from lst_auto_rta.paths import RecoPathStructure
from lst_auto_rta.utils.logging import LOGGING_LEVELS_DICT, init_logging
from lst_auto_rta.utils.slurm import (
    job_statistics_from_squeue_output,
    parse_slurm_job_ID,
    subprocess_run_and_raise_exception_on_error,
)



[docs]
class ConnectionJobInfo(NamedTuple):
    tel_id: Annotated[int, Gt(0)]
    hostname: str
    port: Annotated[int, Gt(0), Le(65535)]
    slurm_reservation: str
    slurm_node: str




[docs]
def assign_worker_to_data_connection(
    slurm_nodes: Dict[str, List[str]],
    tel_ids_to_data_servers: Dict[Annotated[int, Gt(0)], List[DataStreamConnectionConfiguration]],
) -> List[ConnectionJobInfo]:
    """Assign a worker node from `slurm_nodes` to each data server connection in `tel_ids_to_data_servers`

    Parameters
    ----------
    slurm_nodes : Dict[str, List[str]]
        Mapping from slurm reservation to slurm nodes, see AutoRTAConfiguration.slurm_nodes field.
    tel_ids_to_data_servers : Dict[Annotated[int, Gt(0)], List[DataStreamConnectionConfiguration]]
        Mapping from telescope ID to data servers connections, see AutoRTAConfiguration.tel_ids_to_data_servers
    Returns
    -------
    Dict[Tuple[int, str, str], Tuple[str, str]]
        Map from (tel_id, hostname, port) to (slurm_reservation, slurm_nodename)
    """
    # "Flatten" the slurm nodes information into a list of tuple (reservation, node)
    node_list = [(reservation, node) for reservation, nodes in slurm_nodes.items() for node in nodes]
    tel_to_node_map = []
    node_idx = 0
    # Now make the tuples from telescope connections and available nodes.
    for tel_id, tel_data_server_connections in tel_ids_to_data_servers.items():
        for connection_idx, connection in enumerate(tel_data_server_connections):
            tel_to_node_map.append(
                ConnectionJobInfo(
                    tel_id=tel_id,
                    hostname=connection.hostname,
                    port=connection.port,
                    slurm_reservation=node_list[node_idx + connection_idx][0],
                    slurm_node=node_list[node_idx + connection_idx][1],
                )
            )

        node_idx += len(tel_data_server_connections)
    return tel_to_node_map




[docs]
def srun_cmd_worker_nodes(
    connection_jobs_info: List[ConnectionJobInfo],
    cmd: str,
    additional_slurm_params: List[str] | None = None,
    error_level=LOGGING_LEVELS_DICT["CRITICAL"],
):
    """Submit a slurm command with srun on all worker nodes in `connection_jobs_info`

    Parameters
    ----------
    connection_jobs_info : List[ConnectionJobInfo]
        List of connection job information: `cmd` will be executed for each node entry in this list.
    cmd : str
        Command to run with srun.
    additional_slurm_params : List[str] or None
        List of additional slurm jobs parameters, separated between args and values, for instance ["--mem", "20G"]
        Optionnal, default is None.

    Returns
    -------
    List[subprocess.CompletedProcess]
        List of completed processes.
    """
    completed_processes = []
    for job_info in connection_jobs_info:
        srun_cmd = " ".join(
            [
                "srun",
                *(additional_slurm_params if additional_slurm_params is not None else ""),
                "--reservation={}".format(job_info.slurm_reservation),
                "--nodelist={}".format(job_info.slurm_node),
                cmd,
            ]
        )
        logging.info("Running {}".format(srun_cmd))
        completed_processes.append(
            subprocess_run_and_raise_exception_on_error(
                shlex.split(srun_cmd),
                success_log_string="Success on node {}".format(job_info.slurm_node),
                failure_log_string="Failure on node {} with {}".format(job_info.slurm_node, srun_cmd),
                error_level=error_level,
                log_level=LOGGING_LEVELS_DICT["DEBUG"],
            )
        )
    return completed_processes




[docs]
def scancel_jobs(job_ids: List[int], signal: signal.Signals, delay_s: float = None, ignore_error: bool = False):
    """Run `scancel on `job_ids`, sending `signal` after `delay_s` seconds.

    Use the --quiet argument of scancel to not raise error if the jobs are already stopped.

    Parameters
    ----------
    job_ids : List[str]
        List of job ids to scancel
    signal : signal.Signals
        Signal to send with scancel
    delay_s : float, optional
        Amount of time in second to wait before performing the scancel
    ignore_error : bool, optional
        If True, the scancel is run directly with subprocess.srun, and any error happening in the subprocess
        is simply ignored. This is usefull when running scancel -s KILL after a scancel -s INT: if the SIGINT
        stopped the job already, the SIGKILL would have exit code 1 even with --quiet, but we want to ignore
        the error in this case.
    """
    # note: --full or -f is required for r0_dl1 daemons to receive signal
    scancel_cmd = " ".join(["scancel", "--full", "-s", str(signal), *[str(id) for id in job_ids]])
    if delay_s is not None:
        time.sleep(delay_s)
    if ignore_error:
        try:
            sp.run(shlex.split(scancel_cmd), capture_output=True, text=True, check=True)
            logging.debug("Stopping jobs with {}".format(scancel_cmd))
        except CalledProcessError as error:
            logging.info(
                "Ignoring error of {} caused by jobs already been stopped. Error info:\nstdout: {}\nstderr: {}".format(
                    scancel_cmd, error.stdout, error.stderr
                )
            )
    else:
        subprocess_run_and_raise_exception_on_error(
            shlex.split(scancel_cmd),
            success_log_string="Stopping job with {}".format(scancel_cmd),
            failure_log_string="FAILURE to stop job with {}".format(scancel_cmd),
            error_level=LOGGING_LEVELS_DICT["ERROR"],
            log_level=LOGGING_LEVELS_DICT["DEBUG"],
        )




[docs]
def stop_rta(slurm_reservations: List[str], slurm_account: str, r0dl1_job_name: str) -> Thread:
    """Stop the r0dl1 daemons

    The r0dl1 jobs are immediately send a SIGINT signal, which should tell them to
    gracefully shut down. A SIGKILL is also scheduled to run 10 seconds later to
    ensure the jobs are indeed stopped.

    Parameters
    ----------
    slurm_reservations : List[str]
        List of slurm reservation to search for r0dl1 daemons.
    slurm_account : str
        Slurm account to use when searching for the r0dl1 daemons.
    r0dl1_job_name : str
        Name of the r0dl1 jobs in the CDB configuration

    Returns
    -------
    stop_thread : Thread
        Started thread that will SIGKILL the r0dl1 jobs after 10 secs.
    """
    job_ids = parse_slurm_job_ID(slurm_reservations, slurm_account, r0dl1_job_name)
    logging.info("Found r0dl1 job ids {} to stop.".format(job_ids))
    if job_ids:
        # immediately send the SIGINT
        scancel_jobs(job_ids, signal.SIGINT, None)
        # start a detached thread to send the SIGKILL in 10 seconds.
        # this allows autorta to continue and start new run immediately, while jobs are shuting down.
        stop_thread = Thread(target=scancel_jobs, args=(job_ids, signal.SIGKILL, 10.0, True))
        stop_thread.start()
        return stop_thread
    return None




[docs]
def nuke_rta(slurm_account: str):
    """Hard stop of All RTA jobs: scancels all jobs of `slurm_account`

    Parameters
    ----------
    slurm_account : str
        slurm account
    """
    nuke_rta_cmd = " ".join(["scancel", "-u", slurm_account])
    subprocess_run_and_raise_exception_on_error(
        shlex.split(nuke_rta_cmd),
        "Stopped RTA with {}".format(nuke_rta_cmd),
        "Could not stop RTA with {}".format(nuke_rta_cmd),
        error_level=LOGGING_LEVELS_DICT["CRITICAL"],
        log_level=LOGGING_LEVELS_DICT["WARNING"],
    )




[docs]
def write_reco_manager_observation_config(
    obs_info: ObsInfo,
    obs_dir: Path,
    night_path_structure: RecoPathStructure,
    auto_rta_config: AutoRTAConfiguration,
    output_path: Path,
):
    """Writes the observation configuration for the reco-manager

    Parameters
    ----------
    obs_info : ObsInfo
        Observation parameters from observation DB
    obs_dir : Path
        Path to the observation data directory
    night_path_structure : RecoPathStructure
        Path structure of the night
    auto_rta_config : AutoRTAConfiguration
        Configuration of Auto RTA
    output_path: Path
        Path where to write the hiperta_stream_start configuration
    """
    hiperta_obs_config = ObservationParameters.model_validate(
        {
            "sb_id": 1,  # no scheduling block in LST
            "obs_id": obs_info.obs_id,
            "tel_id": 1,  # only 1 tel
            "RA_pointing": obs_info.RA,
            "DEC_pointing": obs_info.DEC,
            "dl1_dir": str(night_path_structure.dl1_dir(obs_dir)),
            "dl2_dir": str(night_path_structure.dl2_dir(obs_dir)),
            "dl3_dir": str(night_path_structure.dl3_dir(obs_dir)),
            "log_dir": str(night_path_structure.log_dir(obs_dir)),
            "reco_manager_log_file": str(night_path_structure.log_dir(obs_dir) / "hiperta_stream_start.log"),
            "data_stream_connections": auto_rta_config.tel_ids_to_data_servers[1],  # only do tel ID 1 for LST 1
            "slurm_nodelists": auto_rta_config.slurm_nodes,
        }
    )
    with open(output_path, "w") as obs_config_f:
        obs_config_f.write(hiperta_obs_config.model_dump_json(indent=4))




[docs]
def main():
    """
    Entrypoint of Auto_RTA:
        - parse the available slurm nodes
        - check that the slurm nodes are in "connected" network mode
        - copy conda environment to slurm nodes (usually in /dev/shm/ (RAM))
        - query database for current RUN and for each run:
            - stops previous RTA reconstruction slurm jobs
            - starts new R0-DL1 jobs for the new RUN with static configuration (CDB config) and
              a newly written dynamic config
            - starts engineering GUI plotting scripts
        - stops at a fixed hour set in configuration, after cleaning slurm nodes memory.
    """

    start_time = datetime.datetime.now(datetime.UTC)

    # initially write log where the script is called (home if cron job)
    # so that errors can be logged if we can't parse the config
    init_logging(log_level="DEBUG", log_filename="LST_AUTO_RTA.log")

    # Load configuration
    parser = argparse.ArgumentParser(
        description="Automatic Starting of the RTA Reconstruction for an observation night",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "-c", "--config", dest="config", type=str, required=True, help="LST auto RTA configuration file."
    )
    args = parser.parse_args()
    with open(args.config, "r") as config_file:
        config = AutoRTAConfiguration.model_validate_json(config_file.read())

    # Create night data directory
    path_structure = RecoPathStructure(config.data_dir, start_time)
    path_structure.create_night_data_dir()
    # now update logging to write log in night's directory
    init_logging(
        log_level=config.log_level,
        log_filename=path_structure.night_data_dir / "LST_AUTO_RTA.log",
    )

    # get the stop time during today
    stop_time = start_time.replace(
        hour=config.stop_time_UTC_hours, minute=config.stop_time_UTC_minutes, second=0, microsecond=0
    )
    # If we are passed it, it is actually tomorrow
    if stop_time < start_time:
        stop_time += datetime.timedelta(days=1)

    logging.info("Start RTA at " + str(start_time))
    logging.info("Found worker nodes : {}".format(config.slurm_nodes))

    connection_jobs_info = assign_worker_to_data_connection(config.slurm_nodes, config.tel_ids_to_data_servers)
    for job_info in connection_jobs_info:
        logging.info(
            'Will run tel {} connection {{"hostname": {}, "port": {}}} r0->dl1 job on {} (reservation {})'.format(
                job_info.tel_id, job_info.hostname, job_info.port, job_info.slurm_node, job_info.slurm_reservation
            )
        )

    logging.info("Reading R0-DL1 jobnames from CDB configuration")
    # load CDB configuration, checking it exists, and parsing r0_dl1_job_name
    with open(config.hiperta_CDB_config_file, "r") as CDB_f:
        CDB_config = json.load(CDB_f)
    r0_dl1_job_name = CDB_config["r0_dl1_params"]["r0_dl1_job_name"]
    logging.info("Found job name: {}".format(r0_dl1_job_name))

    if config.copy_env:
        # For all srun commands here: error level is critical because RTA can not run if environment can not be copied
        logging.info("Loading the environment on worker nodes for the night")
        logging.info("Cleaning previous content in {}".format(config.env_archive_extraction_path))
        srun_cmd_worker_nodes(
            connection_jobs_info,
            "rm -rf {}".format(config.env_archive_extraction_path),
            error_level=LOGGING_LEVELS_DICT["CRITICAL"],
        )
        logging.info("Cleaning previous content in {}".format(config.models_archive_copy_path))
        srun_cmd_worker_nodes(
            connection_jobs_info,
            "rm -rf {}".format(config.models_archive_copy_path),
            error_level=LOGGING_LEVELS_DICT["CRITICAL"],
        )
        logging.info(
            "Extracting environment from {} to {}".format(config.env_archive, config.env_archive_extraction_path)
        )
        srun_cmd_worker_nodes(
            connection_jobs_info,
            "bash -c 'mkdir {} && tar -xzf {} -C {}'".format(
                config.env_archive_extraction_path, config.env_archive, config.env_archive_extraction_path
            ),
            ["--mem", "20G"],
            error_level=LOGGING_LEVELS_DICT["CRITICAL"],
        )
        logging.info(
            "Copying reconstruction models from {} to {}".format(
                config.models_archive_path, config.models_archive_copy_path
            )
        )
        srun_cmd_worker_nodes(
            connection_jobs_info,
            "cp -rf {} {}".format(config.models_archive_path, config.models_archive_copy_path),
            ["--mem", "20G"],
            error_level=LOGGING_LEVELS_DICT["CRITICAL"],
        )

    if config.check_node_connection:
        comp_proc_ib0_cat = srun_cmd_worker_nodes(connection_jobs_info, "cat /sys/class/net/ib0/mode")
        # If a node is not connected, raise error
        if not all("connected" in process.stdout.strip() for process in comp_proc_ib0_cat):
            raise RuntimeError(
                "Not all RTA slurm nodes are connected to ib0! Found {}".format(
                    ", ".join(
                        [
                            "{}: {}".format(job_info.slurm_node, process.stdout.strip())
                            for job_info, process in zip(connection_jobs_info, comp_proc_ib0_cat)
                        ]
                    )
                )
            )

    logging.info("RTA ready for the night !")

    current_obs_info = ObsInfo(None, None, None, None, None, None)
    while not (datetime.datetime.now(datetime.UTC) > stop_time):
        # query TCU DB for observation
        try:
            obs_info = get_current_run_info(config.db_hostname, 10)
        except PyMongoError:
            logging.error("Error retrieving observation information from DB, ignoring...", exc_info=True)
            obs_info = ObsInfo(None, None, None, None, None, None)

        # Check observation data
        obs_info_is_none = obs_info.RA is None or obs_info.DEC is None
        obs_recent_enough = True  # default value if we couldn't get an obs_info
        if obs_info_is_none:
            logging.warning("Queried observation information had no RA DEC.")
        elif config.ignore_old_observation:
            obs_info_tstart_datetime = datetime.datetime.fromtimestamp(obs_info.time_start_camera, datetime.UTC)
            obs_info_time_delta = datetime.datetime.now(datetime.UTC) - obs_info_tstart_datetime
            obs_recent_enough = obs_info_time_delta < datetime.timedelta(hours=4)
            if not obs_recent_enough:
                logging.info(
                    "Queried observation {} has start time {}, timedelta wrt now: {}. Too old to start RTA".format(
                        obs_info.obs_id, obs_info_tstart_datetime, obs_info_time_delta
                    )
                )

        # Start RTA
        if current_obs_info.obs_id != obs_info.obs_id and (not obs_info_is_none) and obs_recent_enough:
            # We got a new observation!
            logging.info(
                "Got new observation! ID: {} - RA: {} - DEC: {} - SOURCE.RA: {} - SOURCE.DEC: {}".format(
                    obs_info.obs_id, obs_info.RA, obs_info.DEC, obs_info.source_RA, obs_info.source_DEC
                )
            )

            try:
                logging.info("Stopping RTA")
                stop_rta(config.slurm_reservations, config.slurm_account, r0_dl1_job_name)
            except Exception:
                logging.error("Could not stop RTA. NEXT RUN DATA MAY BE ACQUIRED BY PREVIOUS RUN DAEMONS !")
                # Note: we could nuke_rta, but it would also kill DQ, SCI jobs, etc...

            obs_dir = path_structure.create_observation_data_dirs(str(obs_info.obs_id), True)
            logging.info("Created directories for obs {} at {}".format(obs_info.obs_id, obs_dir))

            reco_manager_obs_config_path = (
                path_structure.log_dir(obs_dir) / "hiperta_stream_start_observation_config.json"
            )
            logging.info("Writing reco-manager observation configuration at {}".format(reco_manager_obs_config_path))
            write_reco_manager_observation_config(
                obs_info,
                obs_dir,
                path_structure,
                config,
                reco_manager_obs_config_path,
            )

            # Note: hiperta_stream_start has to be started on a worker node as well, because it
            # will read the training r0dl1 configuration, which is in the model's archive copied to /dev/shm
            # (it reads the r0dl1 configuration from the path set in the CDB config, that must point to /dev/shm)
            # Note2: to run several commands with srun, need to wrap with bash -c '...'
            hiperta_stream_start_cmd = " ".join(
                [
                    "bash -c '" "export PATH={}/bin/:$PATH ; {}/bin/hiperta_stream_start".format(
                        config.env_archive_extraction_path, config.env_archive_extraction_path
                    ),
                    "-c",
                    config.hiperta_CDB_config_file,
                    "-d",
                    str(reco_manager_obs_config_path),
                    "'",
                ]
            )
            try:
                # use 1st node to start hiperta_stream
                # Note: hiperta_stream must run on a worker node as well, because it needs access to
                # the r0dl1 training configuration, which is found form the path in the CDB configuration,
                # which points to /dev/shm/model_archive/...
                hiperta_stream_job_info = connection_jobs_info[0]
                hiperta_stream_srun_cmd = " ".join(
                    [
                        "srun",
                        "--reservation={}".format(hiperta_stream_job_info.slurm_reservation),
                        "--nodelist={}".format(hiperta_stream_job_info.slurm_node),
                        hiperta_stream_start_cmd,
                    ]
                )
                logging.info("Starting hiperta_stream_start with {}".format(hiperta_stream_srun_cmd))
                subprocess_run_and_raise_exception_on_error(
                    shlex.split(hiperta_stream_srun_cmd),
                    success_log_string="hiperta_stream started with {}".format(hiperta_stream_job_info.slurm_node),
                    failure_log_string="Failed to start hiperta_stream on {} with {}".format(
                        hiperta_stream_job_info.slurm_node, hiperta_stream_start_cmd
                    ),
                    error_level=LOGGING_LEVELS_DICT[
                        "ERROR"
                    ],  # we might want to continue even if we can't start reco-manager
                    log_level=LOGGING_LEVELS_DICT["DEBUG"],
                )
                # Only if we could start RTA we update the current obs_info
                # Otherwise, we will loop, see a new observation again, stop RTA and re-start, etc ...
                current_obs_info = obs_info
            except sp.SubprocessError:
                logging.error("Failed to start reco-manager with {}".format(hiperta_stream_start_cmd))
                pass  # continue anyway, obs_info is not updated so we will try again to start

        # If not starting RTA: Query squeue for some statistics on the running jobs
        else:
            try:
                running_jobs_info = subprocess_run_and_raise_exception_on_error(
                    shlex.split('squeue -u {} --format="%T,%R"'.format(config.slurm_account)),
                    failure_log_string="Could not parse slurm info while waiting for next observation",
                    error_level=LOGGING_LEVELS_DICT["ERROR"],
                    log_level=LOGGING_LEVELS_DICT["DEBUG"],
                ).stdout
                n_jobs, n_running, n_pending, n_request_node_not_available = job_statistics_from_squeue_output(
                    running_jobs_info
                )
                logging.info(
                    "{} job statistics:  nb_jobs: {} - nb_running: {} - nb_pending: {} - nb_node_not_available: {}".format(
                        config.slurm_account, n_jobs, n_running, n_pending, n_request_node_not_available
                    )
                )
            except sp.SubprocessError:
                pass  # continue loop anyway

            # sleep until we get an observation
            logging.info("RTA waiting for next observation, current observation is {}".format(obs_info.obs_id))
            time.sleep(1.0)

    logging.info("End of the night: Stop the RTA")
    try:
        logging.info("Stopping RTA")
        sigkill_thread = stop_rta(config.slurm_reservations, config.slurm_account, r0_dl1_job_name)
        if sigkill_thread is not None:  # it is None if there are no jobs to kill
            sigkill_thread.join(timeout=180.0)  # 3 minutes timeout but job should end right after delay of 10 sec
        else:
            logging.warning("Found no r0_dl1 jobs to stop !")
    except Exception:
        logging.error("Could not stop RTA normally, nuking all {} jobs".format(config.slurm_account))
        nuke_rta(config.slurm_account)

    # Clean up node environment
    logging.info("Cleaning nodes copied files")
    logging.info("Cleaning content in {}".format(config.env_archive_extraction_path))
    srun_cmd_worker_nodes(
        connection_jobs_info,
        "rm -rf {}".format(config.env_archive_extraction_path),
        error_level=LOGGING_LEVELS_DICT["CRITICAL"],
    )
    logging.info("Cleaning content in {}".format(config.models_archive_copy_path))
    srun_cmd_worker_nodes(
        connection_jobs_info,
        "rm -rf {}".format(config.models_archive_copy_path),
        error_level=LOGGING_LEVELS_DICT["CRITICAL"],
    )
    logging.info("Done cleaning nodes.")

    logging.info("RTA Done for the night. Good day !")



if __name__ == "__main__":
    main()