Source code for lst_auto_rta.plot_line_delay

"""Script plotting event_ids differences and r0dl1 rates for all observations of a night."""

import argparse
import logging
import pickle
import re
import subprocess as sp
import sys
import time
from multiprocessing import Pool
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import tables
from matplotlib import pyplot as plt
from scipy import ndimage
from tqdm import tqdm


[docs] def log_uncaught_exceptions(): """Makes all uncaught exception to be logged by the default logger. Keyboard exceptions and children classes are not logged so one can kill the program with ctr+C. """ def handle_exception(exc_type, exc_value, exc_traceback): if not issubclass(exc_type, KeyboardInterrupt): logging.critical("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback)) sys.__excepthook__(exc_type, exc_value, exc_traceback) return sys.excepthook = handle_exception
[docs] def find_nigth_run_dirs(night_path): """Find the observation folders in a night's directory. In LST, a night directory contains folders for each observation, plus a calibration and a logs folder. Parameters ---------- night_path : pahtlib.Path Path to the night's data folder containing all subruns. Returns ------- List of Paths to all the obersvation folders in the night's directory. """ return [f for f in night_path.iterdir() if f.is_dir() and "calibration" not in f.stem and "logs" not in f.stem]
[docs] def parse_run_worker_nodes(run_data_path): """Parse the worker nodes of the R0->DL1 jobs during in an observation from hiperta_stream_start log file. hiperta_stream_start log file is parsed to read the slurm job ids of the R0->DL1 jobs. Then the worker nodes names are found by interogating sacct. Paramters --------- run_data_path : pathlib.Path Path to the observation folder, containing the dl1, dl2 and dl3 directories and the hiperta_stream_start logs. Returns ------- List of string: the names of the worker nodes, in the order of the lines. """ # job id file template: "jobids_hiperta_stream_{hiperta_start_timestamp}.txt" # timestamp format: 2023-11-23_05-45-51 # There is another one for altaz job, not to be selected. job_id_files = [f for f in run_data_path.iterdir() if f.is_file() and "jobids" in f.stem and "altaz" not in f.stem] if len(job_id_files) != 1: raise ValueError( "Couldn't find jobid file in run dir {}, got {}".format(str(run_data_path), str(job_id_files)) ) with open(job_id_files[0], "r") as job_file: job_ids = job_file.read().strip() sacct_process = sp.run(["sacct", "-j", job_ids, "--format", "NodeList"], capture_output=True, check=True) return [node_str.strip() for node_str in sacct_process.stdout.decode().split("\n")[2:]][:-1:2]
[docs] def parse_line_subrun_dl1_events( line_subrun_dl1_filename, run_id, line_idx, worker_node, nb_retries, re_try_wait_time ): """Read a DL1 file, to load the timestamps and event id information. The separated timestamps_s and timestamp_qns are put back together to single datetime object (ns precision). The event_id diffs is also computed here, as the difference between the event id of an event with respect to the previous one. The first event is set to have an event_id_diff of 0. The "events from the future" are set a event_id_diff of 0, as well as the first event following an event from the future (because event ids of the event from the future are weird). This function returns None if the file could not be read. Parameters ---------- line_subrun_dl1_filename : pathlib.Path Path to the dl1 file. run_id : int Run ID of the events in the file. Written as field in the df. line_idx : int Line number of the process that wrote this file. Written as field in the df. worker_node : str Name of the worker node that processed the DL1 file to treat. nb_retries : int Number of times to re-try opening a file in case of error. re_try_wait_time : float Amount of time, in seconds, to wait before attempting to open a file again. Returns ------- Pandas dataframe with the subrun event ids and timestamps (and line index, worker node). """ # re-try mechanism because of fefs, sometimes we get errors even if the file is there. for _ in range(nb_retries): try: with tables.open_file(line_subrun_dl1_filename, "r") as dl1_f: event_ids = dl1_f.root.dl1.event.subarray.trigger.cols.event_id[:] timestamp_s = dl1_f.root.dl1.event.subarray.trigger.cols.timestamp_s[:] timestamp_qns = dl1_f.root.dl1.event.subarray.trigger.cols.timestamp_qns[:] break except Exception: time.sleep(re_try_wait_time) else: logging.warning("Could not open file {} !".format(line_subrun_dl1_filename)) return None # Conver to a single datetime object. trigger_time = (pd.to_datetime(timestamp_s, unit="s") + pd.to_timedelta(timestamp_qns // 4, unit="ns")).to_series() # "Future events" have weird event ids, so set their event id diffs to 0 # Also set the event id diff of the next event after a future event to 0 for same reason event_id_diffs = np.diff(event_ids, prepend=event_ids[0]) event_id_diffs[ndimage.binary_dilation(trigger_time > pd.to_datetime("21000101"), np.array([0, 1, 1]))] = 0 return pd.DataFrame( { "timestamp_s": timestamp_s, "timestamp_qns": timestamp_qns, "trigger_time": trigger_time, "event_id": event_ids, "event_id_diffs": event_id_diffs, "run_id": run_id, "line_idx": line_idx, "worker_node": worker_node, } )
[docs] def parse_run_dl1_events(dl1_dir_path, run_id, worker_nodes, nb_process, nb_retries, re_try_wait_time): """Parse an observation DL1 files to load event ids and timestamps in a dataframe Paramters --------- dl1_dir_path : pathlib.Path Path to the folder containing the night's DL1 files. (all runs are in the same folder) run_id : int Run id of the run to analyse worker_nodes : list of str List of the worker nodes names of the R0->DL1 jobs of this run nb_process : int Number of processs to use to parse the runs files. nb_retries : int Number of times to re-try reading a DL1 file. re_try_wait_time : float Amount of time, in seconds, to wait before re-trying to open a file. Returns ------- List of pandas dataframe with the events id and timestamps data per subrun. """ files_dfs = [] for line_idx in [0, 1, 2, 3]: with Pool(nb_process) as pool: line_dfs = pool.starmap( parse_line_subrun_dl1_events, [ ( line_subrun_filename, run_id, line_idx, worker_nodes[line_idx], nb_retries, re_try_wait_time, ) # For files in LST created by LSTAutoRTA: # DL1 filename template: "dl1_${obs_id}_${run_id}_${tel_id}_${processIndex}_${threadIndex}_${fileIndex}.h5" # obs_id = 0, tel_id = 1, threadIndex = 0 in LST when started by autoRTA. for line_subrun_filename in dl1_dir_path.glob("dl1_0_{}_1_{}*.h5".format(run_id, line_idx)) ], ) files_dfs.extend([df for df in line_dfs if df is not None]) return files_dfs
[docs] def parse_run_log_files(logs_dir, run_id, worker_nodes): """Parse the log files of the R0->DL1 process of a run to read the processing rates. Parameters ---------- logs_dir : pathlib.Path Path to the folder containing the R0->DL1 jobs logs. run_id : int Run id of the run to parse worker_nodes : list of str List of the worker nodes names of the R0->DL1 jobs of this run Returns ------- List of pandas dataframe with the R0->DL1 processing rates per line during the run. """ pattern = re.compile( r"\[([0-9]+)\] : update_timer_message : " r"nbBadMessage = ([0-9]+) \(size in \[[^\]]*\]\), " r"nbEvent = ([0-9]+), " r"nbGoodEvent = ([0-9]+)" ) logs_dfs = [] for line_idx in [0, 1, 2, 3]: # log filename template: "r0_dl1_{run_id}_{line_id}.log" for log_file in logs_dir.glob("r0_dl1_{}_{}.log".format(run_id, line_idx)): timestamps = [] nb_events = [] nb_good_events = [] event_per_sec = [] good_event_per_s = [] with open(log_file, "r") as log_f: for log_line in log_f.readlines(): results = pattern.search(log_line) if results: timestamps.append(int(results.group(1))) nb_events.append(int(results.group(3))) nb_good_events.append(int(results.group(4))) logs_dfs.append( pd.DataFrame( { "timestamp": pd.to_datetime(timestamps, unit="s"), "nbEvent": nb_events, "run_id": run_id, "line_idx": line_idx, "worker_node": worker_nodes[line_idx], } ) ) return logs_dfs
[docs] def save_line_delays_plot(event_df, log_df, fig_pickle_path, rate_aggregation_str): """Plot and save the event id and the R0->DL1 processing rates wrt to time for an entire night Paramters --------- event_df : pd.DataFrame DataFrame with events data. log_df : pd.DataFrame DataFrame with log files data (processing rate) fig_pickle_path : pathlib.Path Path to where the plot figure should be pickled. rate_aggregation_str : str Aggregation string in pandas time grouper syntax. For example for 10 seconds: "10S" """ sns.set_theme() worker_nodes = sorted(log_df["worker_node"].unique().tolist()) run_ids = sorted(log_df["run_id"].unique()) run_starts = [log_df.loc[log_df["run_id"] == run_id, "timestamp"].min() for run_id in run_ids] fig, ax = plt.subplots( 2, 1, figsize=(15, 10), sharex=True, ) sns.scatterplot( x="trigger_time", y="event_id_diffs", hue="worker_node", style="worker_node", data=event_df[ (event_df["event_id_diffs"] != 4) # don't plot events with normal event id diffs (too many of them) & ( event_df["event_id_diffs"] != 0 ) # those are: 1st events in subrun line file, or events after future events. & (event_df["trigger_time"] < pd.to_datetime("21000101")) # don't plot events from the future & (event_df["event_id_diffs"] < 1e18) # huge event ids when lines caught events of start of next run ], ax=ax[0], s=12, style_order=worker_nodes, hue_order=worker_nodes, ) ax[0].set_yscale("log") sns.lineplot( x="timestamp", y="nbEvent", data=log_df.groupby([pd.Grouper(key="timestamp", freq=rate_aggregation_str), "worker_node"]).mean(), hue="worker_node", style="worker_node", units="run_id", markers=True, dashes=False, markersize=4, linewidth=1, sort=False, ax=ax[1], hue_order=worker_nodes, markeredgewidth=0.0, ) for run_id, run_start in zip(run_ids, run_starts): ax[1].annotate( str(run_id), (run_start, 0), xytext=(0, 2), textcoords="offset fontsize", arrowprops=dict(arrowstyle="-|>", color="k"), fontsize="xx-small", ) fig.tight_layout() with open(fig_pickle_path, "wb") as plot_fig_pickle: pickle.dump(fig, plot_fig_pickle)
[docs] def process_night( night, nb_process, force_reload, data_base_path, obs_relative_dl1_folder, obs_relative_data_folder, relative_logs_folder, output_file_prefix, rate_aggregation_str, nb_retries, re_try_wait_time, ): """Parses DL1 and log files of an entire night to plot the event ids and processing rates wrt time Parameters ---------- night : str Night data in format "YYYYMMDD" nb_process : int Number of process to use to parse the files force_reload : bool If true, the data and logs files are re-processed even if the dataframe is already saved at the output location. data_base_path : pathlib.Path Path to the the folder containing the nights directories obs_relative_dl1_folder : pathlib.Path Relative path to the folder containing dl1 files from the night directory obs_relative_data_folder : pathlib.Path Relative path to the folder containing the data (dl1, dl2, dl3) subfolder from the night directory. relative_logs_folder : pathlib.Path Relative path to the folder containing the night's logs from the night repository output_file_prefix : str Prefix to prepend to the DataFrame and Figure to save. The night str will be appended to make the full output file names. rate_aggregation_str : str Aggregation string in pandas time grouper syntax. For example for 10 seconds: "10S" nb_retries : int Number of times to re-try reading a DL1 file. re_try_wait_time : float Amount of time, in seconds, to wait before re-trying to open a file. """ night_dir = data_base_path / night night_df_path = night_dir / Path(f"{output_file_prefix}{night}.h5") if night_df_path.exists() and not force_reload: event_df = pd.read_hdf(night_df_path, key="event_df", mode="r") log_df = pd.read_hdf(night_df_path, key="log_df", mode="r") else: events_dfs = [] logs_dfs = [] for run_dir_data_path in tqdm(find_nigth_run_dirs(night_dir)): try: run_id = int(run_dir_data_path.stem) worker_nodes = parse_run_worker_nodes(run_dir_data_path / obs_relative_data_folder) events_dfs.extend( parse_run_dl1_events( run_dir_data_path / obs_relative_dl1_folder, run_id, worker_nodes, nb_process, nb_retries, re_try_wait_time, ) ) logs_dfs.extend(parse_run_log_files(night_dir / relative_logs_folder, run_id, worker_nodes)) except Exception: logging.error(f"Can not process run {run_dir_data_path}", exc_info=True) event_df = pd.concat(events_dfs, ignore_index=True) log_df = pd.concat(logs_dfs, ignore_index=True) event_df.to_hdf(night_df_path, key="event_df", mode="w") log_df.to_hdf(night_df_path, key="log_df", mode="a") save_line_delays_plot(event_df, log_df, night_dir / f"{output_file_prefix}{night}.pickle", rate_aggregation_str)
[docs] def main(): log_uncaught_exceptions() parser = argparse.ArgumentParser( prog="plot_line_delays.py", description="Plotting script to visualize delays in events per line. " "This script will first build two pandas dataframes while loading the night's data, " "and save it to the night's data folder (unless a df is already saved there). " "It will then plot the lines unwanted event ids and pickle the figure to the night's data as well. " "To view the figure: import pickle; figx = pickle.load(open('Figure.pickle', 'rb')); figx.show()", epilog="Suitable environment for this script: conda create -n visu -c conda-forge pytables pandas seaborn", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("--night", "-n", type=str, required=True, help='Night to plot, in format "YYYYMMDD".') parser.add_argument( "--nb_process", "-p", type=int, default=1, help="Number of process to use to aggregate the night's data." ) parser.add_argument( "--force_reload", "-f", action="store_true", default=False, help="Forces the re-parsing of the night's data " "even if a visualization dataframe is already saved in the night folder.", ) parser.add_argument( "--data_base_path", "-d", type=str, default="/fefs/onsite/pipeline/rta/data/", help="Path to the folder containing the reco nights outputs.", ) parser.add_argument( "--obs_relative_dl1_folder", "-fdl1", type=str, default="reco/dl1/", help="Path to the folder containing the DL1 hdf5 files, relative to the observation folder.", ) parser.add_argument( "--obs_relative_data_folder", "-fdata", type=str, default="reco/", help="Path to the top level data folder for an observation, relative to the observation folder. " "It is used to find the hiperta_stream_start log of the slurm jobs.", ) parser.add_argument( "--relative_logs_folder", "-flogs", type=str, default="logs/RECO/r0_dl1", help="Path to the folder containing the r0->dl1 logs, relative to the night folder." "(All observations logs of a night are stored together in LST.)", ) parser.add_argument( "--output_file_prefix", "-o", type=str, default="visu_delay_", help="Prefix string to prepend to the dataframe and figure file. " "To this prefix will be appended the night string.", ) parser.add_argument( "--rate_aggregation_str", "-a", type=str, default="1S", help="Time range to use to aggregate rate values in plot, in pandas grouper syntax. " "For instance for 10 seconds: '10S'. No grouping is '1S'. This can be ", ) parser.add_argument( "--nb_retries", "-r", type=int, default=3, help="Number of time to re-try opening a file (in case it fails due to fefs).", ) parser.add_argument( "--re_try_wait_time", "-t", type=float, default=1.0, help="Amount of time to wait before re-trying to open a data file.", ) parser.add_argument( "--log_file", "-l", type=str, help="Path to the file to write this scipt logs. If not provided, logs are printed to the console.", ) args = parser.parse_args() if args.log_file is not None: logging.basicConfig(filename=args.log_file) data_base_path = Path(args.data_base_path) obs_relative_dl1_folder = Path(args.obs_relative_dl1_folder) obs_relative_data_folder = Path(args.obs_relative_data_folder) relative_logs_folder = Path(args.relative_logs_folder) nb_process = args.nb_process if args.nb_process >= 1 else 1 process_night( night=args.night, nb_process=nb_process, force_reload=args.force_reload, data_base_path=data_base_path, obs_relative_dl1_folder=obs_relative_dl1_folder, obs_relative_data_folder=obs_relative_data_folder, relative_logs_folder=relative_logs_folder, output_file_prefix=args.output_file_prefix, rate_aggregation_str=args.rate_aggregation_str, nb_retries=args.nb_retries, re_try_wait_time=args.re_try_wait_time, )
if __name__ == "__main__": main()