import time
from pathlib import Path
from typing import Any, Literal
import pandas as pd
import tables
[docs]
def pd_read_hdf_with_retry(
path_or_buf: str | Path | pd.HDFStore,
key: str | Any | None = None,
mode: Literal["r", "r+", "a"] = "r",
nb_tries: int = 20,
retry_wait_time_s: float = 0.5,
retry_on_os_error: bool = False,
*args,
**kwargs,
) -> pd.DataFrame:
"""Read a table in a hdf5 object into a pandas DataFrame
This is a wrapper around `pandas.read_hdf`, that will try to open the file several time in case a
HDF5 "Resource temporary unavailable" error is raised, which happens if the file is already opened
in write mode by another process.
Optionally, it can also re-try to open the file if an OSError is raised. This can be useful with shared
file systems: when the file system is overburdened, the metadata may take to long to be loaded and an
existing file can be reported as not present.
Parameters
----------
path_or_buf : Path
Any valid string path is acceptable. Only supports the local file system, remote URLs and file-like objects
are not supported.
If you want to pass in a path object, pandas accepts any os.PathLike.
Alternatively, pandas accepts an open pandas.HDFStore object.
key : str | None, optional
The group identifier in the store. Can be omitted if the HDF file contains a single pandas object, by
default None
mode : str, optional
Mode to use when opening the file. Ignored if path_or_buf is a pandas.HDFStore. Default is 'r'.
nb_tries : int, optional
Number of times to attempt opening the file, by default 20
retry_wait_time_s : float, optional
Amount of time to wait, in seconds, between each opening attempt, by default 0.5.
retry_on_os_error : bool, optional
If True, the retry strategy will be used as well if opening the files fails with an OSError.
If False, only the tables.exceptions.HDF5ExtError (raised when ressource is unavailable) are caught.
By default False.
Returns
-------
pd.DataFrame
A DataFrame containing the table content.
"""
for _ in range(nb_tries - 1):
try:
# Don't return right away or the else clause will execute !
df = pd.read_hdf(path_or_buf, key=key, mode=mode, *args, **kwargs)
break
except tables.exceptions.HDF5ExtError:
time.sleep(retry_wait_time_s)
except OSError:
if retry_on_os_error:
time.sleep(retry_wait_time_s)
else:
raise
else:
# try 1 last time, let error raise if failing again
df = pd.read_hdf(path_or_buf, key=key, mode=mode, *args, **kwargs)
return df