ria-toolkit-oss/src/ria_toolkit_oss/data/datasets/h5helpers.py

import os

import h5py
import numpy as np


def copy_dataset_entry_by_index(
    source: str | os.PathLike, destination: str | os.PathLike, dataset_path: str, idx: int
) -> None:
    """
    Copies an entry from a dataset based on an index from the source HDF5 file to the destination HDF5 file.

    :param source: The name of the original HDF5 file.
    :type source: str
    :param destination: The name of the new HDF5 file.
    :type destination: str
    :param dataset_path: The path of the dataset from the root of the file.
    :type dataset_path: str
    :param idx: The index of the specified example.
    :type idx: int

    :return: None
    """
    # TODO: Generalize so that source and destination can be file objects or strings
    with h5py.File(source, "r") as original_file, h5py.File(destination, "a") as new_file:
        original_ds = original_file[dataset_path]

        entry = original_ds[idx]
        new_ds = new_file[dataset_path]
        new_ds.resize(new_ds.shape[0] + 1, axis=0)
        new_ds[-1] = entry


def copy_over_example(source: str | os.PathLike, destination: str | os.PathLike, idx: int) -> None:
    """
    Copies over an example and it's corresponding metadata located at the given index to a new file.
        It appends the new example to the end of the new file.

    :param source: The name of the original HDF5 file.
    :type source: str or os.PathLike
    :param destination: The name of the new HDF5 file.
    :type destination: str or os.PathLike
    :param idx: The index of the example within the dataset.
    :type idx: int

    :return: None
    """

    with h5py.File(source, "r") as original_file, h5py.File(destination, "a") as new_file:
        ds, md = original_file["data"], original_file["metadata/metadata"]

        new_ds, new_md = new_file["data"], new_file["metadata/metadata"]

        new_ds.resize(new_ds.shape[0] + 1, axis=0)
        new_md.resize(new_md.shape[0] + 1, axis=0)

        new_ds[-1], new_md[-1] = ds[idx], md[idx]


def append_entry_inplace(source: str | os.PathLike, dataset_path: str, entry: np.ndarray) -> None:
    """
    Appends an entry to the specified dataset of the source HDF5 file. This operation is done inplace.

    :param source: The name of the source HDF5 file.
    :type source: str or os.PathLike
    :param dataset_path: The path of the dataset from the root of the file.
    :type dataset_path: str
    :param entry: The entry that is being copied.
    :type entry: np.ndarray

    :return: None
    """
    # TODO: Generalize so that source can be file object or string
    with h5py.File(source, "a") as new_file:
        new_ds = new_file[dataset_path]
        new_ds.resize(new_ds.shape[0] + 1, axis=0)
        new_ds[-1] = entry


def duplicate_entry_inplace(source: str | os.PathLike, dataset_path: str, idx: int) -> None:
    """
    Appends the entry at index to the end of the dataset. This operation is done inplace.

    :param source: The name of the source HDF5 file.
    :type source: str or os.PathLike
    :param dataset_path: The path of the dataset from the root of the file. This dataset is usually
      'data' or 'metadata/metadata'.
    :type dataset_path: str
    :param idx: The index of the example within the dataset.
    :type idx: int

    :return: None
    """
    # This function appends to dataset, so upon dataset creation, chunks has to = True and max_size has to = None
    with h5py.File(source, "a") as f:
        ds = f[dataset_path]
        entry = ds[idx]
        ds.resize(ds.shape[0] + 1, axis=0)
        ds[-1] = entry


def copy_file(original_source: str | os.PathLike, new_source: str | os.PathLike) -> None:
    """Copies contents of source HDF5 file to a new HDF5 file.

    :param original_source: The name of the original HDF5 source file.
    :type original_source: str or os.PathLike
    :param new_source: The copy of the HDF5 source file.
    :type new_source: str or os.PathLike

    :return: None
    """
    with h5py.File(original_source, "r") as original_file:
        with h5py.File(new_source, "w") as new_file:
            for key in original_file.keys():
                original_file.copy(key, new_file)


def make_empty_clone(original_source: str | os.PathLike, new_source: str | os.PathLike, example_length: int) -> None:
    """Creates a new HDF5 file with the same structure but will leave metadata and dataset empty for operations.

    :param original_source: The name of the original HDF5 source file.
    :type original_source: str or os.PathLike
    :param new_source: The name of the new HDF5 source file.
    :type new_source: str or os.PathLike
    :param example_length: The desired length of an example in the new file.
    :type example_length: int

    :return: None
    """

    with h5py.File(new_source, "w") as new_file, h5py.File(original_source, "r") as original_file:
        for key in original_file.keys():
            if key == "data":
                ds = original_file["data"]
                channels = ds.shape[1]
                new_file.create_dataset(
                    "data",
                    shape=(0, channels, example_length),
                    chunks=True,
                    maxshape=(None, None, None),
                    dtype=original_file["data"].dtype,
                )
            elif key == "metadata":
                new_metadata_group = new_file.create_group("metadata")
                new_metadata_group.create_dataset(
                    "metadata",
                    shape=(0,),
                    chunks=True,
                    maxshape=(None,),
                    dtype=original_file["metadata/metadata"].dtype,
                )
            else:
                original_file.copy(key, new_file)


def delete_example_inplace(source: str | os.PathLike, idx: int) -> None:
    """Deletes an example and it's corresponding metadata located at the given index.
        This deletion is done by creating a temporary dataset and copying all contents
        to the temporary dataset except for the example at idx. This operation is inplace.

    :param source: The name of the source HDF5 file.
    :type source: str or os.PathLike
    :param idx: The index of the example and metadata to be deleted.
    :type idx: int

    :return: None
    """

    with h5py.File(source, "a") as f:
        ds, md = f["data"], f["metadata/metadata"]
        m, c, n = ds.shape
        if not (0 <= idx <= m - 1):
            raise IndexError(f"Index {idx} out of range [0, {m - 1}]")
        if len(ds) != len(md):
            raise ValueError("Data and metadata array lengths do not match")

        new_ds = f.create_dataset(
            "data.temp",
            shape=(m - 1, c, n),
            chunks=True,
            dtype=ds.dtype,
            maxshape=(None, None, None),  # Required to allow future mutations which expand the shape
        )
        new_md = f.create_dataset(
            "metadata/metadata.temp", shape=len(md) - 1, chunks=True, dtype=md.dtype, maxshape=(None,)
        )

        for row in range(idx):
            new_ds[row], new_md[row] = ds[row], md[row]

        for row in range(idx + 1, len(md)):
            new_ds[row - 1], new_md[row - 1] = ds[row], md[row]

        del f["data"]
        del f["metadata/metadata"]

        f.move("data.temp", "data")
        f.move("metadata/metadata.temp", "metadata/metadata")


def overwrite_file(source: str | os.PathLike, new_data: np.ndarray) -> None:
    """
    Overwrites data in an HDF5 file with new data.

    :param source: The copy of the HDF5 source file.
    :type source: str or os.PathLike
    :param new_data: The updated copy of the data that should be stored.
    :type new_data: np.ndarray

    :return: None
    """

    # TODO: Might need to pass in dataset_path instead of datastet_name depending on file structure
    # Update copy to include augmented data

    with h5py.File(source, "r+") as f:
        ds_name = tuple(f.keys())[0]
        del f[ds_name]
        f.create_dataset(ds_name, data=new_data)