modrec-workflow/produce_dataset.py

import os, h5py, numpy as np


def generate_dataset(path_to_recordings, output_path, dataset_name="data"):
    """
    Generates a dataset from a folder of .npy files and saves it to an HDF5 file

    Parameters:
        path_to_recordings (str): Path to the folder containing .npy files
        output_path (str): Path to the output HDF5 file
        dataset_name (str): Name of the dataset in the HDF5 file (default: "data")

    Returns:
        dset (h5py.Dataset): The created dataset object
    """

    parent = os.path.dirname(output_path)
    if parent:
        os.makedirs(parent, exist_ok=True)

    # we assume the recordings are in .npy format
    files = os.listdir(path_to_recordings)
    if not files:
        raise ValueError("No files found in the specified directory.")

    sample = np.load(os.path.join(path_to_recordings, files[0]))
    shape = sample.shape
    dtype = sample.dtype

    with h5py.File(output_path, "w") as hf:
        dset = hf.create_dataset(
            dataset_name, shape=(len(files),) + shape, dtype=dtype, compression="gzip"
        )

        for idx, fname in enumerate(files):
            data = np.load(os.path.join(path_to_recordings, fname))
            dset[idx, ...] = data

    return dset


if __name__ == "__main__":
    print(generate_dataset("recordings", "data/dataset.h5"))