modrec-workflow/produce_dataset.py

44 lines
1.3 KiB
Python

import os, h5py, numpy as np
def generate_dataset(path_to_recordings, output_path, dataset_name="data"):
"""
Generates a dataset from a folder of .npy files and saves it to an HDF5 file
Parameters:
path_to_recordings (str): Path to the folder containing .npy files
output_path (str): Path to the output HDF5 file
dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
Returns:
dset (h5py.Dataset): The created dataset object
"""
parent = os.path.dirname(output_path)
if parent:
os.makedirs(parent, exist_ok=True)
# we assume the recordings are in .npy format
files = os.listdir(path_to_recordings)
if not files:
raise ValueError("No files found in the specified directory.")
sample = np.load(os.path.join(path_to_recordings, files[0]))
shape = sample.shape
dtype = sample.dtype
with h5py.File(output_path, "w") as hf:
dset = hf.create_dataset(
dataset_name, shape=(len(files),) + shape, dtype=dtype, compression="gzip"
)
for idx, fname in enumerate(files):
data = np.load(os.path.join(path_to_recordings, fname))
dset[idx, ...] = data
return dset
if __name__ == "__main__":
print(generate_dataset("recordings", "data/dataset.h5"))