forked from qoherent/modrec-workflow
44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
import os, h5py, numpy as np
|
|
|
|
|
|
def generate_dataset(path_to_recordings, output_path, dataset_name="data"):
|
|
"""
|
|
Generates a dataset from a folder of .npy files and saves it to an HDF5 file
|
|
|
|
Parameters:
|
|
path_to_recordings (str): Path to the folder containing .npy files
|
|
output_path (str): Path to the output HDF5 file
|
|
dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
|
|
|
|
Returns:
|
|
dset (h5py.Dataset): The created dataset object
|
|
"""
|
|
|
|
parent = os.path.dirname(output_path)
|
|
if parent:
|
|
os.makedirs(parent, exist_ok=True)
|
|
|
|
# we assume the recordings are in .npy format
|
|
files = os.listdir(path_to_recordings)
|
|
if not files:
|
|
raise ValueError("No files found in the specified directory.")
|
|
|
|
sample = np.load(os.path.join(path_to_recordings, files[0]))
|
|
shape = sample.shape
|
|
dtype = sample.dtype
|
|
|
|
with h5py.File(output_path, "w") as hf:
|
|
dset = hf.create_dataset(
|
|
dataset_name, shape=(len(files),) + shape, dtype=dtype, compression="gzip"
|
|
)
|
|
|
|
for idx, fname in enumerate(files):
|
|
data = np.load(os.path.join(path_to_recordings, fname))
|
|
dset[idx, ...] = data
|
|
|
|
return dset
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print(generate_dataset("recordings", "data/dataset.h5"))
|