diff --git a/.gitignore b/.gitignore index 7d082af..6b57304 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ __pycache__/ *.ckpt *.ipynb *.onnx -*.json \ No newline at end of file +*.json +*.h5 \ No newline at end of file diff --git a/.riahub/workflows/workflow.yaml b/.riahub/workflows/workflow.yaml index b9f4748..21780e8 100644 --- a/.riahub/workflows/workflow.yaml +++ b/.riahub/workflows/workflow.yaml @@ -37,12 +37,12 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt - # - name: 1. Build HDF5 Dataset - # run: | - # mkdir -p data/dataset - # PYTHONPATH=. python data/scripts/produce_dataset.py - # echo "datasets produced successfully" - # shell: bash + - name: 1. Build HDF5 Dataset + run: | + mkdir -p data/dataset + PYTHONPATH=. python data/scripts/produce_dataset.py + echo "datasets produced successfully" + shell: bash - name: Upload Dataset Artifacts uses: actions/upload-artifact@v3 diff --git a/data/scripts/produce_dataset.py b/data/scripts/produce_dataset.py index 17ca878..2ff76e9 100644 --- a/data/scripts/produce_dataset.py +++ b/data/scripts/produce_dataset.py @@ -49,12 +49,9 @@ def write_hdf5_file(records, output_path, dataset_name="data"): shape, dtype = sample.shape, sample.dtype with h5py.File(output_path, "w") as hf: - dset = hf.create_dataset( - dataset_name, shape=(len(records),) + shape, dtype=dtype, compression="gzip" - ) + data_arr = np.stack([rec[0] for rec in records]) + dset = hf.create_dataset(dataset_name, data=data_arr, compression="gzip") - for idx, (snip, md) in enumerate(records): - dset[idx, ...] = snip mg = hf.create_group("metadata") mg.create_dataset("metadata", data=meta_arr, compression="gzip")