optimized script

2025-05-26 12:04:20 -04:00 · 2025-05-26 12:04:20 -04:00 · a092b92174
commit a092b92174
parent 4f5101bd7e
3 changed files with 10 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,4 @@ __pycache__/
 *.ipynb
 *.onnx
 *.json
+*.h5
--- a/.riahub/workflows/workflow.yaml
+++ b/.riahub/workflows/workflow.yaml
@ -37,12 +37,12 @@ jobs:
          python -m pip install --upgrade pip
          pip install -r requirements.txt

-      # - name: 1. Build HDF5 Dataset
-      #   run: |
-      #    mkdir -p data/dataset
-      #    PYTHONPATH=. python data/scripts/produce_dataset.py
-      #    echo "datasets produced successfully"
-      #   shell: bash
+      - name: 1. Build HDF5 Dataset
+        run: |
+         mkdir -p data/dataset
+         PYTHONPATH=. python data/scripts/produce_dataset.py
+         echo "datasets produced successfully"
+        shell: bash
      
      - name: Upload Dataset Artifacts
        uses: actions/upload-artifact@v3
--- a/data/scripts/produce_dataset.py
+++ b/data/scripts/produce_dataset.py
@ -49,12 +49,9 @@ def write_hdf5_file(records, output_path, dataset_name="data"):
    shape, dtype = sample.shape, sample.dtype

    with h5py.File(output_path, "w") as hf:
-        dset = hf.create_dataset(
-            dataset_name, shape=(len(records),) + shape, dtype=dtype, compression="gzip"
-        )
+        data_arr = np.stack([rec[0] for rec in records])
+        dset = hf.create_dataset(dataset_name, data=data_arr, compression="gzip")

-        for idx, (snip, md) in enumerate(records):
-            dset[idx, ...] = snip

        mg = hf.create_group("metadata")
        mg.create_dataset("metadata", data=meta_arr, compression="gzip")