optimized script

This commit is contained in:
Liyu Xiao 2025-05-26 12:04:20 -04:00
parent 4f5101bd7e
commit a092b92174
3 changed files with 10 additions and 12 deletions

3
.gitignore vendored
View File

@ -5,4 +5,5 @@ __pycache__/
*.ckpt *.ckpt
*.ipynb *.ipynb
*.onnx *.onnx
*.json *.json
*.h5

View File

@ -37,12 +37,12 @@ jobs:
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -r requirements.txt pip install -r requirements.txt
# - name: 1. Build HDF5 Dataset - name: 1. Build HDF5 Dataset
# run: | run: |
# mkdir -p data/dataset mkdir -p data/dataset
# PYTHONPATH=. python data/scripts/produce_dataset.py PYTHONPATH=. python data/scripts/produce_dataset.py
# echo "datasets produced successfully" echo "datasets produced successfully"
# shell: bash shell: bash
- name: Upload Dataset Artifacts - name: Upload Dataset Artifacts
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v3

View File

@ -49,12 +49,9 @@ def write_hdf5_file(records, output_path, dataset_name="data"):
shape, dtype = sample.shape, sample.dtype shape, dtype = sample.shape, sample.dtype
with h5py.File(output_path, "w") as hf: with h5py.File(output_path, "w") as hf:
dset = hf.create_dataset( data_arr = np.stack([rec[0] for rec in records])
dataset_name, shape=(len(records),) + shape, dtype=dtype, compression="gzip" dset = hf.create_dataset(dataset_name, data=data_arr, compression="gzip")
)
for idx, (snip, md) in enumerate(records):
dset[idx, ...] = snip
mg = hf.create_group("metadata") mg = hf.create_group("metadata")
mg.create_dataset("metadata", data=meta_arr, compression="gzip") mg.create_dataset("metadata", data=meta_arr, compression="gzip")