optimized script

This commit is contained in:
Liyu Xiao 2025-05-26 12:04:20 -04:00
parent 4f5101bd7e
commit a092b92174
3 changed files with 10 additions and 12 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ __pycache__/
*.ipynb
*.onnx
*.json
*.h5

View File

@ -37,12 +37,12 @@ jobs:
python -m pip install --upgrade pip
pip install -r requirements.txt
# - name: 1. Build HDF5 Dataset
# run: |
# mkdir -p data/dataset
# PYTHONPATH=. python data/scripts/produce_dataset.py
# echo "datasets produced successfully"
# shell: bash
- name: 1. Build HDF5 Dataset
run: |
mkdir -p data/dataset
PYTHONPATH=. python data/scripts/produce_dataset.py
echo "datasets produced successfully"
shell: bash
- name: Upload Dataset Artifacts
uses: actions/upload-artifact@v3

View File

@ -49,12 +49,9 @@ def write_hdf5_file(records, output_path, dataset_name="data"):
shape, dtype = sample.shape, sample.dtype
with h5py.File(output_path, "w") as hf:
dset = hf.create_dataset(
dataset_name, shape=(len(records),) + shape, dtype=dtype, compression="gzip"
)
data_arr = np.stack([rec[0] for rec in records])
dset = hf.create_dataset(dataset_name, data=data_arr, compression="gzip")
for idx, (snip, md) in enumerate(records):
dset[idx, ...] = snip
mg = hf.create_group("metadata")
mg.create_dataset("metadata", data=meta_arr, compression="gzip")