Added the split datagen script

2025-05-16 11:26:33 -04:00 · 2025-05-16 11:26:33 -04:00 · ad2ef9509f
commit ad2ef9509f
parent e3a6b231f9
8 changed files with 201 additions and 44 deletions
--- a/.riahub/workflows/workflow.yaml
+++ b/.riahub/workflows/workflow.yaml
@ -41,7 +41,7 @@ jobs:
        run: |
          mkdir -p data
          python produce_dataset.py
-          echo "dataset produced successfully"
+          echo "datasets produced successfully"

      - name: 2. Train Model
        run: |
--- a/data/dataset.h5
+++ b/data/dataset.h5
--- a/data/train.h5
+++ b/data/train.h5
--- a/data/val.h5
+++ b/data/val.h5
--- a/produce_dataset.py
+++ b/produce_dataset.py
@ -1,43 +0,0 @@
-import os, h5py, numpy as np
-
-
-def generate_dataset(path_to_recordings, output_path, dataset_name="data"):
-    """
-    Generates a dataset from a folder of .npy files and saves it to an HDF5 file
-
-    Parameters:
-        path_to_recordings (str): Path to the folder containing .npy files
-        output_path (str): Path to the output HDF5 file
-        dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
-
-    Returns:
-        dset (h5py.Dataset): The created dataset object
-    """
-
-    parent = os.path.dirname(output_path)
-    if parent:
-        os.makedirs(parent, exist_ok=True)
-
-    # we assume the recordings are in .npy format
-    files = os.listdir(path_to_recordings)
-    if not files:
-        raise ValueError("No files found in the specified directory.")
-
-    sample = np.load(os.path.join(path_to_recordings, files[0]))
-    shape = sample.shape
-    dtype = sample.dtype
-
-    with h5py.File(output_path, "w") as hf:
-        dset = hf.create_dataset(
-            dataset_name, shape=(len(files),) + shape, dtype=dtype, compression="gzip"
-        )
-
-        for idx, fname in enumerate(files):
-            data = np.load(os.path.join(path_to_recordings, fname))
-            dset[idx, ...] = data
-
-    return dset
-
-
-if __name__ == "__main__":
-    print(generate_dataset("recordings", "data/dataset.h5"))
--- a/scripts/data_gen.py
+++ b/scripts/data_gen.py
--- a/scripts/produce_dataset.py
+++ b/scripts/produce_dataset.py
@ -0,0 +1,151 @@
+import os, h5py, numpy as np
+from utils.io import from_npy
+from split_dataset import split
+
+meta_dtype = np.dtype(
+    [
+        ("rec_id", "S256"),
+        ("snippet_idx", np.int32),
+        ("modulation", "S32"),
+        ("snr", np.int32),
+        ("beta", np.float32),
+        ("sps", np.int32),
+    ]
+)
+
+info_dtype = np.dtype(
+    [
+        ("num_records", np.int32),
+        ("dataset_name", "S64"),  # up to 64‐byte UTF-8 strings
+        ("creator", "S64"),
+    ]
+)
+
+def write_hdf5_file(records, output_path, dataset_name="data"):
+    """
+    Writes a list of records to an HDF5 file.
+    Parameters:
+        records (list): List of records to be written to the file
+        output_path (str): Path to the output HDF5 file
+        dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
+    Returns:
+        str: Path to the created HDF5 file
+    """
+    meta_arr = np.empty(len(records), dtype=meta_dtype)
+    for i, (_, md) in enumerate(records):
+        meta_arr[i] = (
+            md["rec_id"].encode("utf-8"),
+            md["snippet_idx"],
+            md["modulation"].encode("utf-8"),
+            int(md["snr"]),
+            float(md["beta"]),
+            int(md["sps"]),
+        )
+
+    first_rec, _ = records[0]  # records[0] is a tuple of (data, md)
+    sample = first_rec
+    shape, dtype = sample.shape, sample.dtype
+
+    with h5py.File(output_path, "w") as hf:
+        dset = hf.create_dataset(
+            dataset_name, shape=(len(records),) + shape, dtype=dtype, compression="gzip"
+        )
+
+        for idx, (snip, md) in enumerate(records):
+            dset[idx, ...] = snip
+
+        mg = hf.create_group("metadata")
+        mg.create_dataset("metadata", data=meta_arr, compression="gzip")
+
+        print(dset.shape, f"snippets created in {dataset_name}")
+
+        info_arr = np.array(
+            [
+                (
+                    len(records),
+                    dataset_name.encode("utf-8"),
+                    b"generate_dataset.py",  # already bytes
+                )
+            ],
+            dtype=info_dtype,
+        )
+
+        mg.create_dataset("dataset_info", data=info_arr)
+
+    return output_path
+
+def split_recording(recording_list):
+    """
+    Splits a list of recordings into smaller chunks.
+
+    Parameters:
+        recording_list (list): List of recordings to be split
+
+    Returns: yeah yeah
+        list: List of split recordings
+    """
+    snippet_list = []
+
+    for data, md in recording_list:
+        C, N = data.shape
+        L = N // 8
+        rec_id = md["rec_id"]
+        for i in range(8):
+            start = i * L
+            end = (i + 1) * L
+            snippet = data[:, start:end]
+            # copy the metadata, adding a snippet index
+            snippet_md = md.copy()
+            snippet_md["snippet_idx"] = i
+            snippet_list.append((snippet, snippet_md))
+    return snippet_list
+
+
+def generate_datasets(path_to_recordings, output_path, dataset_name="data"):
+    """
+    Generates a dataset from a folder of .npy files and saves it to an HDF5 file
+
+    Parameters:
+        path_to_recordings (str): Path to the folder containing .npy files
+        output_path (str): Path to the output HDF5 file
+        dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
+
+    Returns:
+        dset (h5py.Dataset): The created dataset object
+    """
+
+    parent = os.path.dirname(output_path)
+    if parent:
+        os.makedirs(parent, exist_ok=True)
+
+    # we assume the recordings are in .npy format
+    files = os.listdir(path_to_recordings)
+    if not files:
+        raise ValueError("No files found in the specified directory.")
+
+    records = []
+    for fname in files:
+        rec = from_npy(os.path.join(path_to_recordings, fname))
+
+        data = rec.data
+
+        md = rec.metadata  # pull metadata from the recordinh
+        md.setdefault("recid", len(records))
+        records.append((data, md))
+
+    # split each recording into 8 snippets each
+    records = split_recording(records)
+    
+    train_records, val_records = split(records, train_frac=0.8, seed=42)
+    
+    train_path = os.path.join(output_path, "train.h5")
+    val_path = os.path.join(output_path, "val.h5")
+    
+    write_hdf5_file(train_records, train_path, "training_data")
+    write_hdf5_file(val_records,   val_path,   "validation_data")
+    
+    return train_path, val_path
+
+
+if __name__ == "__main__":
+    print(generate_datasets("recordings", "data"))
--- a/scripts/split_dataset.py
+++ b/scripts/split_dataset.py
@ -0,0 +1,49 @@
+import random
+from collections import defaultdict
+
+
+def split(dataset, train_frac=0.8, seed=42):
+    """
+    Splits a dataset into smaller datasets based on the specified lengths.
+
+    Parameters:
+        dataset (list): The dataset to be split.
+        lengths (list): A list of lengths for each split.
+
+    Returns:
+        list: A list of split datasets.
+    """
+    N = len(dataset)
+    target = int(N * train_frac)
+    
+    by_rec = defaultdict(list)
+    for i, (_, md) in enumerate(dataset):
+        by_rec[md['rec_id']].append(i)
+    
+    
+    rec_ids = list(by_rec.keys()) 
+    random.seed(seed)
+    random.shuffle(rec_ids)
+    
+    
+    train_set = set()
+    count = 0
+    for rec_id in rec_ids:
+        index = by_rec[rec_id]
+        if count + len(index) <= target:
+            train_set.update(index)
+            count += len(index)
+    
+
+    
+    validation_set = set(range(N)) - train_set
+    
+    print(f"Train set :{len(train_set)}")
+    print(f"val set :{len(validation_set)}")
+    
+    train_records = [dataset[i] for i in sorted(train_set)]
+    val_records   = [dataset[i] for i in sorted(validation_set)]
+
+    return train_records, val_records
+
+