LFS'D the recordings folder, added in a new file called dataset_gen.py, which generates a .h5 dataset file under the data folder, modified the worflow file to run the script on every push/PR and uploades the data set as a workflow artifcat

This commit is contained in:
Liyu Xiao 2025-05-15 10:47:54 -04:00
parent 8f28f5db0f
commit 12b920c88d
33 changed files with 114 additions and 1 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
recordings filter=lfs diff=lfs merge=lfs -text

View File

@ -37,7 +37,7 @@ jobs:
- name: 1. Build Dataset
run: |
echo "building dataset"
python produce_dataset.py
# Placeholder: implement conversion from raw .npy recordings → train/val sets

BIN
data/dataset.h5 Normal file

Binary file not shown.

69
data_gen.py Normal file
View File

@ -0,0 +1,69 @@
from utils.data import Recording
import numpy as np
from utils.signal import block_generator
mods = {
"bpsk": {"num_bits_per_symbol": 1, "constellation_type": "psk"},
"qpsk": {"num_bits_per_symbol": 2, "constellation_type": "psk"},
"qam16": {"num_bits_per_symbol": 4, "constellation_type": "qam"},
"qam64": {"num_bits_per_symbol": 6, "constellation_type": "qam"},
}
def generate_modulated_signals():
for modulation in ["bpsk", "qpsk", "qam16", "qam64"]:
for snr in np.arange(-6, 13, 3):
recording_length = 1024
beta = 0.3 # the rolloff factor, can be changed to add variety
sps = 4 # samples per symbol, or the relative bandwidth of the digital signal. Can also be changed.
# blocks don't directly take the string 'qpsk' so we use the dict 'mods' to get parameters
constellation_type = mods[modulation]["constellation_type"]
num_bits_per_symbol = mods[modulation]["num_bits_per_symbol"]
# construct the digital modulation blocks with these parameters
# we have bit source -> mapper -> upsampling -> pulse shaping
bit_source = block_generator.RandomBinarySource()
mapper = block_generator.Mapper(
constellation_type=constellation_type,
num_bits_per_symbol=num_bits_per_symbol,
)
upsampler = block_generator.Upsampling(factor=sps)
pulse_shaping_filter = block_generator.RaisedCosineFilter(
upsampling_factor=sps, beta=beta
)
pulse_shaping_filter.connect_input([upsampler])
upsampler.connect_input([mapper])
mapper.connect_input([bit_source])
modulation_recording = pulse_shaping_filter.record(
num_samples=recording_length
)
# add noise by calculating the power of the modulation recording and generating AWGN from the snr parameter
signal_power = np.mean(np.abs(modulation_recording.data[0] ** 2))
awgn_source = block_generator.AWGNSource(
variance=(signal_power / 2) * (10 ** (((-1 * snr) / 20)))
)
noise = awgn_source.record(num_samples=recording_length)
samples_with_noise = modulation_recording.data + noise.data
output_recording = Recording(data=samples_with_noise)
# add metadata for ML later
output_recording.add_to_metadata(key="modulation", value=modulation)
output_recording.add_to_metadata(key="snr", value=int(snr))
output_recording.add_to_metadata(key="beta", value=beta)
output_recording.add_to_metadata(key="sps", value=sps)
# view if you want
# output_recording.view()
# save to file
output_recording.to_npy() # optionally add path and filename parameters
if __name__ == "__main__":
generate_modulated_signals()

43
produce_dataset.py Normal file
View File

@ -0,0 +1,43 @@
import os, h5py, numpy as np
def generate_dataset(path_to_recordings, output_path, dataset_name="data"):
"""
Generates a dataset from a folder of .npy files and saves it to an HDF5 file
Parameters:
path_to_recordings (str): Path to the folder containing .npy files
output_path (str): Path to the output HDF5 file
dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
Returns:
dset (h5py.Dataset): The created dataset object
"""
parent = os.path.dirname(output_path)
if parent:
os.makedirs(parent, exist_ok=True)
# we assume the recordings are in .npy format
files = os.listdir(path_to_recordings)
if not files:
raise ValueError("No files found in the specified directory.")
sample = np.load(os.path.join(path_to_recordings, files[0]))
shape = sample.shape
dtype = sample.dtype
with h5py.File(output_path, "w") as hf:
dset = hf.create_dataset(
dataset_name, shape=(len(files),) + shape, dtype=dtype, compression="gzip"
)
for idx, fname in enumerate(files):
data = np.load(os.path.join(path_to_recordings, fname))
dset[idx, ...] = data
return dset
if __name__ == "__main__":
print(generate_dataset("recordings", "data/dataset.h5"))

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.