updated so now the data gen works

This commit is contained in:
Liyu Xiao 2025-06-13 14:17:13 -04:00
parent b49fa4c2b7
commit e4542e76c4
5 changed files with 18 additions and 16 deletions

View File

@ -40,7 +40,7 @@ jobs:
- name: 1. Generate Recordings - name: 1. Generate Recordings
run: | run: |
mkdir -p data/recordings mkdir -p data/recordings
PYTHONPATH=. python scripts/dataset_building/data_gen.py PYTHONPATH=. python scripts/generate_modulated_signals.py --output-dir data/recordings
echo "recordings produced successfully" echo "recordings produced successfully"
- name: Upload Recordings - name: Upload Recordings

View File

@ -3,9 +3,6 @@ general:
run_mode: prod run_mode: prod
dataset: dataset:
#where to read the recordings from to produce the data set
input_dir: data/recordings
#number of slices you want to split each recording into #number of slices you want to split each recording into
num_slices: 8 num_slices: 8
@ -19,8 +16,6 @@ dataset:
#multiple modulations to contain in the dataset #multiple modulations to contain in the dataset
modulation_types: [bpsk, qpsk, qam16, qam64] modulation_types: [bpsk, qpsk, qam16, qam64]
#where to output the datasets
output_dir: data/dataset
training: training:
#number of training samples being processed together before model updates its weights #number of training samples being processed together before model updates its weights
@ -40,5 +35,6 @@ inference:
num_classes: 4 num_classes: 4
app: app:
build_dir: dist build_dir: dist

View File

@ -12,13 +12,11 @@ class GeneralConfig:
@dataclass @dataclass
class DataSetConfig: class DataSetConfig:
input_dir: str
num_slices: int num_slices: int
train_split: float train_split: float
seed: int seed: int
modulation_types: list modulation_types: list
val_split: float val_split: float
output_dir: str
@dataclass @dataclass

View File

@ -1,6 +1,7 @@
from utils.data import Recording from utils.data import Recording
import numpy as np import numpy as np
from utils.signal import block_generator from utils.signal import block_generator
import argparse
mods = { mods = {
"bpsk": {"num_bits_per_symbol": 1, "constellation_type": "psk"}, "bpsk": {"num_bits_per_symbol": 1, "constellation_type": "psk"},
@ -10,7 +11,7 @@ mods = {
} }
def generate_modulated_signals(): def generate_modulated_signals(output_dir):
for modulation in ["bpsk", "qpsk", "qam16", "qam64"]: for modulation in ["bpsk", "qpsk", "qam16", "qam64"]:
for snr in np.arange(-6, 13, 3): for snr in np.arange(-6, 13, 3):
for i in range(100): for i in range(100):
@ -66,4 +67,11 @@ def generate_modulated_signals():
if __name__ == "__main__": if __name__ == "__main__":
generate_modulated_signals() p = argparse.ArgumentParser(description="Generate modulated signal .npy files")
p.add_argument(
"--output-dir",
default=".",
help="Folder where .npy files will be saved"
)
args = p.parse_args()
generate_modulated_signals(args.output_dir)

View File

@ -98,18 +98,18 @@ def generate_datasets(cfg):
dset (h5py.Dataset): The created dataset object dset (h5py.Dataset): The created dataset object
""" """
parent = os.path.dirname(cfg.output_dir) parent = os.path.dirname("data/dataset")
if not parent: if not parent:
os.makedirs(cfg.output_dir, exist_ok=True) os.makedirs("data/dataset", exist_ok=True)
# we assume the recordings are in .npy format # we assume the recordings are in .npy format
files = os.listdir(cfg.input_dir) files = os.listdir("data/recordings")
if not files: if not files:
raise ValueError("No files found in the specified directory.") raise ValueError("No files found in the specified directory.")
records = [] records = []
for fname in files: for fname in files:
rec = from_npy(os.path.join(cfg.input_dir, fname)) rec = from_npy(os.path.join("data/recordings", fname))
data = rec.data # here data is a numpy array with the shape (1, N) data = rec.data # here data is a numpy array with the shape (1, N)
@ -125,8 +125,8 @@ def generate_datasets(cfg):
train_records, val_records = split(records, cfg.train_split, cfg.seed) train_records, val_records = split(records, cfg.train_split, cfg.seed)
train_path = os.path.join(cfg.output_dir, "train.h5") train_path = os.path.join("data/dataset", "train.h5")
val_path = os.path.join(cfg.output_dir, "val.h5") val_path = os.path.join("data/dataset", "val.h5")
write_hdf5_file(train_records, train_path, "training_data") write_hdf5_file(train_records, train_path, "training_data")
write_hdf5_file(val_records, val_path, "validation_data") write_hdf5_file(val_records, val_path, "validation_data")