removed data/ from the git ignore

2025-05-21 15:52:16 -04:00 · 2025-05-21 15:52:16 -04:00 · 2b2766524c
commit 2b2766524c
parent b49f351a4c
38 changed files with 4133 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,5 +2,3 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 data/
--- a/data/dataset/modulation_dataset.py
+++ b/data/dataset/modulation_dataset.py
@ -0,0 +1,57 @@
 import sys, os
 sys.path.insert(0, os.path.abspath("../.."))  # or ".." if needed
 import numpy as np
 import torch
 from torch.utils.data import Dataset
 import h5py
 from helpers.app_settings import get_app_settings
 settings = get_app_settings()
 dataset = settings.dataset.modulation_types
 class ModulationH5Dataset(Dataset):
    def __init__(self, hdf5_path, label_name, data_key="training_data", label_encoder=None, transform=None):
        self.hdf5_path = hdf5_path            
        self.data_key = data_key 
        self.label_name = label_name
        self.label_encoder = label_encoder
        self.transform = transform
        with h5py.File(hdf5_path, 'r') as f:
            self.length = f[data_key].shape[0]
            self.metadata = f["metadata"]["metadata"][:]
        settings = get_app_settings()
        dataset_cfg = settings.dataset
        all_labels = dataset_cfg.modulation_types
        if self.label_encoder is None:
            from sklearn.preprocessing import LabelEncoder
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(all_labels)
        # Get per-sample labels from metadata
        raw_labels = [row["modulation"].decode("utf-8") for row in self.metadata]
        self.encoded_labels = self.label_encoder.transform(raw_labels)
    def __len__(self):
        return self.length
    def __getitem__(self, idx):
        with h5py.File(self.hdf5_path, 'r') as f:
            x = f[self.data_key][idx]  # shape (1, 128) or similar
        # Normalize
        mean = np.mean(x, axis=-1, keepdims=True)
        std = np.std(x, axis=-1, keepdims=True)
        x = (x - mean) / (std + 1e-6)
        x = torch.tensor(x, dtype=torch.float32)
        label = torch.tensor(self.encoded_labels[idx], dtype=torch.long)
        return x, label
--- a/data/dataset/train.h5
+++ b/data/dataset/train.h5
--- a/data/dataset/val.h5
+++ b/data/dataset/val.h5
--- a/data/models/cm_plotter.py
+++ b/data/models/cm_plotter.py
@ -0,0 +1,58 @@
 import numpy as np
 from typing import Optional
 from matplotlib import pyplot as plt
 from sklearn.metrics import confusion_matrix
 def plot_confusion_matrix(
    y_true: np.array,
    y_pred: np.array,
    classes: list,
    normalize: bool = True,
    title: Optional[str] = None,
    text: bool = True,
    rotate_x_text: int = 90,
    figsize: tuple = (16,9),
    cmap: plt.cm = plt.cm.Blues,
 ):
    """Function to help plot confusion matrices
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if not title:
        if normalize:
            title = "Normalized confusion matrix"
        else:
            title = "Confusion matrix, without normalization"
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation="none", cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set( 
        xticks=np.arange(cm.shape[1]), 
        yticks=np.arange(cm.shape[0]), 
        xticklabels=classes, 
        yticklabels=classes, 
        title=title, 
        ylabel="True label", 
        xlabel="Predicted label",
    )
    ax.set_xticklabels(classes, rotation=rotate_x_text)
    ax.figure.set_size_inches(figsize)
    # Loop over data dimensions and create text annotations.
    fmt = ".2f" if normalize else "d"
    thresh = cm.max() / 2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            if text:
                ax.text(j, i, format(cm[i,j], fmt), ha="center", va="center", color="white" if cm[i,j] > thresh else "black")
    if len(classes) == 2:
        plt.axis([-0.5, 1.5, 1.5, -0.5])
        fig.tight_layout()
    return ax
--- a/data/models/interference_recognition.ipynb
+++ b/data/models/interference_recognition.ipynb
--- a/data/models/mobilenetv3.py
+++ b/data/models/mobilenetv3.py
@ -0,0 +1,206 @@
 import numpy as np
 import torch
 import timm
 from torch import nn
 sizes = [
    'mobilenetv3_large_075',
    'mobilenetv3_large_100',
    'mobilenetv3_rw',
    'mobilenetv3_small_050',
    'mobilenetv3_small_075',
    'mobilenetv3_small_100',
    'tf_mobilenetv3_large_075',
    'tf_mobilenetv3_large_100',
    'tf_mobilenetv3_large_minimal_100',
    'tf_mobilenetv3_small_075',
    'tf_mobilenetv3_small_100',
    'tf_mobilenetv3_small_minimal_100'
    ]
 class SqueezeExcite(nn.Module):
    def __init__(
        self,
        in_chs,
        se_ratio=0.25,
        reduced_base_chs=None,
        act_layer=nn.SiLU,
        gate_fn=torch.sigmoid,
        divisor=1,
        **_,
    ):
        super(SqueezeExcite, self).__init__()
        reduced_chs = reduced_base_chs
        self.conv_reduce = nn.Conv1d(in_chs, reduced_chs, 1, bias=True)
        self.act1 = act_layer(inplace=True)
        self.conv_expand = nn.Conv1d(reduced_chs, in_chs, 1, bias=True)
        self.gate_fn = gate_fn
    def forward(self, x):
        x_se = x.mean((2,), keepdim=True)
        x_se = self.conv_reduce(x_se)
        x_se = self.act1(x_se)
        x_se = self.conv_expand(x_se)
        return x * self.gate_fn(x_se)
 class FastGlobalAvgPool1d(nn.Module):
    def __init__(self, flatten=False):
        super(FastGlobalAvgPool1d, self).__init__()
        self.flatten = flatten
    def forward(self, x):
        if self.flatten:
            in_size = x.size()
            return x.view((in_size[0], in_size[1], -1)).mean(dim=2)
        else:
            return x.view(x.size(0), x.size(1), -1).mean(-1).view(x.size(0), x.size(1), 1)
 class GBN(torch.nn.Module):
    """
    Ghost Batch Normalization
    https://arxiv.org/abs/1705.08741
    """
    def __init__(self, input_dim, drop, act, virtual_batch_size=32, momentum=0.1):
        super(GBN, self).__init__()
        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = nn.BatchNorm1d(self.input_dim, momentum=momentum)
        self.drop = drop
        self.act = act
    def forward(self, x):
        # chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
        # res = [self.bn(x_) for x_ in chunks]
        # return self.drop(self.act(torch.cat(res, dim=0)))
        # x = self.bn(x)
        # x = self.act(x)
        # x = self.drop(x)
        # return x
        return self.drop(self.act(self.bn(x)))
 def replace_bn(parent):
    for n, m in parent.named_children():
        if type(m) is timm.layers.norm_act.BatchNormAct2d:
        # if type(m) is nn.BatchNorm2d:
            # print(type(m))
            setattr(
                parent,
                n,
                GBN(m.num_features, m.drop, m.act),
            )
        else:
            replace_bn(m)
 def replace_se(parent):
    for n, m in parent.named_children():
        if type(m) is timm.models._efficientnet_blocks.SqueezeExcite:
            setattr(
                parent,
                n,
                SqueezeExcite(
                    m.conv_reduce.in_channels,
                    reduced_base_chs=m.conv_reduce.out_channels,
                ),
            )
        else:
            replace_se(m)
 def replace_conv(parent, ds_rate):
    for n, m in parent.named_children():
        if type(m) is nn.Conv2d:
            if ds_rate == 2:
                setattr(
                    parent,
                    n,
                    nn.Conv1d(
                        m.in_channels,
                        m.out_channels,
                        kernel_size=m.kernel_size[0],
                        stride=m.stride[0],
                        padding=m.padding[0],
                        bias=m.kernel_size[0],
                        groups=m.groups,
                    ),
                )
            else:
                setattr(
                    parent,
                    n,
                    nn.Conv1d(
                        m.in_channels,
                        m.out_channels,
                        kernel_size=m.kernel_size[0] if m.kernel_size[0] == 1 else 5,
                        stride=m.stride[0] if m.stride[0] == 1 else ds_rate,
                        padding=m.padding[0] if m.padding[0] == 0 else 2,
                        bias=m.kernel_size[0],
                        groups=m.groups,
                    ),
                )
        else:
            replace_conv(m, ds_rate)
 def create_mobilenetv3(network, ds_rate=2, in_chans=2):
    replace_se(network)
    replace_bn(network)
    replace_conv(network, ds_rate)
    network.global_pool = FastGlobalAvgPool1d()
    network.conv_stem = nn.Conv1d(
                            in_channels=in_chans,
                            out_channels=network.conv_stem.out_channels,
                            kernel_size=network.conv_stem.kernel_size,
                            stride=network.conv_stem.stride,
                            padding=network.conv_stem.padding,
                            bias=network.conv_stem.kernel_size,
                            groups=network.conv_stem.groups,
                        )
    return network
 def mobilenetv3(
    model_size = 'mobilenetv3_small_050',
    num_classes: int = 10,
    drop_rate: float = 0,
    drop_path_rate: float = 0,
    in_chans=2,
 ):
    mdl = create_mobilenetv3(
        timm.create_model(
            model_size,
            num_classes=num_classes,
            in_chans=in_chans,
            drop_path_rate=drop_path_rate,
            drop_rate=drop_rate,
            exportable=True,
        ),
        in_chans=in_chans,
    )
    return mdl
 import torch.nn as nn
 class Simple1DCNN(nn.Module):
    def __init__(self, in_chans=2, num_classes=4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(in_chans, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(64, num_classes)
        )
    def forward(self, x):
        return self.net(x)  # x shape: [B, 2, 128]
 def simple_cnn(in_chans=2, num_classes=4):
    return Simple1DCNN(in_chans, num_classes)
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_0264b4a.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_0264b4a.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_0b3b80f.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_0b3b80f.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_1effc4c.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_1effc4c.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_37a73db.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_37a73db.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_3d557a9.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_3d557a9.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_442fcb9.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_442fcb9.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_491c457.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_491c457.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_4fff84f.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_4fff84f.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_6676600.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_6676600.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_6d35ff9.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_6d35ff9.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_6d85f3e.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_6d85f3e.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_85a8c83.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_85a8c83.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_940988e.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_940988e.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_9f88dc2.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_9f88dc2.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_a4a6ba6.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_a4a6ba6.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_a60964b.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_a60964b.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_ad350fe.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_ad350fe.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_ae5224a.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_ae5224a.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_b68f080.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_b68f080.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_c00477b.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_c00477b.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_cca57ca.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_cca57ca.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_db8a5b4.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_db8a5b4.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_dd021f7.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_dd021f7.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_e0cc41d.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_e0cc41d.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_e61d9bf.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_e61d9bf.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_f024082.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_f024082.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_f2013fa.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_f2013fa.npy
--- a/data/recordings/rec_0Hz_2025-05-15_09-45-10_f2ae593.npy
+++ b/data/recordings/rec_0Hz_2025-05-15_09-45-10_f2ae593.npy
--- a/data/scripts/data_gen.py
+++ b/data/scripts/data_gen.py
@ -0,0 +1,69 @@
 from utils.data import Recording
 import numpy as np
 from utils.signal import block_generator
 mods = {
    "bpsk": {"num_bits_per_symbol": 1, "constellation_type": "psk"},
    "qpsk": {"num_bits_per_symbol": 2, "constellation_type": "psk"},
    "qam16": {"num_bits_per_symbol": 4, "constellation_type": "qam"},
    "qam64": {"num_bits_per_symbol": 6, "constellation_type": "qam"},
 }
 def generate_modulated_signals():
    for modulation in ["bpsk", "qpsk", "qam16", "qam64"]:
        for snr in np.arange(-6, 13, 3):
            recording_length = 1024
            beta = 0.3  # the rolloff factor, can be changed to add variety
            sps = 4  # samples per symbol, or the relative bandwidth of the digital signal. Can also be changed.
            # blocks don't directly take the string 'qpsk' so we use the dict 'mods' to get parameters
            constellation_type = mods[modulation]["constellation_type"]
            num_bits_per_symbol = mods[modulation]["num_bits_per_symbol"]
            # construct the digital modulation blocks with these parameters
            # we have bit source -> mapper -> upsampling -> pulse shaping
            bit_source = block_generator.RandomBinarySource()
            mapper = block_generator.Mapper(
                constellation_type=constellation_type,
                num_bits_per_symbol=num_bits_per_symbol,
            )
            upsampler = block_generator.Upsampling(factor=sps)
            pulse_shaping_filter = block_generator.RaisedCosineFilter(
                upsampling_factor=sps, beta=beta
            )
            pulse_shaping_filter.connect_input([upsampler])
            upsampler.connect_input([mapper])
            mapper.connect_input([bit_source])
            modulation_recording = pulse_shaping_filter.record(
                num_samples=recording_length
            )
            # add noise by calculating the power of the modulation recording and generating AWGN from the snr parameter
            signal_power = np.mean(np.abs(modulation_recording.data[0] ** 2))
            awgn_source = block_generator.AWGNSource(
                variance=(signal_power / 2) * (10 ** (((-1 * snr) / 20)))
            )
            noise = awgn_source.record(num_samples=recording_length)
            samples_with_noise = modulation_recording.data + noise.data
            output_recording = Recording(data=samples_with_noise)
            # add metadata for ML later
            output_recording.add_to_metadata(key="modulation", value=modulation)
            output_recording.add_to_metadata(key="snr", value=int(snr))
            output_recording.add_to_metadata(key="beta", value=beta)
            output_recording.add_to_metadata(key="sps", value=sps)
            # view if you want
            # output_recording.view()
            # save to file
            output_recording.to_npy()  # optionally add path and filename parameters
 if __name__ == "__main__":
    generate_modulated_signals()
--- a/data/scripts/produce_dataset.py
+++ b/data/scripts/produce_dataset.py
@ -0,0 +1,152 @@
 import os, h5py, numpy as np
 from utils.io import from_npy
 from split_dataset import split, split_recording
 from helpers.app_settings import get_app_settings
 meta_dtype = np.dtype(
    [
        ("rec_id", "S256"),
        ("snippet_idx", np.int32),
        ("modulation", "S32"),
        ("snr", np.int32),
        ("beta", np.float32),
        ("sps", np.int32),
    ]
 )
 info_dtype = np.dtype(
    [
        ("num_records", np.int32),
        ("dataset_name", "S64"),  # up to 64‐byte UTF-8 strings
        ("creator", "S64"),
    ]
 )
 def write_hdf5_file(records, output_path, dataset_name="data"):
    """
    Writes a list of records to an HDF5 file.
    Parameters:
        records (list): List of records to be written to the file
        output_path (str): Path to the output HDF5 file
        dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
    Returns:
        str: Path to the created HDF5 file
    """
    meta_arr = np.empty(len(records), dtype=meta_dtype)
    for i, (_, md) in enumerate(records):
        meta_arr[i] = (
            md["rec_id"].encode("utf-8"),
            md["snippet_idx"],
            md["modulation"].encode("utf-8"),
            int(md["snr"]),
            float(md["beta"]),
            int(md["sps"]),
        )
    first_rec, _ = records[0]  # records[0] is a tuple of (data, md)
    sample = first_rec
    shape, dtype = sample.shape, sample.dtype
    with h5py.File(output_path, "w") as hf:
        dset = hf.create_dataset(
            dataset_name, shape=(len(records),) + shape, dtype=dtype, compression="gzip"
        )
        for idx, (snip, md) in enumerate(records):
            dset[idx, ...] = snip
        mg = hf.create_group("metadata")
        mg.create_dataset("metadata", data=meta_arr, compression="gzip")
        print(dset.shape, f"snippets created in {dataset_name}")
        info_arr = np.array(
            [
                (
                    len(records),
                    dataset_name.encode("utf-8"),
                    b"generate_dataset.py",  # already bytes
                )
            ],
            dtype=info_dtype,
        )
        mg.create_dataset("dataset_info", data=info_arr)
    return output_path
 def complex_to_channel(data):
    """
    Convert complex-valued IQ data of shape (1, N) to 2-channel real array of shape (2, N).
    """
    assert np.iscomplexobj(data) #check if the data is in the form a+bi
    real = np.real(data[0])  # (N,)
    imag = np.imag(data[0])  # (N,)
    stacked = np.stack([real, imag], axis=0)  # shape (2, N)
    return stacked.astype(np.float32)
 def generate_datasets(cfg):
    """
    Generates a dataset from a folder of .npy files and saves it to an HDF5 file
    Parameters:
        path_to_recordings (str): Path to the folder containing .npy files
        output_path (str): Path to the output HDF5 file
        dataset_name (str): Name of the dataset in the HDF5 file (default: "data")
    Returns:
        dset (h5py.Dataset): The created dataset object
    """
    parent = os.path.dirname(cfg.output_dir)
    if not parent:
        os.makedirs(cfg.output_dir, exist_ok=True)
    # we assume the recordings are in .npy format
    files = os.listdir(cfg.input_dir)
    if not files:
        raise ValueError("No files found in the specified directory.")
    records = []
    for fname in files:
        rec = from_npy(os.path.join(cfg.input_dir, fname))
        data = rec.data #here data is a numpy array with the shape (1, N)
        data = complex_to_channel(data)  # convert to 2-channel real array
        md = rec.metadata  # pull metadata from the recording
        md.setdefault("recid", len(records))
        records.append((data, md))
    # split each recording into <num_slices> snippets each
    records = split_recording(records, cfg.num_slices)
    train_records, val_records = split(records, cfg.train_split, cfg.seed)
    train_path = os.path.join(cfg.output_dir, "train.h5")
    val_path = os.path.join(cfg.output_dir, "val.h5")
    write_hdf5_file(train_records, train_path, "training_data")
    write_hdf5_file(val_records, val_path, "validation_data")
    return train_path, val_path
 def main():
    settings = get_app_settings()
    dataset_cfg = settings.dataset
    train_path, val_path = generate_datasets(dataset_cfg)
    print(f"✅ Train: {train_path}\n✅ Val: {val_path}")
 if __name__ == "__main__":
    main()
--- a/data/scripts/split_dataset.py
+++ b/data/scripts/split_dataset.py
@ -0,0 +1,79 @@
 import random
 from collections import defaultdict
 def split(dataset, train_frac=0.8, seed=42, label_key = "modulation"):
    """
    Splits a dataset into smaller datasets based on the specified lengths.
    Parameters:
        dataset (list): The dataset to be split.
        lengths (list): A list of lengths for each split.
    Returns:
        list: A list of split datasets.
    """
    rec_buckets = defaultdict(list)
    for data, md in dataset:
        rec_buckets[md["recid"]].append((data, md))
    rec_labels = {} #store labels for each recording
    for rec_id, group in rec_buckets.items():
        label = group[0][1][label_key]
        if isinstance(label, bytes): #if the label is a byte string
            label = label.decode("utf-8")
        rec_labels[rec_id] = label
    label_rec_ids = defaultdict(list) #group rec_ids by label
    for rec_id, label in rec_labels.items():
        label_rec_ids[label].append(rec_id)
    random.seed(seed)
    train_recs, val_recs = set(), set()
    for label, rec_ids in label_rec_ids.items():
        random.shuffle(rec_ids)
        split_idx = int(len(rec_ids) * train_frac)
        train_recs.update(rec_ids[:split_idx]) #pulls train_frac or rec_ids per label, guarantees all modulations are represented
        val_recs.update(rec_ids[split_idx:])
    # add the assigned recordings to the train and val datasets
    train_dataset, val_dataset = [], []
    for rec_id, group in rec_buckets.items():
        if rec_id in train_recs:
            train_dataset.extend(group)
        elif rec_id in val_recs:
            val_dataset.extend(group)
    return train_dataset, val_dataset
 def split_recording(recording_list, num_snippets):
    """
    Splits a list of recordings into smaller chunks.
    Parameters:
        recording_list (list): List of recordings to be split
    Returns: yeah yeah
        list: List of split recordings
    """
    snippet_list = []
    for data, md in recording_list:
        C, N = data.shape
        L = N // num_snippets
        for i in range(num_snippets):
            start = i * L
            end = (i + 1) * L
            snippet = data[:, start:end]
            # copy the metadata, adding a snippet index
            snippet_md = md.copy()
            snippet_md["snippet_idx"] = i
            snippet_list.append((snippet, snippet_md))
    return snippet_list