diff --git a/src/ria_toolkit/adt/__init__.py b/src/ria_toolkit/adt/__init__.py new file mode 100644 index 0000000..e5b83b1 --- /dev/null +++ b/src/ria_toolkit/adt/__init__.py @@ -0,0 +1,8 @@ +""" +The Data package contains abstract data types tailored for radio machine learning, such as ``Recording``, as well +as the abstract interfaces for the radio dataset and radio dataset builder framework. +""" + +__all__ = ["Annotation", "Recording"] +from .annotation import Annotation +from .recording import Recording diff --git a/src/ria_toolkit/adt/annotation.py b/src/ria_toolkit/adt/annotation.py new file mode 100644 index 0000000..a3aacd2 --- /dev/null +++ b/src/ria_toolkit/adt/annotation.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import json +from typing import Any, Optional + +from sigmf import SigMFFile + + +class Annotation: + """Signal annotations are labels or additional information associated with specific data points or segments within + a signal. These annotations could be used for tasks like supervised learning, where the goal is to train a model + to recognize patterns or characteristics in the signal associated with these annotations. + + Annotations can be used to label interesting points in your recording. + + :param sample_start: The index of the starting sample of the annotation. + :type sample_start: int + :param sample_count: The index of the ending sample of the annotation, inclusive. + :type sample_count: int + :param freq_lower_edge: The lower frequency of the annotation. + :type freq_lower_edge: float + :param freq_upper_edge: The upper frequency of the annotation. + :type freq_upper_edge: float + :param label: The label that will be displayed with the bounding box in compatible viewers including IQEngine. + Defaults to an emtpy string. + :type label: str, optional + :param comment: A human-readable comment. Defaults to an empty string. + :type comment: str, optional + :param detail: A dictionary of user defined annotation-specific metadata. Defaults to None. + :type detail: dict, optional + """ + + def __init__( + self, + sample_start: int, + sample_count: int, + freq_lower_edge: float, + freq_upper_edge: float, + label: Optional[str] = "", + comment: Optional[str] = "", + detail: Optional[dict] = None, + ): + """Initialize a new Annotation instance.""" + self.sample_start = int(sample_start) + self.sample_count = int(sample_count) + self.freq_lower_edge = float(freq_lower_edge) + self.freq_upper_edge = float(freq_upper_edge) + self.label = str(label) + self.comment = str(comment) + + if detail is None: + self.detail = {} + elif not _is_jsonable(detail): + raise ValueError(f"Detail object is not json serializable: {detail}") + else: + self.detail = detail + + def is_valid(self) -> bool: + """ + Check that the annotation sample count is > 0 and the freq_lower_edge 0 and self.freq_lower_edge < self.freq_upper_edge + + def overlap(self, other): + """ + Quantify how much the bounding box in this annotation overlaps with another annotation. + + :param other: The other annotation. + :type other: Annotation + + :returns: The area of the overlap in samples*frequency, or 0 if they do not overlap.""" + + sample_overlap_start = max(self.sample_start, other.sample_start) + sample_overlap_end = min(self.sample_start + self.sample_count, other.sample_start + other.sample_count) + + freq_overlap_start = max(self.freq_lower_edge, other.freq_lower_edge) + freq_overlap_end = min(self.freq_upper_edge, other.freq_upper_edge) + + if freq_overlap_start >= freq_overlap_end or sample_overlap_start >= sample_overlap_end: + return 0 + else: + return (sample_overlap_end - sample_overlap_start) * (freq_overlap_end - freq_overlap_start) + + def area(self): + """ + The 'area' of the bounding box, samples*frequency. + Useful to quantify annotation size. + + :returns: sample length multiplied by bandwidth.""" + + return self.sample_count * (self.freq_upper_edge - self.freq_lower_edge) + + def __eq__(self, other: Annotation) -> bool: + return self.__dict__ == other.__dict__ + + def to_sigmf_format(self): + """ + Returns a JSON dictionary representing this annotation formatted to be saved in a .sigmf-meta file. + """ + + annotation_dict = {SigMFFile.START_INDEX_KEY: self.sample_start, SigMFFile.LENGTH_INDEX_KEY: self.sample_count} + + annotation_dict["metadata"] = { + SigMFFile.LABEL_KEY: self.label, + SigMFFile.COMMENT_KEY: self.comment, + SigMFFile.FHI_KEY: self.freq_upper_edge, + SigMFFile.FLO_KEY: self.freq_lower_edge, + "ria:detail": self.detail, + } + + if _is_jsonable(annotation_dict): + return annotation_dict + else: + raise ValueError("Annotation dictionary was not json serializable.") + + +def _is_jsonable(x: Any) -> bool: + """ + :return: True if x is JSON serializable, False otherwise. + """ + try: + json.dumps(x) + return True + except (TypeError, OverflowError): + return False diff --git a/src/ria_toolkit/adt/datasets/__init__.py b/src/ria_toolkit/adt/datasets/__init__.py new file mode 100644 index 0000000..465be0f --- /dev/null +++ b/src/ria_toolkit/adt/datasets/__init__.py @@ -0,0 +1,12 @@ +""" +The Radio Dataset Subpackage defines the abstract interfaces and framework components for the management of machine +learning datasets tailored for radio signal processing. +""" + +__all__ = ["RadioDataset", "IQDataset", "SpectDataset", "DatasetBuilder", "split", "random_split"] + +from .dataset_builder import DatasetBuilder +from .iq_dataset import IQDataset +from .radio_dataset import RadioDataset +from .spect_dataset import SpectDataset +from .split import random_split, split diff --git a/src/ria_toolkit/adt/datasets/dataset_builder.py b/src/ria_toolkit/adt/datasets/dataset_builder.py new file mode 100644 index 0000000..271eb86 --- /dev/null +++ b/src/ria_toolkit/adt/datasets/dataset_builder.py @@ -0,0 +1,137 @@ +""" +A `DatasetBuilder` is a creator class that manages the download, preparation, and creation of radio datasets. +""" + +from abc import ABC, abstractmethod +from typing import Any, Optional + +from packaging.version import Version + +from utils._utils.abstract_attribute import abstract_attribute +from utils.data.datasets.license.dataset_license import DatasetLicense +from utils.data.datasets.radio_dataset import RadioDataset + + +class DatasetBuilder(ABC): + """Abstract interface for radio dataset builders. These builder produce radio datasets for common and project + datasets related to radio science. + + This class should not be instantiated directly. Instead, subclass it to define specific builders for different + datasets. + """ + + _url: str = abstract_attribute() + _SHA256: str # SHA256 checksum. + _name: str = abstract_attribute() + _author: str = abstract_attribute() + _license: DatasetLicense = abstract_attribute() + _version: Version = abstract_attribute() + _latest_version: Version = None + + def __init__(self): + super().__init__() + + @property + def name(self) -> str: + """ + :return: The name of the dataset. + :type: str + """ + return self._name + + @property + def author(self) -> str: + """ + :return: The author of the dataset. + :type: str + """ + return self._author + + @property + def url(self) -> str: + """ + :return: The URL where the dataset was accessed. + :type: str + """ + return self._url + + @property + def sha256(self) -> Optional[str]: + """ + :return: The SHA256 checksum, or None if not set. + :type: str + """ + return self._SHA256 + + @property + def md5(self) -> Optional[str]: + """ + :return: The MD5 checksum, or None if not set. + :type: str + """ + return self._MD5 + + @property + def version(self) -> Version: + """ + :return: The version identifier of the dataset. + :type: Version Identifier + """ + return self._version + + @property + def latest_version(self) -> Optional[Version]: + """ + :return: The version identifier of the latest available version of the dataset, or None if not set. + :type: Version Identifier or None + """ + return self._latest_version + + @property + def license(self) -> DatasetLicense: + """ + :return: The dataset license information. + :type: DatasetLicense + """ + return self._license + + @property + def info(self) -> dict[str, Any]: + """ + :return: Information about the dataset including the name, author, and version of the dataset. + :rtype: dict + """ + # TODO: We should increase the amount of information that's included here. See the information included in + # tdfs.core.DatasetInfo for more: https://www.tensorflow.org/datasets/api_docs/python/tfds/core/DatasetInfo. + return { + "name": self.name, + "author": self.author, + "url": self.url, + "sha256": self.sha256, + "md5": self.md5, + "version": self.version, + "license": self.license, + "latest_version": self.latest_version, + } + + @abstractmethod + def download_and_prepare(self) -> None: + """Download and prepare the dataset for use as an HDF5 source file. + + Once an HDF5 source file has been prepared, the downloaded files are deleted. + """ + pass + + @abstractmethod + def as_dataset(self, backend: str) -> RadioDataset: + """A factory method to manage the creation of radio datasets. + + :param backend: Backend framework to use ("pytorch" or "tensorflow"). + :type backend: str + + Note: Depending on your installation, not all backends may be available. + + :return: A new RadioDataset based on the signal representation and specified backend. + :type: RadioDataset + """ + pass diff --git a/src/ria_toolkit/adt/datasets/h5helpers.py b/src/ria_toolkit/adt/datasets/h5helpers.py new file mode 100644 index 0000000..f990025 --- /dev/null +++ b/src/ria_toolkit/adt/datasets/h5helpers.py @@ -0,0 +1,221 @@ +import os + +import h5py +import numpy as np + + +def copy_dataset_entry_by_index( + source: str | os.PathLike, destination: str | os.PathLike, dataset_path: str, idx: int +) -> None: + """ + Copies an entry from a dataset based on an index from the source HDF5 file to the destination HDF5 file. + + :param source: The name of the original HDF5 file. + :type source: str + :param destination: The name of the new HDF5 file. + :type destination: str + :param dataset_path: The path of the dataset from the root of the file. + :type dataset_path: str + :param idx: The index of the specified example. + :type idx: int + + :return: None + """ + # TODO: Generalize so that source and destination can be file objects or strings + with h5py.File(source, "r") as original_file, h5py.File(destination, "a") as new_file: + original_ds = original_file[dataset_path] + + entry = original_ds[idx] + new_ds = new_file[dataset_path] + new_ds.resize(new_ds.shape[0] + 1, axis=0) + new_ds[-1] = entry + + +def copy_over_example(source: str | os.PathLike, destination: str | os.PathLike, idx: int) -> None: + """ + Copies over an example and it's corresponding metadata located at the given index to a new file. + It appends the new example to the end of the new file. + + :param source: The name of the original HDF5 file. + :type source: str or os.PathLike + :param destination: The name of the new HDF5 file. + :type destination: str or os.PathLike + :param idx: The index of the example within the dataset. + :type idx: int + + :return: None + """ + + with h5py.File(source, "r") as original_file, h5py.File(destination, "a") as new_file: + ds, md = original_file["data"], original_file["metadata/metadata"] + + new_ds, new_md = new_file["data"], new_file["metadata/metadata"] + + new_ds.resize(new_ds.shape[0] + 1, axis=0) + new_md.resize(new_md.shape[0] + 1, axis=0) + + new_ds[-1], new_md[-1] = ds[idx], md[idx] + + +def append_entry_inplace(source: str | os.PathLike, dataset_path: str, entry: np.ndarray) -> None: + """ + Appends an entry to the specified dataset of the source HDF5 file. This operation is done inplace. + + :param source: The name of the source HDF5 file. + :type source: str or os.PathLike + :param dataset_path: The path of the dataset from the root of the file. + :type dataset_path: str + :param entry: The entry that is being copied. + :type entry: np.ndarray + + :return: None + """ + # TODO: Generalize so that source can be file object or string + with h5py.File(source, "a") as new_file: + new_ds = new_file[dataset_path] + new_ds.resize(new_ds.shape[0] + 1, axis=0) + new_ds[-1] = entry + + +def duplicate_entry_inplace(source: str | os.PathLike, dataset_path: str, idx: int) -> None: + """ + Appends the entry at index to the end of the dataset. This operation is done inplace. + + :param source: The name of the source HDF5 file. + :type source: str or os.PathLike + :param dataset_path: The path of the dataset from the root of the file. This dataset is usually + 'data' or 'metadata/metadata'. + :type dataset_path: str + :param idx: The index of the example within the dataset. + :type idx: int + + :return: None + """ + # This function appends to dataset, so upon dataset creation, chunks has to = True and max_size has to = None + with h5py.File(source, "a") as f: + ds = f[dataset_path] + entry = ds[idx] + ds.resize(ds.shape[0] + 1, axis=0) + ds[-1] = entry + + +def copy_file(original_source: str | os.PathLike, new_source: str | os.PathLike) -> None: + """Copies contents of source HDF5 file to a new HDF5 file. + + :param original_source: The name of the original HDF5 source file. + :type original_source: str or os.PathLike + :param new_source: The copy of the HDF5 source file. + :type new_source: str or os.PathLike + + :return: None + """ + original_file = h5py.File(original_source, "r") + + with h5py.File(new_source, "w") as new_file: + for key in original_file.keys(): + original_file.copy(key, new_file) + + original_file.close() + + +def make_empty_clone(original_source: str | os.PathLike, new_source: str | os.PathLike, example_length: int) -> None: + """Creates a new HDF5 file with the same structure but will leave metadata and dataset empty for operations. + + :param original_source: The name of the original HDF5 source file. + :type original_source: str or os.PathLike + :param new_source: The name of the new HDF5 source file. + :type new_source: str or os.PathLike + :param example_length: The desired length of an example in the new file. + :type example_length: int + + :return: None + """ + + with h5py.File(new_source, "w") as new_file, h5py.File(original_source, "r") as original_file: + for key in original_file.keys(): + if key == "data": + ds = original_file["data"] + channels = ds.shape[1] + new_file.create_dataset( + "data", + shape=(0, channels, example_length), + chunks=True, + maxshape=(None, None, None), + dtype=original_file["data"].dtype, + ) + elif key == "metadata": + new_metadata_group = new_file.create_group("metadata") + new_metadata_group.create_dataset( + "metadata", + shape=(0,), + chunks=True, + maxshape=(None,), + dtype=original_file["metadata/metadata"].dtype, + ) + else: + original_file.copy(key, new_file) + + +def delete_example_inplace(source: str | os.PathLike, idx: int) -> None: + """Deletes an example and it's corresponding metadata located at the given index. + This deletion is done by creating a temporary dataset and copying all contents + to the temporary dataset except for the example at idx. This operation is inplace. + + :param source: The name of the source HDF5 file. + :type source: str or os.PathLike + :param idx: The index of the example and metadata to be deleted. + :type idx: int + + :return: None + """ + + with h5py.File(source, "a") as f: + ds, md = f["data"], f["metadata/metadata"] + m, c, n = ds.shape + assert 0 <= idx <= m - 1 + assert len(ds) == len(md) + + new_ds = f.create_dataset( + "data.temp", + shape=(m - 1, c, n), + chunks=True, + dtype=ds.dtype, + maxshape=(None, None, None), # Required to allow future mutations which expand the shape + ) + new_md = f.create_dataset( + "metadata/metadata.temp", shape=len(md) - 1, chunks=True, dtype=md.dtype, maxshape=(None,) + ) + + for row in range(idx): + new_ds[row], new_md[row] = ds[row], md[row] + + for row in range(idx + 1, len(md)): + new_ds[row - 1], new_md[row - 1] = ds[row], md[row] + + del f["data"] + del f["metadata/metadata"] + + f.move("data.temp", "data") + f.move("metadata/metadata.temp", "metadata/metadata") + + +def overwrite_file(source: str | os.PathLike, new_data: np.ndarray) -> None: + """ + Overwrites data in an HDF5 file with new data. + + :param source: The copy of the HDF5 source file. + :type source: str or os.PathLike + :param new_data: The updated copy of the data that should be stored. + :type new_data: np.ndarray + + :return: None + """ + + # TODO: Might need to pass in dataset_path instead of datastet_name depending on file structure + # Update copy to include augmented data + + with h5py.File(source, "r+") as f: + ds_name = tuple(f.keys())[0] + del f[ds_name] + f.create_dataset(ds_name, data=new_data) + f.close() diff --git a/src/ria_toolkit/adt/datasets/iq_dataset.py b/src/ria_toolkit/adt/datasets/iq_dataset.py new file mode 100644 index 0000000..7f10d97 --- /dev/null +++ b/src/ria_toolkit/adt/datasets/iq_dataset.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import os +from abc import ABC +from typing import Optional + +import h5py +import numpy as np + +from utils.data.datasets.h5helpers import ( + append_entry_inplace, + copy_dataset_entry_by_index, +) +from utils.data.datasets.radio_dataset import RadioDataset + + +class IQDataset(RadioDataset, ABC): + """An ``IQDataset`` is a ``RadioDataset`` tailored for machine learning tasks that involve processing + radiofrequency (RF) signals represented as In-phase (I) and Quadrature (Q) samples. + + For machine learning tasks that involve processing spectrograms, please use + utils.data.datasets.SpectDataset instead. + + This is an abstract interface defining common properties and behaviour of IQDatasets. Therefore, this class + should not be instantiated directly. Instead, it is subclassed to define custom interfaces for specific machine + learning backends. + + :param source: Path to the dataset source file. For more information on dataset source files + and their format, see :doc:`radio_datasets`. + :type source: str or os.PathLike + """ + + def __init__(self, source: str | os.PathLike): + """Create a new IQDataset.""" + super().__init__(source=source) + + @property + def shape(self) -> tuple[int]: + """IQ datasets are M x C x N, where M is the number of examples, C is the number of channels, N is the length + of the signals. + + :return: The shape of the dataset. The elements of the shape tuple give the lengths of the corresponding + dataset dimensions. + :type: tuple of ints + """ + return super().shape + + def trim_examples( + self, trim_length: int, keep: Optional[str] = "start", inplace: Optional[bool] = False + ) -> IQDataset | None: + """Trims all examples in a dataset to a desired length. + + :param trim_length: The desired length of the trimmed examples. + :type trim_length: int + :param keep: Specifies the part of the example to keep. Defaults to "start". + The options are: + - "start" + - "end" + - "middle" + - "random" + :type keep: str, optional + :param inplace: If True, the operation modifies the existing source file directly and returns None. + If False, the operation creates a new dataset cbject and corresponding source file, leaving the original + dataset unchanged. Default is False. + :type inplace: bool + + :raises ValueError: If trim_length is greater than or equal to the length of the examples. + :raises ValueError: If value of keep is not recognized. + :raises ValueError: If specified trim length is invalid for middle index. + + :return: The dataset that is composed of shorter examples. + :rtype: IQDataset + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.shape + (5, 1, 3) + >>> new_ds = ds.trim_examples(2) + >>> new_ds.shape + (5, 1, 2) + """ + + keep = keep.lower() + + channels, example_length = np.shape(self[0]) + + if trim_length >= example_length: + raise ValueError(f"Trim length must be less than {example_length}") + + if keep not in {"start", "end", "middle", "random"}: + raise ValueError('keep must be "start", "end", "middle", or "random"') + + start = None + if keep == "middle": + start = int(example_length / 2) + if start + trim_length > example_length: + raise ValueError(f"Trim length of {trim_length} is invalid for middle index of: {start} ") + + elif keep == "random": + start = np.random.randint(0, example_length - trim_length + 1) + + if not inplace: + ds = self._create_next_dataset(example_length=trim_length) + + with h5py.File(self.source, "a") as f: + data = f["data"] + for idx in range(len(self)): + + trimmed_example = generate_trimmed_example( + example=data[idx], + keep=keep, + trim_length=trim_length, + start=start, + ) + + if not inplace: + append_entry_inplace(source=ds.source, dataset_path="data", entry=trimmed_example) + copy_dataset_entry_by_index( + source=self.source, destination=ds.source, dataset_path="metadata/metadata", idx=idx + ) + + else: + trimmed_example = np.pad( + trimmed_example, ((0, 0), (0, example_length - trim_length)), "constant", constant_values=0 + ) + data[idx] = trimmed_example + + if not inplace: + return ds + else: + data.resize(trim_length, axis=2) + + def split_examples( + self, split_factor: Optional[int] = None, example_length: Optional[int] = None, inplace: Optional[bool] = False + ) -> IQDataset | None: + """If the current example length is not evenly divisible by the provided example_length, excess samples are + discarded. Excess examples are always at the end of the slice. If the split factor results in non-integer + example lengths for the new example chunks, it rounds down. + + For example: + + + Requires either split_factor or example_length to be specified but not both. If both are provided, + split factor will be used by default, and a warning will be raised. + + :param split_factor: the number of new example chunks produced from each original example, defaults to None. + :type split_factor: int, optional + :param example_length: the example length of the new example chunks, defaults to None. + :type example_length: int, optional + :param inplace: If True, the operation modifies the existing source file directly and returns None. + If False, the operation creates a new dataset cbject and corresponding source file, leaving the original + dataset unchanged. Default is False. + :type inplace: bool, optional + + :return: A dataset with more examples that are shorter. + :rtype: IQDataset + + **Examples:** + + If the dataset has 100 examples of length 1024 and the split factor is 2, the resulting dataset + will have 200 examples of 512. No samples have been discarded. + + If the example dataset has 100 examples of length 1024 and the example length is 100, the resulting dataset + will have 1000 examples of length 100. The remaining 24 samples from each example have been discarded. + """ + + if split_factor is not None and example_length is not None: + # Raise warning and use split factor + raise Warning("split_factor and example_length should not both be specified.") + + if not inplace: + # ds = self.create_new_dataset(example_length=example_length) + pass + + raise NotImplementedError + + +def generate_trimmed_example( + example: np.ndarray, keep: str, trim_length: int, start: Optional[int] = None +) -> np.ndarray: + """Takes in an IQ example as input and returns a trimmed example. + + :param example: The example to be trimmed. + :type example: np.ndarray + :param keep: The position the trimming occurs from. + :type keep: str + :param trim_length: The desired length of the trimmed example: + :type trim_length: int + :param start: The starting index if keep = "middle" or "random" + :type start: int, optional + + :return: The trimmed example + :rtype: np.ndarray + """ + + if keep == "start": + return example[:, :trim_length] + + elif keep == "end": + return example[:, -trim_length:] + + elif keep == "middle": + return example[:, start : start + trim_length] + + else: + return example[:, start : start + trim_length] diff --git a/src/ria_toolkit/adt/datasets/license/__init__.py b/src/ria_toolkit/adt/datasets/license/__init__.py new file mode 100644 index 0000000..1fc5c6f --- /dev/null +++ b/src/ria_toolkit/adt/datasets/license/__init__.py @@ -0,0 +1,211 @@ +""" +This package contains the ``DatasetLicense`` class and a bunch of off-the-shelf implementations for several common +license types. + +Common license types for datasets courtesy of the University of Calgary: +`Common license types for datasets and what they mean `_ + +.. note:: + + License descriptions are provided for informational purposes only and should not be construed as legal advice. + For legal guidance, please refer to official licence documentation and consult with legal professionals specializing + in software and dataset licensing. + +.. note:: + + When licensing datasets, it's recommended to use licenses specifically designed for data, rather than using + software licenses such as MIT, Apache, or GPL. + +""" + +__all__ = [ + "DatasetLicense", + "PUBLIC_DOMAIN", + "CC_0", + "CC_BY", + "CC_BY_NC", + "CC_BY_NC_ND", + "CC_BY_NC_SA", + "CC_BY_ND", + "CC_BY_SA", + "ODC_BY", + "ODC_PDDL", + "ODC_ODbL", + "RESTRICTED", +] + +from .dataset_license import DatasetLicense + +PUBLIC_DOMAIN = DatasetLicense( + name="Public Domain (No License)", + identifier=None, + description="Technically not a license, the public domain mark relinquishes all rights to a dataset and " + "dedicates the dataset to the public domain.", + licence="https://creativecommons.org/public-domain/pdm/", +) +""" +`Public Domain `_: Technically not a license, the public domain mark +relinquishes all rights to a dataset and dedicates the dataset to the public domain. +""" + + +CC_0 = DatasetLicense( + name="Creative Commons Public Domain Dedication", + identifier="CC0-1.0", + description="A Creative Commons license and is like a public domain dedication. The copyright holder " + "surrenders rights in a dataset using this license.", + licence="https://creativecommons.org/publicdomain/zero/1.0/", +) +""" +`Creative Commons Public Domain Dedication `_: A Creative Commons +license and is like a public domain dedication. The copyright holder surrenders rights in a dataset using this license. +""" + + +ODC_PDDL = DatasetLicense( + name="Open Data Commons Public Domain Dedication and License", + identifier="PDDL-1.0", + description="This license is one of the Open Data Commons licenses and is like a public domain dedication. " + "The copyright holder surrenders rights in a dataset using this license.", + licence="https://opendatacommons.org/licenses/pddl/", +) +""" +`Open Data Commons Public Domain Dedication and License `_: This license +is one of the Open Data Commons licenses and is like a public domain dedication. The copyright holder surrenders rights +in a dataset using this license. +""" + + +CC_BY = DatasetLicense( + name="Creative Commons Attribution 4.0 International", + identifier="CC-BY-4.0", + description="This license is one of the open Creative Commons licenses and allows users to share and adapt " + "the dataset so long as they give credit to the copyright holder.", + licence="https://creativecommons.org/licenses/by/4.0/", +) +""" +`Creative Commons Attribution 4.0 International `_: This license is one +of the open Creative Commons licenses and allows users to share and adapt the dataset so long as they give credit to +the copyright holder. +""" + + +ODC_BY = DatasetLicense( + name="Open Data Commons Attribution License", + identifier="ODC-By-1.0", + description="This license is one of the Open Data Commons licenses and allows users to share and adapt the " + "dataset as long as they give credit to the copyright holder.", + licence="https://opendatacommons.org/licenses/by/", +) +""" +`Open Data Commons Attribution License `_: This license is one of the Open +Data Commons licenses and allows users to share and adapt the dataset as long as they give credit to the copyright +holder. +""" + + +CC_BY_SA = DatasetLicense( + name="Creative Commons Attribution-ShareAlike 4.0 International", + identifier="CC-BY-SA-4.0", + description="This license is one of the open Creative Commons licenses and allows users to share and adapt " + "the dataset as long as they give credit to the copyright holder and distribute any additions, " + "transformations or changes to the dataset under this same license.", + licence="https://creativecommons.org/licenses/by-sa/4.0/", +) +""" +`Creative Commons Attribution-ShareAlike 4.0 International `_: This +license is one of the open Creative Commons licenses and allows users to share and adapt the dataset as long as they +give credit to the copyright holder and distribute any additions, transformations or changes to the dataset under +this same license. +""" + + +ODC_ODbL = DatasetLicense( + name="Open Data Commons Open Database License", + identifier="ODbL-1.0", + description="This license is one of the Open Data Commons licenses and allows users to share and adapt the " + "dataset as long as they give credit to the copyright holder and distribute any additions, " + "transformation or changes to the dataset.", + licence="https://opendatacommons.org/licenses/odbl/", +) +""" +`Open Data Commons Open Database License `_: This license is one of the +Open Data Commons licenses and allows users to share and adapt the dataset as long as they give credit to the copyright +holder and distribute any additions, transformation or changes to the dataset. +""" + + +CC_BY_NC = DatasetLicense( + name="Creative Commons Attribution-NonCommercial 4.0 International", + identifier="CC-BY-NC-4.0", + description="This license is one of the Creative Commons licenses and allows users to share and adapt the " + "dataset if they give credit to the copyright holder and do not use the dataset for any " + "commercial purposes.", + licence="https://creativecommons.org/licenses/by-nc/4.0/", +) +""" +`Creative Commons Attribution-NonCommercial 4.0 International `_: This +license is one of the Creative Commons licenses and allows users to share and adapt the dataset if they give credit to +the copyright holder and do not use the dataset for any commercial purposes. +""" + + +CC_BY_ND = DatasetLicense( + name="Creative Commons Attribution-NoDerivatives 4.0 International", + identifier="CC-BY-ND-4.0", + description="This license is one of the Creative Commons licenses and allows users to share the dataset if " + "they give credit to copyright holder, but they cannot make any additions, transformations or " + "changes to the dataset under this license.", + licence="https://creativecommons.org/licenses/by-nd/4.0/", +) +""" +`Creative Commons Attribution-NoDerivatives 4.0 International `_: This +license is one of the Creative Commons licenses and allows users to share the dataset if they give credit to copyright +holder, but they cannot make any additions, transformations or changes to the dataset under this license. +""" + + +CC_BY_NC_SA = DatasetLicense( + name="Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International", + identifier="CC-BY-NC-SA-4.0", + description="This license is one of the Creative Commons licenses and allows users to share the dataset only " + "if they (1) give credit to the copyright holder, (2) do not use the dataset for any commercial " + "purposes, and (3) distribute any additions, transformations or changes to the dataset under this " + "same license.", + licence="https://creativecommons.org/licenses/by-nc-sa/4.0/", +) +""" +`Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International +`_: This license is one of the Creative Commons licenses and allows +users to share the dataset only if they (1) give credit to the copyright holder, (2) do not use the dataset for any +commercial purposes, and (3) distribute any additions, transformations or changes to the dataset under this same +license. +""" + + +CC_BY_NC_ND = DatasetLicense( + name="Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International", + identifier="CC-BY-NC-ND-4.0", + description="This license is one of the Creative Commons licenses and allows users to use only your " + "unmodified dataset if they give credit to the copyright holder and do not share it for " + "commercial purposes. Users cannot make any additions, transformations or changes to the dataset" + "under this license.", + licence="https://creativecommons.org/licenses/by-nc-nd/4.0/", +) +""" +`Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International +`_: This license is one of the Creative Commons licenses and allows +users to use only your unmodified dataset if they give credit to the copyright holder and do not share it for +commercial purposes. Users cannot make any additions, transformations or changes to the dataset under this license. +""" + + +RESTRICTED = DatasetLicense( + name="Restricted (All Rights Reserved)", + identifier="Restricted", + description="All rights reserved. No permissions granted for use, modification, or distribution of the dataset.", + licence="Restricted (All Rights Reserved)", +) +""" +Restricted (All Rights Reserved): No permissions granted for use, modification, or distribution of the dataset. +""" diff --git a/src/ria_toolkit/adt/datasets/license/dataset_license.py b/src/ria_toolkit/adt/datasets/license/dataset_license.py new file mode 100644 index 0000000..75b5b0e --- /dev/null +++ b/src/ria_toolkit/adt/datasets/license/dataset_license.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass + + +@dataclass +class DatasetLicense: + """ + Represents a dataset license. + """ + + name: str #: The name or title of the license. + identifier: str | None #: SPDX short identifier, or None if one does not exist. + description: str #: A description of the license. + licence: str #: Full license text or URL if the license is available online. diff --git a/src/ria_toolkit/adt/datasets/radio_dataset.py b/src/ria_toolkit/adt/datasets/radio_dataset.py new file mode 100644 index 0000000..f8c1b0d --- /dev/null +++ b/src/ria_toolkit/adt/datasets/radio_dataset.py @@ -0,0 +1,1081 @@ +from __future__ import annotations + +import os +import pathlib +import re +from abc import ABC, abstractmethod +from collections import Counter +from typing import Iterator, Optional + +import h5py +import numpy as np +import pandas as pd +from numpy.typing import ArrayLike + +from utils.data.datasets.h5helpers import ( + append_entry_inplace, + copy_file, + copy_over_example, + delete_example_inplace, + duplicate_entry_inplace, + make_empty_clone, +) + + +class RadioDataset(ABC): + """A radio dataset is an iterable dataset designed for machine learning applications in radio signal + processing and analysis. They are a structured collections of examples in a machine learning-ready format, + with associated metadata. + + This is an abstract interface defining common properties and behavior of radio datasets. Therefore, this class + should not be instantiated directly. Instead, it should be subclassed to define specific interfaces for different + types of radio datasets. For example, see utils.data.datasets.IQDataset, which is a radio dataset + subclass tailored for tasks involving the processing of radio signals represented as IQ (In-phase and Quadrature) + samples. + + :param source: Path to the dataset source file. For more information on dataset source files + and their format, see :doc:`radio_datasets`. + :type source: str or os.PathLike + """ + + def __init__(self, source: str | os.PathLike): + """Create a new RadioDataset.""" + if not h5py.is_hdf5(source): + raise ValueError(f"Dataset source must be HDF5, {source} is not.") + + # TODO: Check to see if source is RIA dataset, and let them know otherwise they should use dataset builder + # utilities to generate a dataset compatible with the RIA framework. + + self._source = pathlib.Path(source) + self._index = 0 + + @property + def source(self) -> pathlib.Path: + """ + :return: Path to the dataset source file. + :type: pathlib.Path + """ + return self._source + + @property + def shape(self) -> tuple[int]: + """ + :return: The shape of the dataset. The elements of the shape tuple give the lengths of the corresponding + dataset dimensions. + :type: tuple of ints + """ + with h5py.File(self.source, "r") as f: + return f["data"].shape + + @property + def data(self) -> np.ndarray: + """Retrieve the data from the source file. + + .. note:: + + Accessing this property reads all the data from the source file into memory as a NumPy array, which can + consume significant amounts of memory and potentially degrade performance. Instead, use the + ``RadioDataset`` class methods to process and manipulate the dataset source file. You can read individual + examples into memory as NumPy arrays by indexing the dataset: ``RadioDataset[idx]``. + + :return: The dataset examples as a single NumPy array. + :type: np.ndarray + """ + with h5py.File(self.source, "r") as f: + return f["data"][:] + + @property + def metadata(self) -> pd.DataFrame: + """Retrieve the metadata from the source file. + + .. note:: + + Accessing this property reads all the metadata from the source file into memory as a Pandas DataFrame. + + :return: The dataset metadata as a Pandas DataFrame. + :type: pd.DataFrame + """ + with h5py.File(self.source, "r") as f: + return pd.DataFrame(f["metadata/metadata"][:]).map(decode_bytes) + + @property + def labels(self) -> list[str]: + """Retrieves the metadata labels from the dataset file. + + :return: A list of metadata column headers. + :rtype: list of strings + + **Examples:** + + >>> awgn_builder = AWGN_Builder() + >>> awgn_builder.download_and_prepare() + >>> ds = awgn_builder.as_dataset(backend="pytorch") + >>> print(ds.labels) + ['rec_id', 'modulation', 'snr'] + """ + with h5py.File(self.source, "r") as f: + return [name for name, _ in f["metadata/metadata"].dtype.fields.items()] + + @abstractmethod + def inspect(self): + """ + .. todo:: This method is not yet fully conceptualized. Likely, it will wrap some of the functionality in the + Dataset Inspector package (dataset_manager.inspector) to produce an image or visualization. However, + the Dataset Inspector package is not yet implemented. + """ + # TODO: Implement in subclass based on https://github.com/qoherent/QDM/blob/main/inspection_utils/inspector.py + # Consider removing moving into the dataset builder. + pass + + @abstractmethod + def default_augmentations(self) -> list[callable]: + """Returns a list of default augmentations. + + :return: A list of default augmentations. + :rtype: list of callable + """ + pass + + def augment( # noqa: C901 # TODO: Simplify function + self, + class_key: str, + augmentations: Optional[callable | list[callable]] = None, + level: Optional[float | list[float]] = 1.0, + target_size: Optional[int | list[int]] = None, + classes_to_augment: Optional[str | list[str]] = None, + inplace: Optional[bool] = False, + ) -> RadioDataset | None: + """Supplement the dataset with new examples by applying various transformations to the pre-existing examples + in the dataset. + + .. todo:: This method is currently under construction, and may produce unexpected results. + + The process of supplementing a dataset to artificially increases the diversity of the examples is called + augmentation. In many cases, training on augmented data can enhance the generalization and robustness of + deep machine learning models. For more information on the benefits and limitations of data + augmentation, please refer to this tutorial by Abid Ali Awan: `A Complete Guide to Data Augmentation + `_. + + The metadata for each new example will be identical to the metadata of the pre-existing example from + which it was generated. However, the metadata will be extended to include a 'augmentation' column, which will + be populated for each new example with the string representation of the transform used to generate it, and left + empty for all the pre-existing examples. + + Please note that augmented data should only be utilized for model training, not for testing or validation. + + Unless specified, augmentations are applied equally across classes, maintaining the original class + distribution. + + In the case where target_size is not equal to the sum of the original class sizes scaled by an integer + multiple, it is not possible to maintain the original class distribution, so the distribution gets slightly + skewed to satisfy target_size. To do this, each class size gets divided by the total size and then + multiplied by target_size, then these values all get rounded to the nearest integers. If the target_size is + not equal to the sum of the rounded sizes, the sizes get sorted based on their decimal portions and then + values are adjusted one by one until the target_size is reached. + + :param class_key: Class name that is used to augment from and calculate class distribution. + :type class_key: str + + :param augmentations: A function or a list of functions that take as input an example from the + dataset and return a transformed version of that example. If no augmentations are specified, the default + augmentations returned by the ``default_augmentations()`` method will be applied. + :type augmentations: callable or list of callables, optional + + :param level: The level or extent of data augmentation to apply, ranging from 0.0 (no augmentation) to + 1.0 (full augmentation, where each augmentation is applied to each pre-existing example). + |br| |br| If ``classes_to_augment`` is specified, this can be either: + * A single float: + All classes are augmented evenly to this level, maintaining the original class distribution. + * A list of floats: + Each element corresponds to the augmentation level target for the corresponding class. + The default is 1.0. + :type level: float or list of floats, optional + + :param target_size: Target size of the augmented dataset. If specified, ``level`` is ignored, and augmentations + are applied to expand the dataset to contain the specified number of examples. + If ``classes_to_augment`` is specified, this can be either: + * A single float: + All classes are augmented proportional to their relative frequency until the dataset reaches the + target size, maintaining the original class distribution. + * A list of floats: + Each element in the list corresponds to the target size for the corresponding class. + Defaults to None. + :type target_size: int or list of ints, optional + + :param classes_to_augment: List of the metadata keys of the classes to augment. If specified, only these + classes will be augmented. Defaults to None. + :type classes_to_augment: string or list of strings, optional + + :param inplace: If True, the augmentation is performed inplace and ``None`` is returned. Defaults to False. + :type inplace: bool, optional + + :raises ValueError: If level has any values that are not in the range (0,1]. + :raises ValueError: If target_size of dataset is already sufficed. + :raises ValueError: If a class name in classes_to_augment does not exist in the specified class_key. + + :return: The augmented dataset or None if ``inplace=True``. + :rtype: RadioDataset or None + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.get_class_sizes(class_key='col') + {a:100, b:500, c:300} + >>> new_ds = ds.augment(class_key='col', classes_to_augment=['a', 'b'], target_size=1200) + >>> new_ds.get_class_sizes(class_key='col') + {a:150 b:750, c:300} + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.get_class_sizes(class_key='col') + {a:50, b:20, c:130} + >>> new_ds = ds.augment(class_key='col', level=0.5) + >>> new_ds.get_class_sizes(class_key='col') + {a:75 b:30, c:195} + """ + + if augmentations is None: + augmentations = self.default_augmentations() + + if not isinstance(augmentations, list): + augmentations = [augmentations] + + if isinstance(level, list): + for i in level: + if i <= 0 or i > 1: + raise ValueError("level must be in this range: (0,1]") + else: + if level <= 0 or level > 1: + raise ValueError("level must be in this range: (0,1]") + + class_sizes = self.get_class_sizes(class_key=class_key) + + if isinstance(target_size, int) and target_size <= sum(class_sizes.values()): + raise ValueError("target_size must be greater than the total sum of the current class sizes.") + + # Encode class names to byte strings and check if all class names exist in class key + if classes_to_augment is not None: + if isinstance(classes_to_augment, list): + classes_to_augment = [cls_name.encode("utf-8") for cls_name in classes_to_augment] + for i in classes_to_augment: + if i not in class_sizes: + raise ValueError(f"class name of {i} does not belong to the class key of {class_key}") + else: + classes_to_augment = classes_to_augment.encode("utf-8") + if classes_to_augment not in class_sizes: + raise ValueError(f"class name of {i} does not belong to the class key of {class_key}") + + result_sizes = get_result_sizes( + level=level, target_size=target_size, classes_to_augment=classes_to_augment, class_sizes=class_sizes + ) + + if "augmentations" not in self.metadata.columns: + # Add metadata column to metadata + raise NotImplementedError + + # Create new dataset object in not inplace + if not inplace: + new_source = self._get_next_file_name() + copy_file(original_source=self.source, new_source=new_source) + ds = self.__class__(source=new_source) + else: + ds = self + + # Create a dict where each pair is the class name and a list of all indices of the examples of that class + indices_to_add = dict() + with h5py.File(ds.source, "a") as f: + class_labels = f["metadata/metadata"][class_key] + + for i in range(len(class_labels)): + current_class = class_labels[i] + if class_sizes[current_class] < result_sizes[current_class] and current_class not in indices_to_add: + indices_to_add[current_class] = [] + + if class_sizes[current_class] < result_sizes[current_class] and current_class in indices_to_add: + indices_to_add[current_class].append(i) + + for key in class_sizes: + if class_sizes[key] < result_sizes[key]: + # Generate a sublist which holds the indices of examples to be augmented + rand_idxs = np.random.choice(indices_to_add[key], result_sizes[key] - class_sizes[key], replace=True) + + aug_idx = 0 + + with h5py.File(ds.source, "a") as f: + data = f["data"] + metadata = f["metadata/metadata"] + for idx in rand_idxs: + rand_example = data[idx] + augmented_example = augmentations[aug_idx](rand_example) + + # Update corresponding metadata entry to contain the augmentation that was applied + original_metadata_entry = metadata[idx] + augmented_metadata_entry = original_metadata_entry.copy() + augmented_metadata_entry["augmentations"] = augmentations[aug_idx].__name__ + + # Update augmentation index after adding name of augmentation to metadata column + if aug_idx < len(augmentations) - 1: + aug_idx += 1 + else: + aug_idx = 0 + + append_entry_inplace(source=ds.source, dataset_path="data", entry=augmented_example) + append_entry_inplace( + source=ds.source, dataset_path="metadata/metadata", entry=augmented_metadata_entry + ) + + if not inplace: + return ds + + def subsample(self, class_key: str, percentage: float, inplace: Optional[bool] = False) -> RadioDataset | None: + """Reduces the number of examples in all classes of a dataset by randomly subsampling each class according + to a specified percentage. This function reduces the number of examples per class to the specified + percentage without affecting the overall class distribution. + + :param class_key: The name of the class to subsample. + :type class_key: str + :param percentage: The percentage of the original class sizes to keep. + :type percentage: float + :param inplace: If True, the operation modifies the existing source file directly and returns None. + If False, the operation creates a new dataset object and corresponding source file, leaving the original + dataset unchanged. Default is False. + :type inplace: bool, optional + + :raises ValueError: If the target size of the class with the lowest frequency goes to 0. + + :return: The subsampled dataset. + :rtype: RadioDataset or None + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.get_class_sizes(class_key="col") + {a:100, b:200, c:300} + >>> new_ds = ds.subsample(percentage=0.80, class_key="col") + >>> new_ds.get_class_sizes(class_key="col") + {a:80, b:160, c:240} + """ + class_sizes = self.get_class_sizes(class_key=class_key) + + channels, example_length = np.shape(self[0]) + target_sizes = dict() + for key in class_sizes: + target_sizes[key] = target_sizes.get(key, int(class_sizes[key] * percentage)) + + if min(target_sizes.values()) <= 0: + raise ValueError("Subsampling can not be performed on dataset because class size will equal 0") + + if not inplace: + ds = self._create_next_dataset(example_length=example_length) + + masks = dict() + for key in class_sizes: + masks[key] = masks.get( + key, np.array([1] * target_sizes[key] + [0] * (class_sizes[key] - target_sizes[key])) + ) + np.random.shuffle(masks[key]) + + counters = dict() + for key in class_sizes: + counters[key] = counters.get(key, 0) + + idx = 0 + with h5py.File(self.source, "a") as f: + while idx < len(self): + labels = f["metadata/metadata"][class_key] + current_class = labels[idx] + current_mask = masks[current_class] + current_mask_value = current_mask[counters[current_class]] + + counters[current_class] += 1 + + if not inplace and current_mask_value == 1: + copy_over_example(self.source, ds.source, idx) + + elif inplace and current_mask_value == 0: + delete_example_inplace(self.source, idx) + continue + + idx += 1 + + if not inplace: + return ds + + def resample(self, quantity_target: int, class_key: str, inplace: Optional[bool] = False) -> RadioDataset | None: + """Adjusts an unsampled dataset by changing the number of examples per class to a user-specified quantity. + + For each class: + - If there are excess examples, it randomly subsamples the class to the quantity target. + - If there are less examples, it randomly duplicates examples to reach the quantity target. + + :param quantity_target: The number of examples each class should have. + :type quantity_target: int + :param class_key: The label of the class to resample. + :type class_key: str + :param inplace: If True, the operation modifies the existing source file directly and returns None. + If False, the operation creates a new dataset object and corresponding source file, leaving the original + dataset unchanged. Default is False. + :type inplace: bool, optional + + :return: The resampled dataset. + :rtype: RadioDataset or None + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.get_class_sizes(class_key="col") + {a:100, b:200, c:300} + >>> new_ds = ds.resample(quantity_target=250, class_key="col") + >>> new_ds.get_class_sizes(class_key="col") + {a:250, b:250, c:250} + """ + + if not inplace: + ds = self.homogenize(class_key=class_key, example_limit=quantity_target) + else: + self.homogenize(class_key=class_key, example_limit=quantity_target, inplace=True) + ds = self + + class_sizes = ds.get_class_sizes(class_key=class_key) + + indices_to_add = dict() + with h5py.File(ds.source, "a") as f: + labels = f["metadata/metadata"][class_key] + + for i in range(len(labels)): + current_class = labels[i] + if class_sizes[current_class] < quantity_target and current_class not in indices_to_add: + indices_to_add[current_class] = [] + + if class_sizes[current_class] < quantity_target and current_class in indices_to_add: + indices_to_add[current_class].append(i) + + for key in indices_to_add.keys(): + rand_idxs = np.random.choice(indices_to_add[key], quantity_target - class_sizes[key], replace=True) + for idx in rand_idxs: + duplicate_entry_inplace(ds.source, "data", idx) + duplicate_entry_inplace(ds.source, "metadata/metadata", idx) + + if not inplace: + return ds + + def homogenize( + self, class_key: str, example_limit: Optional[int] = None, inplace: Optional[bool] = False + ) -> RadioDataset | None: + """Discards excess samples by randomly subsampling all classes within a dataset that have more than a + user-specified limit of examples. If the user doesn't specify a limit, the class the with the + fewest examples is selected as the limit. + + :param class_key: The label of the class to homogenize. + :type class_key: str + :param example_limit: The class size limit to which all classes are subsampled. If not specified, + the class with the fewest examples is used as the limit. Default is None. + :type example_limit: int, optional + :param inplace: If True, the operation modifies the existing source file directly and returns None. + If False, the operation creates a new dataset cbject and corresponding source file, leaving the original + dataset unchanged. Default is False. + :type inplace: bool, optional + + :return: The homogenized dataset. + :rtype: RadioDataset or None + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.get_class_sizes(class_key="col") + {a:1000, b:5000, c:1500, d:900} + >>> new_ds = ds.homogenize(example_limit=1000, class_key="col") + >>> new_ds.get_class_sizes(class_key="col") + {a:1000, b:1000, c:1000, d:900} + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.get_class_sizes(class_key="col") + {a:1000, b:5000, c:1500, d:900} + >>> new_ds = ds.homogenize(class_key="col") + >>> new_ds.get_class_sizes(class_key="col") + {a:900, b:900, c:900, d:900} + """ + + class_sizes = self.get_class_sizes(class_key=class_key) + + if example_limit is None: + example_limit = min(class_sizes.values()) + + channels, example_length = np.shape(self[0]) + + if not inplace: + ds = self._create_next_dataset(example_length=example_length) + + masks, counters = get_masks_and_counters(class_sizes, example_limit) + + idx = 0 + + with h5py.File(self.source, "a") as f: + while idx < len(self): + labels = f["metadata/metadata"][class_key] + current_class = labels[idx] + current_mask = masks[current_class] + if current_mask is None and not inplace: + copy_over_example(self.source, ds.source, idx) + + if current_mask is not None: + current_mask_value = current_mask[counters[current_class]] + counters[current_class] += 1 + + if not inplace and current_mask_value == 1: + copy_over_example(self.source, ds.source, idx) + + elif inplace and current_mask_value == 0: + delete_example_inplace(self.source, idx) + continue + + idx += 1 + + if not inplace: + return ds + + def drop_class(self, class_key: str, class_value: str, inplace: Optional[bool] = False) -> RadioDataset | None: + """Removes an entire class from the dataset. + + :param class_key: Class that will have a value dropped from it. Example: 'signal_type' + :type class_key: str + :param class_value: Value of the class to be dropped. Example: 'LTE', 'NR' + :type class_value: str + :param inplace: If True, the operation modifies the existing source file directly and returns None. + If False, the operation creates a new dataset cbject and corresponding source file, leaving the original + dataset unchanged. Defaults to False. + :type inplace: bool, optional + + :raises ValueError: If the entered class name does not exist in the dataset. + + :return: The dataset without the removed class. + :rtype: RadioDataset or None + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() + >>> ds.get_class_sizes() + {a:100, b:500, c:300} + >>> new_ds = ds.drop_class('a') + >>> new_ds.get_class_sizes() + {b:500, c:300} + """ + class_sizes = self.get_class_sizes(class_key=class_key) + + if class_value.encode("utf-8") not in class_sizes.keys(): + raise ValueError(f"{class_value} is not a class of this dataset.") + + channels, example_length = np.shape(self[0]) + + if not inplace: + ds = self._create_next_dataset(example_length=example_length) + + idx = 0 + with h5py.File(self.source, "a") as f: + while idx < len(self): + labels = f["metadata/metadata"][class_key] + current_label = labels[idx].decode("utf-8") + if current_label == class_value and inplace: + delete_example_inplace(self.source, idx) + continue + + elif current_label != class_value and not inplace: + copy_over_example(self.source, ds.source, idx) + + idx += 1 + + if not inplace: + return ds + + def add_label(self, column_name: str, data: ArrayLike, inplace: Optional[bool] = False) -> RadioDataset | None: + """Add a new metadata label to the dataset. + + .. todo:: This method is not yet implemented. + + :param column_name: Name of the new metadata column header. + :type inplace: str + :param data: The contents of the new metadata column. + :type inplace: np.typing.ArrayLike + :param inplace: If True, the label is added inplace and ``None`` is returned. Defaults to False. + :type inplace: bool, optional + + :raises ValueError: If the length of ``data`` is not equal to the length of the dataset. + + :return: The augmented dataset or None if ``inplace=True``. + :rtype: RadioDataset or None + + **Examples:** + + .. todo:: Usage examples coming soon. + """ + raise NotImplementedError + + def get_class_sizes(self, class_key: str) -> dict[str, int]: + """Returns a dictionary containing the sizes of each class in the dataset at the provided key. + + :param class_key: The class label. + :type class_key: str + + :raises ValueError: If the specified key is not found in the dataset labels. + + :return: A dictionary where each key is a distinct class label, and it's value is the class size. + :rtype: A dictionary where the keys are strings and the values are integers + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> spectrogram_sensing_builder = AWGN_Builder() + >>> spectrogram_sensing_builder.download_and_prepare() + >>> ds = spectrogram_sensing_builder.as_dataset(backend="pytorch") + >>> ds.get_class_sizes(class_key='signal_type') + {'LTE': 900, 'NR': 900, 'LTE_NR': 900} + """ + with h5py.File(self.source, "r") as f: + labels = f["metadata/metadata"][class_key] + return dict(Counter(labels)) + + def delete_example(self, idx: int, inplace: Optional[bool] = False) -> RadioDataset | None: + """Deletes an example and it's corresponding metadata from the dataset. + + :param idx: The index of the example to be deleted. + :type idx: int + + :param inplace: If True, the deletion is performed inplace and ``None`` is returned. Defaults to False. + :type inplace: bool, optional + + :return: The new dataset or None if ``inplace=True``. + :rtype: RadioDataset or None + + **Examples:** + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> spectrogram_sensing_builder = AWGN_Builder() + >>> spectrogram_sensing_builder.download_and_prepare() + >>> ds = spectrogram_sensing_builder.as_dataset(backend="pytorch") + >>> len(ds) + 2700 + >>> ds = ds.delete_example(idx=34) + >>> len(ds) + 2699 + """ + + if inplace: + delete_example_inplace(source=self.source, idx=idx) + return None + + else: + # The deletion is performed by 1. creating a new source file, 2. copying all contents to the new source + # file, and 3. deleting the example at idx inplace. + new_source = self._get_next_file_name() + copy_file(original_source=self.source, new_source=new_source) + delete_example_inplace(source=new_source, idx=idx) + return self.__class__(source=new_source) + + def append(self, example: ArrayLike, metadata: dict) -> None: + """Append a single example to the end of the dataset. This operation is performed inplace. + + .. todo:: This method is not yet implemented. + + :param example: The example to append. + :type example: np.typing.ArrayLike + :param metadata: The corresponding metadata dictionary. + :type metadata: dict + + :raises ValueError: If example does not the same shape and type as rest of the examples in the dataset. + + :return: None. + + **Examples:** + + .. todo:: Usage examples coming soon. + """ + raise NotImplementedError + + def join(self, ds: RadioDataset) -> RadioDataset: + """Join or merge together two radio datasets. + + .. todo:: This method is not yet implemented. + + - Duplicate entries are not removed; they are included. + - The examples are not shuffled; examples from ``ds`` are appended at the end. + - Metadata will be expanded to contain all columns. + + :param ds: The dataset to merge together with self. Examples from both datasets must have the same shape. + :type ds: bool + + :return: The combined dataset. + :rtype: RadioDataset + + **Examples:** + + .. todo:: Usage examples coming soon. + """ + raise NotImplementedError + + def filter(self, mask: ArrayLike, inplace: Optional[bool] = False) -> RadioDataset: + """Filter the dataset using the provided mask. + + .. todo:: This method is not yet implemented. + + :param mask: A boolean mask. Where True, keep the corresponding examples. Where False, discard keep the + corresponding examples. The filtering mask is often the result of applying a condition across the elements + of the dataset. + :type mask: array_like + + :param inplace: If True, the filter operation is performed inplace and ``None`` is returned. Defaults to False. + :type inplace: bool, optional + + :return: The filtered dataset or None if ``inplace=True``. + :rtype: RadioDataset or None + + Examples: + + .. todo:: Usage examples coming soon! + + """ + raise NotImplementedError + + def _get_next_file_name(self) -> str: + """As we manipulate a dataset, we create new source files. That is, unless inplace==True. Each new source + file needs a new name, and so we count up. This function computes and returns the next file name. + + If the file has not been manipulated before, it will add `.001` to the end of the file name before the + extension. If there is already a number at the end of the file name, it will update the current number to the + next consecutive number. + + For example: + + >>> from ria.dataset_manager.builders import AWGN_Builder() + >>> builder = AWGN_Builder() + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset() # my_dataset.hdf5 + >>> ds = ds.subsample() # my_dataset.001.hdf5 + >>> ds = ds.augment() # my_dataset.002.hdf5 + + :raises ValueError: If the number at the end of the file name exceeds 999. + + :return: The name of the next file, including the file extension + :rtype: str + """ + + name, ext = os.path.splitext(str(self.source)) + end_of_name = name[-3:] + + if re.match(r"^\d+$", end_of_name): + operation_number = int(end_of_name) + operation_number += 1 + + if operation_number < 10: + operation_number_as_string = f"00{operation_number}" # 1 digits + + elif operation_number < 100: + operation_number_as_string = f"0{operation_number}" # 2 digits + + elif operation_number < 1000: + operation_number_as_string = f"{operation_number}" # 3 digits + + else: + # We assume the maximum number of dataset manipulations will not exceed 999. + raise ValueError("The maximum allowed number of dataset manipulations is 999.") + + return f"{name[:-3]}{operation_number_as_string}{ext}" + + else: + return f"{name}.001{ext}" + + def _create_next_dataset(self, example_length: int) -> RadioDataset: + """Creates a new empty dataset with a new source file, but with the same file structure as self.source. + + :param example_length: The length of the examples in the new dataset. + :type example_length: int + + :return: A new dataset with empty data and labels. + :rtype: RadioDataset + """ + new_source = self._get_next_file_name() + make_empty_clone(self.source, new_source, example_length=example_length) + return self.__class__(source=new_source) + + def __iter__(self) -> Iterator: + self._index = 0 + return self + + def __next__(self) -> np.ndarray: + if self._index < len(self): + with h5py.File(self.source, "r") as f: + dataset = f["data"] + result = dataset[self._index] + self._index += 1 + return result + else: + raise StopIteration + + def __eq__(self, other: RadioDataset) -> bool: + """Two RadioDatasets are equal iff they share the same source file.""" + return self._source == other._source + + def __len__(self) -> int: + """ + :return: The number of examples in a dataset. + :rtype: int + """ + return self.shape[0] + + def __getitem__(self, key: int | slice | ArrayLike) -> np.ndarray | RadioDataset: + """If key is an integer read in and return the example at key. + + If key is a slice, a new dataset instance is returned, initialized with the data and metadata corresponding + to that slice. However, if key is `[:]`, the data is read and returned as a NumPy array. + + If key is array_like, it is interpreted as a boolean mask and used to filter the dataset. In this case, we + return a new instance of the dataset, initialized from a new source file with the filtered data/metadata. + """ + if isinstance(key, int): + with h5py.File(self.source, "r") as file: + return file["data"][key] + + elif isinstance(key, slice): + if key == slice(None): + return self.data + else: + # Create and return a new dataset, initialized from a new source file, with the data/metadata at slice. + raise NotImplementedError("Dataset slicing not yet implemented.") + + else: + try: + key = np.asarray(key) + if key.dtype == bool: + return self.filter(mask=key) + else: + raise ValueError("Array-like mask must be of boolean type.") + + except (TypeError, ValueError): + raise ValueError(f"Indexing with key of type {key} is not supported.") + + def __setitem__(self, *args, **kwargs) -> None: + """Raise an error if an attempt is made to assign to the dataset.""" + raise ValueError("Assignment to dataset is not allowed.") + + +def decode_bytes(cell: any) -> any: + """If cell is of type bytes, returns the decoded UTF-8 string. Otherwise, returns the input value unchanged.""" + if isinstance(cell, bytes): + return cell.decode("utf-8") + + return cell + + +def get_result_sizes( # noqa: C901 # TODO: Simplify function + level: float | list[float], + target_size: int | list[int] | None, + classes_to_augment: str | list[str] | None, + class_sizes: dict, +) -> dict: + """Returns the desired sizes of each class in the metadata. This is a helper function specifically + used by the augment method. + + :param level: The level or extent of data augmentation to apply, ranging from 0.0 (no augmentation) to + 1.0 (full augmentation, where each augmentation is applied to each pre-existing example). + :type level: float or list of floats + + :param target_size: Target size of the augmented dataset. If specified, ``level`` is ignored, and augmentations + are applied to expand the dataset to contain the specified number of examples. + :type target_size: int or list of ints or None + + :param classes_to_augment: List of the classes to augment. + :type classes_to_augment: string or list of strings or None + + :param class_sizes: A dictionary where each key-value pair is the class label and the class size. + :type class_sizes: dict + + :raises ValueError: If level is a list when classes_to_augment is None. + :raises ValueError: If classes_to_augment and level are lists, but they have different sizes. + :raises ValueError: If target_size is a list when classes_to_augment is None. + :raises ValueError: If classes_to_augment and target_size are lists, but they have different sizes. + :raises ValueError: If classes_to_augment and target_size are lists, but the target_size of a class is already met. + + :return: A dictionary where each key is a distinct class label, and it's value is the desired class size. + :rtype: A dictionary where the keys are strings and the values are integers + """ + result_sizes = dict(class_sizes) + + if target_size is None: + # Calculate off of level + if classes_to_augment is None: + # Apply to entire dataset, if classes_to_augment is None + if isinstance(level, list): + raise ValueError("Since classes_to_augment is None, level must be a single float value.") + + for key in result_sizes: + result_sizes[key] = round(result_sizes[key] + class_sizes[key] * level) + else: + if not isinstance(classes_to_augment, list): + classes_to_augment = [classes_to_augment] + + if isinstance(level, list): + if len(level) != len(classes_to_augment): + raise ValueError("If level is a list, there must be one value for each class you wish to augment.") + + for index, class_name in enumerate(classes_to_augment): + result_sizes[class_name] = round(result_sizes[class_name] + class_sizes[class_name] * level[index]) + + else: + for class_name in classes_to_augment: + result_sizes[class_name] = round(result_sizes[class_name] + class_sizes[class_name] * level) + else: + # Calculate off of target_size + if classes_to_augment is None: + # apply to entire dataset, if classes_to_augment is None + if isinstance(target_size, list): + raise ValueError("Since classes_to_augment is None, target_size must be a single int value.") + + result_sizes = calculate_size_with_original_distribution( + class_sizes=class_sizes, target_size=target_size, classes_to_augment=classes_to_augment + ) + + else: + # user specified classes to augment + + # if user provides only 1 class convert it to a list + if not isinstance(classes_to_augment, list): + classes_to_augment = [classes_to_augment] + + if isinstance(target_size, list): + if len(target_size) != len(classes_to_augment): + raise ValueError( + "If target_size is a list, there must be one value for each class you wish to augment." + ) + + # Check that each class that will be augmented does not already suffice target_size + for cls_name, target_size_value in zip(classes_to_augment, target_size): + if class_sizes[cls_name] >= target_size_value: + raise ValueError( + f"""target_size of {target_size_value} is already sufficed for current size of + {class_sizes[cls_name]} for class: {cls_name}""" + ) + + for index, class_name in enumerate(classes_to_augment): + result_sizes[class_name] = target_size[index] + else: + result_sizes = calculate_size_with_original_distribution( + class_sizes=class_sizes, target_size=target_size, classes_to_augment=classes_to_augment + ) + + return result_sizes + + +def calculate_size_with_original_distribution( # noqa: C901 # TODO: Simplify function + class_sizes: dict, target_size: int, classes_to_augment: list[str] | None +) -> dict: + """Returns the desired sizes of each class when target_size is used to calculate the resultant class sizes. + Specifically used as a helper by the get result sizes method. + + :param class_sizes: A dictionary where each key-value pair is the class label and the class size. + :type class_sizes: dict + + :param target_size: Target size of the augmented dataset. + :type target_size: int + + :param classes_to_augment: List of the classes to augment. + :type classes_to_augment: list of strings or None + + :return: A dictionary where each key is a distinct class label, and it's value is the desired class size. + :rtype: dict + """ + + total_size = sum(class_sizes.values()) + + if classes_to_augment is None: + scaled_sizes = {cls: (size / total_size) * target_size for cls, size in class_sizes.items()} + rounded_sizes = {cls: round(size) for cls, size in scaled_sizes.items()} + difference = target_size - sum(rounded_sizes.values()) + + else: + partial_class_size_total = sum(size for cls, size in class_sizes.items() if cls in classes_to_augment) + partial_target_size = target_size - sum( + size for cls, size in class_sizes.items() if cls not in classes_to_augment + ) + scaled_sizes = { + cls: (size / partial_class_size_total) * partial_target_size + for cls, size in class_sizes.items() + if cls in classes_to_augment + } + rounded_sizes = {cls: round(size) for cls, size in scaled_sizes.items()} + difference = partial_target_size - sum(rounded_sizes.values()) + + if difference != 0: + decimals = {cls: scaled_sizes[cls] % 1 for cls in scaled_sizes} + + if difference > 0: + sorted_classes = sorted(rounded_sizes, key=decimals.get, reverse=True) + for cls in sorted_classes: + rounded_sizes[cls] += 1 + difference -= 1 + + if difference == 0: + break + else: + sorted_classes = sorted(rounded_sizes, key=decimals.get) + for cls in sorted_classes: + rounded_sizes[cls] -= 1 + difference += 1 + + if difference == 0: + break + + # Put back classes that were not chosen to be augmented back into resultant size dictionary + if classes_to_augment is not None: + for cls, count in class_sizes.items(): + if cls not in rounded_sizes: + rounded_sizes[cls] = count + + return rounded_sizes + + +def get_masks_and_counters(class_sizes: dict, example_limit: int) -> tuple: + """ + Returns the masks and counters based on the class sizes and example limit of a dataset. + Specifically used for the homogenize method. + + :param class_sizes: Dictionary containing the sizes of each class in the dataset. + :type class_sizes: dict + :param example_limit: The class size limit to which all classes are subsampled. If not specified, + the class with the fewest examples is used as the limit. + :type example_limit: int + + :return: The mask and counter dictionaries. + :rtype: tuple + + """ + masks, counters = dict(), dict() + + for key in class_sizes: + if class_sizes[key] <= example_limit: + masks[key] = None + + else: + masks[key] = np.array([1] * example_limit + [0] * (class_sizes[key] - example_limit)) + np.random.shuffle(masks[key]) + counters[key] = 0 + + return masks, counters diff --git a/src/ria_toolkit/adt/datasets/spect_dataset.py b/src/ria_toolkit/adt/datasets/spect_dataset.py new file mode 100644 index 0000000..e6c04a7 --- /dev/null +++ b/src/ria_toolkit/adt/datasets/spect_dataset.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import os +from abc import ABC + +from utils.data.datasets.radio_dataset import RadioDataset + + +class SpectDataset(RadioDataset, ABC): + """A ``SpectDataset`` is a ``RadioDataset`` tailored for machine learning tasks that involve processing + radiofrequency (RF) signals represented as spectrograms. This class is integrated with vision frameworks, + allowing you to leverage models and techniques from the field of computer vision for analyzing and processing + radio signal spectrograms. + + For machine learning tasks that involve processing on IQ samples, please use + utils.data.datasets.IQDataset instead. + + This is an abstract interface defining common properties and behaviour of IQDatasets. Therefore, this class + should not be instantiated directly. Instead, it is subclassed to define custom interfaces for specific machine + learning backends. + + :param source: Path to the dataset source file. For more information on dataset source files + and their format, see :doc:`radio_datasets`. + :type source: str or os.PathLike + """ + + def __init__(self, source: str | os.PathLike): + """Create a new SpectDataset.""" + super().__init__(source=source) + + @property + def shape(self) -> tuple[int]: + """Spectrogram datasets are M x C x H x W, where M is the number of examples, C is the number of image + channels, H is the height of the spectrogram, and W is the width of the spectrogram. + + :return: The shape of the dataset. The elements of the shape tuple give the lengths of the corresponding + dataset dimensions. + :type: tuple of ints + """ + return super().shape + + def default_augmentations(self) -> list[callable]: + """Returns the list of default augmentations for spectrogram datasets. + + .. todo:: This method is not yet implemented. + + :return: A list of default augmentations. + :rtype: list[callable] + """ + # Consider the following list of default augmentations: + # #. horizontal_flip + # #. vertical_flip + # #. sharpen + # #. darken + # #. lighten + # #. linear_rotate + raise NotImplementedError diff --git a/src/ria_toolkit/adt/datasets/split.py b/src/ria_toolkit/adt/datasets/split.py new file mode 100644 index 0000000..3820085 --- /dev/null +++ b/src/ria_toolkit/adt/datasets/split.py @@ -0,0 +1,317 @@ +import math +import os +from collections import Counter +from typing import Optional + +import numpy as np +from numpy.random import Generator + +from utils.data.datasets import RadioDataset +from utils.data.datasets.h5helpers import copy_over_example, make_empty_clone + + +def split(dataset: RadioDataset, lengths: list[int | float]) -> list[RadioDataset]: + """Split a radio dataset into non-overlapping new datasets of given lengths. + + Recordings are long-form tapes, which can be obtained either from a software-defined radio (SDR) or generated + synthetically. Then, radio datasets are curated from collections of recordings by segmenting these + longer-form tapes into shorter units called slices. + + For each slice in the dataset, the metadata should include the unique ID of the recording from which the example + was cut ('rec_id'). To avoid leakage, all examples with the same 'rec_id' are assigned only to one of the new + datasets. This ensures, for example, that slices cut from the same recording do not appear in both the training + and test datasets. + + This restriction makes it challenging to generate datasets with the exact lengths specified. To get as close as + possible, this method uses a greedy algorithm, which assigns the recordings with the most slices first, working + down to those with the fewest. This may not always provide a perfect split, but it works well in most practical + cases. + + This function is deterministic, meaning it will always produce the same split. For a random split, see + utils.data.datasets.random_split. + + :param dataset: Dataset to be split. + :type dataset: RadioDataset + :param: lengths: Lengths or fractions of splits to be produced. If given a list of fractions, the list should + sum up to 1. The lengths will be computed automatically as ``floor(frac * len(dataset))`` for each fraction + provided, and any remainders will be distributed in round-robin fashion. + :type lengths: list of ints (lengths) or floats (fractions) + + :return: List of radio datasets. The number of returned datasets will correspond to the length of the provided + 'lengths' list. + :rtype: list of RadioDataset + + **Examples:** + + >>> import random + >>> import string + >>> import numpy as np + >>> import pandas as pd + >>> from utils.data.datasets import split + + First, let's generate some random data: + + >>> shape = (24, 1, 1024) # 24 examples, each of length 1024 + >>> real_part, imag_part = np.random.randint(0, 12, size=shape), np.random.randint(0, 79, size=shape) + >>> data = real_part + 1j * imag_part + + Then, a list of recording IDs. Let's pretend this data was cut from 4 separate recordings: + + >>> rec_id_options = [''.join(random.choices(string.ascii_lowercase + string.digits, k=256)) for _ in range(4)] + >>> rec_id = [np.random.choice(rec_id_options) for _ in range(shape[0])] + + Using this data and metadata, let's initialize a dataset: + + >>> metadata = pd.DataFrame(data={"rec_id": rec_id}).to_records(index=False) + >>> fid = os.path.join(os.getcwd(), "source_file.hdf5") + >>> ds = RadioDataset(source=fid) + + Finally, let's do an 80/20 train-test split: + + >>> train_ds, test_ds = split(ds, lengths=[0.8, 0.2]) + """ + if not isinstance(dataset, RadioDataset): + raise ValueError(f"'dataset' must be RadioDataset or one of its subclasses, got {type(dataset)}.") + + lengths_ = _validate_lengths(dataset=dataset, lengths=lengths) + + if "rec_id" not in dataset.metadata or not isinstance(dataset.metadata["rec_id"][0], str): + raise ValueError("Dataset missing string field 'rec_id'.") + + rec_ids = dict(Counter(dataset.metadata["rec_id"])) + + if len(rec_ids) < len(lengths_): + raise ValueError(f"Not enough Recordings IDs in the dataset for a {len(lengths_)}-way split.") + + # Sort the rec_ids in descending order by frequency. + ids, freqs = list(rec_ids.keys()), list(rec_ids.values()) + sorted_indices = np.flip(np.argsort(freqs)) + sorted_rec_ids = [ids[x] for x in sorted_indices] + sorted_freqs = [freqs[x] for x in sorted_indices] + + # Preallocate keys, which we'll use to track which recordings are assigned to which subsets. + split_key_ids = [[] for _ in range(len(lengths_))] + split_key_freqs = [[] for _ in range(len(lengths_))] + + for i in range(len(rec_ids)): + # Find the subset whose current length is farthest from its target length. + current_lengths = [sum(subkey) for subkey in split_key_freqs] + diffs = [lengths_[j] - current_lengths[j] for j in range(len(lengths_))] + index = np.argmax(diffs) + + # Add the 'rec_id' with the highest frequency to the subset farthest from its target. + split_key_freqs[index].append(sorted_freqs[i]) + split_key_ids[index].append(sorted_rec_ids[i]) + + _validate_sublists(list_of_lists=split_key_ids, ids=ids) + + return _split_datasets(dataset=dataset, key=split_key_ids) + + +def random_split( + dataset: RadioDataset, lengths: list[int | float], generator: Optional[Generator] = None +) -> list[RadioDataset]: + """Randomly split a radio dataset into non-overlapping new datasets of given lengths. + + Recordings are long-form tapes, which can be obtained either from a software-defined radio (SDR) or generated + synthetically. Then, radio datasets are curated from collections of recordings by segmenting these + longer-form tapes into shorter units called slices. + + For each slice in the dataset, the metadata should include the unique recording ID ('rec_id') of the recording + from which the example was cut. To avoid leakage, all examples with the same 'rec_id' are assigned only to one of + the new datasets. This ensures, for example, that slices cut from the same recording do not appear in both the + training and test datasets. + + This restriction makes it unlikely that a random split will produce datasets with the exact lengths specified. + If it is important to ensure the closest possible split, consider using utils.data.datasets.split instead. + + :param dataset: Dataset to be split. + :type dataset: RadioDataset + :param: lengths: Lengths or fractions of splits to be produced. If given a list of fractions, the list should + sum up to 1. The lengths will be computed automatically as ``floor(frac * len(dataset))`` for each fraction + provided, and any remainders will be distributed in round-robin fashion. + :type lengths: list of ints (lengths) or floats (fractions) + + :param generator: Random generator. Defaults to None. + :type generator: NumPy Generator Object, optional. + + :return: List of radio datasets. The number of returned datasets will correspond to the length of the provided + 'lengths' list. + :rtype: list of RadioDataset + + See Also: + utils.data.datasets.split: Usage is the same as for ``random_split()``. + """ + if not isinstance(dataset, RadioDataset): + raise ValueError(f"'dataset' must be RadioDataset or one of its subclasses, got {type(dataset)}.") + + lengths_ = _validate_lengths(dataset=dataset, lengths=lengths) + + if generator is None: + rng = np.random.default_rng(np.random.randint(0, np.iinfo(np.int32).max)) + else: + rng = generator + + if "rec_id" not in dataset.metadata or not isinstance(dataset.metadata["rec_id"][0], str): + raise ValueError("Dataset missing string field 'rec_id'.") + + rec_ids = dict(Counter(dataset.metadata["rec_id"])) + + if len(rec_ids) < len(lengths_): + raise ValueError(f"Not enough Recordings IDs in the dataset for a {len(lengths_)}-way split.") + + ids, freqs = list(rec_ids.keys()), list(rec_ids.values()) + sorted_indices = np.flip(np.argsort(freqs)) + sorted_rec_ids = [ids[x] for x in sorted_indices] + sorted_freqs = [freqs[x] for x in sorted_indices] + + # Preallocate keys, which we'll use to track which recordings are assigned to which subsets. + n = len(lengths_) + split_key_ids = [[] for _ in range(n)] + split_key_freqs = [[] for _ in range(n)] + + # Taking from the bottom (least frequent), assign one recording to each subset. This is important to ensure we + # don't end up with any empty subsets, and serves to help randomize the results. + top_rec_ids, bottom_rec_ids = sorted_rec_ids[:-n], sorted_rec_ids[-n:] + top_freqs, bottom_freqs = sorted_freqs[:-n], sorted_freqs[-n:] + bottom_indices = rng.permutation(x=np.asarray(range(n))) + + for i in range(n): + split_key_freqs[i].append(bottom_freqs[bottom_indices[i]]) + split_key_ids[i].append(bottom_rec_ids[bottom_indices[i]]) + + for i in range(len(top_rec_ids)): + # Find the subset whose current length is farthest from its target length. + current_lengths = np.array([sum(subkey) for subkey in split_key_freqs]) + diffs = np.array([lengths_[j] - current_lengths[j] for j in range(n)]) + + # Use the normalized diffs as probabilities. This results in a higher probability for larger diffs. + diffs = np.asarray([0 if d < 0 else d for d in diffs]) # Don't add to full or overfull subsets. + probabilities = diffs / sum(diffs) + + index = rng.choice(range(n), p=probabilities) + + # Add the 'rec_id' with the highest frequency to the chosen subset. + split_key_freqs[index].append(top_freqs[i]) + split_key_ids[index].append(top_rec_ids[i]) + + _validate_sublists(list_of_lists=split_key_ids, ids=ids) + + return _split_datasets(dataset=dataset, key=split_key_ids, generator=rng) + + +def _validate_lengths(dataset: RadioDataset, lengths: list[int | float]) -> list[int]: + """Validate lengths. If lengths are fractions of splits, lengths will be computed automatically. + + :param dataset: Dataset to be split. + :type dataset: RadioDataset + :param: lengths: Lengths or fractions of splits to be produced. + :type lengths: list of ints (lengths) or floats (fractions) + + :return: List of lengths to be produced. + :rtype: list of ints + """ + if not isinstance(lengths, list): + raise ValueError(f"'lengths' must be a list of ints or a list of floats, got {type(lengths)}.") + + if len(lengths) < 2: + raise ValueError("'lengths' list must contain at least 2 elements.") + + if not all(isinstance(sub, type(lengths[0])) for sub in lengths[1:]): + raise ValueError("All elements of 'lengths' must be of the same type.") + + if sum(lengths) == len(dataset): + return [int(i) for i in lengths] + + elif math.isclose(sum(lengths), 1, abs_tol=1e-9): + # Fractions of splits, which add to 1. + lengths_ = [math.floor(f * len(dataset)) for f in lengths] + + # Distribute remainders in round-robin fashion to the lengths until there are no remainders left. + i = 0 + while len(dataset) > sum(lengths_): + lengths_[i] = lengths_[i] + 1 + i = i + 1 + + return lengths_ + + else: + raise ValueError("'lengths' must sum to either the length of 'dataset' or 1.") + + +def _validate_sublists(list_of_lists: list[list[str]], ids: list[str]) -> None: + """Ensure that each ID is present in one and only one sublist.""" + all_elements = [item for sublist in list_of_lists for item in sublist] + + assert len(all_elements) == len(set(all_elements)) and list(set(ids)).sort() == list(set(all_elements)).sort() + + +def _generate_split_source_filenames( + parent_dataset: RadioDataset, n_new_datasets: int, generator: Generator +) -> list[str]: + """Generate source filenames for each new dataset. + + Examples: + + .../file_name.hdf5 -> [ + .../file_name.split66ce07f-0.hdf5, + .../file_name.split66ce07f-1.hdf5, + .../file_name.split66ce07f-2.hdf5 + ] + + .../file_name.002.hdf5 -> [ + .../file_name.002.split156afd7-0.hdf5, + .../file_name.002.split156afd7-1.hdf5, + .../file_name.002.split156afd7-2.hdf5 + ] + """ + parent_file_name = str(parent_dataset.source) + parent_base_name = os.path.splitext(parent_file_name)[0] + + random_tag = generator.bytes(length=4).hex()[:7] + + return [f"{parent_base_name}.split{random_tag}-{i}.hdf5" for i in range(n_new_datasets)] + + +def _split_datasets( + dataset: RadioDataset, key: list[list[str]], generator: Optional[Generator] = None +) -> list[RadioDataset]: + """Once we know how we'd like to split up the dataset (i.e., which slices are to be included in which new + dataset), this helper function does the actual split. + + :param dataset: Dataset to be split. + :type dataset: RadioDataset + :param key: A key indicating which slices are to be included in which dataset. This is a list of lists, where + each sublist contains the recordings IDs of the slices to be included in the corresponding subset. + :type key: A list of lists + + :param generator: Random generator. Defaults to None. + :type generator: NumPy Generator Object, optional. + + :return: Non-overlapping datasets + :rtype: list of RadioDataset + """ + if generator is None: + rng = np.random.default_rng(np.random.randint(0, np.iinfo(np.int32).max)) + else: + rng = generator + + new_source_filenames = _generate_split_source_filenames( + parent_dataset=dataset, n_new_datasets=len(key), generator=rng + ) + + for new_source in new_source_filenames: + make_empty_clone(original_source=dataset.source, new_source=new_source, example_length=len(dataset.data[0, 0])) + + new_datasets = [dataset.__class__(source=new_source) for new_source in new_source_filenames] + + rec_ids = list(dataset.metadata["rec_id"]) + + for i, sublist in enumerate(key): + for rec_id in sublist: + # The examples at these indices are part of the corresponding new dataset. + indices = [index for index, value in enumerate(rec_ids) if value == rec_id] + for idx in indices: + copy_over_example(source=dataset.source, destination=new_datasets[i].source, idx=idx) + + return new_datasets diff --git a/src/ria_toolkit/adt/recording.py b/src/ria_toolkit/adt/recording.py new file mode 100644 index 0000000..beffcea --- /dev/null +++ b/src/ria_toolkit/adt/recording.py @@ -0,0 +1,763 @@ +from __future__ import annotations + +import copy +import datetime +import hashlib +import json +import os +import re +import time +import warnings +from typing import Any, Iterator, Optional + +import numpy as np +from numpy.typing import ArrayLike +from quantiphy import Quantity + +from utils.data.annotation import Annotation + +PROTECTED_KEYS = ["rec_id", "timestamp"] + + +class Recording: + """Tape of complex IQ (in-phase and quadrature) samples with associated metadata and annotations. + + Recording data is a complex array of shape C x N, where C is the number of channels + and N is the number of samples in each channel. + + Metadata is stored in a dictionary of key value pairs, + to include information such as sample_rate and center_frequency. + + Annotations are a list of :ref:`Annotation `, + defining bounding boxes in time and frequency with labels and metadata. + + Here, signal data is represented as a NumPy array. This class is then extended in the RIA Backends to provide + support for different data structures, such as Tensors. + + Recordings are long-form tapes can be obtained either from a software-defined radio (SDR) or generated + synthetically. Then, machine learning datasets are curated from collection of recordings by segmenting these + longer-form tapes into shorter units called slices. + + All recordings are assigned a unique 64-character recording ID, ``rec_id``. If this field is missing from the + provided metadata, a new ID will be generated upon object instantiation. + + :param data: Signal data as a tape IQ samples, either C x N complex, where C is the number of + channels and N is number of samples in the signal. If data is a one-dimensional array of complex samples with + length N, it will be reshaped to a two-dimensional array with dimensions 1 x N. + :type data: array_like + + :param metadata: Additional information associated with the recording. + :type metadata: dict, optional + :param annotations: A collection of ``Annotation`` objects defining bounding boxes. + :type annotations: list of Annotations, optional + + :param dtype: Explicitly specify the data-type of the complex samples. Must be a complex NumPy type, such as + ``np.complex64`` or ``np.complex128``. Default is None, in which case the type is determined implicitly. If + ``data`` is a NumPy array, the Recording will use the dtype of ``data`` directly without any conversion. + :type dtype: numpy dtype object, optional + :param timestamp: The timestamp when the recording data was generated. If provided, it should be a float or integer + representing the time in seconds since epoch (e.g., ``time.time()``). Only used if the `timestamp` field is not + present in the provided metadata. + :type dtype: float or int, optional + + :raises ValueError: If data is not complex 1xN or CxN. + :raises ValueError: If metadata is not a python dict. + :raises ValueError: If metadata is not json serializable. + :raises ValueError: If annotations is not a list of valid annotation objects. + + **Examples:** + + >>> import numpy + >>> from utils.data import Recording, Annotation + + >>> # Create an array of complex samples, just 1s in this case. + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + + >>> # Create a dictionary of relevant metadata. + >>> sample_rate = 1e6 + >>> center_frequency = 2.44e9 + >>> metadata = { + ... "sample_rate": sample_rate, + ... "center_frequency": center_frequency, + ... "author": "me", + ... } + + >>> # Create an annotation for the annotations list. + >>> annotations = [ + ... Annotation( + ... sample_start=0, + ... sample_count=1000, + ... freq_lower_edge=center_frequency - (sample_rate / 2), + ... freq_upper_edge=center_frequency + (sample_rate / 2), + ... label="example", + ... ) + ... ] + + >>> # Store samples, metadata, and annotations together in a convenient object. + >>> recording = Recording(data=samples, metadata=metadata, annotations=annotations) + >>> print(recording.metadata) + {'sample_rate': 1000000.0, 'center_frequency': 2440000000.0, 'author': 'me'} + >>> print(recording.annotations[0].label) + 'example' + """ + + def __init__( # noqa C901 + self, + data: ArrayLike | list[list], + metadata: Optional[dict[str, any]] = None, + dtype: Optional[np.dtype] = None, + timestamp: Optional[float | int] = None, + annotations: Optional[list[Annotation]] = None, + ): + + data_arr = np.asarray(data) + + if np.iscomplexobj(data_arr): + # Expect C x N + if data_arr.ndim == 1: + self._data = np.expand_dims(data_arr, axis=0) # N -> 1 x N + elif data_arr.ndim == 2: + self._data = data_arr + else: + raise ValueError("Complex data must be C x N.") + + else: + raise ValueError("Input data must be complex.") + + if dtype is not None: + self._data = self._data.astype(dtype) + + assert np.iscomplexobj(self._data) + + if metadata is None: + self._metadata = {} + elif isinstance(metadata, dict): + self._metadata = metadata + else: + raise ValueError(f"Metadata must be a python dict, but was {type(metadata)}.") + + if not _is_jsonable(metadata): + raise ValueError("Value must be JSON serializable.") + + if "timestamp" not in self.metadata: + if timestamp is not None: + if not isinstance(timestamp, (int, float)): + raise ValueError(f"timestamp must be int or float, not {type(timestamp)}") + self._metadata["timestamp"] = timestamp + else: + self._metadata["timestamp"] = time.time() + else: + if not isinstance(self._metadata["timestamp"], (int, float)): + raise ValueError("timestamp must be int or float, not ", type(self._metadata["timestamp"])) + + if "rec_id" not in self.metadata: + self._metadata["rec_id"] = generate_recording_id(data=self.data, timestamp=self._metadata["timestamp"]) + + if annotations is None: + self._annotations = [] + elif isinstance(annotations, list): + self._annotations = annotations + else: + raise ValueError("Annotations must be a list or None.") + + if not all(isinstance(annotation, Annotation) for annotation in self._annotations): + raise ValueError("All elements in self._annotations must be of type Annotation.") + + self._index = 0 + + @property + def data(self) -> np.ndarray: + """ + :return: Recording data, as a complex array. + :type: np.ndarray + + .. note:: + + For recordings with more than 1,024 samples, this property returns a read-only view of the data. + + .. note:: + + To access specific samples, consider indexing the object directly with ``rec[c, n]``. + """ + if self._data.size > 1024: + # Returning a read-only view prevents mutation at a distance while maintaining performance. + v = self._data.view() + v.setflags(write=False) + return v + else: + return self._data.copy() + + @property + def metadata(self) -> dict: + """ + :return: Dictionary of recording metadata. + :type: dict + """ + return self._metadata.copy() + + @property + def annotations(self) -> list[Annotation]: + """ + :return: List of recording annotations + :type: list of Annotation objects + """ + return self._annotations.copy() + + @property + def shape(self) -> tuple[int]: + """ + :return: The shape of the data array. + :type: tuple of ints + """ + return np.shape(self.data) + + @property + def n_chan(self) -> int: + """ + :return: The number of channels in the recording. + :type: int + """ + return self.shape[0] + + @property + def rec_id(self) -> str: + """ + :return: Recording ID. + :type: str + """ + return self.metadata["rec_id"] + + @property + def dtype(self) -> str: + """ + :return: Data-type of the data array's elements. + :type: numpy dtype object + """ + return self.data.dtype + + @property + def timestamp(self) -> float | int: + """ + :return: Recording timestamp (time in seconds since epoch). + :type: float or int + """ + return self.metadata["timestamp"] + + @property + def sample_rate(self) -> float | None: + """ + :return: Sample rate of the recording, or None is 'sample_rate' is not in metadata. + :type: str + """ + return self.metadata.get("sample_rate") + + @sample_rate.setter + def sample_rate(self, sample_rate: float | int) -> None: + """Set the sample rate of the recording. + + :param sample_rate: The sample rate of the recording. + :type sample_rate: float or int + + :return: None + """ + self.add_to_metadata(key="sample_rate", value=sample_rate) + + def astype(self, dtype: np.dtype) -> Recording: + """Copy of the recording, data cast to a specified type. + + .. todo: This method is not yet implemented. + + :param dtype: Data-type to which the array is cast. Must be a complex scalar type, such as ``np.complex64`` or + ``np.complex128``. + :type dtype: NumPy data type, optional + + .. note: Casting to a data type with less precision can risk losing data by truncating or rounding values, + potentially resulting in a loss of accuracy and significant information. + + :return: A new recording with the same metadata and data, with dtype. + + TODO: Add example usage. + """ + # Rather than check for a valid datatype, let's cast and check the result. This makes it easier to provide + # cross-platform support where the types are aliased across platforms. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") # Casting may generate user warnings. E.g., complex -> real + data = self.data.astype(dtype) + + if np.iscomplexobj(data): + return Recording(data=data, metadata=self.metadata, annotations=self.annotations) + else: + raise ValueError("dtype must be a complex number scalar type.") + + def add_to_metadata(self, key: str, value: Any) -> None: + """Add a new key-value pair to the recording metadata. + + :param key: New metadata key, must be snake_case. + :type key: str + :param value: Corresponding metadata value. + :type value: any + + :raises ValueError: If key is already in metadata or if key is not a valid metadata key. + :raises ValueError: If value is not JSON serializable. + + :return: None. + + **Examples:** + + Create a recording and add metadata: + + >>> import numpy + >>> from utils.data import Recording + >>> + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + >>> metadata = { + >>> "sample_rate": 1e6, + >>> "center_frequency": 2.44e9, + >>> } + >>> + >>> recording = Recording(data=samples, metadata=metadata) + >>> print(recording.metadata) + {'sample_rate': 1000000.0, + 'center_frequency': 2440000000.0, + 'timestamp': 17369..., + 'rec_id': 'fda0f41...'} + >>> + >>> recording.add_to_metadata(key="author", value="me") + >>> print(recording.metadata) + {'sample_rate': 1000000.0, + 'center_frequency': 2440000000.0, + 'author': 'me', + 'timestamp': 17369..., + 'rec_id': 'fda0f41...'} + """ + if key in self.metadata: + raise ValueError( + f"Key {key} already in metadata. Use Recording.update_metadata() to modify existing fields." + ) + + if not _is_valid_metadata_key(key): + raise ValueError(f"Invalid metadata key: {key}.") + + if not _is_jsonable(value): + raise ValueError("Value must be JSON serializable.") + + self._metadata[key] = value + + def update_metadata(self, key: str, value: Any) -> None: + """Update the value of an existing metadata key, + or add the key value pair if it does not already exist. + + :param key: Existing metadata key. + :type key: str + :param value: New value to enter at key. + :type value: any + + :raises ValueError: If value is not JSON serializable + :raises ValueError: If key is protected. + + :return: None. + + **Examples:** + + Create a recording and update metadata: + + >>> import numpy + >>> from utils.data import Recording + + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + >>> metadata = { + >>> "sample_rate": 1e6, + >>> "center_frequency": 2.44e9, + >>> "author": "me" + >>> } + + >>> recording = Recording(data=samples, metadata=metadata) + >>> print(recording.metadata) + {'sample_rate': 1000000.0, + 'center_frequency': 2440000000.0, + 'author': "me", + 'timestamp': 17369... + 'rec_id': 'fda0f41...'} + + >>> recording.update_metadata(key="author", value=you") + >>> print(recording.metadata) + {'sample_rate': 1000000.0, + 'center_frequency': 2440000000.0, + 'author': "you", + 'timestamp': 17369... + 'rec_id': 'fda0f41...'} + """ + if key not in self.metadata: + self.add_to_metadata(key=key, value=value) + + if not _is_jsonable(value): + raise ValueError("Value must be JSON serializable.") + + if key in PROTECTED_KEYS: # Check protected keys. + raise ValueError(f"Key {key} is protected and cannot be modified or removed.") + + else: + self._metadata[key] = value + + def remove_from_metadata(self, key: str): + """ + Remove a key from the recording metadata. + Does not remove key if it is protected. + + :param key: The key to remove. + :type key: str + + :raises ValueError: If key is protected. + + :return: None. + + **Examples:** + + Create a recording and add metadata: + + >>> import numpy + >>> from utils.data import Recording + + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + >>> metadata = { + ... "sample_rate": 1e6, + ... "center_frequency": 2.44e9, + ... } + + >>> recording = Recording(data=samples, metadata=metadata) + >>> print(recording.metadata) + {'sample_rate': 1000000.0, + 'center_frequency': 2440000000.0, + 'timestamp': 17369..., # Example value + 'rec_id': 'fda0f41...'} # Example value + + >>> recording.add_to_metadata(key="author", value="me") + >>> print(recording.metadata) + {'sample_rate': 1000000.0, + 'center_frequency': 2440000000.0, + 'author': 'me', + 'timestamp': 17369..., # Example value + 'rec_id': 'fda0f41...'} # Example value + """ + if key not in PROTECTED_KEYS: + self._metadata.pop(key) + else: + raise ValueError(f"Key {key} is protected and cannot be modified or removed.") + + def view(self, output_path: Optional[str] = "images/signal.png", **kwargs) -> None: + """Create a plot of various signal visualizations as a PNG image. + + :param output_path: The output image path. Defaults to "images/signal.png". + :type output_path: str, optional + :param kwargs: Keyword arguments passed on to utils.view.view_sig. + :type: dict of keyword arguments + + **Examples:** + + Create a recording and view it as a plot in a .png image: + + >>> import numpy + >>> from utils.data import Recording + + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + >>> metadata = { + >>> "sample_rate": 1e6, + >>> "center_frequency": 2.44e9, + >>> } + + >>> recording = Recording(data=samples, metadata=metadata) + >>> recording.view() + """ + from utils.view import view_sig + + view_sig(recording=self, output_path=output_path, **kwargs) + + def to_sigmf(self, filename: Optional[str] = None, path: Optional[os.PathLike | str] = None) -> None: + """Write recording to a set of SigMF files. + + The SigMF io format is defined by the `SigMF Specification Project `_ + + :param recording: The recording to be written to file. + :type recording: utils.data.Recording + :param filename: The name of the file where the recording is to be saved. Defaults to auto generated filename. + :type filename: os.PathLike or str, optional + :param path: The directory path to where the recording is to be saved. Defaults to recordings/. + :type path: os.PathLike or str, optional + + :raises IOError: If there is an issue encountered during the file writing process. + + :return: None + + **Examples:** + + Create a recording and view it as a plot in a `.png` image: + + >>> import numpy + >>> from utils.data import Recording + + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + >>> metadata = { + ... "sample_rate": 1e6, + ... "center_frequency": 2.44e9, + ... } + + >>> recording = Recording(data=samples, metadata=metadata) + >>> recording.view() + """ + from utils.io.recording import to_sigmf + + to_sigmf(filename=filename, path=path, recording=self) + + def to_npy(self, filename: Optional[str] = None, path: Optional[os.PathLike | str] = None) -> str: + """Write recording to ``.npy`` binary file. + + :param filename: The name of the file where the recording is to be saved. Defaults to auto generated filename. + :type filename: os.PathLike or str, optional + :param path: The directory path to where the recording is to be saved. Defaults to recordings/. + :type path: os.PathLike or str, optional + + :raises IOError: If there is an issue encountered during the file writing process. + + :return: Path where the file was saved. + :rtype: str + + **Examples:** + + Create a recording and save it to a .npy file: + + >>> import numpy + >>> from utils.data import Recording + + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + >>> metadata = { + >>> "sample_rate": 1e6, + >>> "center_frequency": 2.44e9, + >>> } + + >>> recording = Recording(data=samples, metadata=metadata) + >>> recording.to_npy() + """ + from utils.io.recording import to_npy + + to_npy(recording=self, filename=filename, path=path) + + def trim(self, num_samples: int, start_sample: Optional[int] = 0) -> Recording: + """Trim Recording samples to a desired length, shifting annotations to maintain alignment. + + :param start_sample: The start index of the desired trimmed recording. Defaults to 0. + :type start_sample: int, optional + :param num_samples: The number of samples that the output trimmed recording will have. + :type num_samples: int + :raises IndexError: If start_sample + num_samples is greater than the length of the recording. + :raises IndexError: If sample_start < 0 or num_samples < 0. + + :return: The trimmed Recording. + :rtype: Recording + + **Examples:** + + Create a recording and trim it: + + >>> import numpy + >>> from utils.data import Recording + + >>> samples = numpy.ones(10000, dtype=numpy.complex64) + >>> metadata = { + ... "sample_rate": 1e6, + ... "center_frequency": 2.44e9, + ... } + + >>> recording = Recording(data=samples, metadata=metadata) + >>> print(len(recording)) + 10000 + + >>> trimmed_recording = recording.trim(start_sample=1000, num_samples=1000) + >>> print(len(trimmed_recording)) + 1000 + """ + + if start_sample < 0: + raise IndexError("start_sample cannot be < 0.") + elif start_sample + num_samples > len(self): + raise IndexError( + f"start_sample {start_sample} + num_samples {num_samples} > recording length {len(self)}." + ) + + end_sample = start_sample + num_samples + + data = self.data[:, start_sample:end_sample] + + new_annotations = copy.deepcopy(self.annotations) + for annotation in new_annotations: + # trim annotation if it goes outside the trim boundaries + if annotation.sample_start < start_sample: + annotation.sample_count = annotation.sample_count - (start_sample - annotation.sample_start) + annotation.sample_start = start_sample + + if annotation.sample_start + annotation.sample_count > end_sample: + annotation.sample_count = end_sample - annotation.sample_start + + # shift annotation to align with the new start point + annotation.sample_start = annotation.sample_start - start_sample + + return Recording(data=data, metadata=self.metadata, annotations=new_annotations) + + def normalize(self) -> Recording: + """Scale the recording data, relative to its maximum value, so that the magnitude of the maximum sample is 1. + + :return: Recording where the maximum sample amplitude is 1. + :rtype: Recording + + **Examples:** + + Create a recording with maximum amplitude 0.5 and normalize to a maximum amplitude of 1: + + >>> import numpy + >>> from utils.data import Recording + + >>> samples = numpy.ones(10000, dtype=numpy.complex64) * 0.5 + >>> metadata = { + ... "sample_rate": 1e6, + ... "center_frequency": 2.44e9, + ... } + + >>> recording = Recording(data=samples, metadata=metadata) + >>> print(numpy.max(numpy.abs(recording.data))) + 0.5 + + >>> normalized_recording = recording.normalize() + >>> print(numpy.max(numpy.abs(normalized_recording.data))) + 1 + """ + scaled_data = self.data / np.max(abs(self.data)) + return Recording(data=scaled_data, metadata=self.metadata, annotations=self.annotations) + + def generate_filename(self, tag: Optional[str] = "rec"): + """Generate a filename from metadata. + + :param tag: The string at the beginning of the generated filename. Default is "rec". + :type tag: str, optional + + :return: A filename without an extension. + :rtype: str + """ + # TODO: This method should be refactored to use the first 7 characters of the 'rec_id' field. + + tag = tag + "_" + source = self.metadata.get("source", "") + if source != "": + source = source + "_" + + # converts 1000 to 1k for example + center_frequency = str(Quantity(self.metadata.get("center_frequency", 0))) + if center_frequency != "0": + num = center_frequency[:-1] + suffix = center_frequency[-1] + num = int(np.round(float(num))) + else: + num = 0 + suffix = "" + center_frequency = str(num) + suffix + "Hz_" + + timestamp = int(self.timestamp) + timestamp = datetime.datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d_%H-%M-%S") + "_" + + # Add first seven characters of rec_id for uniqueness + rec_id = self.rec_id[0:7] + return tag + source + center_frequency + timestamp + rec_id + + def __len__(self) -> int: + """The length of a recording is defined by the number of complex samples in each channel of the recording.""" + return self.shape[1] + + def __eq__(self, other: Recording) -> bool: + """Two Recordings are equal if all data, metadata, and annotations are the same.""" + + # counter used to allow for differently ordered annotation lists + return ( + np.array_equal(self.data, other.data) + and self.metadata == other.metadata + and self.annotations == other.annotations + ) + + def __ne__(self, other: Recording) -> bool: + """Two Recordings are equal if all data, and metadata, and annotations are the same.""" + return not self.__eq__(other=other) + + def __iter__(self) -> Iterator: + self._index = 0 + return self + + def __next__(self) -> np.ndarray: + if self._index < self.n_chan: + to_ret = self.data[self._index] + self._index += 1 + return to_ret + else: + raise StopIteration + + def __getitem__(self, key: int | tuple[int] | slice) -> np.ndarray | np.complexfloating: + """If key is an integer, tuple of integers, or a slice, return the corresponding samples. + + For arrays with 1,024 or fewer samples, return a copy of the recording data. For larger arrays, return a + read-only view. This prevents mutation at a distance while maintaining performance. + """ + if isinstance(key, (int, tuple, slice)): + v = self._data[key] + if isinstance(v, np.complexfloating): + return v + elif v.size > 1024: + v.setflags(write=False) # Make view read-only. + return v + else: + return v.copy() + + else: + raise ValueError(f"Key must be an integer, tuple, or slice but was {type(key)}.") + + def __setitem__(self, *args, **kwargs) -> None: + """Raise an error if an attempt is made to assign to the recording.""" + raise ValueError("Assignment to Recording is not allowed.") + + +def generate_recording_id(data: np.ndarray, timestamp: Optional[float | int] = None) -> str: + """Generate unique 64-character recording ID. The recording ID is generated by hashing the recording data with + the datetime that the recording data was generated. If no datatime is provided, the current datatime is used. + + :param data: Tape of IQ samples, as a NumPy array. + :type data: np.ndarray + :param timestamp: Unix timestamp in seconds. Defaults to None. + :type timestamp: float or int, optional + + :return: 256-character hash, to be used as the recording ID. + :rtype: str + """ + if timestamp is None: + timestamp = time.time() + + byte_sequence = data.tobytes() + str(timestamp).encode("utf-8") + sha256_hash = hashlib.sha256(byte_sequence) + + return sha256_hash.hexdigest() + + +def _is_jsonable(x: Any) -> bool: + """ + :return: True if x is JSON serializable, False otherwise. + """ + try: + json.dumps(x) + return True + except (TypeError, OverflowError): + return False + + +def _is_valid_metadata_key(key: Any) -> bool: + """ + :return: True if key is a valid metadata key, False otherwise. + """ + if isinstance(key, str) and key.islower() and re.match(pattern=r"^[a-z_]+$", string=key) is not None: + return True + + else: + return False diff --git a/src/ria_toolkit/io/__init__.py b/src/ria_toolkit/io/__init__.py new file mode 100644 index 0000000..2032b92 --- /dev/null +++ b/src/ria_toolkit/io/__init__.py @@ -0,0 +1,22 @@ +""" +The IO package contains utilities for input and output operations, such as loading and saving recordings to and from +file. +""" + +__all__ = [ + # Common: + "exists", + "copy", + "move", + "validate", + # Recording: + "save_recording", + "load_recording", + "to_sigmf", + "from_sigmf", + "to_npy", + "from_npy", +] + +from .common import copy, exists, move, validate +from .recording import from_npy, from_sigmf, load_recording, to_npy, to_sigmf diff --git a/src/ria_toolkit/io/recording.py b/src/ria_toolkit/io/recording.py new file mode 100644 index 0000000..cac8c1b --- /dev/null +++ b/src/ria_toolkit/io/recording.py @@ -0,0 +1,331 @@ +""" +Utilities for input/output operations on the utils.data.Recording object. +""" + +import datetime as dt +import os +from datetime import timezone +from typing import Optional + +import numpy as np +import sigmf +from sigmf import SigMFFile, sigmffile +from sigmf.utils import get_data_type_str + +from utils.data import Annotation +from utils.data.recording import Recording + + +def load_rec(file: os.PathLike) -> Recording: + """Load a recording from file. + + :param file: The directory path to the file(s) to load, **with** the file extension. + To loading from SigMF, the file extension must be one of *sigmf*, *sigmf-data*, or *sigmf-meta*, + either way both the SigMF data and meta files must be present for a successful read. + :type file: os.PathLike + + :raises IOError: If there is an issue encountered during the file reading process. + + :raises ValueError: If the inferred file extension is not supported. + + :return: The recording, as initialized from file(s). + :rtype: utils.data.Recording + """ + _, extension = os.path.splitext(file) + extension = extension.lstrip(".") + + if extension.lower() in ["sigmf", "sigmf-data", "sigmf-meta"]: + return from_sigmf(file=file) + + elif extension.lower() == "npy": + return from_npy(file=file) + + else: + raise ValueError(f"File extension {extension} not supported.") + + +SIGMF_KEY_CONVERSION = { + SigMFFile.AUTHOR_KEY: "author", + SigMFFile.COLLECTION_KEY: "sigmf:collection", + SigMFFile.DATASET_KEY: "sigmf:dataset", + SigMFFile.DATATYPE_KEY: "datatype", + SigMFFile.DATA_DOI_KEY: "data_doi", + SigMFFile.DESCRIPTION_KEY: "description", + SigMFFile.EXTENSIONS_KEY: "sigmf:extensions", + SigMFFile.GEOLOCATION_KEY: "geolocation", + SigMFFile.HASH_KEY: "sigmf:hash", + SigMFFile.HW_KEY: "sdr", + SigMFFile.LICENSE_KEY: "license", + SigMFFile.META_DOI_KEY: "metadata", + SigMFFile.METADATA_ONLY_KEY: "sigmf:metadata_only", + SigMFFile.NUM_CHANNELS_KEY: "sigmf:num_channels", + SigMFFile.RECORDER_KEY: "source_software", + SigMFFile.SAMPLE_RATE_KEY: "sample_rate", + SigMFFile.START_OFFSET_KEY: "sigmf:start_offset", + SigMFFile.TRAILING_BYTES_KEY: "sigmf:trailing_bytes", + SigMFFile.VERSION_KEY: "sigmf:version", +} + + +def convert_to_serializable(obj): + """ + Recursively convert a JSON-compatible structure into a fully JSON-serializable one. + Handles cases like NumPy data types, nested dicts, lists, and sets. + """ + if isinstance(obj, np.integer): + return int(obj) # Convert NumPy int to Python int + elif isinstance(obj, np.floating): + return float(obj) # Convert NumPy float to Python float + elif isinstance(obj, np.ndarray): + return obj.tolist() # Convert NumPy array to list + elif isinstance(obj, (list, tuple)): + return [convert_to_serializable(item) for item in obj] # Process list or tuple + elif isinstance(obj, dict): + return {key: convert_to_serializable(value) for key, value in obj.items()} # Process dict + elif isinstance(obj, set): + return list(obj) # Convert set to list + elif obj in [float("inf"), float("-inf"), None]: # Handle infinity or None + return None + elif isinstance(obj, (str, int, float, bool)) or obj is None: + return obj # Base case: already serializable + else: + raise TypeError(f"Value of type {type(obj)} is not JSON serializable: {obj}") + + +def to_sigmf(recording: Recording, filename: Optional[str] = None, path: Optional[os.PathLike | str] = None) -> None: + """Write recording to a set of SigMF files. + + The SigMF io format is defined by the `SigMF Specification Project `_ + + :param recording: The recording to be written to file. + :type recording: utils.data.Recording + :param filename: The name of the file where the recording is to be saved. Defaults to auto generated filename. + :type filename: os.PathLike or str, optional + :param path: The directory path to where the recording is to be saved. Defaults to recordings/. + :type path: os.PathLike or str, optional + + :raises IOError: If there is an issue encountered during the file writing process. + + :return: None + + **Examples:** + + >>> from utils.sdr import Synth + >>> from utils.data import Recording + >>> from utils.io import to_sigmf + >>> sdr = Synth() + >>> rec = sdr.record(center_frequency=2.4e9, sample_rate=20e6) + >>> to_sigmf(recording=rec, file="sample_recording") + """ + + if filename is not None: + filename, _ = os.path.splitext(filename) + else: + filename = recording.generate_filename() + + if path is None: + path = "recordings" + + if not os.path.exists(path): + os.makedirs(path) + + multichannel_samples = recording.data + metadata = recording.metadata + annotations = recording.annotations + + if multichannel_samples.shape[0] > 1: + raise NotImplementedError("SigMF File Saving Not Implemented for Multichannel Recordings") + else: + # extract single channel + samples = multichannel_samples[0] + + data_file_path = os.path.join(path, f"{filename}.sigmf-data") + + samples.tofile(data_file_path) + global_info = { + SigMFFile.DATATYPE_KEY: get_data_type_str(samples), + SigMFFile.VERSION_KEY: sigmf.__version__, + SigMFFile.RECORDER_KEY: "RIA", + } + + converted_metadata = { + sigmf_key: metadata[metadata_key] + for sigmf_key, metadata_key in SIGMF_KEY_CONVERSION.items() + if metadata_key in metadata + } + + # Merge dictionaries, giving priority to sigmf_meta + global_info = {**converted_metadata, **global_info} + + ria_metadata = {f"ria:{key}": value for key, value in metadata.items()} + ria_metadata = convert_to_serializable(ria_metadata) + global_info.update(ria_metadata) + + sigMF_metafile = SigMFFile( + data_file=data_file_path, + global_info=global_info, + ) + + for annotation_object in annotations: + annotation_dict = annotation_object.to_sigmf_format() + annotation_dict = convert_to_serializable(annotation_dict) + sigMF_metafile.add_annotation( + start_index=annotation_dict[SigMFFile.START_INDEX_KEY], + length=annotation_dict[SigMFFile.LENGTH_INDEX_KEY], + metadata=annotation_dict["metadata"], + ) + + sigMF_metafile.add_capture( + 0, + metadata={ + SigMFFile.FREQUENCY_KEY: metadata.get("center_frequency", 0), + SigMFFile.DATETIME_KEY: dt.datetime.fromtimestamp(float(metadata.get("timestamp", 0)), tz=timezone.utc) + .isoformat() + .replace("+00:00", "Z"), + }, + ) + + meta_dict = sigMF_metafile.ordered_metadata() + meta_dict["ria"] = metadata + + sigMF_metafile.tofile(f"{os.path.join(path,filename)}.sigmf-meta") + + +def from_sigmf(file: os.PathLike | str) -> Recording: + """Load a recording from a set of SigMF files. + + :param file: The directory path to the SigMF recording files, without any file extension. + The recording will be initialized from ``file_name.sigmf-data`` and ``file_name.sigmf-meta``. + Both the data and meta files must be present for a successful read. + :type file: str or os.PathLike + + :raises IOError: If there is an issue encountered during the file reading process. + + :return: The recording, as initialized from the SigMF files. + :rtype: utils.data.Recording + """ + + if len(file) > 11: + if file[-11:-5] != ".sigmf": + file = file + ".sigmf-data" + + sigmf_file = sigmffile.fromfile(file) + + data = sigmf_file.read_samples() + global_metadata = sigmf_file.get_global_info() + dict_annotations = sigmf_file.get_annotations() + + processed_metadata = {} + for key, value in global_metadata.items(): + # Process core keys + if key.startswith("core:"): + base_key = key[5:] # Remove 'core:' prefix + converted_key = SIGMF_KEY_CONVERSION.get(base_key, base_key) + # Process ria keys + elif key.startswith("ria:"): + converted_key = key[4:] # Remove 'ria:' prefix + else: + # Load non-core/ria keys as is + converted_key = key + + processed_metadata[converted_key] = value + + annotations = [] + + for dict in dict_annotations: + annotations.append( + Annotation( + sample_start=dict[SigMFFile.START_INDEX_KEY], + sample_count=dict[SigMFFile.LENGTH_INDEX_KEY], + freq_lower_edge=dict.get(SigMFFile.FLO_KEY, None), + freq_upper_edge=dict.get(SigMFFile.FHI_KEY, None), + label=dict.get(SigMFFile.LABEL_KEY, None), + comment=dict.get(SigMFFile.COMMENT_KEY, None), + detail=dict.get("ria:detail", None), + ) + ) + + output_recording = Recording(data=data, metadata=processed_metadata, annotations=annotations) + return output_recording + + +def to_npy(recording: Recording, filename: Optional[str] = None, path: Optional[os.PathLike | str] = None) -> str: + """Write recording to ``.npy`` binary file. + + :param recording: The recording to be written to file. + :type recording: utils.data.Recording + :param filename: The name of the file where the recording is to be saved. Defaults to auto generated filename. + :type filename: os.PathLike or str, optional + :param path: The directory path to where the recording is to be saved. Defaults to recordings/. + :type path: os.PathLike or str, optional + + :raises IOError: If there is an issue encountered during the file writing process. + + :return: Path where the file was saved. + :rtype: str + + **Examples:** + + >>> from utils.sdr import Synth + >>> from utils.data import Recording + >>> from utils.io import to_npy + >>> sdr = Synth() + >>> rec = sdr.record(center_frequency=2.4e9, sample_rate=20e6) + >>> to_npy(recording=rec, file="sample_recording.npy") + """ + if filename is not None: + filename, _ = os.path.splitext(filename) + else: + filename = recording.generate_filename() + filename = filename + ".npy" + + if path is None: + path = "recordings" + + if not os.path.exists(path): + os.makedirs(path) + fullpath = os.path.join(path, filename) + + data = np.array(recording.data) + metadata = recording.metadata + annotations = recording.annotations + + with open(file=fullpath, mode="wb") as f: + np.save(f, data) + np.save(f, metadata) + np.save(f, annotations) + + # print(f"Saved recording to {os.getcwd()}/{fullpath}") + return str(fullpath) + + +def from_npy(file: os.PathLike | str) -> Recording: + """Load a recording from a ``.npy`` binary file. + + :param file: The directory path to the recording file, with or without the ``.npy`` file extension. + :type file: str or os.PathLike + + :raises IOError: If there is an issue encountered during the file reading process. + + :return: The recording, as initialized from the ``.npy`` file. + :rtype: utils.data.Recording + """ + + filename, extension = os.path.splitext(file) + if extension != ".npy" and extension != "": + raise ValueError("Cannot use from_npy if file extension is not .npy") + + # Rebuild with .npy extension. + filename = str(filename) + ".npy" + + with open(file=filename, mode="rb") as f: + data = np.load(f, allow_pickle=True) + metadata = np.load(f, allow_pickle=True) + metadata = metadata.tolist() + try: + annotations = list(np.load(f, allow_pickle=True)) + except EOFError: + annotations = [] + + recording = Recording(data=data, metadata=metadata, annotations=annotations) + return recording diff --git a/src/ria_toolkit/transforms/__init__.py b/src/ria_toolkit/transforms/__init__.py new file mode 100644 index 0000000..1d8098b --- /dev/null +++ b/src/ria_toolkit/transforms/__init__.py @@ -0,0 +1,8 @@ +""" +The transforms package houses a collection of functions to manipulate and transform radio data. + +This package contains various functions that operate on NumPy arrays. These functions are utilized within the machine +learning backends to build transforms and functions that seamlessly integrate with those from the respective backend. + +All the transforms in this package expect data in the complex 1xN format. +""" diff --git a/src/ria_toolkit/transforms/iq_augmentations.py b/src/ria_toolkit/transforms/iq_augmentations.py new file mode 100644 index 0000000..937cc15 --- /dev/null +++ b/src/ria_toolkit/transforms/iq_augmentations.py @@ -0,0 +1,717 @@ +""" +This module comprises the functionals of various transforms designed to create new training examples by augmenting +existing examples or recordings using a variety of techniques These transforms take an ArrayLike object as input +and return a corresponding numpy.ndarray with the impairment model applied; +we call the latter the impaired data. +""" + +from typing import Optional + +import numpy as np +from numpy.typing import ArrayLike + +from utils.data.recording import Recording +from utils.helpers.array_conversion import convert_to_2xn + +# TODO: For round 2 of index generation, should j be at min 2 spots away from where it was to prevent adjacent patches. + +# TODO: All the transforms with some randomness need to be refactored to use a random generator. + + +def generate_awgn(signal: ArrayLike | Recording, snr: Optional[float] = 1) -> np.ndarray | Recording: + """Generates additive white gaussian noise (AWGN) relative to the signal-to-noise ratio (SNR) of the + provided `signal` array or `Recording`. + + This function calculates the root mean squared (RMS) power of `signal` and then finds the RMS power of + the noise which matches the specified SNR. Then, the AWGN is generated after calculating the variance and + randomly calculating the amplitude and phase of the noise. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param snr: The signal-to-noise ratio in dB. Default is 1. + :type snr: float, optional + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array representing the generated noise which matches the SNR of `signal`. If `signal` is a + Recording, returns a Recording object with its `data` attribute containing the generated noise array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[2 + 5j, 1 + 8j]]) + >>> new_rec = generate_awgn(rec) + >>> new_rec.data + array([[2.15991777 + 0.69673915j, 0.2814541 - 0.12111976j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + snr_linear = 10 ** (snr / 10) + + # Calculate the RMS power of the signal to solve for the RMS power of the noise + signal_rms_power = np.sqrt(np.mean(np.abs(data) ** 2)) + noise_rms_power = signal_rms_power / snr_linear + + # Generate the AWGN noise which has the same shape as data + variance = noise_rms_power**2 + magnitude = np.random.normal(loc=0, scale=np.sqrt(variance), size=(c, n)) + phase = np.random.uniform(low=0, high=2 * np.pi, size=(c, n)) + complex_awgn = magnitude * np.exp(1j * phase) + + if isinstance(signal, Recording): + return Recording(data=complex_awgn, metadata=signal.metadata) + else: + return complex_awgn + + +def time_reversal(signal: ArrayLike | Recording) -> np.ndarray | Recording: + """Reverses the order of the I (In-phase) and Q (Quadrature) data samples along the time axis of the provided + `signal` array or `Recording`. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array containing the reversed I and Q data samples if `signal` is an array. + If `signal` is a `Recording`, returns a `Recording` object with its `data` attribute containing the + reversed array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[1+2j, 3+4j, 5+6j]]) + >>> new_rec = time_reversal(rec) + >>> new_rec.data + array([[5+6j, 3+4j, 1+2j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if c == 1: + # If 1xN complex + reversed_data = np.squeeze(data)[::-1] + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=reversed_data, metadata=signal.metadata) + else: + return reversed_data.reshape(c, n) + + +def spectral_inversion(signal: ArrayLike | Recording) -> np.ndarray | Recording: + """Negates the imaginary components (Q, Quadrature) of the data samples contained within the + provided `signal` array or `Recording`. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array containing the original I and negated Q data samples if `signal` is an array. + If `signal` is a `Recording`, returns a `Recording` object with its `data` attribute containing the + inverted array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[0+45j, 2-10j]]) + >>> new_rec = spectral_inversion(rec) + >>> new_rec.data + array([[0-45j, 2+10j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if c == 1: + new_data = np.squeeze(data).real - 1j * np.squeeze(data).imag + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=new_data, metadata=signal.metadata) + else: + return new_data.reshape(c, n) + + +def channel_swap(signal: ArrayLike | Recording) -> np.ndarray | Recording: + """Switches the I (In-phase) with the and Q (Quadrature) data samples for each sample within the + provided `signal` array or `Recording`. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array containing the swapped I and Q data samples if `signal` is an array. + If `signal` is a `Recording`, returns a `Recording` object with its `data` attribute containing the + swapped array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[10+20j, 7+35j]]) + >>> new_rec = channel_swap(rec) + >>> new_rec.data + array([[20+10j, 35+7j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if c == 1: + swapped_data = np.squeeze(data).imag + 1j * np.squeeze(data).real + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=swapped_data, metadata=signal.metadata) + else: + return swapped_data.reshape(c, n) + + +def amplitude_reversal(signal: ArrayLike | Recording) -> np.ndarray | Recording: + """Negates the amplitudes of both the I (In-phase) and Q (Quadrature) data samples contained within the + provided `signal` array or `Recording`. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array containing the negated I and Q data samples if `signal` is an array. + If `signal` is a `Recording`, returns a `Recording` object with its `data` attribute containing the + negated array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[4-3j, -5-2j, -9+1j]]) + >>> new_rec = amplitude_reversal(rec) + >>> new_rec.data + array([[-4+3j, 5+2j, 9-1j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if c == 1: + reversed_data = -1 * np.squeeze(data).real - 1j * np.squeeze(data).imag + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=reversed_data, metadata=signal.metadata) + else: + return reversed_data.reshape(c, n) + + +def drop_samples( # noqa: C901 # TODO: Simplify function + signal: ArrayLike | Recording, max_section_size: Optional[int] = 2, fill_type: Optional[str] = "zeros" +) -> np.ndarray | Recording: + """Randomly drops IQ data samples contained within the provided `signal` array or `Recording`. + + This function randomly selects sections of the signal and replaces the current data samples in the specified + section with another value dependent on the fill type. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param max_section_size: Maximum allowable size of the section to be dropped and replaced. Default is 2. + :type max_section_size: int, optional + :param fill_type: Fill option used to replace dropped section of data (back-fill, front-fill, mean, zeros). + Default is "zeros". + + + "back-fill": replace dropped section with the data sample occuring before the section. + + "front-fill": replace dropped section with the data sample occuring after the section. + + "mean": replace dropped section with mean of the entire signal. + + "zeros": replace dropped section with constant value of 0+0j. + :type fill_type: str, optional + + :raises ValueError: If `signal` is not CxN complex. + :raises ValueError: If `max_section_size` is less than 1 or greater than or equal to length of `signal`. + + :return: A numpy array containing the I and Q data samples with replaced subsections if + `signal` is an array. If `signal` is a `Recording`, returns a `Recording` object with its `data` + attribute containing the array with dropped samples. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[2+5j, 1+8j, 6+4j, 3+7j, 4+9j]]) + >>> new_rec = drop_samples(rec) + >>> new_rec.data + array([[2+5j, 0, 0, 0, 4+9j]]) + """ + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if max_section_size < 1 or max_section_size >= n: + raise ValueError("max_section_size must be at least 1 and must be less than the length of signal.") + + if c == 1: + data = np.squeeze(data) + + if fill_type == "mean": + mean = np.mean(data) + + i = -1 + j = -1 + + # Pointers i and j point to exact positions + while i < n: + # Generate valid starting point so that at least 1 drop occurs + i = np.random.randint(j + 1, j + n - max_section_size + 2) + j = np.random.randint(i, i + max_section_size) + + if j > n - 1: # Check that the full drop is within the dataset + break + + # Generate fill based on fill_type + if fill_type == "back-fill": + fill = data[i - 1] if i > 0 else data[i] + elif fill_type == "front-fill": + fill = data[j + 1] if j < n - 1 else data[j] + elif fill_type == "mean": + fill = mean + elif fill_type == "zeros": + fill = 0 + 0j + else: + raise ValueError(f"fill_type {fill_type} not recognized.") + + # Replaces dropped samples with fill values + data[i : j + 1] = fill + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=data, metadata=signal.metadata) + else: + return data.reshape(c, n) + + +def quantize_tape( + signal: ArrayLike | Recording, bin_number: Optional[int] = 4, rounding_type: Optional[str] = "floor" +) -> np.ndarray | Recording: + """Quantizes the IQ data of the provided `signal` array or `Recording` by a few bits. + + This function emulates an analog-to-digital converter (ADC) which is commonly seen in digital RF systems. + The relationship between the number of bins and number of bits is: log(# of bins) / log(2) = # of bits. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param bin_number: The number of bins the signal should be divided into. Default is 4. + :type bin_number: int, optional + :param rounding_type: The type of rounding applied during processing. Default is "floor". + + "floor": rounds down to the lower bound of the bin. + + "ceiling": rounds up to the upper bound of the bin. + :type rounding_type: str, optional + + :raises ValueError: If `signal` is not CxN complex. + :raises UserWarning: If `rounding_type` is not "floor" or "ceiling", "floor" is selected by default. + + :return: A numpy array containing the quantized I and Q data samples if `signal` is an array. + If `signal` is a `Recording`, returns a `Recording` object with its `data` attribute containing + the quantized array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[1+1j, 4+4j, 1+2j, 1+4j]]) + >>> new_rec = quantize_tape(rec) + >>> new_rec.data + array([[4+4j, 3+3j, 4+1j, 4+3j]]) + """ + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if rounding_type not in {"ceiling", "floor"}: + raise UserWarning('rounding_type must be either "floor" or "ceiling", floor has been selected by default') + + if c == 1: + iq_data = convert_to_2xn(data) + maximum, minimum = iq_data.max(), iq_data.min() + bin_edges = np.linspace(minimum, maximum, bin_number + 1) + indices = np.digitize(iq_data, bin_edges, right=True) + + # If data falls outside the first bin, map it back into the first bin, data will not fall outside of last bin + indices[indices == 0] = 1 + + # Map the data points to the correct bins + if rounding_type == "ceiling": + modified_iq_data = bin_edges[indices] + else: + modified_iq_data = bin_edges[indices - 1] + + new_data = modified_iq_data[0] + 1j * modified_iq_data[1] + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=new_data, metadata=signal.metadata) + else: + return new_data.reshape(c, n) + + +def quantize_parts( + signal: ArrayLike | Recording, + max_section_size: Optional[int] = 2, + bin_number: Optional[int] = 4, + rounding_type: Optional[str] = "floor", +) -> np.ndarray | Recording: + """Quantizes random parts of the IQ data within the provided `signal` array or `Recording` by a few bits. + + This function emulates an analog-to-digital converter (ADC) which is commonly seen in digital RF systems. + The relationship between the number of bins and number of bits is: log(# of bins) / log(2) = # of bits. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param max_section_size: Maximum allowable size of the section to be quantized. Default is 2. + :type max_section_size: int, optional + :param bin_number: The number of bins the signal should be divided into. Default is 4. + :type bin_number: int, optional + :param rounding_type: Type of rounding applied during processing. Default is "floor". + + "floor": rounds down to the lower bound of the bin. + + "ceiling": rounds up to the upper bound of the bin. + :type rounding_type: str, optional + + :raises ValueError: If `signal` is not CxN complex. + :raises UserWarning: If `rounding_type` is not "floor" or "ceiling", "floor" is selected by default. + + :return: A numpy array containing the I and Q data samples with quantized subsections if `signal` + is an array. If `signal` is a `Recording`, returns a `Recording` object with its `data` attribute + containing the partially quantized array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[2+5j, 1+8j, 6+4j, 3+7j, 4+9j]]) + >>> new_rec = quantize_parts(rec) + >>> new_rec.data + array([[2+5j, 1+8j, 3.66666667+3.66666667j, 3+7j, 4+9j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if rounding_type not in {"ceiling", "floor"}: + raise UserWarning('rounding_type must be either "floor" or "ceiling", floor has been selected by default') + + if c == 1: + iq_data = convert_to_2xn(data) + i_data, q_data = iq_data + maximum, minimum = iq_data.max(), iq_data.min() + bin_edges = np.linspace(minimum, maximum, bin_number + 1) + indices = np.digitize(iq_data, bin_edges, right=True) + + # Map everything from bin 0 to bin 1 + indices[indices == 0] = 1 + + i = -1 + j = -1 + + # Pointers i and j point to exact positions + while i < n: + # Generate valid starting point so that at least 1 drop occurs + i = np.random.randint(j + 1, j + n - max_section_size + 2) + j = np.random.randint(i, i + max_section_size) + + if j > n - 1: # Check that the full drop is within the dataset + break + + if rounding_type == "ceiling": + i_data[i : j + 1] = bin_edges[indices[0][i : j + 1]] + q_data[i : j + 1] = bin_edges[indices[1][i : j + 1]] + else: + i_data[i : j + 1] = bin_edges[indices[0][i : j + 1] - 1] + q_data[i : j + 1] = bin_edges[indices[1][i : j + 1] - 1] + + quantized_data = i_data + 1j * q_data + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=quantized_data, metadata=signal.metadata) + else: + return quantized_data.reshape(c, n) + + +def magnitude_rescale( + signal: ArrayLike | Recording, + starting_bounds: Optional[tuple] = None, + max_magnitude: Optional[int] = 1, +) -> np.ndarray | Recording: + """Selects a random starting point from within the specified starting bounds and multiplies IQ data of the + provided `signal` array or `Recording` by a random constant. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param starting_bounds: The bounds (inclusive) as indices in which the starting position of the rescaling occurs. + Default is None, but if user does not assign any bounds, the bounds become (random index, N-1). + :type starting_bounds: tuple, optional + :param max_magnitude: The maximum value of the constant that is used to rescale the data. Default is 1. + :type max_magnitude: int, optional + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array containing the I and Q data samples with the rescaled magnitude after the random + starting point if `signal` is an array. If `signal` is a `Recording`, returns a `Recording` + object with its `data` attribute containing the rescaled array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[2+5j, 1+8j, 6+4j, 3+7j, 4+9j]]) + >>> new_rec = magniute_rescale(rec) + >>> new_rec.data + array([[2+5j, 1+8j, 6+4j, 3+7j, 3.03181761+6.82158963j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if starting_bounds is None: + starting_bounds = (np.random.randint(0, n), n - 1) + + if starting_bounds[0] < 0 or starting_bounds[1] > n - 1: + raise ValueError("starting_bounds must be valid indices for the dataset.") + + if c == 1: + data = np.squeeze(data) + starting_point = np.random.randint(starting_bounds[0], starting_bounds[1] + 1) + magnitude = np.random.rand() * max_magnitude + + rescaled_section = data[starting_point:] * magnitude + rescaled_data = np.concatenate((data[:starting_point], rescaled_section)) + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=rescaled_data, metadata=signal.metadata) + else: + return rescaled_data.reshape(c, n) + + +def cut_out( # noqa: C901 # TODO: Simplify function + signal: ArrayLike | Recording, max_section_size: Optional[int] = 3, fill_type: Optional[str] = "ones" +) -> np.ndarray | Recording: + """Cuts out random sections of IQ data and replaces them with either 0s, 1s, or low, average, or high + sound-to-noise ratio (SNR) additive white gausssian noise (AWGN) within the provided `signal` array or + `Recording`. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param max_section_size: Maximum allowable size of the section to be quantized. Default is 3. + :type max_section_size: int, optional + :param fill_type: Fill option used to replace cutout section of data (zeros, ones, low-snr, avg-snr-1, avg-snr-2). + Default is "ones". + + "zeros": replace cutout section with 0s. + + "ones": replace cutout section with 1s. + + "low-snr": replace cutout section with AWGN with an SNR of 0.5. + + "avg-snr": replace cutout section with AWGN with an SNR of 1. + + "high-snr": replace cutout section with AWGN with an SNR of 2. + :type fill_type: str, optional + + :raises ValueError: If `signal` is not CxN complex. + :raises UserWarning: If fill_type is not "zeros", "ones", "low-snr", "avg-snr", or "high-snr", "ones" is selected + by default. + :raises ValueError: If `max_section_size` is less than 1 or greater than or equal to length of `signal`. + + :return: A numpy array containing the I and Q data samples with random sections cut out and replaced according to + `fill_type` if `signal` is an array. If `signal` is a `Recording`, returns a `Recording` object + with its `data` attribute containing the cut out and replaced array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[2+5j, 1+8j, 6+4j, 3+7j, 4+9j]]) + >>> new_rec = cut_out(rec) + >>> new_rec.data + array([[2+5j, 1+8j, 1+1j, 1+1j, 1+1j]]) + """ + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if fill_type not in {"zeros", "ones", "low-snr", "avg-snr", "high-snr"}: + raise UserWarning( + """fill_type must be "zeros", "ones", "low-snr", "avg-snr", or "high-snr", + "ones" has been selected by default""" + ) + + if max_section_size < 1 or max_section_size >= n: + raise ValueError("max_section_size must be at least 1 and must be less than the length of signal.") + + if c == 1: + data = np.squeeze(data) + + i = -1 + j = -1 + + # Pointers i and j point to exact positions + while i < n: + # Generate valid starting point so that at least 1 drop occurs + i = np.random.randint(j + 1, j + n - max_section_size + 2) + j = np.random.randint(i, i + max_section_size) + + if j > n - 1: # Check that the full drop is within the dataset + break + + # TODO: Check if we can collapse last three options which depends on what snr value the user enters + if fill_type == "zeros": + fill = 0 + 0j + elif fill_type == "ones": + fill = 1 + 1j + elif fill_type == "low-snr": + fill = generate_awgn([data[i : j + 1]], 0.5) + elif fill_type == "avg-snr": + fill = generate_awgn([data[i : j + 1]], 1) + else: + fill = generate_awgn([data[i : j + 1]], 2) + + data[i : j + 1] = fill + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=data, metadata=signal.metadata) + else: + return data.reshape(c, n) + + +def patch_shuffle(signal: ArrayLike | Recording, max_patch_size: Optional[int] = 3) -> np.ndarray | Recording: + """Selects random patches of the IQ data and randomly shuffles the data samples within the specified patch of + the provided `signal` array or `Recording`. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param max_patch_size: Maximum allowable patch size of the data that can be shuffled. Default is 3. + :type max_patch_size: int, optional + + :raises ValueError: If `signal` is not CxN complex. + :raises ValueError: If `max_patch_size` is less than or equal to 1 or greater than length of `signal`. + + :return: A numpy array containing the I and Q data samples with randomly shuffled regions if `signal` is + an array. If `signal` is a `Recording`, returns a `Recording` object with its `data` attribute containing + the shuffled array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[2+5j, 1+8j, 6+4j, 3+7j, 4+9j]]) + >>> new_rec = patch_shuffle(rec) + >>> new_rec.data + array([[2+5j, 1+8j, 3+4j, 6+9j, 4+7j]]) + """ + if isinstance(signal, Recording): + data = signal.data.copy() # Cannot shuffle read-only array. + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if max_patch_size > n or max_patch_size <= 1: + raise ValueError("max_patch_size must be less than or equal to the length of signal and greater than 1.") + + if c == 1: + data = np.squeeze(data) + + i = -1 + j = -1 + + # Pointers i and j point to exact positions + while i < n: + # Generate valid starting point so that at least 1 drop occurs + i = np.random.randint(j + 1, j + n - max_patch_size + 2) + j = np.random.randint(i, i + max_patch_size) + + if j > n - 1: # Check that the full drop is within the dataset + break + + np.random.shuffle(data.real[i : j + 1]) + np.random.shuffle(data.imag[i : j + 1]) + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=data, metadata=signal.metadata) + else: + return data.reshape(c, n) diff --git a/src/ria_toolkit/transforms/iq_impairments.py b/src/ria_toolkit/transforms/iq_impairments.py new file mode 100644 index 0000000..fc2176b --- /dev/null +++ b/src/ria_toolkit/transforms/iq_impairments.py @@ -0,0 +1,365 @@ +""" +This module comprises various transforms designed to represent signal impairments. +These transforms take a recording as input and return a corresponding recording with +the impairment model applied; we call the latter an impaired recording. + +Signals travel through transmission media, which are not perfect. The imperfection +causes signal impairment, meaning that the signal at the beginning of the medium is +not the same as the signal at the end of the medium. What is sent is not what is received. +Three causes of impairment are attenuation, distortion, and noise. +""" + +from typing import Optional + +import numpy as np +from numpy.typing import ArrayLike +from scipy.signal import resample_poly + +from utils.data import Recording +from utils.transforms import iq_augmentations + + +def add_awgn_to_signal(signal: ArrayLike | Recording, snr: Optional[float] = 1) -> np.ndarray | Recording: + """Generates additive white gaussian noise (AWGN) relative to the signal-to-noise ratio (SNR) of the + provided `signal` array or `Recording`. + + This function calculates the root mean squared (RMS) power of `signal` and then finds the RMS power of the noise + which matches the specified SNR. Then, the AWGN is generated after calculating the variance and randomly + calculating the amplitude and phase of the noise. Then, this generated AWGN is added to the original signal and + returned. + + :param signal: Input IQ data as a complex ``C x N`` array or `Recording`, where ``C`` is the number of channels + and ``N`` is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param snr: The signal-to-noise ratio in dB. Default is 1. + :type snr: float, optional + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array which is the sum of the noise (which matches the SNR) and the original signal. If `signal` + is a `Recording`, returns a `Recording object` with its `data` attribute containing the noisy signal array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[1+1j, 2+2j]]) + >>> new_rec = add_awgn_to_signal(rec) + >>> new_rec.data + array([[0.83141973+0.32529242j, -1.00909846+2.39282713j]]) + """ + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim != 2 or not np.iscomplexobj(data): + raise ValueError("signal must be CxN complex.") + + noise = iq_augmentations.generate_awgn(signal=data, snr=snr) + print(f"noise is {noise}") + + noisy_signal = data + noise + + if isinstance(signal, Recording): + return Recording(data=noisy_signal, metadata=signal.metadata) + else: + return noisy_signal + + +def time_shift(signal: ArrayLike | Recording, shift: Optional[int] = 1) -> np.ndarray | Recording: + """Apply a time shift to a signal. + + After the time shift is applied, we fill any empty regions with zeros. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param shift: The number of indices to shift by. Default is 1. + :type shift: int, optional + + :raises ValueError: If `signal` is not CxN complex. + :raises UserWarning: If `shift` is greater than length of `signal`. + + :return: A numpy array which represents the time-shifted signal. If `signal` is a `Recording`, + returns a `Recording object` with its `data` attribute containing the time-shifted array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[1+1j, 2+2j, 3+3j, 4+4j, 5+5j]]) + >>> new_rec = time_shift(rec, -2) + >>> new_rec.data + array([[3+3j, 4+4j, 5+5j, 0+0j, 0+0j]]) + """ + # TODO: Additional info needs to be added to docstring description + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if shift > n: + raise UserWarning("shift is greater than signal length") + + shifted_data = np.zeros_like(data) + + if c == 1: + # New iq array shifted left or right depending on sign of shift + # This should work even if shift > iqdata.shape[1] + if shift >= 0: + # Shift to right + shifted_data[:, shift:] = data[:, :-shift] + + else: + # Shift to the left + shifted_data[:, :shift] = data[:, -shift:] + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=shifted_data, metadata=signal.metadata) + else: + return shifted_data + + +def frequency_shift(signal: ArrayLike | Recording, shift: Optional[float] = 0.5) -> np.ndarray | Recording: + """Apply a frequency shift to a signal. + + .. note:: + + The frequency shift is applied relative to the sample rate. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param shift: The frequency shift relative to the sample rate. Must be in the range ``[-0.5, 0.5]``. + Default is 0.5. + :type shift: float, optional + + :raises ValueError: If the provided frequency shift is not in the range ``[-0.5, 0.5]``. + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array which represents the frequency-shifted signal. If `signal` is a `Recording`, + returns a `Recording object` with its `data` attribute containing the frequency-shifted array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[1+1j, 2+2j, 3+3j, 4+4j]]) + >>> new_rec = frequency_shift(rec, -0.4) + >>> new_rec.data + array([[1+1j, -0.44246348-2.79360449j, -1.92611857+3.78022053j, 5.04029404-2.56815809j]]) + """ + # TODO: Additional info needs to be added to docstring description + + if shift > 0.5 or shift < -0.5: + raise ValueError("Frequency shift must be in the range [-0.5, 0.5]") + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + shifted_data = np.zeros_like(data) + + if c == 1: + # Calculate the phase shift for the frequency shift + phase_shift_ = 2.0 * np.pi * shift * np.arange(n) + + # Use trigonometric identities to apply the frequency shift + shifted_data.real = data.real * np.cos(phase_shift_) - data.imag * np.sin(phase_shift_) + shifted_data.imag = data.real * np.sin(phase_shift_) + data.imag * np.cos(phase_shift_) + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=shifted_data, metadata=signal.metadata) + else: + return shifted_data + + +def phase_shift(signal: ArrayLike | Recording, phase: Optional[float] = np.pi) -> np.ndarray | Recording: + """Apply a phase shift to a signal. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param phase: The phase angle by which to rotate the IQ samples, in radians. Must be in the range ``[-π, π]``. + Default is π. + :type phase: float, optional + + :raises ValueError: If the provided phase rotation is not in the range ``[-π, π]``. + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array which represents the phase-shifted signal. If `signal` is a `Recording`, + returns a `Recording object` with its `data` attribute containing the phase-shifted array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[1+1j, 2+2j, 3+3j, 4+4j]]) + >>> new_rec = phase_shift(rec, np.pi/2) + >>> new_rec.data + array([[-1+1j, -2+2j -3+3j -4+4j]]) + """ + # TODO: Additional info needs to be added to docstring description + + if phase > np.pi or phase < -np.pi: + raise ValueError("Phase rotation must be in the range [-π, π]") + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if c == 1: + shifted_data = data * np.exp(1j * phase) + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=shifted_data, metadata=signal.metadata) + else: + return shifted_data + + +def iq_imbalance( + signal: ArrayLike | Recording, + amplitude_imbalance: Optional[float] = 1.5, + phase_imbalance: Optional[float] = np.pi, + dc_offset: Optional[float] = 1.5, +) -> np.ndarray | Recording: + """Apply an IQ Imbalance to a signal. + + .. note:: + + Based on MathWorks' `I/Q Imbalance `_. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param amplitude_imbalance: The IQ amplitude imbalance to apply, in dB. Default is 1.5. + :type amplitude_imbalance: float, optional + :param phase_imbalance: The IQ phase imbalance to apply, in radians. Default is π. + Must be in the range ``[-π, π]``. + :type phase_imbalance: float, optional + :param dc_offset: The IQ DC offset to apply, in dB. Default is 1.5. + :type dc_offset: float, optional + + :raises ValueError: If the phase imbalance is not in the range ``[-π, π]``. + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array which is the original signal with an applied IQ imbalance. If `signal` is a `Recording`, + returns a `Recording object` with its `data` attribute containing the IQ imbalanced signal array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[2+18j, -34+2j, 3+9j]]) + >>> new_rec = iq_imbalance(rec, 1, np.pi, 2) + >>> new_rec.data + array([[-38.38613587-4.78555031j, -4.26512621+81.35435535j, -19.19306793-7.17832547j]]) + """ + # TODO: Additional info needs to be added to docstring description + + if phase_imbalance > np.pi or phase_imbalance < -np.pi: + raise ValueError("Phase imbalance must be in the range [-π, π].") + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if c == 1: + # Apply amplitude imbalance + data = ( + 10 ** (0.5 * amplitude_imbalance / 20.0) * data.real + + 1j * 10 ** (-0.5 * amplitude_imbalance / 20.0) * data.imag + ) + + # Apply phase imbalance + data = ( + np.exp(-1j * phase_imbalance / 2.0) * data.real + + np.exp(1j * (np.pi / 2.0 + phase_imbalance / 2.0)) * data.imag + ) + + # Apply DC offset + imbalanced_data = data + (10 ** (dc_offset / 20.0) * data.real + 1j * 10 ** (dc_offset / 20.0) * data.imag) + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=imbalanced_data, metadata=signal.metadata) + else: + return imbalanced_data + + +def resample(signal: ArrayLike | Recording, up: Optional[int] = 4, down: Optional[int] = 2) -> np.ndarray | Recording: + """Resample a signal using polyphase filtering. + + Uses scipy.signal.resample_poly to upsample the signal by the + factor *up*, apply a zero-phase low-pass FIR filter, and downsample the + signal by the factor *down*. + + :param signal: Input IQ data as a complex CxN array or `Recording`, where C is the number of channels and N + is the length of the IQ examples. + :type signal: array_like or utils.data.Recording + :param up: The upsampling factor. Default is 4. + :type up: int, optional + :param down: The downsampling factor. Default is 2. + :type down: int, optional + + :raises ValueError: If `signal` is not CxN complex. + + :return: A numpy array which represents the resampled signal If `signal` is a `Recording`, + returns a `Recording object` with its `data` attribute containing the resampled array. + :rtype: np.ndarray or utils.data.Recording + + >>> rec = Recording(data=[[1+1j, 2+2j]]) + >>> new_rec = resample(rec, 2, 1) + >>> new_rec.data + array([[1.00051747+1.00051747j, 1.90020207+1.90020207j]]) + """ + # TODO: Additional info needs to be added to docstring description + + if isinstance(signal, Recording): + data = signal.data + else: + data = np.asarray(signal) + + if data.ndim == 2 and np.iscomplexobj(data): + c, n = data.shape + else: + raise ValueError("signal must be CxN complex.") + + if c == 1: + data = np.squeeze(data) + resampled_iqdata = resample_poly(x=data, up=up, down=down) + + # Reshape array so that slicing operations work on resampled data + resampled_iqdata = np.reshape(resampled_iqdata, newshape=(1, len(resampled_iqdata))) + + if resampled_iqdata.shape[1] > n: + resampled_iqdata = resampled_iqdata[:, :n] + + else: + empty_array = np.zeros(resampled_iqdata.shape, dtype=resampled_iqdata.dtype) + empty_array[:, : resampled_iqdata.shape[1]] = resampled_iqdata + else: + raise NotImplementedError + + if isinstance(signal, Recording): + return Recording(data=resampled_iqdata, metadata=signal.metadata) + else: + return resampled_iqdata diff --git a/src/ria_toolkit/utils/__init__.py b/src/ria_toolkit/utils/__init__.py new file mode 100644 index 0000000..de63e45 --- /dev/null +++ b/src/ria_toolkit/utils/__init__.py @@ -0,0 +1,9 @@ +""" +The Helpers module contains a bunch of helper functions, including array conversion utilities. +""" + +__all__ = [ + "bytes_to_samples", +] + +from .bytes_to_samples import bytes_to_samples diff --git a/src/ria_toolkit/utils/array_conversion.py b/src/ria_toolkit/utils/array_conversion.py new file mode 100644 index 0000000..a3e153b --- /dev/null +++ b/src/ria_toolkit/utils/array_conversion.py @@ -0,0 +1,80 @@ +""" +IQ data represents the in-phase (I) and quadrature (Q) components of a signal. There are two ways to represent +single-channel IQ signals: + +#. **Complex 1xN Format:** In the complex 1xN format, the IQ data is represented as a 2D array of complex numbers with + shape 1xN. In this format, the real part of each complex number represents the in-phase component, while the + imaginary part represents the quadrature component. +#. **Real 2xN Format:** In the real 2xN format, the IQ data is represented as a 2D array of real numbers with shape + 2xN. In this format, the first row contains the in-phase components, while the second row contains the quadrature + components. + +This submodule provides functions to verify and convert between these two formats. +""" + +import numpy as np +from numpy.typing import ArrayLike + + +def convert_to_2xn(arr: np.ndarray) -> np.ndarray: + """Convert arr to the real 2xN format. If arr is already real 2xN, then you'll get back a copy. + + :param arr: Array of IQ samples, in the complex 1XN format. + :type arr: array_like + + :return: The provided signal, in the real 2xN format. + :rtype: np.ndarray + """ + if is_1xn(arr): + return np.vstack((np.real(arr[0]), np.imag(arr[0]))) + + elif is_2xn(arr): + return np.copy(arr) + + else: + raise ValueError("arr is neither complex 1xN nor real 2xN.") + + +def convert_to_1xn(arr: np.ndarray) -> np.ndarray: + """Convert arr to the complex 1xN format. If arr is already complex 1xN, then you'll get back a copy. + + :param arr: Array of IQ samples, in the real 2xN format. + :type arr: np.ndarray + + :return: The provided signal, in the complex 1xN format. + :rtype: np.ndarray + """ + if is_2xn(arr): + return np.expand_dims(a=arr[0, :] + 1j * arr[1, :], axis=0) + + elif is_1xn(arr): + return np.copy(arr) + + else: + raise ValueError("arr is neither complex 1xN nor real 2xN.") + + +def is_1xn(arr: ArrayLike) -> bool: + """ + :return: True is arr is complex 1xN, False otherwise. + :rtype: bool + """ + a = np.asarray(arr) + + if a.ndim == 2 and a.shape[0] == 1 and np.iscomplexobj(a): + return True + else: + return False + + +def is_2xn(arr: ArrayLike) -> bool: + """ + :return: True is arr is real 2xN, False otherwise. + :rtype: bool + """ + a = np.asarray(arr) + + if a.ndim == 2 and a.shape[0] == 2 and not np.iscomplexobj(a): + return True + else: + return False diff --git a/src/ria_toolkit/utils/bytes_to_samples.py b/src/ria_toolkit/utils/bytes_to_samples.py new file mode 100644 index 0000000..ea05996 --- /dev/null +++ b/src/ria_toolkit/utils/bytes_to_samples.py @@ -0,0 +1,18 @@ +from numpy.typing import NDArray + + +def bytes_to_samples(data: bytes) -> NDArray: + """Convert bytes to IQ samples, in the complex 1xN format. + + :param data: Array of bytes + :type data: bytes + + :return: Tape of IQ samples, as numpy complex type + :rtype: np.ndarray + """ + # samples = np.frombuffer(data, dtype=np.int16).astype(np.float32) + # samples /= 2048 + # samples = samples[::2] + 1j * samples[1::2] + # # samples = samples.view(np.complex64) + # return samples + raise NotImplementedError diff --git a/src/ria_toolkit/viz/__init__.py b/src/ria_toolkit/viz/__init__.py new file mode 100644 index 0000000..28c5f6d --- /dev/null +++ b/src/ria_toolkit/viz/__init__.py @@ -0,0 +1,12 @@ +""" +The package contains assorted plotting and report generation utilities to help visualize RIA components such as +recordings and radio datasets. +""" + +__all__ = [ + "view_annotations", + "view_channels", + "view_sig", +] + +from .view_signal import view_annotations, view_channels, view_sig diff --git a/src/ria_toolkit/viz/recording.py b/src/ria_toolkit/viz/recording.py new file mode 100644 index 0000000..df1ddcc --- /dev/null +++ b/src/ria_toolkit/viz/recording.py @@ -0,0 +1,192 @@ +import numpy as np +import plotly.graph_objects as go +import scipy.signal as signal +from plotly.graph_objs import Figure +from scipy.fft import fft, fftshift + +from utils.data import Recording + + +def spectrogram(rec: Recording, thumbnail: bool = False) -> Figure: + """Create a spectrogram for the recording. + + :param rec: Signal to plot. + :type rec: utils.data.Recording + :param thumbnail: Whether to return a small thumbnail version or full plot. + :type thumbnail: bool + + :return: Spectrogram, as a Plotly figure. + """ + complex_signal = rec.data[0] + sample_rate = int(rec.metadata.get("sample_rate", 1)) + plot_length = len(complex_signal) + + # Determine FFT size + if plot_length < 2000: + fft_size = 64 + elif plot_length < 10000: + fft_size = 256 + elif plot_length < 1000000: + fft_size = 1024 + else: + fft_size = 2048 + + frequencies, times, Sxx = signal.spectrogram( + complex_signal, + fs=sample_rate, + nfft=fft_size, + nperseg=fft_size, + noverlap=fft_size // 8, + scaling="density", + mode="complex", + return_onesided=False, + ) + + # Convert complex values to amplitude and then to log scale for visualization + Sxx_magnitude = np.abs(Sxx) + Sxx_log = np.log10(Sxx_magnitude + 1e-6) + + # Normalize spectrogram values between 0 and 1 for plotting + Sxx_log_shifted = Sxx_log - np.min(Sxx_log) + Sxx_log_norm = Sxx_log_shifted / np.max(Sxx_log_shifted) + + # Shift frequency bins and spectrogram rows so frequencies run from negative to positive + frequencies_shifted = np.fft.fftshift(frequencies) + Sxx_shifted = np.fft.fftshift(Sxx_log_norm, axes=0) + + fig = go.Figure( + data=go.Heatmap( + z=Sxx_shifted, + x=times / 1e6, + y=frequencies_shifted, + colorscale="Viridis", + zmin=0, + zmax=1, + reversescale=False, + showscale=False, + ) + ) + + if thumbnail: + fig.update_xaxes(showticklabels=False) + fig.update_yaxes(showticklabels=False) + fig.update_layout( + template="plotly_dark", + width=200, + height=100, + margin=dict(l=5, r=5, t=5, b=5), + xaxis=dict(scaleanchor=None), + yaxis=dict(scaleanchor=None), + ) + else: + fig.update_layout( + title="Spectrogram", + xaxis_title="Time [s]", + yaxis_title="Frequency [Hz]", + template="plotly_dark", + height=300, + width=800, + ) + + return fig + + +def iq_time_series(rec: Recording) -> Figure: + """Create a time series plot of the real and imaginary parts of signal. + + :param rec: Signal to plot. + :type rec: utils.data.Recording + + :return: Time series plot as a Plotly figure. + """ + complex_signal = rec.data[0] + sample_rate = int(rec.metadata.get("sample_rate", 1)) + plot_length = len(complex_signal) + t = np.arange(0, plot_length, 1) / sample_rate + + fig = go.Figure() + fig.add_trace(go.Scatter(x=t, y=complex_signal.real, mode="lines", name="I (In-phase)", line=dict(width=0.6))) + fig.add_trace(go.Scatter(x=t, y=complex_signal.imag, mode="lines", name="Q (Quadrature)", line=dict(width=0.6))) + + fig.update_layout( + title="IQ Time Series", + xaxis_title="Time [s]", + yaxis_title="Amplitude", + template="plotly_dark", + height=300, + width=800, + showlegend=True, + ) + + return fig + + +def frequency_spectrum(rec: Recording) -> Figure: + """Create a frequency spectrum plot from the recording. + + :param rec: Input signal to plot. + :type rec: utils.data.Recording + + :return: Frequency spectrum as a Plotly figure. + """ + complex_signal = rec.data[0] + center_frequency = int(rec.metadata.get("center_frequency", 0)) + sample_rate = int(rec.metadata.get("sample_rate", 1)) + + epsilon = 1e-10 + spectrum = np.abs(fftshift(fft(complex_signal))) + freqs = np.linspace(-sample_rate / 2, sample_rate / 2, len(complex_signal)) + center_frequency + log_spectrum = np.log10(spectrum + epsilon) + scaled_log_spectrum = (log_spectrum - log_spectrum.min()) / (log_spectrum.max() - log_spectrum.min()) + + fig = go.Figure() + fig.add_trace(go.Scatter(x=freqs, y=scaled_log_spectrum, mode="lines", name="Spectrum", line=dict(width=0.4))) + + fig.update_layout( + title="Frequency Spectrum", + xaxis_title="Frequency [Hz]", + yaxis_title="Magnitude", + yaxis_type="log", + template="plotly_dark", + height=300, + width=800, + showlegend=False, + ) + + return fig + + +def constellation(rec: Recording) -> Figure: + """Create a constellation plot from the recording. + + :param rec: Input signal to plot. + :type rec: utils.data.Recording + + :return: Constellation as a Plotly figure. + """ + complex_signal = rec.data[0] + + # Downsample the IQ samples to a target number of points + # This reduces the amount of data plotted, improving performance and interactivity + # without losing significant detail in the constellation visualization. + target_number_of_points = 5000 + step = max(1, len(complex_signal) // target_number_of_points) + i_ds = complex_signal.real[::step] + q_ds = complex_signal.imag[::step] + + fig = go.Figure() + fig.add_trace(go.Scatter(x=i_ds, y=q_ds, mode="lines", name="Constellation", line=dict(width=0.2))) + + fig.update_layout( + title="Constellation", + xaxis_title="In-phase (I)", + yaxis_title="Quadrature (Q)", + template="plotly_dark", + height=400, + width=400, + showlegend=False, + xaxis=dict(range=[-1.1, 1.1]), + yaxis=dict(range=[-1.1, 1.1]), + ) + + return fig