modrec-workflow/scripts/application_packager/profile_onnx.py
Michael Luciuk 9979d84e29
All checks were successful
Modulation Recognition Demo / ria-demo (push) Successful in 2m52s
Documentation and formatting updates (#1)
Documentation and formatting updates:
- Updates to project README.
- Adding project health files (`LICENSE` and `SECURITY.md`)
- A few minor formatting changes throughout
- A few typo fixes, removal of unused code, cleanup of shadowed variables, and fixed import ordering with isort.

**Note:** These changes have not been tested.

Co-authored-by: Michael Luciuk <michael.luciuk@gmail.com>
Co-authored-by: Liyu Xiao <liyu@qoherent.ai>
Reviewed-on: https://git.riahub.ai/qoherent/modrec-workflow/pulls/1
Reviewed-by: Liyux <liyux@noreply.localhost>
Co-authored-by: Michael Luciuk <michael@qoherent.ai>
Co-committed-by: Michael Luciuk <michael@qoherent.ai>
2025-07-08 10:50:41 -04:00

89 lines
3.0 KiB
Python

import json
import os
import time
import numpy as np
import onnxruntime as ort
def profile_onnx_model(
path_to_onnx: str, num_runs: int = 100, warmup_runs: int = 5
) -> None:
"""
Profiles an ONNX model by running inference multiple times and collecting performance data.
Prints session initialization time, provider used, average inference time (excluding warm-up),
and parses the ONNX Runtime JSON trace to show the most expensive operation.
Parameters:
path_to_onnx (str): Path to the ONNX model file.
num_runs (int): Number of inference runs (including warm-ups).
warmup_runs (int): Number of warm-up runs to skip from timing.
"""
# Session setup
options = ort.SessionOptions()
options.enable_profiling = True
options.add_session_config_entry("session.enable_quant_qdq_cleanup", "1")
options.add_session_config_entry("ep.dynamic.workload_type", "Efficient")
# Try GPU, then fallback to CPU
try:
start_time = time.time()
session = ort.InferenceSession(
path_to_onnx, sess_options=options, providers=["CUDAExecutionProvider"]
)
print("Running on the GPU")
except Exception as e:
session = ort.InferenceSession(
path_to_onnx, sess_options=options, providers=["CPUExecutionProvider"]
)
print("Could not find GPU, running on CPU")
end_time = time.time()
print(f"[Timing] Model load + session init time: {end_time - start_time:.4f} sec")
print("Session providers:", session.get_providers())
# Prepare dummy input
input_name = session.get_inputs()[0].name
input_shape = [
dim if isinstance(dim, int) and dim > 0 else 1
for dim in session.get_inputs()[0].shape
]
input_data = np.random.randn(*input_shape).astype(np.float32)
# Time multiple inferences (skip warm-up)
times = []
for i in range(num_runs):
t0 = time.time()
session.run(None, {input_name: input_data})
t1 = time.time()
if i >= warmup_runs:
times.append(t1 - t0)
avg_time = sum(times) / len(times)
print(
f"[Timing] Avg inference time (excluding {warmup_runs} warm-ups): {avg_time:.6f} sec"
)
# End profiling & parse JSON
profile_file = session.end_profiling()
print(f"[Output] Profiling trace saved to: {profile_file}")
try:
with open(profile_file, "r") as f:
trace = json.load(f)
nodes = [e for e in trace if e.get("cat") == "Node"]
print(f"[Profile] Number of nodes executed: {len(nodes)}")
if nodes:
top = max(nodes, key=lambda x: x.get("dur", 0))
print(
f"[Profile] Most expensive op: {top['name']}{top['dur'] / 1e6:.3f} ms"
)
except Exception as e:
print(f"[Warning] Failed to parse profiling JSON: {e}")
if __name__ == "__main__":
output_path = os.path.join("onnx_files", "inference_recognition_model.onnx")
profile_onnx_model(output_path)