From 7c33f7adad485d273977621e932dad2f33bef346 Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 28 May 2026 05:37:23 -0400 Subject: [PATCH] Training run - 2026-05-28 05:37:23 icc28 test --- .riahub/train_configs/train.yaml | 60 ++++++ .riahub/workflows/train.yaml | 326 +++++++++++++++++++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 .riahub/train_configs/train.yaml create mode 100644 .riahub/workflows/train.yaml diff --git a/.riahub/train_configs/train.yaml b/.riahub/train_configs/train.yaml new file mode 100644 index 0000000..087c52f --- /dev/null +++ b/.riahub/train_configs/train.yaml @@ -0,0 +1,60 @@ +data: + batch_size: 32 + dataset_params: + iq_key: iq_data + label_key: labels + drop_last: false + kind: iq_h5 + num_workers: 0 + persistent_workers: false + pin_memory: false + test_path: /opt/qmb/riahub/dataset/qoherent/icc-28/main/datasets/icc28-test_v1.0.0.h5 + test_split: 0 + train_path: /opt/qmb/riahub/dataset/qoherent/icc-28/main/datasets/icc28-train_v1.0.0.h5 + validation_path: /opt/qmb/riahub/dataset/qoherent/icc-28/main/datasets/icc28-test_v1.0.0.h5 + validation_split: 0 +evaluation: + capture_predictions: true + enabled: true + params: + save_confusion: true + split: test +export: + dynamic_batch: true + dynamic_width: false + enabled: true + file_name: model.onnx + opset_version: 17 + strict: false + use_dynamo: true + use_onnxsim: false +model: + name: iq_vtcnn2 + params: + dropout_p: 0.6 +optimization: + loss: + name: cross_entropy + params: {} + optimizer: + name: adam + params: + amsgrad: false + eps: 1e-08 + lr: 0.001 + weight_decay: 0 +runtime: + amp_enabled: false + autocast_dtype: float32 + checkpoint_every_n_epochs: 1 + component_modules: [] + device: auto + epochs: 1 + progress_bar: false + seed: 42 +task: + name: classification + params: + save_artifacts: true + selection_metric: accuracy + selection_mode: max diff --git a/.riahub/workflows/train.yaml b/.riahub/workflows/train.yaml new file mode 100644 index 0000000..3ac9ee1 --- /dev/null +++ b/.riahub/workflows/train.yaml @@ -0,0 +1,326 @@ +name: QMB Training +on: + push: + branches: [ "main" ] + paths: + - ".riahub/workflows/train.yaml" + pull_request: + branches: [ "main" ] + paths: + - ".riahub/workflows/train.yaml" + +permissions: + contents: read + actions: read + +jobs: + QMB-Training: + runs-on: "whitehorse-p40-qmb" + env: + RIAHUB_BASE_URL: ${{ vars.RIAHUB_BASE_URL || secrets.RIAHUB_BASE_URL || '' }} + QMB_OUTPUT_ROOT: "/opt/qmb/outputs" + QMB_TASK_REPO_ROOT: "/opt/qmb/task_repos" + steps: + - name: Display basic runner info + run: | + echo "Runner OS: ${{ runner.os }}" + echo "Runner Architecture: ${{ runner.arch }}" + + - name: Print CPU information + run: | + lscpu + + - name: Print GPU information + run: | + if command -v nvidia-smi &> /dev/null; then + nvidia-smi + else + echo "No NVIDIA GPU available." + fi + + + - name: Checkout Training Dataset + shell: bash + timeout-minutes: 10 + env: + RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} + RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} + GIT_TERMINAL_PROMPT: "0" + run: | + set -euo pipefail + DEFAULT_BASE_URL='https://riahub.ai' + BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL} + BASE_URL_SOURCE="${BASE_URL_SOURCE%/}" + + build_base_candidates() { + local raw="$1" + if [[ "$raw" =~ ^https?:// ]]; then + echo "$raw" + if [[ "$raw" == http://* ]]; then + echo "https://${raw#http://}" + elif [[ "$raw" == https://* ]]; then + echo "http://${raw#https://}" + fi + return + fi + echo "https://$raw" + echo "http://$raw" + } + + AUTH_HEADER="" + if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then + AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") + fi + git_auth() { + if [[ -n "$AUTH_HEADER" ]]; then + sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" + else + sudo env GIT_TERMINAL_PROMPT=0 git "$@" + fi + } + REPO_PATH='/qoherent/icc-28.git' + DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main' + sudo mkdir -p "$(dirname "$DEST_ROOT")" + if ! command -v git-lfs >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y git-lfs + fi + mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE") + MATERIALIZED=0 + for base in "${BASE_CANDIDATES[@]}"; do + base="${base%/}" + REPO_URL="${base}${REPO_PATH}" + echo "Fetching dataset from $REPO_URL" + sudo rm -rf "$DEST_ROOT" + sudo mkdir -p "$DEST_ROOT" + sudo git -C "$DEST_ROOT" init || continue + sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue + sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue + sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ + 'datasets/icc28-train_v1.0.0.h5' || continue + if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '1d9083f05d0538110f09e710865b078eba30964b'; then + continue + fi + if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then + continue + fi + if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-train_v1.0.0.h5' --exclude=""; then + echo "LFS fetch failed for candidate $base, trying next" >&2 + continue + fi + if ! sudo git -C "$DEST_ROOT" lfs checkout; then + echo "LFS checkout failed for candidate $base, trying next" >&2 + continue + fi + POINTER_FOUND=0 + _LFS_REL_PATH='datasets/icc28-train_v1.0.0.h5' + if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then + echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2 + POINTER_FOUND=1 + fi + if [[ "$POINTER_FOUND" -ne 0 ]]; then + continue + fi + MATERIALIZED=1 + break + done + if [[ "$MATERIALIZED" -ne 1 ]]; then + echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 + if [[ -z "$AUTH_HEADER" ]]; then + echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 + fi + exit 1 + fi + + - name: Checkout Validation Dataset + shell: bash + timeout-minutes: 10 + env: + RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} + RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} + GIT_TERMINAL_PROMPT: "0" + run: | + set -euo pipefail + DEFAULT_BASE_URL='https://riahub.ai' + BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL} + BASE_URL_SOURCE="${BASE_URL_SOURCE%/}" + + build_base_candidates() { + local raw="$1" + if [[ "$raw" =~ ^https?:// ]]; then + echo "$raw" + if [[ "$raw" == http://* ]]; then + echo "https://${raw#http://}" + elif [[ "$raw" == https://* ]]; then + echo "http://${raw#https://}" + fi + return + fi + echo "https://$raw" + echo "http://$raw" + } + + AUTH_HEADER="" + if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then + AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") + fi + git_auth() { + if [[ -n "$AUTH_HEADER" ]]; then + sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" + else + sudo env GIT_TERMINAL_PROMPT=0 git "$@" + fi + } + REPO_PATH='/qoherent/icc-28.git' + DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main' + sudo mkdir -p "$(dirname "$DEST_ROOT")" + if ! command -v git-lfs >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y git-lfs + fi + mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE") + MATERIALIZED=0 + for base in "${BASE_CANDIDATES[@]}"; do + base="${base%/}" + REPO_URL="${base}${REPO_PATH}" + echo "Fetching dataset from $REPO_URL" + sudo rm -rf "$DEST_ROOT" + sudo mkdir -p "$DEST_ROOT" + sudo git -C "$DEST_ROOT" init || continue + sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue + sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue + sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ + 'datasets/icc28-test_v1.0.0.h5' || continue + if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin 'b307499b8c7150e10537d8f2f17fe108f0bc73db'; then + continue + fi + if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then + continue + fi + if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-test_v1.0.0.h5' --exclude=""; then + echo "LFS fetch failed for candidate $base, trying next" >&2 + continue + fi + if ! sudo git -C "$DEST_ROOT" lfs checkout; then + echo "LFS checkout failed for candidate $base, trying next" >&2 + continue + fi + POINTER_FOUND=0 + _LFS_REL_PATH='datasets/icc28-test_v1.0.0.h5' + if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then + echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2 + POINTER_FOUND=1 + fi + if [[ "$POINTER_FOUND" -ne 0 ]]; then + continue + fi + MATERIALIZED=1 + break + done + if [[ "$MATERIALIZED" -ne 1 ]]; then + echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 + if [[ -z "$AUTH_HEADER" ]]; then + echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 + fi + exit 1 + fi + + - name: Checkout configs + uses: actions/checkout@v5 + with: + sparse-checkout: .riahub/train_configs + + - name: Copy configs into qmb folder + run: | + mkdir -p /opt/qmb/configs/ + sudo cp -r ${{ github.workspace }}/.riahub/train_configs/* /opt/qmb/configs/ + + + - name: List QMB project contents + run: | + ls -lha /opt/qmb + ls -lh /opt/qmb/wheel + + - name: List Downloaded RIA Hub contents + run: | + ls -lh /opt/qmb/riahub || true + ls -lh /opt/qmb/riahub/model || true + ls -lh /opt/qmb/riahub/dataset || true + + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: "3.13" + + - name: Install Python dependencies + run: | + set -euo pipefail + uv pip install --system --index-url https://pypi.org/simple --upgrade /opt/qmb/wheel/*.whl + TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu" + TORCH_REASON="no NVIDIA GPU detected" + if command -v nvidia-smi &> /dev/null; then + CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)" + if [[ -z "$CAP_LINES" ]]; then + CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')" + fi + CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')" + if [[ -n "$CAP_MAX" ]]; then + if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then + TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130" + TORCH_REASON="compute capability ${CAP_MAX} >= 7.5" + else + TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126" + TORCH_REASON="compute capability ${CAP_MAX} < 7.5" + fi + fi + fi + echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})." + uv pip install --system --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision + uv pip install --system --index-url https://pypi.org/simple --upgrade "onnxscript>=0.7.0" "onnx-ir>=0.2.1" onnx onnxruntime timm + + - name: Run Training Script + run: | + cd /opt/qmb + export PYTHONPATH="$QMB_TASK_REPO_ROOT:${PYTHONPATH:-}" + #source .venv/bin/activate + qmb train --config /opt/qmb/configs/train.yaml + + - name: Collect training artifacts + if: always() + run: | + set -euo pipefail + ARTIFACT_DIR="${{ github.workspace }}/.riahub/artifacts/training" + rm -rf "$ARTIFACT_DIR" + mkdir -p "$ARTIFACT_DIR" + if [[ -d "$QMB_OUTPUT_ROOT" ]]; then + while IFS= read -r -d '' file; do + rel="${file#${QMB_OUTPUT_ROOT}/}" + if [[ "$rel" == "$file" ]]; then + rel="$(basename "$file")" + fi + mkdir -p "$ARTIFACT_DIR/$(dirname "$rel")" + cp "$file" "$ARTIFACT_DIR/$rel" + done < <( + find "$QMB_OUTPUT_ROOT" -type f \( \ + -path "*/checkpoints/best.pt" -o \ + -path "*/checkpoints/best.ckpt" -o \ + -name "*.onnx" -o \ + -path "*/evaluation/*/confusion_matrix.png" -o \ + -path "*/evaluation/*/parameter_sweeps/*.png" \ + \) -print0 + ) + else + echo "QMB output root not found: $QMB_OUTPUT_ROOT" + fi + echo "Collected training artifacts:" + find "$ARTIFACT_DIR" -type f -print | sort || true + + - name: ⬆️ Upload training artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: training-artifacts + path: ${{ github.workspace }}/.riahub/artifacts/training + if-no-files-found: warn + +# committed at 2026-05-28T09:37:23.399702+00:00