name: QMB Training on: push: branches: [ "main" ] paths: - ".riahub/workflows/train.yaml" pull_request: branches: [ "main" ] paths: - ".riahub/workflows/train.yaml" permissions: contents: read actions: read jobs: QMB-Training: runs-on: "whitehorse-p40-qmb" env: RIAHUB_BASE_URL: ${{ vars.RIAHUB_BASE_URL || secrets.RIAHUB_BASE_URL || '' }} QMB_OUTPUT_ROOT: "/opt/qmb/outputs" QMB_TASK_REPO_ROOT: "/opt/qmb/task_repos" steps: - name: Display basic runner info run: | echo "Runner OS: ${{ runner.os }}" echo "Runner Architecture: ${{ runner.arch }}" - name: Print CPU information run: | lscpu - name: Print GPU information run: | if command -v nvidia-smi &> /dev/null; then nvidia-smi else echo "No NVIDIA GPU available." fi - name: Checkout Training Dataset shell: bash timeout-minutes: 10 env: RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} GIT_TERMINAL_PROMPT: "0" run: | set -euo pipefail DEFAULT_BASE_URL='https://riahub.ai' BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL} BASE_URL_SOURCE="${BASE_URL_SOURCE%/}" build_base_candidates() { local raw="$1" if [[ "$raw" =~ ^https?:// ]]; then echo "$raw" if [[ "$raw" == http://* ]]; then echo "https://${raw#http://}" elif [[ "$raw" == https://* ]]; then echo "http://${raw#https://}" fi return fi echo "https://$raw" echo "http://$raw" } AUTH_HEADER="" if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") fi git_auth() { if [[ -n "$AUTH_HEADER" ]]; then sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" else sudo env GIT_TERMINAL_PROMPT=0 git "$@" fi } REPO_PATH='/qoherent/icc-28.git' DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main' sudo mkdir -p "$(dirname "$DEST_ROOT")" if ! command -v git-lfs >/dev/null 2>&1; then sudo apt-get update -y sudo apt-get install -y git-lfs fi mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE") MATERIALIZED=0 for base in "${BASE_CANDIDATES[@]}"; do base="${base%/}" REPO_URL="${base}${REPO_PATH}" echo "Fetching dataset from $REPO_URL" sudo rm -rf "$DEST_ROOT" sudo mkdir -p "$DEST_ROOT" sudo git -C "$DEST_ROOT" init || continue sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ 'datasets/icc28-train_v1.0.0.h5' || continue if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '1d9083f05d0538110f09e710865b078eba30964b'; then continue fi if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then continue fi if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-train_v1.0.0.h5' --exclude=""; then echo "LFS fetch failed for candidate $base, trying next" >&2 continue fi if ! sudo git -C "$DEST_ROOT" lfs checkout; then echo "LFS checkout failed for candidate $base, trying next" >&2 continue fi POINTER_FOUND=0 _LFS_REL_PATH='datasets/icc28-train_v1.0.0.h5' if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2 POINTER_FOUND=1 fi if [[ "$POINTER_FOUND" -ne 0 ]]; then continue fi MATERIALIZED=1 break done if [[ "$MATERIALIZED" -ne 1 ]]; then echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 if [[ -z "$AUTH_HEADER" ]]; then echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 fi exit 1 fi - name: Checkout Validation Dataset shell: bash timeout-minutes: 10 env: RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} GIT_TERMINAL_PROMPT: "0" run: | set -euo pipefail DEFAULT_BASE_URL='https://riahub.ai' BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL} BASE_URL_SOURCE="${BASE_URL_SOURCE%/}" build_base_candidates() { local raw="$1" if [[ "$raw" =~ ^https?:// ]]; then echo "$raw" if [[ "$raw" == http://* ]]; then echo "https://${raw#http://}" elif [[ "$raw" == https://* ]]; then echo "http://${raw#https://}" fi return fi echo "https://$raw" echo "http://$raw" } AUTH_HEADER="" if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") fi git_auth() { if [[ -n "$AUTH_HEADER" ]]; then sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" else sudo env GIT_TERMINAL_PROMPT=0 git "$@" fi } REPO_PATH='/qoherent/icc-28.git' DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main' sudo mkdir -p "$(dirname "$DEST_ROOT")" if ! command -v git-lfs >/dev/null 2>&1; then sudo apt-get update -y sudo apt-get install -y git-lfs fi mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE") MATERIALIZED=0 for base in "${BASE_CANDIDATES[@]}"; do base="${base%/}" REPO_URL="${base}${REPO_PATH}" echo "Fetching dataset from $REPO_URL" sudo rm -rf "$DEST_ROOT" sudo mkdir -p "$DEST_ROOT" sudo git -C "$DEST_ROOT" init || continue sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ 'datasets/icc28-test_v1.0.0.h5' || continue if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin 'b307499b8c7150e10537d8f2f17fe108f0bc73db'; then continue fi if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then continue fi if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-test_v1.0.0.h5' --exclude=""; then echo "LFS fetch failed for candidate $base, trying next" >&2 continue fi if ! sudo git -C "$DEST_ROOT" lfs checkout; then echo "LFS checkout failed for candidate $base, trying next" >&2 continue fi POINTER_FOUND=0 _LFS_REL_PATH='datasets/icc28-test_v1.0.0.h5' if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2 POINTER_FOUND=1 fi if [[ "$POINTER_FOUND" -ne 0 ]]; then continue fi MATERIALIZED=1 break done if [[ "$MATERIALIZED" -ne 1 ]]; then echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 if [[ -z "$AUTH_HEADER" ]]; then echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 fi exit 1 fi - name: Checkout configs uses: actions/checkout@v5 with: sparse-checkout: .riahub/train_configs - name: Copy configs into qmb folder run: | mkdir -p /opt/qmb/configs/ sudo cp -r ${{ github.workspace }}/.riahub/train_configs/* /opt/qmb/configs/ - name: List QMB project contents run: | ls -lha /opt/qmb ls -lh /opt/qmb/wheel - name: List Downloaded RIA Hub contents run: | ls -lh /opt/qmb/riahub || true ls -lh /opt/qmb/riahub/model || true ls -lh /opt/qmb/riahub/dataset || true - name: Setup Python uses: actions/setup-python@v6 with: python-version: "3.13" - name: Install Python dependencies run: | set -euo pipefail uv pip install --system --index-url https://pypi.org/simple --upgrade /opt/qmb/wheel/*.whl TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu" TORCH_REASON="no NVIDIA GPU detected" if command -v nvidia-smi &> /dev/null; then CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)" if [[ -z "$CAP_LINES" ]]; then CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')" fi CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')" if [[ -n "$CAP_MAX" ]]; then if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130" TORCH_REASON="compute capability ${CAP_MAX} >= 7.5" else TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126" TORCH_REASON="compute capability ${CAP_MAX} < 7.5" fi fi fi echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})." uv pip install --system --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision uv pip install --system --index-url https://pypi.org/simple --upgrade "onnxscript>=0.7.0" "onnx-ir>=0.2.1" onnx onnxruntime timm - name: Run Training Script run: | cd /opt/qmb export PYTHONPATH="$QMB_TASK_REPO_ROOT:${PYTHONPATH:-}" #source .venv/bin/activate qmb train --config /opt/qmb/configs/train.yaml - name: Collect training artifacts if: always() run: | set -euo pipefail ARTIFACT_DIR="${{ github.workspace }}/.riahub/artifacts/training" rm -rf "$ARTIFACT_DIR" mkdir -p "$ARTIFACT_DIR" if [[ -d "$QMB_OUTPUT_ROOT" ]]; then while IFS= read -r -d '' file; do rel="${file#${QMB_OUTPUT_ROOT}/}" if [[ "$rel" == "$file" ]]; then rel="$(basename "$file")" fi mkdir -p "$ARTIFACT_DIR/$(dirname "$rel")" cp "$file" "$ARTIFACT_DIR/$rel" done < <( find "$QMB_OUTPUT_ROOT" -type f \( \ -path "*/checkpoints/best.pt" -o \ -path "*/checkpoints/best.ckpt" -o \ -name "*.onnx" -o \ -path "*/evaluation/*/confusion_matrix.png" -o \ -path "*/evaluation/*/parameter_sweeps/*.png" \ \) -print0 ) else echo "QMB output root not found: $QMB_OUTPUT_ROOT" fi echo "Collected training artifacts:" find "$ARTIFACT_DIR" -type f -print | sort || true - name: ⬆️ Upload training artifacts if: always() uses: actions/upload-artifact@v3 with: name: training-artifacts path: ${{ github.workspace }}/.riahub/artifacts/training if-no-files-found: warn # committed at 2026-05-28T09:37:23.399702+00:00