327 lines
13 KiB
YAML
327 lines
13 KiB
YAML
|
A
|
name: QMB Training
|
||
|
|
on:
|
||
|
|
push:
|
||
|
|
branches: [ "main" ]
|
||
|
|
paths:
|
||
|
|
- ".riahub/workflows/train.yaml"
|
||
|
|
pull_request:
|
||
|
|
branches: [ "main" ]
|
||
|
|
paths:
|
||
|
|
- ".riahub/workflows/train.yaml"
|
||
|
|
|
||
|
|
permissions:
|
||
|
|
contents: read
|
||
|
|
actions: read
|
||
|
|
|
||
|
|
jobs:
|
||
|
|
QMB-Training:
|
||
|
|
runs-on: "whitehorse-p40-qmb"
|
||
|
|
env:
|
||
|
|
RIAHUB_BASE_URL: ${{ vars.RIAHUB_BASE_URL || secrets.RIAHUB_BASE_URL || '' }}
|
||
|
|
QMB_OUTPUT_ROOT: "/opt/qmb/outputs"
|
||
|
|
QMB_TASK_REPO_ROOT: "/opt/qmb/task_repos"
|
||
|
|
steps:
|
||
|
|
- name: Display basic runner info
|
||
|
|
run: |
|
||
|
|
echo "Runner OS: ${{ runner.os }}"
|
||
|
|
echo "Runner Architecture: ${{ runner.arch }}"
|
||
|
|
|
||
|
|
- name: Print CPU information
|
||
|
|
run: |
|
||
|
|
lscpu
|
||
|
|
|
||
|
|
- name: Print GPU information
|
||
|
|
run: |
|
||
|
|
if command -v nvidia-smi &> /dev/null; then
|
||
|
|
nvidia-smi
|
||
|
|
else
|
||
|
|
echo "No NVIDIA GPU available."
|
||
|
|
fi
|
||
|
|
|
||
|
|
|
||
|
|
- name: Checkout Training Dataset
|
||
|
|
shell: bash
|
||
|
|
timeout-minutes: 10
|
||
|
|
env:
|
||
|
|
RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
|
||
|
|
RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
|
||
|
|
GIT_TERMINAL_PROMPT: "0"
|
||
|
|
run: |
|
||
|
|
set -euo pipefail
|
||
|
|
DEFAULT_BASE_URL='https://riahub.ai'
|
||
|
|
BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
|
||
|
|
BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
|
||
|
|
|
||
|
|
build_base_candidates() {
|
||
|
|
local raw="$1"
|
||
|
|
if [[ "$raw" =~ ^https?:// ]]; then
|
||
|
|
echo "$raw"
|
||
|
|
if [[ "$raw" == http://* ]]; then
|
||
|
|
echo "https://${raw#http://}"
|
||
|
|
elif [[ "$raw" == https://* ]]; then
|
||
|
|
echo "http://${raw#https://}"
|
||
|
|
fi
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
echo "https://$raw"
|
||
|
|
echo "http://$raw"
|
||
|
|
}
|
||
|
|
|
||
|
|
AUTH_HEADER=""
|
||
|
|
if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then
|
||
|
|
AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
|
||
|
|
fi
|
||
|
|
git_auth() {
|
||
|
|
if [[ -n "$AUTH_HEADER" ]]; then
|
||
|
|
sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@"
|
||
|
|
else
|
||
|
|
sudo env GIT_TERMINAL_PROMPT=0 git "$@"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
REPO_PATH='/qoherent/icc-28.git'
|
||
|
|
DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main'
|
||
|
|
sudo mkdir -p "$(dirname "$DEST_ROOT")"
|
||
|
|
if ! command -v git-lfs >/dev/null 2>&1; then
|
||
|
|
sudo apt-get update -y
|
||
|
|
sudo apt-get install -y git-lfs
|
||
|
|
fi
|
||
|
|
mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
|
||
|
|
MATERIALIZED=0
|
||
|
|
for base in "${BASE_CANDIDATES[@]}"; do
|
||
|
|
base="${base%/}"
|
||
|
|
REPO_URL="${base}${REPO_PATH}"
|
||
|
|
echo "Fetching dataset from $REPO_URL"
|
||
|
|
sudo rm -rf "$DEST_ROOT"
|
||
|
|
sudo mkdir -p "$DEST_ROOT"
|
||
|
|
sudo git -C "$DEST_ROOT" init || continue
|
||
|
|
sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue
|
||
|
|
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue
|
||
|
|
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
|
||
|
|
'datasets/icc28-train_v1.0.0.h5' || continue
|
||
|
|
if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '1d9083f05d0538110f09e710865b078eba30964b'; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-train_v1.0.0.h5' --exclude=""; then
|
||
|
|
echo "LFS fetch failed for candidate $base, trying next" >&2
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
if ! sudo git -C "$DEST_ROOT" lfs checkout; then
|
||
|
|
echo "LFS checkout failed for candidate $base, trying next" >&2
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
POINTER_FOUND=0
|
||
|
|
_LFS_REL_PATH='datasets/icc28-train_v1.0.0.h5'
|
||
|
|
if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then
|
||
|
|
echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2
|
||
|
|
POINTER_FOUND=1
|
||
|
|
fi
|
||
|
|
if [[ "$POINTER_FOUND" -ne 0 ]]; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
MATERIALIZED=1
|
||
|
|
break
|
||
|
|
done
|
||
|
|
if [[ "$MATERIALIZED" -ne 1 ]]; then
|
||
|
|
echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2
|
||
|
|
if [[ -z "$AUTH_HEADER" ]]; then
|
||
|
|
echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
|
||
|
|
fi
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
- name: Checkout Validation Dataset
|
||
|
|
shell: bash
|
||
|
|
timeout-minutes: 10
|
||
|
|
env:
|
||
|
|
RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
|
||
|
|
RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
|
||
|
|
GIT_TERMINAL_PROMPT: "0"
|
||
|
|
run: |
|
||
|
|
set -euo pipefail
|
||
|
|
DEFAULT_BASE_URL='https://riahub.ai'
|
||
|
|
BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
|
||
|
|
BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
|
||
|
|
|
||
|
|
build_base_candidates() {
|
||
|
|
local raw="$1"
|
||
|
|
if [[ "$raw" =~ ^https?:// ]]; then
|
||
|
|
echo "$raw"
|
||
|
|
if [[ "$raw" == http://* ]]; then
|
||
|
|
echo "https://${raw#http://}"
|
||
|
|
elif [[ "$raw" == https://* ]]; then
|
||
|
|
echo "http://${raw#https://}"
|
||
|
|
fi
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
echo "https://$raw"
|
||
|
|
echo "http://$raw"
|
||
|
|
}
|
||
|
|
|
||
|
|
AUTH_HEADER=""
|
||
|
|
if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then
|
||
|
|
AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
|
||
|
|
fi
|
||
|
|
git_auth() {
|
||
|
|
if [[ -n "$AUTH_HEADER" ]]; then
|
||
|
|
sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@"
|
||
|
|
else
|
||
|
|
sudo env GIT_TERMINAL_PROMPT=0 git "$@"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
REPO_PATH='/qoherent/icc-28.git'
|
||
|
|
DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main'
|
||
|
|
sudo mkdir -p "$(dirname "$DEST_ROOT")"
|
||
|
|
if ! command -v git-lfs >/dev/null 2>&1; then
|
||
|
|
sudo apt-get update -y
|
||
|
|
sudo apt-get install -y git-lfs
|
||
|
|
fi
|
||
|
|
mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
|
||
|
|
MATERIALIZED=0
|
||
|
|
for base in "${BASE_CANDIDATES[@]}"; do
|
||
|
|
base="${base%/}"
|
||
|
|
REPO_URL="${base}${REPO_PATH}"
|
||
|
|
echo "Fetching dataset from $REPO_URL"
|
||
|
|
sudo rm -rf "$DEST_ROOT"
|
||
|
|
sudo mkdir -p "$DEST_ROOT"
|
||
|
|
sudo git -C "$DEST_ROOT" init || continue
|
||
|
|
sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue
|
||
|
|
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue
|
||
|
|
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
|
||
|
|
'datasets/icc28-test_v1.0.0.h5' || continue
|
||
|
|
if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin 'b307499b8c7150e10537d8f2f17fe108f0bc73db'; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-test_v1.0.0.h5' --exclude=""; then
|
||
|
|
echo "LFS fetch failed for candidate $base, trying next" >&2
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
if ! sudo git -C "$DEST_ROOT" lfs checkout; then
|
||
|
|
echo "LFS checkout failed for candidate $base, trying next" >&2
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
POINTER_FOUND=0
|
||
|
|
_LFS_REL_PATH='datasets/icc28-test_v1.0.0.h5'
|
||
|
|
if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then
|
||
|
|
echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2
|
||
|
|
POINTER_FOUND=1
|
||
|
|
fi
|
||
|
|
if [[ "$POINTER_FOUND" -ne 0 ]]; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
MATERIALIZED=1
|
||
|
|
break
|
||
|
|
done
|
||
|
|
if [[ "$MATERIALIZED" -ne 1 ]]; then
|
||
|
|
echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2
|
||
|
|
if [[ -z "$AUTH_HEADER" ]]; then
|
||
|
|
echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
|
||
|
|
fi
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
- name: Checkout configs
|
||
|
|
uses: actions/checkout@v5
|
||
|
|
with:
|
||
|
|
sparse-checkout: .riahub/train_configs
|
||
|
|
|
||
|
|
- name: Copy configs into qmb folder
|
||
|
|
run: |
|
||
|
|
mkdir -p /opt/qmb/configs/
|
||
|
|
sudo cp -r ${{ github.workspace }}/.riahub/train_configs/* /opt/qmb/configs/
|
||
|
|
|
||
|
|
|
||
|
|
- name: List QMB project contents
|
||
|
|
run: |
|
||
|
|
ls -lha /opt/qmb
|
||
|
|
ls -lh /opt/qmb/wheel
|
||
|
|
|
||
|
|
- name: List Downloaded RIA Hub contents
|
||
|
|
run: |
|
||
|
|
ls -lh /opt/qmb/riahub || true
|
||
|
|
ls -lh /opt/qmb/riahub/model || true
|
||
|
|
ls -lh /opt/qmb/riahub/dataset || true
|
||
|
|
|
||
|
|
- name: Setup Python
|
||
|
|
uses: actions/setup-python@v6
|
||
|
|
with:
|
||
|
|
python-version: "3.13"
|
||
|
|
|
||
|
|
- name: Install Python dependencies
|
||
|
|
run: |
|
||
|
|
set -euo pipefail
|
||
|
|
uv pip install --system --index-url https://pypi.org/simple --upgrade /opt/qmb/wheel/*.whl
|
||
|
|
TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||
|
|
TORCH_REASON="no NVIDIA GPU detected"
|
||
|
|
if command -v nvidia-smi &> /dev/null; then
|
||
|
|
CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)"
|
||
|
|
if [[ -z "$CAP_LINES" ]]; then
|
||
|
|
CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')"
|
||
|
|
fi
|
||
|
|
CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')"
|
||
|
|
if [[ -n "$CAP_MAX" ]]; then
|
||
|
|
if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then
|
||
|
|
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130"
|
||
|
|
TORCH_REASON="compute capability ${CAP_MAX} >= 7.5"
|
||
|
|
else
|
||
|
|
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126"
|
||
|
|
TORCH_REASON="compute capability ${CAP_MAX} < 7.5"
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})."
|
||
|
|
uv pip install --system --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision
|
||
|
|
uv pip install --system --index-url https://pypi.org/simple --upgrade "onnxscript>=0.7.0" "onnx-ir>=0.2.1" onnx onnxruntime timm
|
||
|
|
|
||
|
|
- name: Run Training Script
|
||
|
|
run: |
|
||
|
|
cd /opt/qmb
|
||
|
|
export PYTHONPATH="$QMB_TASK_REPO_ROOT:${PYTHONPATH:-}"
|
||
|
|
#source .venv/bin/activate
|
||
|
|
qmb train --config /opt/qmb/configs/train.yaml
|
||
|
|
|
||
|
|
- name: Collect training artifacts
|
||
|
|
if: always()
|
||
|
|
run: |
|
||
|
|
set -euo pipefail
|
||
|
|
ARTIFACT_DIR="${{ github.workspace }}/.riahub/artifacts/training"
|
||
|
|
rm -rf "$ARTIFACT_DIR"
|
||
|
|
mkdir -p "$ARTIFACT_DIR"
|
||
|
|
if [[ -d "$QMB_OUTPUT_ROOT" ]]; then
|
||
|
|
while IFS= read -r -d '' file; do
|
||
|
|
rel="${file#${QMB_OUTPUT_ROOT}/}"
|
||
|
|
if [[ "$rel" == "$file" ]]; then
|
||
|
|
rel="$(basename "$file")"
|
||
|
|
fi
|
||
|
|
mkdir -p "$ARTIFACT_DIR/$(dirname "$rel")"
|
||
|
|
cp "$file" "$ARTIFACT_DIR/$rel"
|
||
|
|
done < <(
|
||
|
|
find "$QMB_OUTPUT_ROOT" -type f \( \
|
||
|
|
-path "*/checkpoints/best.pt" -o \
|
||
|
|
-path "*/checkpoints/best.ckpt" -o \
|
||
|
|
-name "*.onnx" -o \
|
||
|
|
-path "*/evaluation/*/confusion_matrix.png" -o \
|
||
|
|
-path "*/evaluation/*/parameter_sweeps/*.png" \
|
||
|
|
\) -print0
|
||
|
|
)
|
||
|
|
else
|
||
|
|
echo "QMB output root not found: $QMB_OUTPUT_ROOT"
|
||
|
|
fi
|
||
|
|
echo "Collected training artifacts:"
|
||
|
|
find "$ARTIFACT_DIR" -type f -print | sort || true
|
||
|
|
|
||
|
|
- name: ⬆️ Upload training artifacts
|
||
|
|
if: always()
|
||
|
|
uses: actions/upload-artifact@v3
|
||
|
|
with:
|
||
|
|
name: training-artifacts
|
||
|
|
path: ${{ github.workspace }}/.riahub/artifacts/training
|
||
|
|
if-no-files-found: warn
|
||
|
|
|
||
|
|
# committed at 2026-05-28T09:37:23.399702+00:00
|