diff --git a/.ria/train.yaml b/.ria/train.yaml new file mode 100644 index 0000000..b3ec70d --- /dev/null +++ b/.ria/train.yaml @@ -0,0 +1,202 @@ +name: QMB Training +on: + push: + branches: [ "main" ] + paths: + - ".riahub/workflows/train.yaml" + pull_request: + branches: [ "main" ] + paths: + - ".riahub/workflows/train.yaml" + +permissions: + contents: read + actions: read + +jobs: + QMB-Training: + runs-on: "hades-4090" + env: + RIAHUB_BASE_URL: ${{ vars.RIAHUB_BASE_URL || secrets.RIAHUB_BASE_URL || '' }} + QMB_OUTPUT_ROOT: "/opt/qmb/outputs" + QMB_TASK_REPO_ROOT: "/opt/qmb/task_repos" + steps: + - name: Display basic runner info + run: | + echo "Runner OS: ${{ runner.os }}" + echo "Runner Architecture: ${{ runner.arch }}" + + - name: Print CPU information + run: | + lscpu + + - name: Print GPU information + run: | + if command -v nvidia-smi &> /dev/null; then + nvidia-smi + else + echo "No NVIDIA GPU available." + fi + + + - name: Checkout Datasets (lswersk/library-test) + env: + RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} + RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} + run: | + set -euo pipefail + DEFAULT_BASE_URL="http://localhost:3000" + BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL} + BASE_URL_SOURCE="${BASE_URL_SOURCE%/}" + + build_base_candidates() { + local raw="$1" + if [[ "$raw" =~ ^https?:// ]]; then + echo "$raw" + if [[ "$raw" == http://* ]]; then + echo "https://${raw#http://}" + elif [[ "$raw" == https://* ]]; then + echo "http://${raw#https://}" + fi + return + fi + echo "https://$raw" + echo "http://$raw" + } + + REPO_PATH="/lswersk/library-test.git" + DEST_ROOT="/opt/qmb/riahub/dataset/lswersk/library-test/main" + sudo mkdir -p "$(dirname "$DEST_ROOT")" + mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE") + CLONED=0 + for base in "${BASE_CANDIDATES[@]}"; do + base="${base%/}" + REPO_URL="${base}${REPO_PATH}" + AUTHED_URL=$(printf '%s' "$REPO_URL" | sed -E "s#^(https?://)#\\1${RIAHUB_USER}:${RIAHUB_TOKEN}@#") + echo "Cloning dataset repo from $REPO_URL" + sudo rm -rf "$DEST_ROOT" + if sudo git clone --filter=blob:none --no-checkout "$AUTHED_URL" "$DEST_ROOT"; then + CLONED=1 + break + fi + done + if [[ "$CLONED" -ne 1 ]]; then + echo "Failed to clone dataset repo using base URL candidates derived from: $BASE_URL_SOURCE" >&2 + exit 1 + fi + if ! command -v git-lfs >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y git-lfs + fi + sudo git -C "$DEST_ROOT" lfs install --local || true + sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone + sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ + "datasets/tiny-pluto/test.h5" \ + "datasets/tiny-pluto/train.h5" \ + "datasets/tiny-pluto/val.h5" + sudo git -C "$DEST_ROOT" fetch --depth=1 origin "e4bd5193c5bb09aa23afd18e138840befefa59cd" + sudo git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD + sudo git -C "$DEST_ROOT" lfs fetch origin --include="datasets/tiny-pluto/test.h5,datasets/tiny-pluto/train.h5,datasets/tiny-pluto/val.h5" --exclude="" || true + sudo git -C "$DEST_ROOT" lfs checkout || true + sudo git -C "$DEST_ROOT" remote remove origin || true + sudo git -C "$DEST_ROOT" config --local --unset-all http.extraheader || true + + - name: Checkout configs + uses: actions/checkout@v5 + with: + sparse-checkout: .riahub/train_configs + + - name: Copy configs into qmb folder + run: | + mkdir -p /opt/qmb/configs/ + sudo cp -r ${{ github.workspace }}/.riahub/train_configs/* /opt/qmb/configs/ + + + - name: List QMB project contents + run: | + ls -lha /opt/qmb + ls -lh /opt/qmb/wheel + + - name: List Downloaded RIA Hub contents + run: | + ls -lh /opt/qmb/riahub || true + ls -lh /opt/qmb/riahub/model || true + ls -lh /opt/qmb/riahub/dataset || true + + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: "3.13" + + - name: Install Python dependencies + run: | + set -euo pipefail + uv pip install --system --index-url https://pypi.org/simple --upgrade /opt/qmb/wheel/*.whl + TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu" + TORCH_REASON="no NVIDIA GPU detected" + if command -v nvidia-smi &> /dev/null; then + CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)" + if [[ -z "$CAP_LINES" ]]; then + CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')" + fi + CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')" + if [[ -n "$CAP_MAX" ]]; then + if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then + TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130" + TORCH_REASON="compute capability ${CAP_MAX} >= 7.5" + else + TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126" + TORCH_REASON="compute capability ${CAP_MAX} < 7.5" + fi + fi + fi + echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})." + uv pip install --system --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision + uv pip install --system --index-url https://pypi.org/simple --upgrade "onnxscript>=0.7.0" "onnx-ir>=0.2.1" onnx onnxruntime timm + + - name: Run Training Script + run: | + cd /opt/qmb + export PYTHONPATH="$QMB_TASK_REPO_ROOT:${PYTHONPATH:-}" + #source .venv/bin/activate + qmb train --config /opt/qmb/configs/train.yaml + + - name: Collect training artifacts + if: always() + run: | + set -euo pipefail + ARTIFACT_DIR="${{ github.workspace }}/.riahub/artifacts/training" + rm -rf "$ARTIFACT_DIR" + mkdir -p "$ARTIFACT_DIR" + if [[ -d "$QMB_OUTPUT_ROOT" ]]; then + while IFS= read -r -d '' file; do + rel="${file#${QMB_OUTPUT_ROOT}/}" + if [[ "$rel" == "$file" ]]; then + rel="$(basename "$file")" + fi + mkdir -p "$ARTIFACT_DIR/$(dirname "$rel")" + cp "$file" "$ARTIFACT_DIR/$rel" + done < <( + find "$QMB_OUTPUT_ROOT" -type f \( \ + -path "*/checkpoints/best.pt" -o \ + -path "*/checkpoints/best.ckpt" -o \ + -name "*.onnx" -o \ + -path "*/evaluation/*/confusion_matrix.png" -o \ + -path "*/evaluation/*/parameter_sweeps/*.png" \ + \) -print0 + ) + else + echo "QMB output root not found: $QMB_OUTPUT_ROOT" + fi + echo "Collected training artifacts:" + find "$ARTIFACT_DIR" -type f -print | sort || true + + - name: ⬆️ Upload training artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: training-artifacts + path: ${{ github.workspace }}/.riahub/artifacts/training + if-no-files-found: warn + +# committed at 2026-04-24T00:04:24.549781+00:00