Training run - 2026-05-26 22:54:53
This commit is contained in:
parent
6f4e96e49b
commit
7c274eafaa
60
.riahub/train_configs/train.yaml
Normal file
60
.riahub/train_configs/train.yaml
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
data:
|
||||
batch_size: 32
|
||||
dataset_params:
|
||||
iq_key: iq_data
|
||||
label_key: labels
|
||||
drop_last: false
|
||||
kind: iq_h5
|
||||
num_workers: 0
|
||||
persistent_workers: false
|
||||
pin_memory: false
|
||||
test_path: /opt/qmb/riahub/dataset/qoherent/ash_test_demo_repo/main/datasets/test_demo_v1.0.0.h5
|
||||
test_split: 0
|
||||
train_path: /opt/qmb/riahub/dataset/qoherent/ash_test_demo_repo/main/datasets/train_demo_v1.0.0.h5
|
||||
validation_path: /opt/qmb/riahub/dataset/qoherent/ash_test_demo_repo/main/datasets/val_demo_v1.0.0.h5
|
||||
validation_split: 0
|
||||
evaluation:
|
||||
capture_predictions: true
|
||||
enabled: true
|
||||
params:
|
||||
save_confusion: true
|
||||
split: test
|
||||
export:
|
||||
dynamic_batch: true
|
||||
dynamic_width: false
|
||||
enabled: true
|
||||
file_name: demo_model.onnx
|
||||
opset_version: 17
|
||||
strict: false
|
||||
use_dynamo: true
|
||||
use_onnxsim: false
|
||||
model:
|
||||
name: iq_tiny_cnn
|
||||
params:
|
||||
hidden_channels: 16
|
||||
optimization:
|
||||
loss:
|
||||
name: cross_entropy
|
||||
params: {}
|
||||
optimizer:
|
||||
name: adam
|
||||
params:
|
||||
amsgrad: false
|
||||
eps: 1e-08
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
runtime:
|
||||
amp_enabled: false
|
||||
autocast_dtype: float32
|
||||
checkpoint_every_n_epochs: 1
|
||||
component_modules: []
|
||||
device: auto
|
||||
epochs: 1
|
||||
progress_bar: false
|
||||
seed: 42
|
||||
task:
|
||||
name: classification
|
||||
params:
|
||||
save_artifacts: true
|
||||
selection_metric: accuracy
|
||||
selection_mode: max
|
||||
320
.riahub/workflows/train.yaml
Normal file
320
.riahub/workflows/train.yaml
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
name: QMB Training
|
||||
on:
|
||||
push:
|
||||
branches: [ "main" ]
|
||||
paths:
|
||||
- ".riahub/workflows/train.yaml"
|
||||
pull_request:
|
||||
branches: [ "main" ]
|
||||
paths:
|
||||
- ".riahub/workflows/train.yaml"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
actions: read
|
||||
|
||||
jobs:
|
||||
QMB-Training:
|
||||
runs-on: "ubuntu-latest-2080"
|
||||
env:
|
||||
RIAHUB_BASE_URL: ${{ vars.RIAHUB_BASE_URL || secrets.RIAHUB_BASE_URL || '' }}
|
||||
QMB_OUTPUT_ROOT: "/opt/qmb/outputs"
|
||||
QMB_TASK_REPO_ROOT: "/opt/qmb/task_repos"
|
||||
steps:
|
||||
- name: Display basic runner info
|
||||
run: |
|
||||
echo "Runner OS: ${{ runner.os }}"
|
||||
echo "Runner Architecture: ${{ runner.arch }}"
|
||||
|
||||
- name: Print CPU information
|
||||
run: |
|
||||
lscpu
|
||||
|
||||
- name: Print GPU information
|
||||
run: |
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
nvidia-smi
|
||||
else
|
||||
echo "No NVIDIA GPU available."
|
||||
fi
|
||||
|
||||
|
||||
- name: Checkout Training Dataset
|
||||
env:
|
||||
RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
|
||||
RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
DEFAULT_BASE_URL="https://riahub.ai"
|
||||
BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
|
||||
BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
|
||||
|
||||
build_base_candidates() {
|
||||
local raw="$1"
|
||||
if [[ "$raw" =~ ^https?:// ]]; then
|
||||
echo "$raw"
|
||||
if [[ "$raw" == http://* ]]; then
|
||||
echo "https://${raw#http://}"
|
||||
elif [[ "$raw" == https://* ]]; then
|
||||
echo "http://${raw#https://}"
|
||||
fi
|
||||
return
|
||||
fi
|
||||
echo "https://$raw"
|
||||
echo "http://$raw"
|
||||
}
|
||||
|
||||
REPO_PATH="/qoherent/ash_test_demo_repo.git"
|
||||
DEST_ROOT="/opt/qmb/riahub/dataset/qoherent/ash_test_demo_repo/main"
|
||||
sudo mkdir -p "$(dirname "$DEST_ROOT")"
|
||||
mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
|
||||
CLONED=0
|
||||
for base in "${BASE_CANDIDATES[@]}"; do
|
||||
base="${base%/}"
|
||||
REPO_URL="${base}${REPO_PATH}"
|
||||
AUTHED_URL=$(printf '%s' "$REPO_URL" | sed -E "s#^(https?://)#\\1${RIAHUB_USER}:${RIAHUB_TOKEN}@#")
|
||||
echo "Cloning dataset repo from $REPO_URL"
|
||||
sudo rm -rf "$DEST_ROOT"
|
||||
if sudo git clone --filter=blob:none --no-checkout "$AUTHED_URL" "$DEST_ROOT"; then
|
||||
CLONED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ "$CLONED" -ne 1 ]]; then
|
||||
echo "Failed to clone dataset repo using base URL candidates derived from: $BASE_URL_SOURCE" >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! command -v git-lfs >/dev/null 2>&1; then
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y git-lfs
|
||||
fi
|
||||
sudo git -C "$DEST_ROOT" lfs install --local || true
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
|
||||
"datasets/train_demo_v1.0.0.h5"
|
||||
sudo git -C "$DEST_ROOT" fetch --depth=1 origin "6f4e96e49bdf6634a5a2ceb9fee5bae379ae7021"
|
||||
sudo git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD
|
||||
sudo git -C "$DEST_ROOT" lfs fetch origin --include="datasets/train_demo_v1.0.0.h5" --exclude="" || true
|
||||
sudo git -C "$DEST_ROOT" lfs checkout || true
|
||||
sudo git -C "$DEST_ROOT" remote remove origin || true
|
||||
sudo git -C "$DEST_ROOT" config --local --unset-all http.extraheader || true
|
||||
|
||||
- name: Checkout Validation Dataset
|
||||
env:
|
||||
RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
|
||||
RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
DEFAULT_BASE_URL="https://riahub.ai"
|
||||
BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
|
||||
BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
|
||||
|
||||
build_base_candidates() {
|
||||
local raw="$1"
|
||||
if [[ "$raw" =~ ^https?:// ]]; then
|
||||
echo "$raw"
|
||||
if [[ "$raw" == http://* ]]; then
|
||||
echo "https://${raw#http://}"
|
||||
elif [[ "$raw" == https://* ]]; then
|
||||
echo "http://${raw#https://}"
|
||||
fi
|
||||
return
|
||||
fi
|
||||
echo "https://$raw"
|
||||
echo "http://$raw"
|
||||
}
|
||||
|
||||
REPO_PATH="/qoherent/ash_test_demo_repo.git"
|
||||
DEST_ROOT="/opt/qmb/riahub/dataset/qoherent/ash_test_demo_repo/main"
|
||||
sudo mkdir -p "$(dirname "$DEST_ROOT")"
|
||||
mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
|
||||
CLONED=0
|
||||
for base in "${BASE_CANDIDATES[@]}"; do
|
||||
base="${base%/}"
|
||||
REPO_URL="${base}${REPO_PATH}"
|
||||
AUTHED_URL=$(printf '%s' "$REPO_URL" | sed -E "s#^(https?://)#\\1${RIAHUB_USER}:${RIAHUB_TOKEN}@#")
|
||||
echo "Cloning dataset repo from $REPO_URL"
|
||||
sudo rm -rf "$DEST_ROOT"
|
||||
if sudo git clone --filter=blob:none --no-checkout "$AUTHED_URL" "$DEST_ROOT"; then
|
||||
CLONED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ "$CLONED" -ne 1 ]]; then
|
||||
echo "Failed to clone dataset repo using base URL candidates derived from: $BASE_URL_SOURCE" >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! command -v git-lfs >/dev/null 2>&1; then
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y git-lfs
|
||||
fi
|
||||
sudo git -C "$DEST_ROOT" lfs install --local || true
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
|
||||
"datasets/val_demo_v1.0.0.h5"
|
||||
sudo git -C "$DEST_ROOT" fetch --depth=1 origin "6e347846fbf0270fc5547a4522a93b5b411925e6"
|
||||
sudo git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD
|
||||
sudo git -C "$DEST_ROOT" lfs fetch origin --include="datasets/val_demo_v1.0.0.h5" --exclude="" || true
|
||||
sudo git -C "$DEST_ROOT" lfs checkout || true
|
||||
sudo git -C "$DEST_ROOT" remote remove origin || true
|
||||
sudo git -C "$DEST_ROOT" config --local --unset-all http.extraheader || true
|
||||
|
||||
- name: Checkout Test Dataset
|
||||
env:
|
||||
RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
|
||||
RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
DEFAULT_BASE_URL="https://riahub.ai"
|
||||
BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
|
||||
BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
|
||||
|
||||
build_base_candidates() {
|
||||
local raw="$1"
|
||||
if [[ "$raw" =~ ^https?:// ]]; then
|
||||
echo "$raw"
|
||||
if [[ "$raw" == http://* ]]; then
|
||||
echo "https://${raw#http://}"
|
||||
elif [[ "$raw" == https://* ]]; then
|
||||
echo "http://${raw#https://}"
|
||||
fi
|
||||
return
|
||||
fi
|
||||
echo "https://$raw"
|
||||
echo "http://$raw"
|
||||
}
|
||||
|
||||
REPO_PATH="/qoherent/ash_test_demo_repo.git"
|
||||
DEST_ROOT="/opt/qmb/riahub/dataset/qoherent/ash_test_demo_repo/main"
|
||||
sudo mkdir -p "$(dirname "$DEST_ROOT")"
|
||||
mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
|
||||
CLONED=0
|
||||
for base in "${BASE_CANDIDATES[@]}"; do
|
||||
base="${base%/}"
|
||||
REPO_URL="${base}${REPO_PATH}"
|
||||
AUTHED_URL=$(printf '%s' "$REPO_URL" | sed -E "s#^(https?://)#\\1${RIAHUB_USER}:${RIAHUB_TOKEN}@#")
|
||||
echo "Cloning dataset repo from $REPO_URL"
|
||||
sudo rm -rf "$DEST_ROOT"
|
||||
if sudo git clone --filter=blob:none --no-checkout "$AUTHED_URL" "$DEST_ROOT"; then
|
||||
CLONED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ "$CLONED" -ne 1 ]]; then
|
||||
echo "Failed to clone dataset repo using base URL candidates derived from: $BASE_URL_SOURCE" >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! command -v git-lfs >/dev/null 2>&1; then
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y git-lfs
|
||||
fi
|
||||
sudo git -C "$DEST_ROOT" lfs install --local || true
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
|
||||
"datasets/test_demo_v1.0.0.h5"
|
||||
sudo git -C "$DEST_ROOT" fetch --depth=1 origin "82ed23a403a6766c85229092f25a879cdf64da86"
|
||||
sudo git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD
|
||||
sudo git -C "$DEST_ROOT" lfs fetch origin --include="datasets/test_demo_v1.0.0.h5" --exclude="" || true
|
||||
sudo git -C "$DEST_ROOT" lfs checkout || true
|
||||
sudo git -C "$DEST_ROOT" remote remove origin || true
|
||||
sudo git -C "$DEST_ROOT" config --local --unset-all http.extraheader || true
|
||||
|
||||
- name: Checkout configs
|
||||
uses: actions/checkout@v5
|
||||
with:
|
||||
sparse-checkout: .riahub/train_configs
|
||||
|
||||
- name: Copy configs into qmb folder
|
||||
run: |
|
||||
mkdir -p /opt/qmb/configs/
|
||||
sudo cp -r ${{ github.workspace }}/.riahub/train_configs/* /opt/qmb/configs/
|
||||
|
||||
|
||||
- name: List QMB project contents
|
||||
run: |
|
||||
ls -lha /opt/qmb
|
||||
ls -lh /opt/qmb/wheel
|
||||
|
||||
- name: List Downloaded RIA Hub contents
|
||||
run: |
|
||||
ls -lh /opt/qmb/riahub || true
|
||||
ls -lh /opt/qmb/riahub/model || true
|
||||
ls -lh /opt/qmb/riahub/dataset || true
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.13"
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
uv pip install --system --index-url https://pypi.org/simple --upgrade /opt/qmb/wheel/*.whl
|
||||
TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
TORCH_REASON="no NVIDIA GPU detected"
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)"
|
||||
if [[ -z "$CAP_LINES" ]]; then
|
||||
CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')"
|
||||
fi
|
||||
CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')"
|
||||
if [[ -n "$CAP_MAX" ]]; then
|
||||
if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then
|
||||
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130"
|
||||
TORCH_REASON="compute capability ${CAP_MAX} >= 7.5"
|
||||
else
|
||||
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126"
|
||||
TORCH_REASON="compute capability ${CAP_MAX} < 7.5"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})."
|
||||
uv pip install --system --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision
|
||||
uv pip install --system --index-url https://pypi.org/simple --upgrade "onnxscript>=0.7.0" "onnx-ir>=0.2.1" onnx onnxruntime timm
|
||||
|
||||
- name: Run Training Script
|
||||
run: |
|
||||
cd /opt/qmb
|
||||
export PYTHONPATH="$QMB_TASK_REPO_ROOT:${PYTHONPATH:-}"
|
||||
#source .venv/bin/activate
|
||||
qmb train --config /opt/qmb/configs/train.yaml
|
||||
|
||||
- name: Collect training artifacts
|
||||
if: always()
|
||||
run: |
|
||||
set -euo pipefail
|
||||
ARTIFACT_DIR="${{ github.workspace }}/.riahub/artifacts/training"
|
||||
rm -rf "$ARTIFACT_DIR"
|
||||
mkdir -p "$ARTIFACT_DIR"
|
||||
if [[ -d "$QMB_OUTPUT_ROOT" ]]; then
|
||||
while IFS= read -r -d '' file; do
|
||||
rel="${file#${QMB_OUTPUT_ROOT}/}"
|
||||
if [[ "$rel" == "$file" ]]; then
|
||||
rel="$(basename "$file")"
|
||||
fi
|
||||
mkdir -p "$ARTIFACT_DIR/$(dirname "$rel")"
|
||||
cp "$file" "$ARTIFACT_DIR/$rel"
|
||||
done < <(
|
||||
find "$QMB_OUTPUT_ROOT" -type f \( \
|
||||
-path "*/checkpoints/best.pt" -o \
|
||||
-path "*/checkpoints/best.ckpt" -o \
|
||||
-name "*.onnx" -o \
|
||||
-path "*/evaluation/*/confusion_matrix.png" -o \
|
||||
-path "*/evaluation/*/parameter_sweeps/*.png" \
|
||||
\) -print0
|
||||
)
|
||||
else
|
||||
echo "QMB output root not found: $QMB_OUTPUT_ROOT"
|
||||
fi
|
||||
echo "Collected training artifacts:"
|
||||
find "$ARTIFACT_DIR" -type f -print | sort || true
|
||||
|
||||
- name: ⬆️ Upload training artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: training-artifacts
|
||||
path: ${{ github.workspace }}/.riahub/artifacts/training
|
||||
if-no-files-found: warn
|
||||
|
||||
# committed at 2026-05-27T02:54:53.931581+00:00
|
||||
Loading…
Reference in New Issue
Block a user