From a9c9d72bb1093a7aa5cab7f283dd6a39dfedaee9 Mon Sep 17 00:00:00 2001 From: Roman Pope Date: Thu, 28 May 2026 07:17:52 -0400 Subject: [PATCH] Canary: trigger fresh WavesFM training (proven-green path) --- .riahub/workflows/train.yaml | 387 +++++++++++++++++++++++++---------- 1 file changed, 281 insertions(+), 106 deletions(-) diff --git a/.riahub/workflows/train.yaml b/.riahub/workflows/train.yaml index 3ac9ee1..925d3dd 100644 --- a/.riahub/workflows/train.yaml +++ b/.riahub/workflows/train.yaml @@ -1,4 +1,4 @@ -name: QMB Training +name: WavesFM Fine-Tuning on: push: branches: [ "main" ] @@ -14,22 +14,28 @@ permissions: actions: read jobs: - QMB-Training: - runs-on: "whitehorse-p40-qmb" + WavesFM-Training: + runs-on: "ubuntu-latest" env: - RIAHUB_BASE_URL: ${{ vars.RIAHUB_BASE_URL || secrets.RIAHUB_BASE_URL || '' }} - QMB_OUTPUT_ROOT: "/opt/qmb/outputs" - QMB_TASK_REPO_ROOT: "/opt/qmb/task_repos" + WAVESFM_TASK: "rml" + WAVESFM_EPOCHS: "1" + WAVESFM_BATCH_SIZE: "8" + WAVESFM_OUTPUT_DIR: "/opt/wavesfm/output" + # Single source of truth for the cloned WavesFM repo location. + # Referenced as ${{ env.WAVESFM_REPO_DIR }} in steps. To relocate + # (e.g. /home/runner/wavesfm), change ONLY this value — every + # downstream step uses the env var, no hard-coded paths. + WAVESFM_REPO_DIR: "/opt/wavesfm/repo" + WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5" + # Override model-download base URL to internal LAN IP + # (external riahub.ai LFS endpoint is unreachable from runners). + RIAHUB_BASE_URL: "http://192.168.0.170:3000" steps: - name: Display basic runner info run: | echo "Runner OS: ${{ runner.os }}" echo "Runner Architecture: ${{ runner.arch }}" - - name: Print CPU information - run: | - lscpu - - name: Print GPU information run: | if command -v nvidia-smi &> /dev/null; then @@ -39,9 +45,9 @@ jobs: fi - - name: Checkout Training Dataset + - name: "Download Model (qoherent/wavesfm-base/wavesfm-v1p0.pth)" shell: bash - timeout-minutes: 10 + timeout-minutes: 4 env: RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} @@ -69,8 +75,16 @@ jobs: AUTH_HEADER="" if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then - AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") + AUTH_HEADER=$(printf 'Authorization: basic %s' \ + "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") fi + # ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across + # sudo's default ``env_reset`` boundary; a bare ``sudo git`` would + # see an empty env on most distros' default sudoers, so the + # step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT + # actually reach git child processes. Without it, git falls back + # to opening ``/dev/tty`` (the PTY allocated by act_runner) and + # prompting for credentials on a 401, hanging until timeout. git_auth() { if [[ -n "$AUTH_HEADER" ]]; then sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" @@ -78,9 +92,14 @@ jobs: sudo env GIT_TERMINAL_PROMPT=0 git "$@" fi } - REPO_PATH='/qoherent/icc-28.git' - DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main' - sudo mkdir -p "$(dirname "$DEST_ROOT")" + + REPO_PATH='/qoherent/wavesfm-base.git' + REL_PATH='wavesfm-v1p0.pth' + REF='48787da4d310e9f939d9a0abe92f2a6cb13fbca7' + DEST_PATH='/opt/qmb/riahub/model/wavesfm-v1p0.pth' + TMP_DIR=$(mktemp -d) + cleanup() { sudo rm -rf "$TMP_DIR"; } + trap cleanup EXIT if ! command -v git-lfs >/dev/null 2>&1; then sudo apt-get update -y sudo apt-get install -y git-lfs @@ -90,49 +109,80 @@ jobs: for base in "${BASE_CANDIDATES[@]}"; do base="${base%/}" REPO_URL="${base}${REPO_PATH}" - echo "Fetching dataset from $REPO_URL" - sudo rm -rf "$DEST_ROOT" - sudo mkdir -p "$DEST_ROOT" - sudo git -C "$DEST_ROOT" init || continue - sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue - sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue - sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ - 'datasets/icc28-train_v1.0.0.h5' || continue - if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '1d9083f05d0538110f09e710865b078eba30964b'; then + echo "Fetching model from $REPO_URL" + sudo rm -rf "$TMP_DIR" + sudo mkdir -p "$TMP_DIR" + sudo git -C "$TMP_DIR" init || continue + sudo git -C "$TMP_DIR" remote add origin "$REPO_URL" || continue + # NOT running ``git lfs install --local`` on purpose: it would + # register the smudge/clean filter, which then fires during + # ``git checkout FETCH_HEAD`` and tries to download every LFS + # object via its OWN credential helper subprocess. That + # subprocess does NOT inherit ``-c http.extraheader=`` or env + # vars set by the parent via ``sudo``, so on an internal/ + # private repo it gets a 401 and hangs on /dev/tty + # waiting for a username — same failure class as the + # parent-fetch sub-fetch documented above. By skipping + # ``lfs install``, checkout writes LFS pointer files verbatim + # to disk; the explicit ``git lfs fetch --include=...`` + # below downloads the actual objects WITH the auth header, + # and ``git lfs checkout`` then materializes them. (Explicit + # LFS commands do NOT require ``lfs install``.) + sudo git -C "$TMP_DIR" sparse-checkout init --no-cone || continue + sudo git -C "$TMP_DIR" sparse-checkout set --no-cone -- "$REL_PATH" || continue + if ! git_auth -C "$TMP_DIR" fetch --depth=1 origin "$REF"; then continue fi - if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then + # ``GIT_LFS_SKIP_SMUDGE=1`` disables the LFS smudge filter for + # this checkout. The smudge filter is installed SYSTEM-WIDE on + # the runner (``/etc/gitconfig`` filter.lfs.smudge) by the + # ``apt-get install git-lfs`` step above, so skipping + # ``git lfs install --local`` is NOT sufficient to keep it + # from firing. Without this env var, checkout invokes + # ``git-lfs filter-process`` which spawns its own + # ``git credential fill`` to authenticate LFS-object downloads, + # and that subprocess does NOT inherit our auth header — it + # hangs on /dev/tty waiting for a username on internal/ + # private repos. With smudge skipped, checkout writes LFS + # pointer files verbatim; the explicit ``git lfs fetch`` + # below materializes them with proper auth. + # ``GIT_TERMINAL_PROMPT=0`` is belt-and-suspenders for any + # other auth path that could open /dev/tty. + if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \ + git -C "$TMP_DIR" -c advice.detachedHead=false checkout FETCH_HEAD; then continue fi - if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-train_v1.0.0.h5' --exclude=""; then - echo "LFS fetch failed for candidate $base, trying next" >&2 + if ! git_auth -C "$TMP_DIR" lfs fetch origin --include="$REL_PATH" --exclude=""; then continue fi - if ! sudo git -C "$DEST_ROOT" lfs checkout; then - echo "LFS checkout failed for candidate $base, trying next" >&2 + if ! sudo git -C "$TMP_DIR" lfs checkout; then continue fi - POINTER_FOUND=0 - _LFS_REL_PATH='datasets/icc28-train_v1.0.0.h5' - if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then - echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2 - POINTER_FOUND=1 + sudo mkdir -p "$(dirname "$DEST_PATH")" + if ! sudo cp -f "$TMP_DIR/$REL_PATH" "$DEST_PATH"; then + continue fi - if [[ "$POINTER_FOUND" -ne 0 ]]; then + # Reject LFS pointer files (~120-byte ASCII starting with + # ``version https://git-lfs.github.com/spec/v1``). Shipping a + # pointer to the training job would crash torch.load far from + # the root cause. + if [ "$(sudo head -c 9 "$DEST_PATH" 2>/dev/null || true)" = 'version h' ]; then + echo "ERROR: $DEST_PATH is an LFS pointer, not actual content" >&2 + echo " (LFS materialization failed for $REPO_URL)" >&2 continue fi MATERIALIZED=1 break done if [[ "$MATERIALIZED" -ne 1 ]]; then - echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 + echo "Failed to materialize model file using base URL candidates derived from: $BASE_URL_SOURCE" >&2 if [[ -z "$AUTH_HEADER" ]]; then - echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 + echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 fi exit 1 fi - - name: Checkout Validation Dataset + - name: "Checkout Dataset (qoherent/icc-28/datasets/icc28-test_v1.0.0.h5)" shell: bash timeout-minutes: 10 env: @@ -162,8 +212,16 @@ jobs: AUTH_HEADER="" if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then - AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") + AUTH_HEADER=$(printf 'Authorization: basic %s' \ + "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") fi + # ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across + # sudo's default ``env_reset`` boundary; a bare ``sudo git`` would + # see an empty env on most distros' default sudoers, so the + # step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT + # actually reach git child processes. Without it, git falls back + # to opening ``/dev/tty`` (the PTY allocated by act_runner) and + # prompting for credentials on a 401, hanging until timeout. git_auth() { if [[ -n "$AUTH_HEADER" ]]; then sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" @@ -171,8 +229,9 @@ jobs: sudo env GIT_TERMINAL_PROMPT=0 git "$@" fi } + REPO_PATH='/qoherent/icc-28.git' - DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/main' + DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-28/b307499b8c7150e10537d8f2f17fe108f0bc73db' sudo mkdir -p "$(dirname "$DEST_ROOT")" if ! command -v git-lfs >/dev/null 2>&1; then sudo apt-get update -y @@ -188,13 +247,28 @@ jobs: sudo mkdir -p "$DEST_ROOT" sudo git -C "$DEST_ROOT" init || continue sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue + # See ``_render_model_checkout`` for the rationale on skipping + # ``git lfs install --local`` — short version: the smudge + # filter it would register tries its own credential lookup + # during ``git checkout FETCH_HEAD`` and hangs forever on + # /dev/tty when the repo is internal/private. We rely on + # the explicit ``git lfs fetch`` (with auth) + + # ``git lfs checkout`` (local) pair below instead. sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ 'datasets/icc28-test_v1.0.0.h5' || continue if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin 'b307499b8c7150e10537d8f2f17fe108f0bc73db'; then continue fi - if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then + # See ``_render_model_checkout`` for the rationale on + # ``GIT_LFS_SKIP_SMUDGE=1`` — short version: the runner has + # the LFS smudge filter installed system-wide + # (``/etc/gitconfig``), so checkout fires it and the filter's + # credential helper hangs on /dev/tty for internal repos. + # Skipping smudge here lets the explicit ``git lfs fetch`` + # below handle materialization with proper auth. + if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \ + git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then continue fi if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-test_v1.0.0.h5' --exclude=""; then @@ -220,42 +294,94 @@ jobs: if [[ "$MATERIALIZED" -ne 1 ]]; then echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 if [[ -z "$AUTH_HEADER" ]]; then - echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 + echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 fi exit 1 fi - - name: Checkout configs + - name: Clone WavesFM + shell: bash + run: | + set -euo pipefail + mkdir -p "$(dirname "$WAVESFM_REPO_DIR")" + rm -rf "$WAVESFM_REPO_DIR" + git init "$WAVESFM_REPO_DIR" + cd "$WAVESFM_REPO_DIR" + git remote add origin https://github.com/AhmedTarek62/wavesfm.git + git fetch --depth 1 origin 483831732e32190b7018181b4f2cef93d755cef9 + git checkout FETCH_HEAD + # CPU-runner compatibility patch. WavesFM upstream + # main_finetune.py hardcodes a CUDA device in two places: + # line ~87: `add_argument("--device", default="cuda")` + # line ~267: `torch.amp.GradScaler(device="cuda")` + # On a runner without an NVIDIA GPU (e.g. dev machines with + # only integrated Intel UHD graphics), the CPU-only torch + # wheel we install in "Install dependencies" raises + # `AssertionError: Torch not compiled with CUDA enabled` at + # the first `model.to(device)` call. Patch the GradScaler + # literal to use the argparse-provided device so passing + # `--device cpu` from the Train step actually takes effect. + # No-op if the line already uses args.device (idempotent). + if [[ -f main_finetune.py ]]; then + sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))|' main_finetune.py + echo "Patched main_finetune.py GradScaler for CPU/GPU device parity." + fi + + - name: Checkout adapter and model config uses: actions/checkout@v5 with: - sparse-checkout: .riahub/train_configs - - - name: Copy configs into qmb folder - run: | - mkdir -p /opt/qmb/configs/ - sudo cp -r ${{ github.workspace }}/.riahub/train_configs/* /opt/qmb/configs/ - - - - name: List QMB project contents - run: | - ls -lha /opt/qmb - ls -lh /opt/qmb/wheel - - - name: List Downloaded RIA Hub contents - run: | - ls -lh /opt/qmb/riahub || true - ls -lh /opt/qmb/riahub/model || true - ls -lh /opt/qmb/riahub/dataset || true + sparse-checkout: | + scripts/adapt_dataset.py + .riahub/train_configs/model - name: Setup Python uses: actions/setup-python@v6 with: - python-version: "3.13" + python-version: "3.12" - - name: Install Python dependencies + - name: Install dependencies run: | set -euo pipefail - uv pip install --system --index-url https://pypi.org/simple --upgrade /opt/qmb/wheel/*.whl + # Use ``python -m pip`` rather than ``pip`` directly: actions/setup-python + # always puts ``python`` on PATH but the ``pip`` shim isn't guaranteed + # on every distro / venv layout. ``python -m pip`` always works + # against the active interpreter. + PIP="python -m pip" + # The pinned wavesfm SHA controls repo layout. Three common shapes: + # - Pure script-only repo: requirements.txt only -> install + # requirements (current pinned SHA shape) + # - Python package: setup.py / setup.cfg, OR pyproject.toml + # with a real [build-system] section -> editable install + # - Poetry-only / Hatch-only / tool-only pyproject.toml WITHOUT + # [build-system] -> `pip install -e .` would error; we install + # requirements.txt if available and skip the editable install + # Order matters: install requirements.txt first if present (it's + # the most explicit dep list); editable install layered on top + # only when the repo is genuinely installable (has setup.py / + # setup.cfg, or pyproject.toml with [build-system]). + cd "$WAVESFM_REPO_DIR" + # Pre-install CPU torch + numpy<2 to make requirements.txt see them already-satisfied (saves ~600MB). + # torch 2.2.2 has the NumPy 1.x ABI and crashes if numpy 2.x is installed. + $PIP install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple "numpy<2" "torch==2.2.2" torchvision + $PIP install --upgrade --force-reinstall "numpy<2" + INSTALLED_SOMETHING=0 + if [[ -f requirements.txt ]]; then + $PIP install -r requirements.txt + INSTALLED_SOMETHING=1 + fi + if [[ -f setup.py ]] || [[ -f setup.cfg ]] || \ + ( [[ -f pyproject.toml ]] && grep -q "^\[build-system\]" pyproject.toml ); then + $PIP install -e . + INSTALLED_SOMETHING=1 + fi + if [[ "$INSTALLED_SOMETHING" -eq 0 ]]; then + echo "ERROR: $WAVESFM_REPO_DIR has no installable Python metadata" >&2 + echo " expected: requirements.txt, setup.py, setup.cfg, or pyproject.toml with [build-system]" >&2 + exit 1 + fi + $PIP install h5py scipy + # Force numpy<2 again (requirements.txt may have bumped it via transitive deps). + $PIP install --upgrade --force-reinstall "numpy<2" TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu" TORCH_REASON="no NVIDIA GPU detected" if command -v nvidia-smi &> /dev/null; then @@ -275,52 +401,101 @@ jobs: fi fi echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})." - uv pip install --system --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision - uv pip install --system --index-url https://pypi.org/simple --upgrade "onnxscript>=0.7.0" "onnx-ir>=0.2.1" onnx onnxruntime timm + # torch pre-installed at step head; force-reinstall disabled to avoid 755MB redownload + echo "Skipping torch force-reinstall ($TORCH_INDEX_URL)" - - name: Run Training Script - run: | - cd /opt/qmb - export PYTHONPATH="$QMB_TASK_REPO_ROOT:${PYTHONPATH:-}" - #source .venv/bin/activate - qmb train --config /opt/qmb/configs/train.yaml - - - name: Collect training artifacts - if: always() + - name: Find and adapt dataset + shell: bash run: | set -euo pipefail - ARTIFACT_DIR="${{ github.workspace }}/.riahub/artifacts/training" - rm -rf "$ARTIFACT_DIR" - mkdir -p "$ARTIFACT_DIR" - if [[ -d "$QMB_OUTPUT_ROOT" ]]; then - while IFS= read -r -d '' file; do - rel="${file#${QMB_OUTPUT_ROOT}/}" - if [[ "$rel" == "$file" ]]; then - rel="$(basename "$file")" - fi - mkdir -p "$ARTIFACT_DIR/$(dirname "$rel")" - cp "$file" "$ARTIFACT_DIR/$rel" - done < <( - find "$QMB_OUTPUT_ROOT" -type f \( \ - -path "*/checkpoints/best.pt" -o \ - -path "*/checkpoints/best.ckpt" -o \ - -name "*.onnx" -o \ - -path "*/evaluation/*/confusion_matrix.png" -o \ - -path "*/evaluation/*/parameter_sweeps/*.png" \ - \) -print0 - ) - else - echo "QMB output root not found: $QMB_OUTPUT_ROOT" + # Find .h5 files but EXCLUDE LFS pointer files. The dataset + # checkout's sparse-checkout pattern is supposed to limit the + # working tree to the target file, but sparse-checkout in + # ``init+fetch+checkout`` mode (vs the previous + # ``clone --no-checkout``) doesn't always activate cleanly, + # leaving OTHER LFS-tracked files (e.g. sibling datasets in + # the same repo) as unmaterialized 120-180 byte pointer files + # in the working tree. The training adapter must skip those — + # otherwise ``find ... -name '*.h5'`` counts pointer files as + # real datasets and the "expected exactly one" check trips. + # The same ``head -c 9`` LFS-pointer test used elsewhere works + # here: real HDF5 files start with the HDF5 magic byte + # 0x89, never with the ASCII string "version h". + H5_CANDIDATES=() + while IFS= read -r f; do + if [[ "$(head -c 9 "$f" 2>/dev/null || true)" != "version h" ]]; then + H5_CANDIDATES+=("$f") + fi + done < <(find /opt/qmb/riahub/dataset -name '*.h5' -type f) + if [[ ${#H5_CANDIDATES[@]} -eq 0 ]]; then + echo "ERROR: No materialized .h5 dataset file found in /opt/qmb/riahub/dataset/" >&2 + echo " (any .h5 files present are LFS pointers — LFS materialization may have failed)" >&2 + exit 1 fi - echo "Collected training artifacts:" - find "$ARTIFACT_DIR" -type f -print | sort || true + if [[ ${#H5_CANDIDATES[@]} -gt 1 ]]; then + echo "ERROR: Multiple materialized .h5 files found (${#H5_CANDIDATES[@]}); expected exactly one:" >&2 + printf ' %s\n' "${H5_CANDIDATES[@]}" >&2 + exit 1 + fi + INPUT_H5="${H5_CANDIDATES[0]}" + echo "Adapting: $INPUT_H5" + python ${{ github.workspace }}/scripts/adapt_dataset.py \ + "$INPUT_H5" "$WAVESFM_ADAPTED_DATA" - - name: ⬆️ Upload training artifacts - if: always() + - name: Verify adapted dataset + run: | + python -c " + import h5py, json, os + f = h5py.File(os.environ['WAVESFM_ADAPTED_DATA'], 'r') + print('sample:', f['sample'].shape, f['sample'].dtype) + print('label:', f['label'].shape, f['label'].dtype) + labels = json.loads(f.attrs['labels']) + print('classes:', len(labels), labels[:5]) + f.close() + " + + - name: Train WavesFM (Linear Probe) + shell: bash + env: + PYTHONUNBUFFERED: "1" + run: | + set -euo pipefail + # Detect runtime device: CUDA if NVIDIA GPU is present (matching + # the Install dependencies step's torch index choice), CPU + # otherwise. WavesFM's main_finetune.py defaults --device to + # "cuda" which would hard-fail on a CPU-only runner with + # ``Torch not compiled with CUDA enabled`` at model.to(device). + # Paired with the Clone WavesFM step's GradScaler patch above. + DEVICE="cpu" + if command -v nvidia-smi >/dev/null 2>&1; then + DEVICE="cuda" + fi + echo "Training device: $DEVICE" + cd "$WAVESFM_REPO_DIR" + python -u main_finetune.py \ + --task "${{ env.WAVESFM_TASK }}" \ + --device "$DEVICE" \ + --train-data "$WAVESFM_ADAPTED_DATA" \ + --finetune /opt/qmb/riahub/model/wavesfm-v1p0.pth \ + --model vit_multi_small \ + --use-conditional-ln \ + --class-weights \ + --warmup-epochs 5 \ + --val-split 0.2 \ + --epochs "${{ env.WAVESFM_EPOCHS }}" \ + --batch-size "${{ env.WAVESFM_BATCH_SIZE }}" \ + --blr 1e-3 \ + --freeze-encoder \ + --output-dir "${{ env.WAVESFM_OUTPUT_DIR }}" + + # upload-artifact@v3: matches codebase convention (see TRAIN_TEMPLATE). + # Upgrade to v4 is tracked as deferred/P2 work. + - name: Upload training artifacts uses: actions/upload-artifact@v3 with: - name: training-artifacts - path: ${{ github.workspace }}/.riahub/artifacts/training + name: wavesfm-training-artifacts + path: | + ${{ env.WAVESFM_OUTPUT_DIR }}/best.pth + ${{ env.WAVESFM_OUTPUT_DIR }}/log.txt if-no-files-found: warn - -# committed at 2026-05-28T09:37:23.399702+00:00 +# committed at 2026-05-28T11:17:49.563043+00:00