Trigger WavesFM Linear Probe training (canary)

Dataset: icc_canary_2026_05_28-v1.0.0.h5 (3 recs, 144 slices) Model: qoherent/wavesfm-base/wavesfm-v1p0.pth @ 48787da4 Task: rml, epochs=3, batch_size=16, device=cpu
2026-05-28 01:42:59 -04:00 · 2026-05-28 01:42:59 -04:00 · b8fd7d73d9
commit b8fd7d73d9
parent 9f87fa9fe2
1 changed files with 374 additions and 0 deletions
--- a/.riahub/workflows/train.yaml
+++ b/.riahub/workflows/train.yaml
@ -0,0 +1,374 @@
 name: WavesFM Fine-Tuning
 on:
  push:
    branches: [ "main" ]
    paths:
      - ".riahub/workflows/train.yaml"
  pull_request:
    branches: [ "main" ]
    paths:
      - ".riahub/workflows/train.yaml"
 permissions:
  contents: read
  actions: read
 jobs:
  WavesFM-Training:
    runs-on: "ubuntu-latest"
    env:
      WAVESFM_TASK: "rml"
      WAVESFM_EPOCHS: "3"
      WAVESFM_BATCH_SIZE: "16"
      WAVESFM_OUTPUT_DIR: "/opt/wavesfm/output"
      # Single source of truth for the cloned WavesFM repo location.
      # Referenced as ${{ env.WAVESFM_REPO_DIR }} in steps. To relocate
      # (e.g. /home/runner/wavesfm), change ONLY this value — every
      # downstream step uses the env var, no hard-coded paths.
      WAVESFM_REPO_DIR: "/opt/wavesfm/repo"
      WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5"
    steps:
      - name: Display basic runner info
        run: |
          echo "Runner OS: ${{ runner.os }}"
          echo "Runner Architecture: ${{ runner.arch }}"
      - name: Print GPU information
        run: |
          if command -v nvidia-smi &> /dev/null; then
            nvidia-smi
          else
            echo "No NVIDIA GPU available."
          fi
      - name: "Download Model (qoherent/wavesfm-base/wavesfm-v1p0.pth)"
        shell: bash
        timeout-minutes: 4
        env:
          RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
          RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
          GIT_TERMINAL_PROMPT: "0"
        run: |
          set -euo pipefail
          DEFAULT_BASE_URL='https://riahub.ai'
          BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
          BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
          build_base_candidates() {
            local raw="$1"
            if [[ "$raw" =~ ^https?:// ]]; then
              echo "$raw"
              if [[ "$raw" == http://* ]]; then
                echo "https://${raw#http://}"
              elif [[ "$raw" == https://* ]]; then
                echo "http://${raw#https://}"
              fi
              return
            fi
            echo "https://$raw"
            echo "http://$raw"
          }
          AUTH_HEADER=""
          if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then
            AUTH_HEADER=$(printf 'Authorization: basic %s' \
              "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
          fi
          # ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across
          # sudo's default ``env_reset`` boundary; a bare ``sudo git`` would
          # see an empty env on most distros' default sudoers, so the
          # step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT
          # actually reach git child processes. Without it, git falls back
          # to opening ``/dev/tty`` (the PTY allocated by act_runner) and
          # prompting for credentials on a 401, hanging until timeout.
          git_auth() {
            if [[ -n "$AUTH_HEADER" ]]; then
              sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@"
            else
              sudo env GIT_TERMINAL_PROMPT=0 git "$@"
            fi
          }
          REPO_PATH='/qoherent/wavesfm-base.git'
          REL_PATH='wavesfm-v1p0.pth'
          REF='48787da4d310e9f939d9a0abe92f2a6cb13fbca7'
          DEST_PATH='/opt/qmb/riahub/model/wavesfm-v1p0.pth'
          TMP_DIR=$(mktemp -d)
          cleanup() { sudo rm -rf "$TMP_DIR"; }
          trap cleanup EXIT
          if ! command -v git-lfs >/dev/null 2>&1; then
            sudo apt-get update -y
            sudo apt-get install -y git-lfs
          fi
          mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
          MATERIALIZED=0
          for base in "${BASE_CANDIDATES[@]}"; do
            base="${base%/}"
            REPO_URL="${base}${REPO_PATH}"
            echo "Fetching model from $REPO_URL"
            sudo rm -rf "$TMP_DIR"
            sudo mkdir -p "$TMP_DIR"
            sudo git -C "$TMP_DIR" init || continue
            sudo git -C "$TMP_DIR" remote add origin "$REPO_URL" || continue
            # NOT running ``git lfs install --local`` on purpose: it would
            # register the smudge/clean filter, which then fires during
            # ``git checkout FETCH_HEAD`` and tries to download every LFS
            # object via its OWN credential helper subprocess. That
            # subprocess does NOT inherit ``-c http.extraheader=`` or env
            # vars set by the parent via ``sudo``, so on an internal/
            # private repo it gets a 401 and hangs on /dev/tty
            # waiting for a username — same failure class as the
            # parent-fetch sub-fetch documented above. By skipping
            # ``lfs install``, checkout writes LFS pointer files verbatim
            # to disk; the explicit ``git lfs fetch --include=...``
            # below downloads the actual objects WITH the auth header,
            # and ``git lfs checkout`` then materializes them. (Explicit
            # LFS commands do NOT require ``lfs install``.)
            sudo git -C "$TMP_DIR" sparse-checkout init --no-cone || continue
            sudo git -C "$TMP_DIR" sparse-checkout set --no-cone -- "$REL_PATH" || continue
            if ! git_auth -C "$TMP_DIR" fetch --depth=1 origin "$REF"; then
              continue
            fi
            # ``GIT_LFS_SKIP_SMUDGE=1`` disables the LFS smudge filter for
            # this checkout. The smudge filter is installed SYSTEM-WIDE on
            # the runner (``/etc/gitconfig`` filter.lfs.smudge) by the
            # ``apt-get install git-lfs`` step above, so skipping
            # ``git lfs install --local`` is NOT sufficient to keep it
            # from firing. Without this env var, checkout invokes
            # ``git-lfs filter-process`` which spawns its own
            # ``git credential fill`` to authenticate LFS-object downloads,
            # and that subprocess does NOT inherit our auth header — it
            # hangs on /dev/tty waiting for a username on internal/
            # private repos. With smudge skipped, checkout writes LFS
            # pointer files verbatim; the explicit ``git lfs fetch``
            # below materializes them with proper auth.
            # ``GIT_TERMINAL_PROMPT=0`` is belt-and-suspenders for any
            # other auth path that could open /dev/tty.
            if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \
                git -C "$TMP_DIR" -c advice.detachedHead=false checkout FETCH_HEAD; then
              continue
            fi
            if ! git_auth -C "$TMP_DIR" lfs fetch origin --include="$REL_PATH" --exclude=""; then
              continue
            fi
            if ! sudo git -C "$TMP_DIR" lfs checkout; then
              continue
            fi
            sudo mkdir -p "$(dirname "$DEST_PATH")"
            if ! sudo cp -f "$TMP_DIR/$REL_PATH" "$DEST_PATH"; then
              continue
            fi
            # Reject LFS pointer files (~120-byte ASCII starting with
            # ``version https://git-lfs.github.com/spec/v1``). Shipping a
            # pointer to the training job would crash torch.load far from
            # the root cause.
            if [ "$(sudo head -c 9 "$DEST_PATH" 2>/dev/null || true)" = 'version h' ]; then
              echo "ERROR: $DEST_PATH is an LFS pointer, not actual content" >&2
              echo "  (LFS materialization failed for $REPO_URL)" >&2
              continue
            fi
            MATERIALIZED=1
            break
          done
          if [[ "$MATERIALIZED" -ne 1 ]]; then
            echo "Failed to materialize model file using base URL candidates derived from: $BASE_URL_SOURCE" >&2
            if [[ -z "$AUTH_HEADER" ]]; then
              echo "  (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
            fi
            exit 1
          fi
      - name: Clone WavesFM
        shell: bash
        run: |
          set -euo pipefail
          mkdir -p "$(dirname "$WAVESFM_REPO_DIR")"
          rm -rf "$WAVESFM_REPO_DIR"
          git init "$WAVESFM_REPO_DIR"
          cd "$WAVESFM_REPO_DIR"
          git remote add origin https://github.com/AhmedTarek62/wavesfm.git
          git fetch --depth 1 origin 483831732e32190b7018181b4f2cef93d755cef9
          git checkout FETCH_HEAD
          # CPU-runner compatibility patch. WavesFM upstream
          # main_finetune.py hardcodes a CUDA device in two places:
          #   line ~87: `add_argument("--device", default="cuda")`
          #   line ~267: `torch.amp.GradScaler(device="cuda")`
          # On a runner without an NVIDIA GPU (e.g. dev machines with
          # only integrated Intel UHD graphics), the CPU-only torch
          # wheel we install in "Install dependencies" raises
          # `AssertionError: Torch not compiled with CUDA enabled` at
          # the first `model.to(device)` call. Patch the GradScaler
          # literal to use the argparse-provided device so passing
          # `--device cpu` from the Train step actually takes effect.
          # No-op if the line already uses args.device (idempotent).
          if [[ -f main_finetune.py ]]; then
            sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.amp.GradScaler(device=args.device, enabled=(args.device != "cpu"))|' main_finetune.py
            echo "Patched main_finetune.py GradScaler for CPU/GPU device parity."
          fi
      - name: Checkout adapter and model config
        uses: actions/checkout@v5
        with:
          sparse-checkout: |
            scripts/adapt_dataset.py
            .riahub/train_configs/model
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.12"
      - name: Install dependencies
        run: |
          set -euo pipefail
          # Use ``python -m pip`` rather than ``pip`` directly: actions/setup-python
          # always puts ``python`` on PATH but the ``pip`` shim isn't guaranteed
          # on every distro / venv layout. ``python -m pip`` always works
          # against the active interpreter.
          PIP="python -m pip"
          # The pinned wavesfm SHA controls repo layout. Three common shapes:
          #   - Pure script-only repo: requirements.txt only -> install
          #     requirements (current pinned SHA shape)
          #   - Python package: setup.py / setup.cfg, OR pyproject.toml
          #     with a real [build-system] section -> editable install
          #   - Poetry-only / Hatch-only / tool-only pyproject.toml WITHOUT
          #     [build-system] -> `pip install -e .` would error; we install
          #     requirements.txt if available and skip the editable install
          # Order matters: install requirements.txt first if present (it's
          # the most explicit dep list); editable install layered on top
          # only when the repo is genuinely installable (has setup.py /
          # setup.cfg, or pyproject.toml with [build-system]).
          cd "$WAVESFM_REPO_DIR"
          INSTALLED_SOMETHING=0
          if [[ -f requirements.txt ]]; then
            $PIP install -r requirements.txt
            INSTALLED_SOMETHING=1
          fi
          if [[ -f setup.py ]] || [[ -f setup.cfg ]] || \
             ( [[ -f pyproject.toml ]] && grep -q "^\[build-system\]" pyproject.toml ); then
            $PIP install -e .
            INSTALLED_SOMETHING=1
          fi
          if [[ "$INSTALLED_SOMETHING" -eq 0 ]]; then
            echo "ERROR: $WAVESFM_REPO_DIR has no installable Python metadata" >&2
            echo "  expected: requirements.txt, setup.py, setup.cfg, or pyproject.toml with [build-system]" >&2
            exit 1
          fi
          $PIP install h5py scipy
          TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu"
          TORCH_REASON="no NVIDIA GPU detected"
          if command -v nvidia-smi &> /dev/null; then
            CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)"
            if [[ -z "$CAP_LINES" ]]; then
              CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')"
            fi
            CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')"
            if [[ -n "$CAP_MAX" ]]; then
              if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then
                TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130"
                TORCH_REASON="compute capability ${CAP_MAX} >= 7.5"
              else
                TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126"
                TORCH_REASON="compute capability ${CAP_MAX} < 7.5"
              fi
            fi
          fi
          echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})."
          $PIP install --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision
      - name: Find and adapt dataset
        shell: bash
        run: |
          set -euo pipefail
          # Find .h5 files but EXCLUDE LFS pointer files. The dataset
          # checkout's sparse-checkout pattern is supposed to limit the
          # working tree to the target file, but sparse-checkout in
          # ``init+fetch+checkout`` mode (vs the previous
          # ``clone --no-checkout``) doesn't always activate cleanly,
          # leaving OTHER LFS-tracked files (e.g. sibling datasets in
          # the same repo) as unmaterialized 120-180 byte pointer files
          # in the working tree. The training adapter must skip those —
          # otherwise ``find ... -name '*.h5'`` counts pointer files as
          # real datasets and the "expected exactly one" check trips.
          # The same ``head -c 9`` LFS-pointer test used elsewhere works
          # here: real HDF5 files start with the HDF5 magic byte
          # 0x89, never with the ASCII string "version h".
          H5_CANDIDATES=()
          while IFS= read -r f; do
            if [[ "$(head -c 9 "$f" 2>/dev/null || true)" != "version h" ]]; then
              H5_CANDIDATES+=("$f")
            fi
          done < <(find /opt/qmb/riahub/dataset -name '*.h5' -type f)
          if [[ ${#H5_CANDIDATES[@]} -eq 0 ]]; then
            echo "ERROR: No materialized .h5 dataset file found in /opt/qmb/riahub/dataset/" >&2
            echo "  (any .h5 files present are LFS pointers — LFS materialization may have failed)" >&2
            exit 1
          fi
          if [[ ${#H5_CANDIDATES[@]} -gt 1 ]]; then
            echo "ERROR: Multiple materialized .h5 files found (${#H5_CANDIDATES[@]}); expected exactly one:" >&2
            printf '  %s\n' "${H5_CANDIDATES[@]}" >&2
            exit 1
          fi
          INPUT_H5="${H5_CANDIDATES[0]}"
          echo "Adapting: $INPUT_H5"
          python ${{ github.workspace }}/scripts/adapt_dataset.py \
            "$INPUT_H5" "$WAVESFM_ADAPTED_DATA"
      - name: Verify adapted dataset
        run: |
          python -c "
          import h5py, json, os
          f = h5py.File(os.environ['WAVESFM_ADAPTED_DATA'], 'r')
          print('sample:', f['sample'].shape, f['sample'].dtype)
          print('label:', f['label'].shape, f['label'].dtype)
          labels = json.loads(f.attrs['labels'])
          print('classes:', len(labels), labels[:5])
          f.close()
          "
      - name: Train WavesFM (Linear Probe)
        shell: bash
        env:
          PYTHONUNBUFFERED: "1"
        run: |
          set -euo pipefail
          # Detect runtime device: CUDA if NVIDIA GPU is present (matching
          # the Install dependencies step's torch index choice), CPU
          # otherwise. WavesFM's main_finetune.py defaults --device to
          # "cuda" which would hard-fail on a CPU-only runner with
          # ``Torch not compiled with CUDA enabled`` at model.to(device).
          # Paired with the Clone WavesFM step's GradScaler patch above.
          DEVICE="cpu"
          if command -v nvidia-smi >/dev/null 2>&1; then
            DEVICE="cuda"
          fi
          echo "Training device: $DEVICE"
          cd "$WAVESFM_REPO_DIR"
          python -u main_finetune.py \
            --task "${{ env.WAVESFM_TASK }}" \
            --device "$DEVICE" \
            --train-data "$WAVESFM_ADAPTED_DATA" \
            --finetune /opt/qmb/riahub/model/wavesfm-v1p0.pth \
            --model vit_multi_small \
            --use-conditional-ln \
            --class-weights \
            --warmup-epochs 5 \
            --val-split 0.2 \
            --epochs "${{ env.WAVESFM_EPOCHS }}" \
            --batch-size "${{ env.WAVESFM_BATCH_SIZE }}" \
            --blr 1e-3 \
            --freeze-encoder \
            --output-dir "${{ env.WAVESFM_OUTPUT_DIR }}"
      # upload-artifact@v3: matches codebase convention (see TRAIN_TEMPLATE).
      # Upgrade to v4 is tracked as deferred/P2 work.
      - name: Upload training artifacts
        uses: actions/upload-artifact@v3
        with:
          name: wavesfm-training-artifacts
          path: |
            ${{ env.WAVESFM_OUTPUT_DIR }}/best.pth
            ${{ env.WAVESFM_OUTPUT_DIR }}/log.txt
          if-no-files-found: warn
 # committed at 2026-05-28T05:41:59.835552+00:00