name: WavesFM Fine-Tuning on: push: branches: [ "main" ] paths: - ".riahub/workflows/train.yaml" pull_request: branches: [ "main" ] paths: - ".riahub/workflows/train.yaml" permissions: contents: read actions: read jobs: WavesFM-Training: runs-on: "ubuntu-latest" env: WAVESFM_TASK: "rml" WAVESFM_EPOCHS: "3" WAVESFM_BATCH_SIZE: "16" WAVESFM_OUTPUT_DIR: "/opt/wavesfm/output" # Single source of truth for the cloned WavesFM repo location. # Referenced as ${{ env.WAVESFM_REPO_DIR }} in steps. To relocate # (e.g. /home/runner/wavesfm), change ONLY this value — every # downstream step uses the env var, no hard-coded paths. WAVESFM_REPO_DIR: "/opt/wavesfm/repo" WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5" RIAHUB_BASE_URL: "http://192.168.0.170:3000" steps: - name: Display basic runner info run: | echo "Runner OS: ${{ runner.os }}" echo "Runner Architecture: ${{ runner.arch }}" - name: Print GPU information run: | if command -v nvidia-smi &> /dev/null; then nvidia-smi else echo "No NVIDIA GPU available." fi - name: "Download Model (qoherent/wavesfm-base/wavesfm-v1p0.pth)" shell: bash timeout-minutes: 4 env: RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} GIT_TERMINAL_PROMPT: "0" run: | set -euo pipefail DEFAULT_BASE_URL='https://riahub.ai' BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL} BASE_URL_SOURCE="${BASE_URL_SOURCE%/}" build_base_candidates() { local raw="$1" if [[ "$raw" =~ ^https?:// ]]; then echo "$raw" if [[ "$raw" == http://* ]]; then echo "https://${raw#http://}" elif [[ "$raw" == https://* ]]; then echo "http://${raw#https://}" fi return fi echo "https://$raw" echo "http://$raw" } AUTH_HEADER="" if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then AUTH_HEADER=$(printf 'Authorization: basic %s' \ "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") fi # ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across # sudo's default ``env_reset`` boundary; a bare ``sudo git`` would # see an empty env on most distros' default sudoers, so the # step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT # actually reach git child processes. Without it, git falls back # to opening ``/dev/tty`` (the PTY allocated by act_runner) and # prompting for credentials on a 401, hanging until timeout. git_auth() { if [[ -n "$AUTH_HEADER" ]]; then sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" else sudo env GIT_TERMINAL_PROMPT=0 git "$@" fi } REPO_PATH='/qoherent/wavesfm-base.git' REL_PATH='wavesfm-v1p0.pth' REF='48787da4d310e9f939d9a0abe92f2a6cb13fbca7' DEST_PATH='/opt/qmb/riahub/model/wavesfm-v1p0.pth' TMP_DIR=$(mktemp -d) cleanup() { sudo rm -rf "$TMP_DIR"; } trap cleanup EXIT if ! command -v git-lfs >/dev/null 2>&1; then sudo apt-get update -y sudo apt-get install -y git-lfs fi mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE") MATERIALIZED=0 for base in "${BASE_CANDIDATES[@]}"; do base="${base%/}" REPO_URL="${base}${REPO_PATH}" echo "Fetching model from $REPO_URL" sudo rm -rf "$TMP_DIR" sudo mkdir -p "$TMP_DIR" sudo git -C "$TMP_DIR" init || continue sudo git -C "$TMP_DIR" remote add origin "$REPO_URL" || continue # NOT running ``git lfs install --local`` on purpose: it would # register the smudge/clean filter, which then fires during # ``git checkout FETCH_HEAD`` and tries to download every LFS # object via its OWN credential helper subprocess. That # subprocess does NOT inherit ``-c http.extraheader=`` or env # vars set by the parent via ``sudo``, so on an internal/ # private repo it gets a 401 and hangs on /dev/tty # waiting for a username — same failure class as the # parent-fetch sub-fetch documented above. By skipping # ``lfs install``, checkout writes LFS pointer files verbatim # to disk; the explicit ``git lfs fetch --include=...`` # below downloads the actual objects WITH the auth header, # and ``git lfs checkout`` then materializes them. (Explicit # LFS commands do NOT require ``lfs install``.) sudo git -C "$TMP_DIR" sparse-checkout init --no-cone || continue sudo git -C "$TMP_DIR" sparse-checkout set --no-cone -- "$REL_PATH" || continue if ! git_auth -C "$TMP_DIR" fetch --depth=1 origin "$REF"; then continue fi # ``GIT_LFS_SKIP_SMUDGE=1`` disables the LFS smudge filter for # this checkout. The smudge filter is installed SYSTEM-WIDE on # the runner (``/etc/gitconfig`` filter.lfs.smudge) by the # ``apt-get install git-lfs`` step above, so skipping # ``git lfs install --local`` is NOT sufficient to keep it # from firing. Without this env var, checkout invokes # ``git-lfs filter-process`` which spawns its own # ``git credential fill`` to authenticate LFS-object downloads, # and that subprocess does NOT inherit our auth header — it # hangs on /dev/tty waiting for a username on internal/ # private repos. With smudge skipped, checkout writes LFS # pointer files verbatim; the explicit ``git lfs fetch`` # below materializes them with proper auth. # ``GIT_TERMINAL_PROMPT=0`` is belt-and-suspenders for any # other auth path that could open /dev/tty. if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \ git -C "$TMP_DIR" -c advice.detachedHead=false checkout FETCH_HEAD; then continue fi if ! git_auth -C "$TMP_DIR" lfs fetch origin --include="$REL_PATH" --exclude=""; then continue fi if ! sudo git -C "$TMP_DIR" lfs checkout; then continue fi sudo mkdir -p "$(dirname "$DEST_PATH")" if ! sudo cp -f "$TMP_DIR/$REL_PATH" "$DEST_PATH"; then continue fi # Reject LFS pointer files (~120-byte ASCII starting with # ``version https://git-lfs.github.com/spec/v1``). Shipping a # pointer to the training job would crash torch.load far from # the root cause. if [ "$(sudo head -c 9 "$DEST_PATH" 2>/dev/null || true)" = 'version h' ]; then echo "ERROR: $DEST_PATH is an LFS pointer, not actual content" >&2 echo " (LFS materialization failed for $REPO_URL)" >&2 continue fi MATERIALIZED=1 break done if [[ "$MATERIALIZED" -ne 1 ]]; then echo "Failed to materialize model file using base URL candidates derived from: $BASE_URL_SOURCE" >&2 if [[ -z "$AUTH_HEADER" ]]; then echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 fi exit 1 fi - name: "Checkout Dataset (qoherent/icc-demo/icc_canary_2026_05_28-v1.0.0.h5)" shell: bash timeout-minutes: 10 env: RIAHUB_USER: ${{ secrets.QMBDEMO_USER }} RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }} GIT_TERMINAL_PROMPT: "0" run: | set -euo pipefail DEFAULT_BASE_URL='https://riahub.ai' BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL} BASE_URL_SOURCE="${BASE_URL_SOURCE%/}" build_base_candidates() { local raw="$1" if [[ "$raw" =~ ^https?:// ]]; then echo "$raw" if [[ "$raw" == http://* ]]; then echo "https://${raw#http://}" elif [[ "$raw" == https://* ]]; then echo "http://${raw#https://}" fi return fi echo "https://$raw" echo "http://$raw" } AUTH_HEADER="" if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then AUTH_HEADER=$(printf 'Authorization: basic %s' \ "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") fi # ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across # sudo's default ``env_reset`` boundary; a bare ``sudo git`` would # see an empty env on most distros' default sudoers, so the # step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT # actually reach git child processes. Without it, git falls back # to opening ``/dev/tty`` (the PTY allocated by act_runner) and # prompting for credentials on a 401, hanging until timeout. git_auth() { if [[ -n "$AUTH_HEADER" ]]; then sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" else sudo env GIT_TERMINAL_PROMPT=0 git "$@" fi } REPO_PATH='/qoherent/icc-demo.git' DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/9f87fa9fe2badd314ad81379064e236ea494e89d' sudo mkdir -p "$(dirname "$DEST_ROOT")" if ! command -v git-lfs >/dev/null 2>&1; then sudo apt-get update -y sudo apt-get install -y git-lfs fi mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE") MATERIALIZED=0 for base in "${BASE_CANDIDATES[@]}"; do base="${base%/}" REPO_URL="${base}${REPO_PATH}" echo "Fetching dataset from $REPO_URL" sudo rm -rf "$DEST_ROOT" sudo mkdir -p "$DEST_ROOT" sudo git -C "$DEST_ROOT" init || continue sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue # See ``_render_model_checkout`` for the rationale on skipping # ``git lfs install --local`` — short version: the smudge # filter it would register tries its own credential lookup # during ``git checkout FETCH_HEAD`` and hangs forever on # /dev/tty when the repo is internal/private. We rely on # the explicit ``git lfs fetch`` (with auth) + # ``git lfs checkout`` (local) pair below instead. sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ 'icc_canary_2026_05_28-v1.0.0.h5' || continue if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '9f87fa9fe2badd314ad81379064e236ea494e89d'; then continue fi # See ``_render_model_checkout`` for the rationale on # ``GIT_LFS_SKIP_SMUDGE=1`` — short version: the runner has # the LFS smudge filter installed system-wide # (``/etc/gitconfig``), so checkout fires it and the filter's # credential helper hangs on /dev/tty for internal repos. # Skipping smudge here lets the explicit ``git lfs fetch`` # below handle materialization with proper auth. if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \ git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then continue fi if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='icc_canary_2026_05_28-v1.0.0.h5' --exclude=""; then echo "LFS fetch failed for candidate $base, trying next" >&2 continue fi if ! sudo git -C "$DEST_ROOT" lfs checkout; then echo "LFS checkout failed for candidate $base, trying next" >&2 continue fi POINTER_FOUND=0 _LFS_REL_PATH='icc_canary_2026_05_28-v1.0.0.h5' if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2 POINTER_FOUND=1 fi if [[ "$POINTER_FOUND" -ne 0 ]]; then continue fi MATERIALIZED=1 break done if [[ "$MATERIALIZED" -ne 1 ]]; then echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 if [[ -z "$AUTH_HEADER" ]]; then echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 fi exit 1 fi - name: Clone WavesFM shell: bash run: | set -euo pipefail mkdir -p "$(dirname "$WAVESFM_REPO_DIR")" rm -rf "$WAVESFM_REPO_DIR" git init "$WAVESFM_REPO_DIR" cd "$WAVESFM_REPO_DIR" git remote add origin https://github.com/AhmedTarek62/wavesfm.git git fetch --depth 1 origin 483831732e32190b7018181b4f2cef93d755cef9 git checkout FETCH_HEAD # CPU-runner compatibility patch. WavesFM upstream # main_finetune.py hardcodes a CUDA device in two places: # line ~87: `add_argument("--device", default="cuda")` # line ~267: `torch.amp.GradScaler(device="cuda")` # On a runner without an NVIDIA GPU (e.g. dev machines with # only integrated Intel UHD graphics), the CPU-only torch # wheel we install in "Install dependencies" raises # `AssertionError: Torch not compiled with CUDA enabled` at # the first `model.to(device)` call. Patch the GradScaler # literal to use the argparse-provided device so passing # `--device cpu` from the Train step actually takes effect. # No-op if the line already uses args.device (idempotent). if [[ -f main_finetune.py ]]; then sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))|' main_finetune.py echo "Patched main_finetune.py GradScaler for CPU/GPU device parity." fi - name: Checkout adapter and model config uses: actions/checkout@v5 with: sparse-checkout: | scripts/adapt_dataset.py .riahub/train_configs/model - name: Setup Python uses: actions/setup-python@v6 with: python-version: "3.12" - name: Install dependencies run: | set -euo pipefail # Use ``python -m pip`` rather than ``pip`` directly: actions/setup-python # always puts ``python`` on PATH but the ``pip`` shim isn't guaranteed # on every distro / venv layout. ``python -m pip`` always works # against the active interpreter. PIP="python -m pip" # The pinned wavesfm SHA controls repo layout. Three common shapes: # - Pure script-only repo: requirements.txt only -> install # requirements (current pinned SHA shape) # - Python package: setup.py / setup.cfg, OR pyproject.toml # with a real [build-system] section -> editable install # - Poetry-only / Hatch-only / tool-only pyproject.toml WITHOUT # [build-system] -> `pip install -e .` would error; we install # requirements.txt if available and skip the editable install # Order matters: install requirements.txt first if present (it's # the most explicit dep list); editable install layered on top # only when the repo is genuinely installable (has setup.py / # setup.cfg, or pyproject.toml with [build-system]). cd "$WAVESFM_REPO_DIR" # FAST-PATH: install CPU torch from pytorch.org/whl/cpu FIRST (~200MB). # This makes torch==X already-satisfied so requirements.txt does not # pull the 755MB manylinux wheel with bundled CUDA from PyPI default. $PIP install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple "numpy<2" "torch==2.2.2" torchvision # Also ensure numpy<2 is pinned for the requirements.txt install below $PIP install --upgrade --force-reinstall "numpy<2" INSTALLED_SOMETHING=0 if [[ -f requirements.txt ]]; then $PIP install -r requirements.txt INSTALLED_SOMETHING=1 fi if [[ -f setup.py ]] || [[ -f setup.cfg ]] || \ ( [[ -f pyproject.toml ]] && grep -q "^\[build-system\]" pyproject.toml ); then $PIP install -e . INSTALLED_SOMETHING=1 fi if [[ "$INSTALLED_SOMETHING" -eq 0 ]]; then echo "ERROR: $WAVESFM_REPO_DIR has no installable Python metadata" >&2 echo " expected: requirements.txt, setup.py, setup.cfg, or pyproject.toml with [build-system]" >&2 exit 1 fi $PIP install h5py scipy # After requirements.txt, force numpy back to <2 (torch 2.2.2 has # NumPy 1.x ABI; transitive deps in requirements.txt would # otherwise leave numpy 2.x in place and crash at runtime). $PIP install --upgrade --force-reinstall "numpy<2" TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu" TORCH_REASON="no NVIDIA GPU detected" if command -v nvidia-smi &> /dev/null; then CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)" if [[ -z "$CAP_LINES" ]]; then CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')" fi CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')" if [[ -n "$CAP_MAX" ]]; then if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130" TORCH_REASON="compute capability ${CAP_MAX} >= 7.5" else TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126" TORCH_REASON="compute capability ${CAP_MAX} < 7.5" fi fi fi echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})." # torch was pre-installed at the top of this step; no force-reinstall needed. echo "Skipping torch force-reinstall (already installed at step head): $TORCH_INDEX_URL" - name: Find and adapt dataset shell: bash run: | set -euo pipefail # Find .h5 files but EXCLUDE LFS pointer files. The dataset # checkout's sparse-checkout pattern is supposed to limit the # working tree to the target file, but sparse-checkout in # ``init+fetch+checkout`` mode (vs the previous # ``clone --no-checkout``) doesn't always activate cleanly, # leaving OTHER LFS-tracked files (e.g. sibling datasets in # the same repo) as unmaterialized 120-180 byte pointer files # in the working tree. The training adapter must skip those — # otherwise ``find ... -name '*.h5'`` counts pointer files as # real datasets and the "expected exactly one" check trips. # The same ``head -c 9`` LFS-pointer test used elsewhere works # here: real HDF5 files start with the HDF5 magic byte # 0x89, never with the ASCII string "version h". H5_CANDIDATES=() while IFS= read -r f; do if [[ "$(head -c 9 "$f" 2>/dev/null || true)" != "version h" ]]; then H5_CANDIDATES+=("$f") fi done < <(find /opt/qmb/riahub/dataset -name '*.h5' -type f) if [[ ${#H5_CANDIDATES[@]} -eq 0 ]]; then echo "ERROR: No materialized .h5 dataset file found in /opt/qmb/riahub/dataset/" >&2 echo " (any .h5 files present are LFS pointers — LFS materialization may have failed)" >&2 exit 1 fi if [[ ${#H5_CANDIDATES[@]} -gt 1 ]]; then echo "ERROR: Multiple materialized .h5 files found (${#H5_CANDIDATES[@]}); expected exactly one:" >&2 printf ' %s\n' "${H5_CANDIDATES[@]}" >&2 exit 1 fi INPUT_H5="${H5_CANDIDATES[0]}" echo "Adapting: $INPUT_H5" python ${{ github.workspace }}/scripts/adapt_dataset.py \ "$INPUT_H5" "$WAVESFM_ADAPTED_DATA" - name: Verify adapted dataset run: | python -c " import h5py, json, os f = h5py.File(os.environ['WAVESFM_ADAPTED_DATA'], 'r') print('sample:', f['sample'].shape, f['sample'].dtype) print('label:', f['label'].shape, f['label'].dtype) labels = json.loads(f.attrs['labels']) print('classes:', len(labels), labels[:5]) f.close() " - name: Train WavesFM (Linear Probe) shell: bash env: PYTHONUNBUFFERED: "1" run: | set -euo pipefail # Detect runtime device: CUDA if NVIDIA GPU is present (matching # the Install dependencies step's torch index choice), CPU # otherwise. WavesFM's main_finetune.py defaults --device to # "cuda" which would hard-fail on a CPU-only runner with # ``Torch not compiled with CUDA enabled`` at model.to(device). # Paired with the Clone WavesFM step's GradScaler patch above. DEVICE="cpu" if command -v nvidia-smi >/dev/null 2>&1; then DEVICE="cuda" fi echo "Training device: $DEVICE" cd "$WAVESFM_REPO_DIR" python -u main_finetune.py \ --task "${{ env.WAVESFM_TASK }}" \ --device "$DEVICE" \ --train-data "$WAVESFM_ADAPTED_DATA" \ --finetune /opt/qmb/riahub/model/wavesfm-v1p0.pth \ --model vit_multi_small \ --use-conditional-ln \ --class-weights \ --warmup-epochs 5 \ --val-split 0.2 \ --epochs "${{ env.WAVESFM_EPOCHS }}" \ --batch-size "${{ env.WAVESFM_BATCH_SIZE }}" \ --blr 1e-3 \ --freeze-encoder \ --output-dir "${{ env.WAVESFM_OUTPUT_DIR }}" # upload-artifact@v3: matches codebase convention (see TRAIN_TEMPLATE). # Upgrade to v4 is tracked as deferred/P2 work. - name: Upload training artifacts uses: actions/upload-artifact@v3 with: name: wavesfm-training-artifacts path: | ${{ env.WAVESFM_OUTPUT_DIR }}/best.pth ${{ env.WAVESFM_OUTPUT_DIR }}/log.txt if-no-files-found: warn # committed at 2026-05-28T06:39:26.910514+00:00