Push Tracker
icc-demo-0/.riahub/workflows/train.yaml

476 lines
22 KiB
YAML
Raw Normal View History

name: WavesFM Fine-Tuning
on:
push:
branches: [ "main" ]
paths:
- ".riahub/workflows/train.yaml"
pull_request:
branches: [ "main" ]
paths:
- ".riahub/workflows/train.yaml"
permissions:
contents: read
actions: read
jobs:
WavesFM-Training:
2026-05-28 07:57:55 -04:00
runs-on: "ubuntu-24.04"
env:
WAVESFM_TASK: "rml"
2026-05-28 07:57:55 -04:00
WAVESFM_EPOCHS: "10"
WAVESFM_BATCH_SIZE: "2048"
WAVESFM_OUTPUT_DIR: "/opt/wavesfm/output"
# Single source of truth for the cloned WavesFM repo location.
# Referenced as ${{ env.WAVESFM_REPO_DIR }} in steps. To relocate
# (e.g. /home/runner/wavesfm), change ONLY this value — every
# downstream step uses the env var, no hard-coded paths.
WAVESFM_REPO_DIR: "/opt/wavesfm/repo"
WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5"
A
2026-05-28 07:01:57 -04:00
# Override model-download base URL to internal LAN IP
# (external riahub.ai LFS endpoint is unreachable from runners).
RIAHUB_BASE_URL: "http://192.168.0.170:3000"
steps:
- name: Display basic runner info
run: |
echo "Runner OS: ${{ runner.os }}"
echo "Runner Architecture: ${{ runner.arch }}"
- name: Print GPU information
run: |
if command -v nvidia-smi &> /dev/null; then
nvidia-smi
else
echo "No NVIDIA GPU available."
fi
- name: "Download Model (qoherent/wavesfm-base/wavesfm-v1p0.pth)"
shell: bash
timeout-minutes: 4
env:
RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
GIT_TERMINAL_PROMPT: "0"
run: |
set -euo pipefail
DEFAULT_BASE_URL='https://riahub.ai'
BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
build_base_candidates() {
local raw="$1"
if [[ "$raw" =~ ^https?:// ]]; then
echo "$raw"
if [[ "$raw" == http://* ]]; then
echo "https://${raw#http://}"
elif [[ "$raw" == https://* ]]; then
echo "http://${raw#https://}"
fi
return
fi
echo "https://$raw"
echo "http://$raw"
}
AUTH_HEADER=""
if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then
AUTH_HEADER=$(printf 'Authorization: basic %s' \
"$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
fi
# ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across
# sudo's default ``env_reset`` boundary; a bare ``sudo git`` would
# see an empty env on most distros' default sudoers, so the
# step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT
# actually reach git child processes. Without it, git falls back
# to opening ``/dev/tty`` (the PTY allocated by act_runner) and
# prompting for credentials on a 401, hanging until timeout.
git_auth() {
if [[ -n "$AUTH_HEADER" ]]; then
sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@"
else
sudo env GIT_TERMINAL_PROMPT=0 git "$@"
fi
}
REPO_PATH='/qoherent/wavesfm-base.git'
REL_PATH='wavesfm-v1p0.pth'
REF='48787da4d310e9f939d9a0abe92f2a6cb13fbca7'
DEST_PATH='/opt/qmb/riahub/model/wavesfm-v1p0.pth'
TMP_DIR=$(mktemp -d)
cleanup() { sudo rm -rf "$TMP_DIR"; }
trap cleanup EXIT
if ! command -v git-lfs >/dev/null 2>&1; then
sudo apt-get update -y
sudo apt-get install -y git-lfs
fi
mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
MATERIALIZED=0
for base in "${BASE_CANDIDATES[@]}"; do
base="${base%/}"
REPO_URL="${base}${REPO_PATH}"
echo "Fetching model from $REPO_URL"
sudo rm -rf "$TMP_DIR"
sudo mkdir -p "$TMP_DIR"
sudo git -C "$TMP_DIR" init || continue
sudo git -C "$TMP_DIR" remote add origin "$REPO_URL" || continue
# NOT running ``git lfs install --local`` on purpose: it would
# register the smudge/clean filter, which then fires during
# ``git checkout FETCH_HEAD`` and tries to download every LFS
# object via its OWN credential helper subprocess. That
# subprocess does NOT inherit ``-c http.extraheader=`` or env
# vars set by the parent via ``sudo``, so on an internal/
# private repo it gets a 401 and hangs on /dev/tty
# waiting for a username — same failure class as the
# parent-fetch sub-fetch documented above. By skipping
# ``lfs install``, checkout writes LFS pointer files verbatim
# to disk; the explicit ``git lfs fetch --include=...``
# below downloads the actual objects WITH the auth header,
# and ``git lfs checkout`` then materializes them. (Explicit
# LFS commands do NOT require ``lfs install``.)
sudo git -C "$TMP_DIR" sparse-checkout init --no-cone || continue
sudo git -C "$TMP_DIR" sparse-checkout set --no-cone -- "$REL_PATH" || continue
if ! git_auth -C "$TMP_DIR" fetch --depth=1 origin "$REF"; then
continue
fi
# ``GIT_LFS_SKIP_SMUDGE=1`` disables the LFS smudge filter for
# this checkout. The smudge filter is installed SYSTEM-WIDE on
# the runner (``/etc/gitconfig`` filter.lfs.smudge) by the
# ``apt-get install git-lfs`` step above, so skipping
# ``git lfs install --local`` is NOT sufficient to keep it
# from firing. Without this env var, checkout invokes
# ``git-lfs filter-process`` which spawns its own
# ``git credential fill`` to authenticate LFS-object downloads,
# and that subprocess does NOT inherit our auth header — it
# hangs on /dev/tty waiting for a username on internal/
# private repos. With smudge skipped, checkout writes LFS
# pointer files verbatim; the explicit ``git lfs fetch``
# below materializes them with proper auth.
# ``GIT_TERMINAL_PROMPT=0`` is belt-and-suspenders for any
# other auth path that could open /dev/tty.
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \
git -C "$TMP_DIR" -c advice.detachedHead=false checkout FETCH_HEAD; then
continue
fi
if ! git_auth -C "$TMP_DIR" lfs fetch origin --include="$REL_PATH" --exclude=""; then
continue
fi
if ! sudo git -C "$TMP_DIR" lfs checkout; then
continue
fi
sudo mkdir -p "$(dirname "$DEST_PATH")"
if ! sudo cp -f "$TMP_DIR/$REL_PATH" "$DEST_PATH"; then
continue
fi
# Reject LFS pointer files (~120-byte ASCII starting with
# ``version https://git-lfs.github.com/spec/v1``). Shipping a
# pointer to the training job would crash torch.load far from
# the root cause.
if [ "$(sudo head -c 9 "$DEST_PATH" 2>/dev/null || true)" = 'version h' ]; then
echo "ERROR: $DEST_PATH is an LFS pointer, not actual content" >&2
echo " (LFS materialization failed for $REPO_URL)" >&2
continue
fi
MATERIALIZED=1
break
done
if [[ "$MATERIALIZED" -ne 1 ]]; then
echo "Failed to materialize model file using base URL candidates derived from: $BASE_URL_SOURCE" >&2
if [[ -z "$AUTH_HEADER" ]]; then
echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
fi
exit 1
fi
2026-05-28 07:57:55 -04:00
- name: Checkout Training Dataset
shell: bash
timeout-minutes: 10
env:
RIAHUB_USER: ${{ secrets.QMBDEMO_USER }}
RIAHUB_TOKEN: ${{ secrets.QMBDEMO_TOKEN }}
GIT_TERMINAL_PROMPT: "0"
run: |
set -euo pipefail
DEFAULT_BASE_URL='https://riahub.ai'
BASE_URL_SOURCE=${RIAHUB_BASE_URL:-$DEFAULT_BASE_URL}
BASE_URL_SOURCE="${BASE_URL_SOURCE%/}"
build_base_candidates() {
local raw="$1"
if [[ "$raw" =~ ^https?:// ]]; then
echo "$raw"
if [[ "$raw" == http://* ]]; then
echo "https://${raw#http://}"
elif [[ "$raw" == https://* ]]; then
echo "http://${raw#https://}"
fi
return
fi
echo "https://$raw"
echo "http://$raw"
}
AUTH_HEADER=""
if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then
2026-05-28 07:57:55 -04:00
AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
fi
git_auth() {
if [[ -n "$AUTH_HEADER" ]]; then
sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@"
else
sudo env GIT_TERMINAL_PROMPT=0 git "$@"
fi
}
REPO_PATH='/qoherent/icc-demo.git'
2026-05-28 07:57:55 -04:00
DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/main'
sudo mkdir -p "$(dirname "$DEST_ROOT")"
if ! command -v git-lfs >/dev/null 2>&1; then
sudo apt-get update -y
sudo apt-get install -y git-lfs
fi
mapfile -t BASE_CANDIDATES < <(build_base_candidates "$BASE_URL_SOURCE")
MATERIALIZED=0
for base in "${BASE_CANDIDATES[@]}"; do
base="${base%/}"
REPO_URL="${base}${REPO_PATH}"
echo "Fetching dataset from $REPO_URL"
sudo rm -rf "$DEST_ROOT"
sudo mkdir -p "$DEST_ROOT"
sudo git -C "$DEST_ROOT" init || continue
sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
A
2026-05-28 07:19:35 -04:00
'datasets/icc28-train_v1.0.0.h5' || continue
if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin 'ef0a0e430f8e5019dec52794fccab958b3a3c2b7'; then
continue
fi
2026-05-28 07:57:55 -04:00
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then
continue
fi
A
2026-05-28 07:19:35 -04:00
if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='datasets/icc28-train_v1.0.0.h5' --exclude=""; then
echo "LFS fetch failed for candidate $base, trying next" >&2
continue
fi
if ! sudo git -C "$DEST_ROOT" lfs checkout; then
echo "LFS checkout failed for candidate $base, trying next" >&2
continue
fi
POINTER_FOUND=0
A
2026-05-28 07:19:35 -04:00
_LFS_REL_PATH='datasets/icc28-train_v1.0.0.h5'
if [[ "$(sudo head -c 9 "$DEST_ROOT/$_LFS_REL_PATH" 2>/dev/null || true)" == "version h" ]]; then
echo "LFS materialization left a pointer at $DEST_ROOT/$_LFS_REL_PATH for candidate $base, trying next" >&2
POINTER_FOUND=1
fi
if [[ "$POINTER_FOUND" -ne 0 ]]; then
continue
fi
MATERIALIZED=1
break
done
if [[ "$MATERIALIZED" -ne 1 ]]; then
echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2
if [[ -z "$AUTH_HEADER" ]]; then
2026-05-28 07:57:55 -04:00
echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
fi
exit 1
fi
- name: Clone WavesFM
shell: bash
run: |
set -euo pipefail
mkdir -p "$(dirname "$WAVESFM_REPO_DIR")"
rm -rf "$WAVESFM_REPO_DIR"
git init "$WAVESFM_REPO_DIR"
cd "$WAVESFM_REPO_DIR"
git remote add origin https://github.com/AhmedTarek62/wavesfm.git
git fetch --depth 1 origin 483831732e32190b7018181b4f2cef93d755cef9
git checkout FETCH_HEAD
# CPU-runner compatibility patch. WavesFM upstream
# main_finetune.py hardcodes a CUDA device in two places:
# line ~87: `add_argument("--device", default="cuda")`
# line ~267: `torch.amp.GradScaler(device="cuda")`
# On a runner without an NVIDIA GPU (e.g. dev machines with
# only integrated Intel UHD graphics), the CPU-only torch
# wheel we install in "Install dependencies" raises
# `AssertionError: Torch not compiled with CUDA enabled` at
# the first `model.to(device)` call. Patch the GradScaler
# literal to use the argparse-provided device so passing
# `--device cpu` from the Train step actually takes effect.
# No-op if the line already uses args.device (idempotent).
if [[ -f main_finetune.py ]]; then
A
2026-05-28 07:01:57 -04:00
sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))|' main_finetune.py
echo "Patched main_finetune.py GradScaler for CPU/GPU device parity."
fi
- name: Checkout adapter and model config
uses: actions/checkout@v5
with:
sparse-checkout: |
scripts/adapt_dataset.py
.riahub/train_configs/model
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: "3.12"
- name: Install dependencies
run: |
set -euo pipefail
# Use ``python -m pip`` rather than ``pip`` directly: actions/setup-python
# always puts ``python`` on PATH but the ``pip`` shim isn't guaranteed
# on every distro / venv layout. ``python -m pip`` always works
# against the active interpreter.
PIP="python -m pip"
# The pinned wavesfm SHA controls repo layout. Three common shapes:
# - Pure script-only repo: requirements.txt only -> install
# requirements (current pinned SHA shape)
# - Python package: setup.py / setup.cfg, OR pyproject.toml
# with a real [build-system] section -> editable install
# - Poetry-only / Hatch-only / tool-only pyproject.toml WITHOUT
# [build-system] -> `pip install -e .` would error; we install
# requirements.txt if available and skip the editable install
# Order matters: install requirements.txt first if present (it's
# the most explicit dep list); editable install layered on top
# only when the repo is genuinely installable (has setup.py /
# setup.cfg, or pyproject.toml with [build-system]).
cd "$WAVESFM_REPO_DIR"
A
2026-05-28 07:01:57 -04:00
# Pre-install CPU torch + numpy<2 to make requirements.txt see them already-satisfied (saves ~600MB).
# torch 2.2.2 has the NumPy 1.x ABI and crashes if numpy 2.x is installed.
$PIP install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple "numpy<2" "torch==2.2.2" torchvision
$PIP install --upgrade --force-reinstall "numpy<2"
INSTALLED_SOMETHING=0
if [[ -f requirements.txt ]]; then
$PIP install -r requirements.txt
INSTALLED_SOMETHING=1
fi
if [[ -f setup.py ]] || [[ -f setup.cfg ]] || \
( [[ -f pyproject.toml ]] && grep -q "^\[build-system\]" pyproject.toml ); then
$PIP install -e .
INSTALLED_SOMETHING=1
fi
if [[ "$INSTALLED_SOMETHING" -eq 0 ]]; then
echo "ERROR: $WAVESFM_REPO_DIR has no installable Python metadata" >&2
echo " expected: requirements.txt, setup.py, setup.cfg, or pyproject.toml with [build-system]" >&2
exit 1
fi
$PIP install h5py scipy
A
2026-05-28 07:01:57 -04:00
# Force numpy<2 again (requirements.txt may have bumped it via transitive deps).
$PIP install --upgrade --force-reinstall "numpy<2"
TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu"
TORCH_REASON="no NVIDIA GPU detected"
if command -v nvidia-smi &> /dev/null; then
CAP_LINES="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null || true)"
if [[ -z "$CAP_LINES" ]]; then
CAP_LINES="$(nvidia-smi -q 2>/dev/null | awk -F: '/Compute Capability/ {print $2}')"
fi
CAP_MAX="$(echo "$CAP_LINES" | awk '{gsub(/[^0-9.]/,""); if ($0=="") next; if ($0+0>max) max=$0+0} END {if (max>0) print max}')"
if [[ -n "$CAP_MAX" ]]; then
if awk -v cap="$CAP_MAX" 'BEGIN{exit !(cap>=7.5)}'; then
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130"
TORCH_REASON="compute capability ${CAP_MAX} >= 7.5"
else
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu126"
TORCH_REASON="compute capability ${CAP_MAX} < 7.5"
fi
fi
fi
echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})."
A
2026-05-28 07:01:57 -04:00
# torch pre-installed at step head; force-reinstall disabled to avoid 755MB redownload
echo "Skipping torch force-reinstall ($TORCH_INDEX_URL)"
- name: Find and adapt dataset
shell: bash
run: |
set -euo pipefail
# Find .h5 files but EXCLUDE LFS pointer files. The dataset
# checkout's sparse-checkout pattern is supposed to limit the
# working tree to the target file, but sparse-checkout in
# ``init+fetch+checkout`` mode (vs the previous
# ``clone --no-checkout``) doesn't always activate cleanly,
# leaving OTHER LFS-tracked files (e.g. sibling datasets in
# the same repo) as unmaterialized 120-180 byte pointer files
# in the working tree. The training adapter must skip those —
# otherwise ``find ... -name '*.h5'`` counts pointer files as
# real datasets and the "expected exactly one" check trips.
# The same ``head -c 9`` LFS-pointer test used elsewhere works
# here: real HDF5 files start with the HDF5 magic byte
# 0x89, never with the ASCII string "version h".
H5_CANDIDATES=()
while IFS= read -r f; do
if [[ "$(head -c 9 "$f" 2>/dev/null || true)" != "version h" ]]; then
H5_CANDIDATES+=("$f")
fi
done < <(find /opt/qmb/riahub/dataset -name '*.h5' -type f)
if [[ ${#H5_CANDIDATES[@]} -eq 0 ]]; then
echo "ERROR: No materialized .h5 dataset file found in /opt/qmb/riahub/dataset/" >&2
echo " (any .h5 files present are LFS pointers — LFS materialization may have failed)" >&2
exit 1
fi
if [[ ${#H5_CANDIDATES[@]} -gt 1 ]]; then
echo "ERROR: Multiple materialized .h5 files found (${#H5_CANDIDATES[@]}); expected exactly one:" >&2
printf ' %s\n' "${H5_CANDIDATES[@]}" >&2
exit 1
fi
INPUT_H5="${H5_CANDIDATES[0]}"
echo "Adapting: $INPUT_H5"
python ${{ github.workspace }}/scripts/adapt_dataset.py \
"$INPUT_H5" "$WAVESFM_ADAPTED_DATA"
- name: Verify adapted dataset
run: |
python -c "
import h5py, json, os
f = h5py.File(os.environ['WAVESFM_ADAPTED_DATA'], 'r')
print('sample:', f['sample'].shape, f['sample'].dtype)
print('label:', f['label'].shape, f['label'].dtype)
labels = json.loads(f.attrs['labels'])
print('classes:', len(labels), labels[:5])
f.close()
"
- name: Train WavesFM (Linear Probe)
shell: bash
env:
PYTHONUNBUFFERED: "1"
run: |
set -euo pipefail
# Detect runtime device: CUDA if NVIDIA GPU is present (matching
# the Install dependencies step's torch index choice), CPU
# otherwise. WavesFM's main_finetune.py defaults --device to
# "cuda" which would hard-fail on a CPU-only runner with
# ``Torch not compiled with CUDA enabled`` at model.to(device).
# Paired with the Clone WavesFM step's GradScaler patch above.
DEVICE="cpu"
if command -v nvidia-smi >/dev/null 2>&1; then
DEVICE="cuda"
fi
echo "Training device: $DEVICE"
cd "$WAVESFM_REPO_DIR"
python -u main_finetune.py \
--task "${{ env.WAVESFM_TASK }}" \
--device "$DEVICE" \
--train-data "$WAVESFM_ADAPTED_DATA" \
--finetune /opt/qmb/riahub/model/wavesfm-v1p0.pth \
--model vit_multi_small \
--use-conditional-ln \
--class-weights \
--warmup-epochs 5 \
--val-split 0.2 \
--epochs "${{ env.WAVESFM_EPOCHS }}" \
--batch-size "${{ env.WAVESFM_BATCH_SIZE }}" \
--blr 1e-3 \
--freeze-encoder \
--output-dir "${{ env.WAVESFM_OUTPUT_DIR }}"
# upload-artifact@v3: matches codebase convention (see TRAIN_TEMPLATE).
# Upgrade to v4 is tracked as deferred/P2 work.
- name: Upload training artifacts
uses: actions/upload-artifact@v3
with:
name: wavesfm-training-artifacts
path: |
${{ env.WAVESFM_OUTPUT_DIR }}/best.pth
${{ env.WAVESFM_OUTPUT_DIR }}/log.txt
if-no-files-found: warn
2026-05-28 07:57:55 -04:00
# committed at 2026-05-28T11:57:55.094537+00:00