Training run - 2026-05-28 06:47:45
Some checks failed
WavesFM Fine-Tuning / WavesFM-Training (push) Failing after 42s

This commit is contained in:
A ash 2026-05-28 06:47:45 -04:00
parent 9cb3f35225
commit ceb2c3fc56
2 changed files with 16 additions and 49 deletions

View File

@ -0,0 +1,5 @@
source: wavesfm
name: WavesFM Linear Probe
task: rml
epochs: 10
batch_size: 2048

View File

@ -15,11 +15,11 @@ permissions:
jobs:
WavesFM-Training:
runs-on: "ubuntu-latest"
runs-on: "ubuntu-24.04"
env:
WAVESFM_TASK: "rml"
WAVESFM_EPOCHS: "3"
WAVESFM_BATCH_SIZE: "16"
WAVESFM_EPOCHS: "10"
WAVESFM_BATCH_SIZE: "2048"
WAVESFM_OUTPUT_DIR: "/opt/wavesfm/output"
# Single source of truth for the cloned WavesFM repo location.
# Referenced as ${{ env.WAVESFM_REPO_DIR }} in steps. To relocate
@ -27,7 +27,6 @@ jobs:
# downstream step uses the env var, no hard-coded paths.
WAVESFM_REPO_DIR: "/opt/wavesfm/repo"
WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5"
RIAHUB_BASE_URL: "http://192.168.0.170:3000"
steps:
- name: Display basic runner info
run: |
@ -42,7 +41,6 @@ jobs:
echo "No NVIDIA GPU available."
fi
- name: "Download Model (qoherent/wavesfm-base/wavesfm-v1p0.pth)"
shell: bash
timeout-minutes: 4
@ -179,8 +177,7 @@ jobs:
fi
exit 1
fi
- name: "Checkout Dataset (qoherent/icc-demo/icc_canary_2026_05_28-v1.0.0.h5)"
- name: Checkout Training Dataset
shell: bash
timeout-minutes: 10
env:
@ -210,16 +207,8 @@ jobs:
AUTH_HEADER=""
if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then
AUTH_HEADER=$(printf 'Authorization: basic %s' \
"$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
fi
# ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across
# sudo's default ``env_reset`` boundary; a bare ``sudo git`` would
# see an empty env on most distros' default sudoers, so the
# step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT
# actually reach git child processes. Without it, git falls back
# to opening ``/dev/tty`` (the PTY allocated by act_runner) and
# prompting for credentials on a 401, hanging until timeout.
git_auth() {
if [[ -n "$AUTH_HEADER" ]]; then
sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@"
@ -227,9 +216,8 @@ jobs:
sudo env GIT_TERMINAL_PROMPT=0 git "$@"
fi
}
REPO_PATH='/qoherent/icc-demo.git'
DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/9f87fa9fe2badd314ad81379064e236ea494e89d'
DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/main'
sudo mkdir -p "$(dirname "$DEST_ROOT")"
if ! command -v git-lfs >/dev/null 2>&1; then
sudo apt-get update -y
@ -245,28 +233,13 @@ jobs:
sudo mkdir -p "$DEST_ROOT"
sudo git -C "$DEST_ROOT" init || continue
sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue
# See ``_render_model_checkout`` for the rationale on skipping
# ``git lfs install --local`` — short version: the smudge
# filter it would register tries its own credential lookup
# during ``git checkout FETCH_HEAD`` and hangs forever on
# /dev/tty when the repo is internal/private. We rely on
# the explicit ``git lfs fetch`` (with auth) +
# ``git lfs checkout`` (local) pair below instead.
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
'icc_canary_2026_05_28-v1.0.0.h5' || continue
if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '9f87fa9fe2badd314ad81379064e236ea494e89d'; then
continue
fi
# See ``_render_model_checkout`` for the rationale on
# ``GIT_LFS_SKIP_SMUDGE=1`` — short version: the runner has
# the LFS smudge filter installed system-wide
# (``/etc/gitconfig``), so checkout fires it and the filter's
# credential helper hangs on /dev/tty for internal repos.
# Skipping smudge here lets the explicit ``git lfs fetch``
# below handle materialization with proper auth.
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \
git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then
continue
fi
if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='icc_canary_2026_05_28-v1.0.0.h5' --exclude=""; then
@ -292,7 +265,7 @@ jobs:
if [[ "$MATERIALIZED" -ne 1 ]]; then
echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2
if [[ -z "$AUTH_HEADER" ]]; then
echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
fi
exit 1
fi
@ -321,7 +294,7 @@ jobs:
# `--device cpu` from the Train step actually takes effect.
# No-op if the line already uses args.device (idempotent).
if [[ -f main_finetune.py ]]; then
sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))|' main_finetune.py
sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.amp.GradScaler(device=args.device, enabled=(args.device != "cpu"))|' main_finetune.py
echo "Patched main_finetune.py GradScaler for CPU/GPU device parity."
fi
@ -358,12 +331,6 @@ jobs:
# only when the repo is genuinely installable (has setup.py /
# setup.cfg, or pyproject.toml with [build-system]).
cd "$WAVESFM_REPO_DIR"
# FAST-PATH: install CPU torch from pytorch.org/whl/cpu FIRST (~200MB).
# This makes torch==X already-satisfied so requirements.txt does not
# pull the 755MB manylinux wheel with bundled CUDA from PyPI default.
$PIP install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple "numpy<2" "torch==2.2.2" torchvision
# Also ensure numpy<2 is pinned for the requirements.txt install below
$PIP install --upgrade --force-reinstall "numpy<2"
INSTALLED_SOMETHING=0
if [[ -f requirements.txt ]]; then
$PIP install -r requirements.txt
@ -380,10 +347,6 @@ jobs:
exit 1
fi
$PIP install h5py scipy
# After requirements.txt, force numpy back to <2 (torch 2.2.2 has
# NumPy 1.x ABI; transitive deps in requirements.txt would
# otherwise leave numpy 2.x in place and crash at runtime).
$PIP install --upgrade --force-reinstall "numpy<2"
TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu"
TORCH_REASON="no NVIDIA GPU detected"
if command -v nvidia-smi &> /dev/null; then
@ -403,8 +366,7 @@ jobs:
fi
fi
echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})."
# torch was pre-installed at the top of this step; no force-reinstall needed.
echo "Skipping torch force-reinstall (already installed at step head): $TORCH_INDEX_URL"
$PIP install --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision
- name: Find and adapt dataset
shell: bash
@ -500,4 +462,4 @@ jobs:
${{ env.WAVESFM_OUTPUT_DIR }}/best.pth
${{ env.WAVESFM_OUTPUT_DIR }}/log.txt
if-no-files-found: warn
# committed at 2026-05-28T06:39:26.910514+00:00
# committed at 2026-05-28T10:47:45.318818+00:00