Training run - 2026-05-28 06:47:45
Some checks failed
WavesFM Fine-Tuning / WavesFM-Training (push) Failing after 42s
Some checks failed
WavesFM Fine-Tuning / WavesFM-Training (push) Failing after 42s
This commit is contained in:
parent
9cb3f35225
commit
ceb2c3fc56
5
.riahub/train_configs/model/model.yaml
Normal file
5
.riahub/train_configs/model/model.yaml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
source: wavesfm
|
||||
name: WavesFM Linear Probe
|
||||
task: rml
|
||||
epochs: 10
|
||||
batch_size: 2048
|
||||
|
|
@ -15,11 +15,11 @@ permissions:
|
|||
|
||||
jobs:
|
||||
WavesFM-Training:
|
||||
runs-on: "ubuntu-latest"
|
||||
runs-on: "ubuntu-24.04"
|
||||
env:
|
||||
WAVESFM_TASK: "rml"
|
||||
WAVESFM_EPOCHS: "3"
|
||||
WAVESFM_BATCH_SIZE: "16"
|
||||
WAVESFM_EPOCHS: "10"
|
||||
WAVESFM_BATCH_SIZE: "2048"
|
||||
WAVESFM_OUTPUT_DIR: "/opt/wavesfm/output"
|
||||
# Single source of truth for the cloned WavesFM repo location.
|
||||
# Referenced as ${{ env.WAVESFM_REPO_DIR }} in steps. To relocate
|
||||
|
|
@ -27,7 +27,6 @@ jobs:
|
|||
# downstream step uses the env var, no hard-coded paths.
|
||||
WAVESFM_REPO_DIR: "/opt/wavesfm/repo"
|
||||
WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5"
|
||||
RIAHUB_BASE_URL: "http://192.168.0.170:3000"
|
||||
steps:
|
||||
- name: Display basic runner info
|
||||
run: |
|
||||
|
|
@ -42,7 +41,6 @@ jobs:
|
|||
echo "No NVIDIA GPU available."
|
||||
fi
|
||||
|
||||
|
||||
- name: "Download Model (qoherent/wavesfm-base/wavesfm-v1p0.pth)"
|
||||
shell: bash
|
||||
timeout-minutes: 4
|
||||
|
|
@ -179,8 +177,7 @@ jobs:
|
|||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: "Checkout Dataset (qoherent/icc-demo/icc_canary_2026_05_28-v1.0.0.h5)"
|
||||
- name: Checkout Training Dataset
|
||||
shell: bash
|
||||
timeout-minutes: 10
|
||||
env:
|
||||
|
|
@ -210,16 +207,8 @@ jobs:
|
|||
|
||||
AUTH_HEADER=""
|
||||
if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then
|
||||
AUTH_HEADER=$(printf 'Authorization: basic %s' \
|
||||
"$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
|
||||
AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')")
|
||||
fi
|
||||
# ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across
|
||||
# sudo's default ``env_reset`` boundary; a bare ``sudo git`` would
|
||||
# see an empty env on most distros' default sudoers, so the
|
||||
# step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT
|
||||
# actually reach git child processes. Without it, git falls back
|
||||
# to opening ``/dev/tty`` (the PTY allocated by act_runner) and
|
||||
# prompting for credentials on a 401, hanging until timeout.
|
||||
git_auth() {
|
||||
if [[ -n "$AUTH_HEADER" ]]; then
|
||||
sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@"
|
||||
|
|
@ -227,9 +216,8 @@ jobs:
|
|||
sudo env GIT_TERMINAL_PROMPT=0 git "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
REPO_PATH='/qoherent/icc-demo.git'
|
||||
DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/9f87fa9fe2badd314ad81379064e236ea494e89d'
|
||||
DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/main'
|
||||
sudo mkdir -p "$(dirname "$DEST_ROOT")"
|
||||
if ! command -v git-lfs >/dev/null 2>&1; then
|
||||
sudo apt-get update -y
|
||||
|
|
@ -245,28 +233,13 @@ jobs:
|
|||
sudo mkdir -p "$DEST_ROOT"
|
||||
sudo git -C "$DEST_ROOT" init || continue
|
||||
sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue
|
||||
# See ``_render_model_checkout`` for the rationale on skipping
|
||||
# ``git lfs install --local`` — short version: the smudge
|
||||
# filter it would register tries its own credential lookup
|
||||
# during ``git checkout FETCH_HEAD`` and hangs forever on
|
||||
# /dev/tty when the repo is internal/private. We rely on
|
||||
# the explicit ``git lfs fetch`` (with auth) +
|
||||
# ``git lfs checkout`` (local) pair below instead.
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue
|
||||
sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \
|
||||
'icc_canary_2026_05_28-v1.0.0.h5' || continue
|
||||
if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '9f87fa9fe2badd314ad81379064e236ea494e89d'; then
|
||||
continue
|
||||
fi
|
||||
# See ``_render_model_checkout`` for the rationale on
|
||||
# ``GIT_LFS_SKIP_SMUDGE=1`` — short version: the runner has
|
||||
# the LFS smudge filter installed system-wide
|
||||
# (``/etc/gitconfig``), so checkout fires it and the filter's
|
||||
# credential helper hangs on /dev/tty for internal repos.
|
||||
# Skipping smudge here lets the explicit ``git lfs fetch``
|
||||
# below handle materialization with proper auth.
|
||||
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \
|
||||
git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then
|
||||
if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then
|
||||
continue
|
||||
fi
|
||||
if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='icc_canary_2026_05_28-v1.0.0.h5' --exclude=""; then
|
||||
|
|
@ -292,7 +265,7 @@ jobs:
|
|||
if [[ "$MATERIALIZED" -ne 1 ]]; then
|
||||
echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2
|
||||
if [[ -z "$AUTH_HEADER" ]]; then
|
||||
echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
|
||||
echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
|
@ -321,7 +294,7 @@ jobs:
|
|||
# `--device cpu` from the Train step actually takes effect.
|
||||
# No-op if the line already uses args.device (idempotent).
|
||||
if [[ -f main_finetune.py ]]; then
|
||||
sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))|' main_finetune.py
|
||||
sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.amp.GradScaler(device=args.device, enabled=(args.device != "cpu"))|' main_finetune.py
|
||||
echo "Patched main_finetune.py GradScaler for CPU/GPU device parity."
|
||||
fi
|
||||
|
||||
|
|
@ -358,12 +331,6 @@ jobs:
|
|||
# only when the repo is genuinely installable (has setup.py /
|
||||
# setup.cfg, or pyproject.toml with [build-system]).
|
||||
cd "$WAVESFM_REPO_DIR"
|
||||
# FAST-PATH: install CPU torch from pytorch.org/whl/cpu FIRST (~200MB).
|
||||
# This makes torch==X already-satisfied so requirements.txt does not
|
||||
# pull the 755MB manylinux wheel with bundled CUDA from PyPI default.
|
||||
$PIP install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple "numpy<2" "torch==2.2.2" torchvision
|
||||
# Also ensure numpy<2 is pinned for the requirements.txt install below
|
||||
$PIP install --upgrade --force-reinstall "numpy<2"
|
||||
INSTALLED_SOMETHING=0
|
||||
if [[ -f requirements.txt ]]; then
|
||||
$PIP install -r requirements.txt
|
||||
|
|
@ -380,10 +347,6 @@ jobs:
|
|||
exit 1
|
||||
fi
|
||||
$PIP install h5py scipy
|
||||
# After requirements.txt, force numpy back to <2 (torch 2.2.2 has
|
||||
# NumPy 1.x ABI; transitive deps in requirements.txt would
|
||||
# otherwise leave numpy 2.x in place and crash at runtime).
|
||||
$PIP install --upgrade --force-reinstall "numpy<2"
|
||||
TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
TORCH_REASON="no NVIDIA GPU detected"
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
|
|
@ -403,8 +366,7 @@ jobs:
|
|||
fi
|
||||
fi
|
||||
echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})."
|
||||
# torch was pre-installed at the top of this step; no force-reinstall needed.
|
||||
echo "Skipping torch force-reinstall (already installed at step head): $TORCH_INDEX_URL"
|
||||
$PIP install --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision
|
||||
|
||||
- name: Find and adapt dataset
|
||||
shell: bash
|
||||
|
|
@ -500,4 +462,4 @@ jobs:
|
|||
${{ env.WAVESFM_OUTPUT_DIR }}/best.pth
|
||||
${{ env.WAVESFM_OUTPUT_DIR }}/log.txt
|
||||
if-no-files-found: warn
|
||||
# committed at 2026-05-28T06:39:26.910514+00:00
|
||||
# committed at 2026-05-28T10:47:45.318818+00:00
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user