From ceb2c3fc569c01170bbccbe9cdcc18cbc85c7081 Mon Sep 17 00:00:00 2001 From: ash Date: Thu, 28 May 2026 06:47:45 -0400 Subject: [PATCH] Training run - 2026-05-28 06:47:45 --- .riahub/train_configs/model/model.yaml | 5 +++ .riahub/workflows/train.yaml | 60 +++++--------------------- 2 files changed, 16 insertions(+), 49 deletions(-) create mode 100644 .riahub/train_configs/model/model.yaml diff --git a/.riahub/train_configs/model/model.yaml b/.riahub/train_configs/model/model.yaml new file mode 100644 index 0000000..4183b59 --- /dev/null +++ b/.riahub/train_configs/model/model.yaml @@ -0,0 +1,5 @@ +source: wavesfm +name: WavesFM Linear Probe +task: rml +epochs: 10 +batch_size: 2048 diff --git a/.riahub/workflows/train.yaml b/.riahub/workflows/train.yaml index af8db83..e1982d1 100644 --- a/.riahub/workflows/train.yaml +++ b/.riahub/workflows/train.yaml @@ -15,11 +15,11 @@ permissions: jobs: WavesFM-Training: - runs-on: "ubuntu-latest" + runs-on: "ubuntu-24.04" env: WAVESFM_TASK: "rml" - WAVESFM_EPOCHS: "3" - WAVESFM_BATCH_SIZE: "16" + WAVESFM_EPOCHS: "10" + WAVESFM_BATCH_SIZE: "2048" WAVESFM_OUTPUT_DIR: "/opt/wavesfm/output" # Single source of truth for the cloned WavesFM repo location. # Referenced as ${{ env.WAVESFM_REPO_DIR }} in steps. To relocate @@ -27,7 +27,6 @@ jobs: # downstream step uses the env var, no hard-coded paths. WAVESFM_REPO_DIR: "/opt/wavesfm/repo" WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5" - RIAHUB_BASE_URL: "http://192.168.0.170:3000" steps: - name: Display basic runner info run: | @@ -42,7 +41,6 @@ jobs: echo "No NVIDIA GPU available." fi - - name: "Download Model (qoherent/wavesfm-base/wavesfm-v1p0.pth)" shell: bash timeout-minutes: 4 @@ -179,8 +177,7 @@ jobs: fi exit 1 fi - - - name: "Checkout Dataset (qoherent/icc-demo/icc_canary_2026_05_28-v1.0.0.h5)" + - name: Checkout Training Dataset shell: bash timeout-minutes: 10 env: @@ -210,16 +207,8 @@ jobs: AUTH_HEADER="" if [[ -n "${RIAHUB_USER:-}" && -n "${RIAHUB_TOKEN:-}" ]]; then - AUTH_HEADER=$(printf 'Authorization: basic %s' \ - "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") + AUTH_HEADER=$(printf 'Authorization: basic %s' "$(printf '%s:%s' "$RIAHUB_USER" "$RIAHUB_TOKEN" | base64 | tr -d '\n')") fi - # ``sudo env GIT_TERMINAL_PROMPT=0`` propagates the env var across - # sudo's default ``env_reset`` boundary; a bare ``sudo git`` would - # see an empty env on most distros' default sudoers, so the - # step-level ``env:`` block's GIT_TERMINAL_PROMPT=0 would NOT - # actually reach git child processes. Without it, git falls back - # to opening ``/dev/tty`` (the PTY allocated by act_runner) and - # prompting for credentials on a 401, hanging until timeout. git_auth() { if [[ -n "$AUTH_HEADER" ]]; then sudo env GIT_TERMINAL_PROMPT=0 git -c "http.extraheader=$AUTH_HEADER" "$@" @@ -227,9 +216,8 @@ jobs: sudo env GIT_TERMINAL_PROMPT=0 git "$@" fi } - REPO_PATH='/qoherent/icc-demo.git' - DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/9f87fa9fe2badd314ad81379064e236ea494e89d' + DEST_ROOT='/opt/qmb/riahub/dataset/qoherent/icc-demo/main' sudo mkdir -p "$(dirname "$DEST_ROOT")" if ! command -v git-lfs >/dev/null 2>&1; then sudo apt-get update -y @@ -245,28 +233,13 @@ jobs: sudo mkdir -p "$DEST_ROOT" sudo git -C "$DEST_ROOT" init || continue sudo git -C "$DEST_ROOT" remote add origin "$REPO_URL" || continue - # See ``_render_model_checkout`` for the rationale on skipping - # ``git lfs install --local`` — short version: the smudge - # filter it would register tries its own credential lookup - # during ``git checkout FETCH_HEAD`` and hangs forever on - # /dev/tty when the repo is internal/private. We rely on - # the explicit ``git lfs fetch`` (with auth) + - # ``git lfs checkout`` (local) pair below instead. sudo git -C "$DEST_ROOT" sparse-checkout init --no-cone || continue sudo git -C "$DEST_ROOT" sparse-checkout set --no-cone -- \ 'icc_canary_2026_05_28-v1.0.0.h5' || continue if ! git_auth -C "$DEST_ROOT" fetch --depth=1 origin '9f87fa9fe2badd314ad81379064e236ea494e89d'; then continue fi - # See ``_render_model_checkout`` for the rationale on - # ``GIT_LFS_SKIP_SMUDGE=1`` — short version: the runner has - # the LFS smudge filter installed system-wide - # (``/etc/gitconfig``), so checkout fires it and the filter's - # credential helper hangs on /dev/tty for internal repos. - # Skipping smudge here lets the explicit ``git lfs fetch`` - # below handle materialization with proper auth. - if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 \ - git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then + if ! sudo env GIT_TERMINAL_PROMPT=0 GIT_LFS_SKIP_SMUDGE=1 git -C "$DEST_ROOT" -c advice.detachedHead=false checkout FETCH_HEAD; then continue fi if ! git_auth -C "$DEST_ROOT" lfs fetch origin --include='icc_canary_2026_05_28-v1.0.0.h5' --exclude=""; then @@ -292,7 +265,7 @@ jobs: if [[ "$MATERIALIZED" -ne 1 ]]; then echo "Failed to materialize dataset using base URL candidates derived from: $BASE_URL_SOURCE" >&2 if [[ -z "$AUTH_HEADER" ]]; then - echo " (set QMBDEMO_USER+QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 + echo " (no credentials configured: set QMBDEMO_USER and QMBDEMO_TOKEN repo secrets for internal/private repos)" >&2 fi exit 1 fi @@ -321,7 +294,7 @@ jobs: # `--device cpu` from the Train step actually takes effect. # No-op if the line already uses args.device (idempotent). if [[ -f main_finetune.py ]]; then - sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))|' main_finetune.py + sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.amp.GradScaler(device=args.device, enabled=(args.device != "cpu"))|' main_finetune.py echo "Patched main_finetune.py GradScaler for CPU/GPU device parity." fi @@ -358,12 +331,6 @@ jobs: # only when the repo is genuinely installable (has setup.py / # setup.cfg, or pyproject.toml with [build-system]). cd "$WAVESFM_REPO_DIR" - # FAST-PATH: install CPU torch from pytorch.org/whl/cpu FIRST (~200MB). - # This makes torch==X already-satisfied so requirements.txt does not - # pull the 755MB manylinux wheel with bundled CUDA from PyPI default. - $PIP install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple "numpy<2" "torch==2.2.2" torchvision - # Also ensure numpy<2 is pinned for the requirements.txt install below - $PIP install --upgrade --force-reinstall "numpy<2" INSTALLED_SOMETHING=0 if [[ -f requirements.txt ]]; then $PIP install -r requirements.txt @@ -380,10 +347,6 @@ jobs: exit 1 fi $PIP install h5py scipy - # After requirements.txt, force numpy back to <2 (torch 2.2.2 has - # NumPy 1.x ABI; transitive deps in requirements.txt would - # otherwise leave numpy 2.x in place and crash at runtime). - $PIP install --upgrade --force-reinstall "numpy<2" TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu" TORCH_REASON="no NVIDIA GPU detected" if command -v nvidia-smi &> /dev/null; then @@ -403,8 +366,7 @@ jobs: fi fi echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})." - # torch was pre-installed at the top of this step; no force-reinstall needed. - echo "Skipping torch force-reinstall (already installed at step head): $TORCH_INDEX_URL" + $PIP install --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision - name: Find and adapt dataset shell: bash @@ -500,4 +462,4 @@ jobs: ${{ env.WAVESFM_OUTPUT_DIR }}/best.pth ${{ env.WAVESFM_OUTPUT_DIR }}/log.txt if-no-files-found: warn -# committed at 2026-05-28T06:39:26.910514+00:00 +# committed at 2026-05-28T10:47:45.318818+00:00