diff --git a/.riahub/workflows/train.yaml b/.riahub/workflows/train.yaml index e1982d1..ce5bada 100644 --- a/.riahub/workflows/train.yaml +++ b/.riahub/workflows/train.yaml @@ -15,7 +15,7 @@ permissions: jobs: WavesFM-Training: - runs-on: "ubuntu-24.04" + runs-on: "ubuntu-latest-2080" env: WAVESFM_TASK: "rml" WAVESFM_EPOCHS: "10" @@ -27,6 +27,9 @@ jobs: # downstream step uses the env var, no hard-coded paths. WAVESFM_REPO_DIR: "/opt/wavesfm/repo" WAVESFM_ADAPTED_DATA: "/opt/wavesfm/adapted_data.h5" + # Override model-download base URL to internal LAN IP + # (external riahub.ai LFS endpoint is unreachable from runners). + RIAHUB_BASE_URL: "http://192.168.0.170:3000" steps: - name: Display basic runner info run: | @@ -294,7 +297,7 @@ jobs: # `--device cpu` from the Train step actually takes effect. # No-op if the line already uses args.device (idempotent). if [[ -f main_finetune.py ]]; then - sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.amp.GradScaler(device=args.device, enabled=(args.device != "cpu"))|' main_finetune.py + sed -i 's|torch\.amp\.GradScaler(device="cuda")|torch.cuda.amp.GradScaler(enabled=(args.device != "cpu"))|' main_finetune.py echo "Patched main_finetune.py GradScaler for CPU/GPU device parity." fi @@ -331,6 +334,10 @@ jobs: # only when the repo is genuinely installable (has setup.py / # setup.cfg, or pyproject.toml with [build-system]). cd "$WAVESFM_REPO_DIR" + # Pre-install CPU torch + numpy<2 to make requirements.txt see them already-satisfied (saves ~600MB). + # torch 2.2.2 has the NumPy 1.x ABI and crashes if numpy 2.x is installed. + $PIP install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple "numpy<2" "torch==2.2.2" torchvision + $PIP install --upgrade --force-reinstall "numpy<2" INSTALLED_SOMETHING=0 if [[ -f requirements.txt ]]; then $PIP install -r requirements.txt @@ -347,6 +354,8 @@ jobs: exit 1 fi $PIP install h5py scipy + # Force numpy<2 again (requirements.txt may have bumped it via transitive deps). + $PIP install --upgrade --force-reinstall "numpy<2" TORCH_INDEX_URL="https://download.pytorch.org/whl/cpu" TORCH_REASON="no NVIDIA GPU detected" if command -v nvidia-smi &> /dev/null; then @@ -366,7 +375,8 @@ jobs: fi fi echo "Installing PyTorch from ${TORCH_INDEX_URL} (${TORCH_REASON})." - $PIP install --index-url "$TORCH_INDEX_URL" --upgrade --force-reinstall torch torchvision + # torch pre-installed at step head; force-reinstall disabled to avoid 755MB redownload + echo "Skipping torch force-reinstall ($TORCH_INDEX_URL)" - name: Find and adapt dataset shell: bash @@ -462,4 +472,4 @@ jobs: ${{ env.WAVESFM_OUTPUT_DIR }}/best.pth ${{ env.WAVESFM_OUTPUT_DIR }}/log.txt if-no-files-found: warn -# committed at 2026-05-28T10:47:45.318818+00:00 +# committed at 2026-05-28T11:01:57.936314+00:00