"""ria upload — stream large files to a RIA Hub Project via the LFS API. How it works ------------ 1. The file is hashed locally (SHA-256 + size) — this is the LFS object ID. 2. A single POST to the repo's LFS batch endpoint returns an upload URL (and headers) for any object the server does not already have. 3. The file is streamed to that URL in fixed-size chunks — nothing is ever fully loaded into memory, so files of any size work. 4. A commit is created via the Gitea contents API that records the LFS pointer (a small text file) so the file appears in the repo tree. No server-side changes are required — this uses the same authenticated LFS protocol that `git lfs push` uses internally. """ import base64 import hashlib import http.client import json import math import os import sys import urllib.error import urllib.parse import urllib.request import click # Read buffer for hashing and streaming — 8 MB keeps memory use flat # for arbitrarily large files. _CHUNK = 8 * 1024 * 1024 LFS_MEDIA_TYPE = "application/vnd.git-lfs+json" # --------------------------------------------------------------------------- # Credential helpers (reused from setup_repo pattern) # --------------------------------------------------------------------------- def _get_stored_credentials(hub_url: str) -> tuple[str | None, str | None]: import subprocess parsed = urllib.parse.urlparse(hub_url) payload = f"protocol={parsed.scheme}\nhost={parsed.netloc}\n\n" try: result = subprocess.run( ["git", "credential", "fill"], input=payload, capture_output=True, text=True, timeout=5, ) creds = {} for line in result.stdout.splitlines(): k, sep, v = line.partition("=") if sep: creds[k.strip()] = v return creds.get("username"), creds.get("password") except Exception: return None, None def _resolve_credentials(hub: str) -> tuple[str, str]: username, password = _get_stored_credentials(hub) if username and password: return username, password click.echo(f"No stored credentials found for {hub}.") username = click.prompt("RIA Hub username") password = click.prompt("Password / personal access token", hide_input=True) return username, password def _basic_auth(username: str, password: str) -> str: return "Basic " + base64.b64encode(f"{username}:{password}".encode()).decode() # --------------------------------------------------------------------------- # File helpers # --------------------------------------------------------------------------- def _hash_file(path: str) -> tuple[str, int]: """Return (sha256_hex, byte_size) by streaming the file.""" h = hashlib.sha256() size = 0 with open(path, "rb") as f: while True: chunk = f.read(_CHUNK) if not chunk: break h.update(chunk) size += len(chunk) return h.hexdigest(), size def _lfs_pointer_text(oid: str, size: int) -> str: return f"version https://git-lfs.github.com/spec/v1\noid sha256:{oid}\nsize {size}\n" # --------------------------------------------------------------------------- # LFS batch API # --------------------------------------------------------------------------- def _lfs_batch( hub: str, owner: str, repo: str, objects: list[dict], username: str, password: str, ) -> dict: """ POST to /{owner}/{repo}.git/info/lfs/objects/batch. Returns the parsed JSON response. Raises on HTTP error or JSON decode failure. """ url = ( f"{hub.rstrip('/')}" f"/{urllib.parse.quote(owner, safe='')}" f"/{urllib.parse.quote(repo, safe='')}" f".git/info/lfs/objects/batch" ) body = json.dumps( { "operation": "upload", "transfers": ["basic"], "objects": objects, } ).encode() req = urllib.request.Request(url, data=body, method="POST") req.add_header("Content-Type", LFS_MEDIA_TYPE) req.add_header("Accept", LFS_MEDIA_TYPE) req.add_header("Authorization", _basic_auth(username, password)) try: with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read()) except urllib.error.HTTPError as e: body_text = e.read().decode(errors="replace") raise RuntimeError(f"LFS batch request failed (HTTP {e.code}): {body_text}") from e # --------------------------------------------------------------------------- # Streaming PUT upload # --------------------------------------------------------------------------- def _stream_upload(href: str, headers: dict, file_path: str, size: int) -> None: """ PUT file_path content to href, streaming in _CHUNK-sized pieces. Uses http.client directly so Content-Length is set without buffering the whole file in memory. Works for files of any size. """ parsed = urllib.parse.urlparse(href) host = parsed.netloc path = parsed.path if parsed.query: path += "?" + parsed.query if parsed.scheme == "https": conn = http.client.HTTPSConnection(host, timeout=300) else: conn = http.client.HTTPConnection(host, timeout=300) all_headers = dict(headers or {}) all_headers.setdefault("Content-Type", "application/octet-stream") all_headers["Content-Length"] = str(size) try: conn.connect() conn.putrequest("PUT", path) for k, v in all_headers.items(): conn.putheader(k, v) conn.endheaders() with open(file_path, "rb") as f: while True: chunk = f.read(_CHUNK) if not chunk: break conn.send(chunk) resp = conn.getresponse() resp.read() # drain if resp.status not in (200, 201): raise RuntimeError(f"LFS object upload failed: HTTP {resp.status}") finally: conn.close() # --------------------------------------------------------------------------- # Gitea contents API — create / update a file to record the LFS pointer # --------------------------------------------------------------------------- def _get_file_sha( hub: str, owner: str, repo: str, path: str, branch: str, username: str, password: str, ) -> str | None: """Return the blob SHA of an existing file, or None if it doesn't exist.""" url = ( f"{hub.rstrip('/')}/api/v1" f"/repos/{urllib.parse.quote(owner, safe='')}/{urllib.parse.quote(repo, safe='')}" f"/contents/{urllib.parse.quote(path)}" f"?ref={urllib.parse.quote(branch)}" ) req = urllib.request.Request(url) req.add_header("Authorization", _basic_auth(username, password)) try: with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read()).get("sha") except urllib.error.HTTPError as e: if e.code == 404: return None raise def _commit_lfs_pointer( hub: str, owner: str, repo: str, remote_path: str, pointer_text: str, branch: str, message: str, username: str, password: str, ) -> None: """Create or update a file in the repo containing the LFS pointer.""" url = ( f"{hub.rstrip('/')}/api/v1" f"/repos/{urllib.parse.quote(owner, safe='')}/{urllib.parse.quote(repo, safe='')}" f"/contents/{urllib.parse.quote(remote_path)}" ) existing_sha = _get_file_sha(hub, owner, repo, remote_path, branch, username, password) body: dict = { "message": message, "content": base64.b64encode(pointer_text.encode()).decode(), "branch": branch, } if existing_sha: body["sha"] = existing_sha method = "PUT" if existing_sha else "POST" req = urllib.request.Request(url, data=json.dumps(body).encode(), method=method) req.add_header("Content-Type", "application/json") req.add_header("Authorization", _basic_auth(username, password)) try: with urllib.request.urlopen(req, timeout=30) as resp: resp.read() except urllib.error.HTTPError as e: body_text = e.read().decode(errors="replace") raise RuntimeError(f"Failed to commit LFS pointer for '{remote_path}' (HTTP {e.code}): {body_text}") from e # --------------------------------------------------------------------------- # Per-file upload logic # --------------------------------------------------------------------------- def _upload_single_file( hub: str, owner: str, repo_name: str, username: str, password: str, file_path: str, remote_dir: str, message: str | None, branch: str, ) -> None: """Hash, upload (if needed), and commit the LFS pointer for one file.""" filename = os.path.basename(file_path) file_size = os.path.getsize(file_path) size_mb = file_size / (1024 * 1024) click.echo(f"\n {filename} ({size_mb:.1f} MB)") click.echo(" Hashing...", nl=False) oid, size = _hash_file(file_path) click.echo(f" sha256:{oid[:12]}...") try: batch = _lfs_batch(hub, owner, repo_name, [{"oid": oid, "size": size}], username, password) except RuntimeError as e: click.echo(f"\n Error: {e}", err=True) sys.exit(1) objects = batch.get("objects", []) if not objects: click.echo(" Already in LFS — skipping upload.") else: obj = objects[0] if "error" in obj: err_msg = obj["error"].get("message", "unknown error") err_code = obj["error"].get("code", 0) if err_code == 413 or "quota" in err_msg.lower() or "limit" in err_msg.lower(): click.echo( f"\n Error: storage quota exceeded for this repo.\n Server: {err_msg}", err=True, ) else: click.echo(f"\n Error from server: {err_msg}", err=True) sys.exit(1) upload_action = obj.get("actions", {}).get("upload") if not upload_action: click.echo(" Already in LFS — skipping upload.") else: href = upload_action["href"] up_headers = upload_action.get("header", {}) chunks = math.ceil(size / _CHUNK) click.echo(f" Uploading ({size_mb:.1f} MB, {chunks} chunk{'s' if chunks != 1 else ''})...") try: _stream_upload_progress(href, up_headers, file_path, size) except RuntimeError as e: click.echo(f"\n Upload failed: {e}", err=True) sys.exit(1) click.echo(" Upload complete.") verify_action = obj.get("actions", {}).get("verify") if verify_action: try: vreq = urllib.request.Request( verify_action["href"], data=json.dumps({"oid": oid, "size": size}).encode(), method="POST", ) vreq.add_header("Content-Type", LFS_MEDIA_TYPE) vreq.add_header("Accept", LFS_MEDIA_TYPE) for k, v in verify_action.get("header", {}).items(): vreq.add_header(k, v) with urllib.request.urlopen(vreq, timeout=15): pass except Exception: pass # verify is optional; non-fatal on failure pointer = _lfs_pointer_text(oid, size) remote_path = (f"{remote_dir.rstrip('/')}/{filename}").lstrip("/") if remote_dir else filename commit_msg = message or f"chore: upload {filename} via ria" click.echo(f" Committing pointer → {remote_path}...", nl=False) try: _commit_lfs_pointer(hub, owner, repo_name, remote_path, pointer, branch, commit_msg, username, password) click.echo(" done.") except RuntimeError as e: click.echo(f"\n Error: {e}", err=True) sys.exit(1) def _stream_upload_progress(href: str, headers: dict, file_path: str, size: int) -> None: """Stream file_path to href with a click progress bar.""" parsed = urllib.parse.urlparse(href) host = parsed.netloc path_q = parsed.path + (f"?{parsed.query}" if parsed.query else "") if parsed.scheme == "https": conn = http.client.HTTPSConnection(host, timeout=300) else: conn = http.client.HTTPConnection(host, timeout=300) all_headers = dict(headers) all_headers.setdefault("Content-Type", "application/octet-stream") all_headers["Content-Length"] = str(size) with click.progressbar( length=size, label=" ", width=40, show_eta=True, show_percent=True, fill_char="█", empty_char="░", ) as bar: conn.connect() conn.putrequest("PUT", path_q) for k, v in all_headers.items(): conn.putheader(k, v) conn.endheaders() with open(file_path, "rb") as f: while True: chunk = f.read(_CHUNK) if not chunk: break conn.send(chunk) bar.update(len(chunk)) resp = conn.getresponse() resp.read() conn.close() if resp.status not in (200, 201): raise RuntimeError(f"HTTP {resp.status}") # --------------------------------------------------------------------------- # Command # --------------------------------------------------------------------------- @click.command("upload") @click.argument("files", nargs=-1, required=True) @click.option( "--repo", required=True, metavar="OWNER/NAME", help="Target repository on RIA Hub (e.g. benchinnery/my-dataset)." ) @click.option("--hub", default="http://localhost:3000", show_default=True, metavar="URL", help="RIA Hub base URL.") @click.option("--branch", default="main", show_default=True, help="Branch to commit the files to.") @click.option( "--path", "remote_dir", default="", metavar="DIR", help="Remote directory path inside the repo (default: repo root).", ) @click.option("--message", "-m", default=None, help="Commit message (default: 'chore: upload via ria').") def upload( files: tuple[str], repo: str, hub: str, branch: str, remote_dir: str, message: str | None, ) -> None: """Upload large files to a RIA Hub Project via Git LFS. Files are streamed directly to the repo's LFS object store — nothing is buffered into memory, so files of any size work. Each file creates one commit recording the LFS pointer. \b Examples: ria upload recording.sigmf-data --repo benchinnery/my-recordings ria upload *.npy --repo benchinnery/my-recordings --branch main ria upload big.pt --repo benchinnery/models --path weights/ """ # Validate repo argument if "/" not in repo: click.echo("Error: --repo must be in the form OWNER/NAME.", err=True) sys.exit(1) owner, repo_name = repo.split("/", 1) # Expand and validate files resolved = [] for pattern in files: if not os.path.isfile(pattern): click.echo(f"Error: '{pattern}' is not a file or does not exist.", err=True) sys.exit(1) resolved.append(os.path.abspath(pattern)) # Credentials hub = hub.rstrip("/") username, password = _resolve_credentials(hub) click.echo(f"Uploading {len(resolved)} file(s) to {owner}/{repo_name} on {hub}...") for file_path in resolved: _upload_single_file(hub, owner, repo_name, username, password, file_path, remote_dir, message, branch) click.echo(f"\nAll done. {len(resolved)} file(s) uploaded to {owner}/{repo_name}.")