chore: initialize monorepo with project scaffolding, configs, and CI setup

This commit is contained in:
oib
2025-09-27 06:05:25 +02:00
commit c1926136fb
171 changed files with 13708 additions and 0 deletions

27
apps/miner-node/README.md Normal file
View File

@ -0,0 +1,27 @@
# Miner Node
## Purpose & Scope
Worker daemon responsible for executing compute jobs on CPU/GPU hardware, reporting telemetry, and submitting proofs back to the coordinator. See `docs/bootstrap/miner_node.md` for the detailed implementation roadmap.
## Development Setup
- Create a Python virtual environment under `apps/miner-node/.venv`.
- Install dependencies (FastAPI optional for health endpoint, `httpx`, `pydantic`, `psutil`).
- Implement the package structure described in the bootstrap guide.
## Production Deployment (systemd)
1. Copy the project to `/opt/aitbc/apps/miner-node/` on the target host.
2. Create a virtual environment and install dependencies as needed.
3. Populate `.env` with coordinator URL/API token settings.
4. Run the installer script from repo root:
```bash
sudo scripts/ops/install_miner_systemd.sh
```
This installs `configs/systemd/aitbc-miner.service`, reloads systemd, and enables the service.
5. Check status/logs:
```bash
sudo systemctl status aitbc-miner
journalctl -u aitbc-miner -f
```

View File

@ -0,0 +1,30 @@
[tool.poetry]
name = "aitbc-miner-node"
version = "0.1.0"
description = "AITBC miner node daemon"
authors = ["AITBC Team"]
packages = [
{ include = "aitbc_miner", from = "src" }
]
[tool.poetry.dependencies]
python = "^3.11"
httpx = "^0.27.0"
pydantic = "^2.7.0"
pyyaml = "^6.0.1"
psutil = "^5.9.8"
aiosignal = "^1.3.1"
uvloop = { version = "^0.19.0", optional = true }
asyncio = { version = "^3.4.3", optional = true }
rich = "^13.7.1"
[tool.poetry.extras]
uvloop = ["uvloop"]
[tool.poetry.group.dev.dependencies]
pytest = "^8.2.0"
pytest-asyncio = "^0.23.0"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -0,0 +1 @@
"""AITBC miner node package."""

View File

@ -0,0 +1 @@
"""Control loop and background tasks for the miner node."""

View File

@ -0,0 +1,127 @@
from __future__ import annotations
import asyncio
import json
from collections.abc import Callable
from typing import Optional
from ..config import settings
from ..logging import get_logger
from ..coordinator import CoordinatorClient
from ..util.probe import collect_capabilities, collect_runtime_metrics
from ..util.backoff import compute_backoff
from ..util.fs import ensure_workspace, write_json
from ..runners import get_runner
logger = get_logger(__name__)
class MinerControlLoop:
def __init__(self) -> None:
self._tasks: list[asyncio.Task[None]] = []
self._stop_event = asyncio.Event()
self._coordinator = CoordinatorClient()
self._capabilities_snapshot = collect_capabilities(settings.max_concurrent_cpu, settings.max_concurrent_gpu)
self._current_backoff = settings.poll_interval_seconds
async def start(self) -> None:
logger.info("Starting miner control loop", extra={"node_id": settings.node_id})
await self._register()
self._tasks.append(asyncio.create_task(self._heartbeat_loop()))
self._tasks.append(asyncio.create_task(self._poll_loop()))
async def stop(self) -> None:
logger.info("Stopping miner control loop")
self._stop_event.set()
for task in self._tasks:
task.cancel()
await asyncio.gather(*self._tasks, return_exceptions=True)
await self._coordinator.aclose()
async def _register(self) -> None:
payload = {
"capabilities": self._capabilities_snapshot.capabilities,
"concurrency": self._capabilities_snapshot.concurrency,
"region": settings.region,
}
try:
resp = await self._coordinator.register(payload)
logger.info("Registered miner", extra={"resp": resp})
except Exception as exc:
logger.exception("Failed to register miner", exc_info=exc)
raise
async def _heartbeat_loop(self) -> None:
interval = settings.heartbeat_interval_seconds
while not self._stop_event.is_set():
payload = {
"inflight": 0,
"status": "ONLINE",
"metadata": collect_runtime_metrics(),
}
try:
await self._coordinator.heartbeat(payload)
logger.debug("heartbeat sent")
except Exception as exc:
logger.warning("heartbeat failed", exc_info=exc)
await asyncio.sleep(interval)
async def _poll_loop(self) -> None:
interval = settings.poll_interval_seconds
while not self._stop_event.is_set():
payload = {"max_wait_seconds": interval}
try:
job = await self._coordinator.poll(payload)
if job:
logger.info("received job", extra={"job_id": job.get("job_id")})
self._current_backoff = settings.poll_interval_seconds
await self._handle_job(job)
else:
interval = min(compute_backoff(interval, 2.0, settings.heartbeat_jitter_pct, settings.max_backoff_seconds), settings.max_backoff_seconds)
logger.debug("no job; next poll interval=%s", interval)
except Exception as exc:
logger.warning("poll failed", exc_info=exc)
interval = min(compute_backoff(interval, 2.0, settings.heartbeat_jitter_pct, settings.max_backoff_seconds), settings.max_backoff_seconds)
await asyncio.sleep(interval)
async def _handle_job(self, job: dict) -> None:
job_id = job.get("job_id", "unknown")
workspace = ensure_workspace(settings.workspace_root, job_id)
runner_kind = job.get("runner", {}).get("kind", "noop")
runner = get_runner(runner_kind)
try:
result = await runner.run(job, workspace)
except Exception as exc:
logger.exception("runner crashed", extra={"job_id": job_id, "runner": runner_kind})
await self._coordinator.submit_failure(
job_id,
{
"error_code": "RUNTIME_ERROR",
"error_message": str(exc),
"metrics": {},
},
)
return
if result.ok:
write_json(workspace / "result.json", result.output)
try:
await self._coordinator.submit_result(
job_id,
{
"result": result.output,
"metrics": {"workspace": str(workspace)},
},
)
except Exception as exc:
logger.warning("failed to submit result", extra={"job_id": job_id}, exc_info=exc)
else:
await self._coordinator.submit_failure(
job_id,
{
"error_code": result.output.get("error_code", "FAILED"),
"error_message": result.output.get("error_message", "Job failed"),
"metrics": result.output.get("metrics", {}),
},
)

View File

@ -0,0 +1,40 @@
from __future__ import annotations
from pathlib import Path
from typing import Optional
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class MinerSettings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", case_sensitive=False)
node_id: str = "node-dev-1"
coordinator_base_url: str = "http://127.0.0.1:8011/v1"
auth_token: str = "miner_dev_key_1"
region: Optional[str] = None
workspace_root: Path = Field(default=Path("/var/lib/aitbc/miner/jobs"))
cache_root: Path = Field(default=Path("/var/lib/aitbc/miner/cache"))
heartbeat_interval_seconds: int = 15
heartbeat_jitter_pct: int = 10
heartbeat_timeout_seconds: int = 60
poll_interval_seconds: int = 3
max_backoff_seconds: int = 60
max_concurrent_cpu: int = 1
max_concurrent_gpu: int = 1
enable_cli_runner: bool = True
enable_python_runner: bool = True
allowlist_dir: Path = Field(default=Path("/etc/aitbc/miner/allowlist.d"))
log_level: str = "INFO"
log_path: Optional[Path] = None
settings = MinerSettings()

View File

@ -0,0 +1,76 @@
from __future__ import annotations
import asyncio
from typing import Any, Dict, Optional
import httpx
from .config import MinerSettings, settings
from .logging import get_logger
logger = get_logger(__name__)
class CoordinatorClient:
"""Async HTTP client for interacting with the coordinator API."""
def __init__(self, cfg: MinerSettings | None = None) -> None:
self.cfg = cfg or settings
self._client: Optional[httpx.AsyncClient] = None
@property
def client(self) -> httpx.AsyncClient:
if self._client is None:
headers = {
"Authorization": f"Bearer {self.cfg.auth_token}",
"User-Agent": f"aitbc-miner/{self.cfg.node_id}",
}
timeout = httpx.Timeout(connect=5.0, read=30.0, write=10.0, pool=None)
self._client = httpx.AsyncClient(base_url=self.cfg.coordinator_base_url.rstrip("/"), headers=headers, timeout=timeout)
return self._client
async def aclose(self) -> None:
if self._client:
await self._client.aclose()
self._client = None
async def register(self, payload: Dict[str, Any]) -> Dict[str, Any]:
logger.debug("registering miner", extra={"payload": payload})
resp = await self.client.post("/miners/register", json=payload)
resp.raise_for_status()
return resp.json()
async def heartbeat(self, payload: Dict[str, Any]) -> Dict[str, Any]:
resp = await self.client.post("/miners/heartbeat", json=payload)
resp.raise_for_status()
return resp.json()
async def poll(self, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
resp = await self.client.post("/miners/poll", json=payload)
if resp.status_code == 204:
logger.debug("no job available")
return None
resp.raise_for_status()
return resp.json()
async def submit_result(self, job_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
resp = await self.client.post(f"/miners/{job_id}/result", json=payload)
resp.raise_for_status()
return resp.json()
async def submit_failure(self, job_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
resp = await self.client.post(f"/miners/{job_id}/fail", json=payload)
resp.raise_for_status()
return resp.json()
async def __aenter__(self) -> "CoordinatorClient":
_ = self.client
return self
async def __aexit__(self, exc_type, exc, tb) -> None:
await self.aclose()
async def backoff(base: float, max_seconds: float) -> float:
await asyncio.sleep(base)
return min(base * 2, max_seconds)

View File

@ -0,0 +1,25 @@
from __future__ import annotations
import logging
from typing import Optional
from .config import settings
def configure_logging(level: Optional[str] = None, log_path: Optional[str] = None) -> None:
log_level = getattr(logging, (level or settings.log_level).upper(), logging.INFO)
handlers: list[logging.Handler] = [logging.StreamHandler()]
if log_path:
handlers.append(logging.FileHandler(log_path))
logging.basicConfig(
level=log_level,
format="%(asctime)s %(levelname)s %(name)s :: %(message)s",
handlers=handlers,
)
def get_logger(name: str) -> logging.Logger:
if not logging.getLogger().handlers:
configure_logging(settings.log_level, settings.log_path.as_posix() if settings.log_path else None)
return logging.getLogger(name)

View File

@ -0,0 +1,51 @@
from __future__ import annotations
import asyncio
import signal
from contextlib import asynccontextmanager
from typing import AsyncIterator
from .config import settings
from .logging import get_logger
logger = get_logger(__name__)
class MinerApplication:
def __init__(self) -> None:
self._stop_event = asyncio.Event()
async def start(self) -> None:
logger.info("Miner node starting", extra={"node_id": settings.node_id})
# TODO: initialize capability probe, register with coordinator, start heartbeat and poll loops
await self._stop_event.wait()
async def stop(self) -> None:
logger.info("Miner node shutting down")
self._stop_event.set()
@asynccontextmanager
async def miner_app() -> AsyncIterator[MinerApplication]:
app = MinerApplication()
try:
yield app
finally:
await app.stop()
def run() -> None:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
async def _run() -> None:
async with miner_app() as app:
loop.add_signal_handler(signal.SIGINT, lambda: asyncio.create_task(app.stop()))
loop.add_signal_handler(signal.SIGTERM, lambda: asyncio.create_task(app.stop()))
await app.start()
loop.run_until_complete(_run())
if __name__ == "__main__": # pragma: no cover
run()

View File

@ -0,0 +1,18 @@
from __future__ import annotations
from typing import Dict
from .base import BaseRunner
from .cli.simple import CLIRunner
from .python.noop import PythonNoopRunner
_RUNNERS: Dict[str, BaseRunner] = {
"cli": CLIRunner(),
"python": PythonNoopRunner(),
"noop": PythonNoopRunner(),
}
def get_runner(kind: str) -> BaseRunner:
return _RUNNERS.get(kind, _RUNNERS["noop"])

View File

@ -0,0 +1,17 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict
@dataclass
class RunnerResult:
ok: bool
output: Dict[str, Any]
artifacts: Dict[str, Path] | None = None
class BaseRunner:
async def run(self, job: Dict[str, Any], workspace: Path) -> RunnerResult:
raise NotImplementedError

View File

@ -0,0 +1,62 @@
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Any, Dict, List
from ..base import BaseRunner, RunnerResult
class CLIRunner(BaseRunner):
async def run(self, job: Dict[str, Any], workspace: Path) -> RunnerResult:
runner_cfg = job.get("runner", {})
command: List[str] = runner_cfg.get("command", [])
if not command:
return RunnerResult(
ok=False,
output={
"error_code": "INVALID_COMMAND",
"error_message": "runner.command is required for CLI jobs",
"metrics": {},
},
)
stdout_path = workspace / "stdout.log"
stderr_path = workspace / "stderr.log"
process = await asyncio.create_subprocess_exec(
*command,
cwd=str(workspace),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout_bytes, stderr_bytes = await process.communicate()
stdout_path.write_bytes(stdout_bytes)
stderr_path.write_bytes(stderr_bytes)
if process.returncode == 0:
return RunnerResult(
ok=True,
output={
"exit_code": 0,
"stdout": stdout_path.name,
"stderr": stderr_path.name,
},
artifacts={
"stdout": stdout_path,
"stderr": stderr_path,
},
)
return RunnerResult(
ok=False,
output={
"error_code": "PROCESS_FAILED",
"error_message": f"command exited with code {process.returncode}",
"metrics": {
"exit_code": process.returncode,
"stderr": stderr_path.name,
},
},
)

View File

@ -0,0 +1,20 @@
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Any, Dict
from ..base import BaseRunner, RunnerResult
class PythonNoopRunner(BaseRunner):
async def run(self, job: Dict[str, Any], workspace: Path) -> RunnerResult:
await asyncio.sleep(0)
payload = job.get("payload", {})
return RunnerResult(
ok=True,
output={
"echo": payload,
"message": "python noop runner executed",
},
)

View File

@ -0,0 +1,19 @@
from __future__ import annotations
import asyncio
import random
def compute_backoff(base: float, factor: float, jitter_pct: float, max_seconds: float) -> float:
backoff = min(base * factor, max_seconds)
jitter = backoff * (jitter_pct / 100.0)
return max(0.0, random.uniform(backoff - jitter, backoff + jitter))
def next_backoff(current: float, factor: float, jitter_pct: float, max_seconds: float) -> float:
return compute_backoff(current, factor, jitter_pct, max_seconds)
async def sleep_with_backoff(delay: float, factor: float, jitter_pct: float, max_seconds: float) -> float:
await asyncio.sleep(delay)
return next_backoff(delay, factor, jitter_pct, max_seconds)

View File

@ -0,0 +1,15 @@
from __future__ import annotations
from pathlib import Path
def ensure_workspace(root: Path, job_id: str) -> Path:
path = root / job_id
path.mkdir(parents=True, exist_ok=True)
return path
def write_json(path: Path, data: dict) -> None:
import json
path.write_text(json.dumps(data, indent=2), encoding="utf-8")

View File

@ -0,0 +1,91 @@
from __future__ import annotations
import platform
import shutil
import subprocess
import time
from dataclasses import dataclass
from typing import Any, Dict, List
import psutil
@dataclass
class CapabilitySnapshot:
capabilities: Dict[str, Any]
concurrency: int
region: str | None = None
def collect_capabilities(max_cpu_concurrency: int, max_gpu_concurrency: int) -> CapabilitySnapshot:
cpu_count = psutil.cpu_count(logical=True) or 1
total_mem = psutil.virtual_memory().total
gpu_info = _detect_gpus()
capabilities: Dict[str, Any] = {
"node": platform.node(),
"python_version": platform.python_version(),
"platform": platform.platform(),
"cpu": {
"logical_cores": cpu_count,
"model": platform.processor(),
},
"memory": {
"total_bytes": total_mem,
"total_gb": round(total_mem / (1024**3), 2),
},
"runners": {
"cli": True,
"python": True,
},
}
if gpu_info:
capabilities["gpus"] = gpu_info
concurrency = max(1, max_cpu_concurrency, max_gpu_concurrency if gpu_info else 0)
return CapabilitySnapshot(capabilities=capabilities, concurrency=concurrency)
def collect_runtime_metrics() -> Dict[str, Any]:
vm = psutil.virtual_memory()
load_avg = psutil.getloadavg() if hasattr(psutil, "getloadavg") else (0, 0, 0)
return {
"cpu_percent": psutil.cpu_percent(interval=None),
"load_avg": load_avg,
"memory_percent": vm.percent,
"timestamp": time.time(),
}
def _detect_gpus() -> List[Dict[str, Any]]:
nvidia_smi = shutil.which("nvidia-smi")
if not nvidia_smi:
return []
try:
output = subprocess.check_output(
[
nvidia_smi,
"--query-gpu=name,memory.total",
"--format=csv,noheader"
],
stderr=subprocess.DEVNULL,
text=True,
)
except (subprocess.CalledProcessError, FileNotFoundError):
return []
gpus: List[Dict[str, Any]] = []
for line in output.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if not parts:
continue
name = parts[0]
mem_mb = None
if len(parts) > 1 and parts[1].lower().endswith(" mib"):
try:
mem_mb = int(float(parts[1].split()[0]))
except ValueError:
mem_mb = None
gpus.append({"name": name, "memory_mb": mem_mb})
return gpus

View File

@ -0,0 +1,37 @@
import asyncio
from pathlib import Path
import pytest
from aitbc_miner.runners.cli.simple import CLIRunner
from aitbc_miner.runners.python.noop import PythonNoopRunner
@pytest.mark.asyncio
async def test_python_noop_runner(tmp_path: Path):
runner = PythonNoopRunner()
job = {"payload": {"value": 42}}
result = await runner.run(job, tmp_path)
assert result.ok
assert result.output["echo"] == job["payload"]
@pytest.mark.asyncio
async def test_cli_runner_success(tmp_path: Path):
runner = CLIRunner()
job = {"runner": {"command": ["echo", "hello"]}}
result = await runner.run(job, tmp_path)
assert result.ok
assert result.artifacts is not None
stdout_path = result.artifacts["stdout"]
assert stdout_path.exists()
assert stdout_path.read_text().strip() == "hello"
@pytest.mark.asyncio
async def test_cli_runner_invalid_command(tmp_path: Path):
runner = CLIRunner()
job = {"runner": {}}
result = await runner.run(job, tmp_path)
assert not result.ok
assert result.output["error_code"] == "INVALID_COMMAND"