chore: initialize monorepo with project scaffolding, configs, and CI setup
This commit is contained in:
27
apps/miner-node/README.md
Normal file
27
apps/miner-node/README.md
Normal file
@ -0,0 +1,27 @@
|
||||
# Miner Node
|
||||
|
||||
## Purpose & Scope
|
||||
|
||||
Worker daemon responsible for executing compute jobs on CPU/GPU hardware, reporting telemetry, and submitting proofs back to the coordinator. See `docs/bootstrap/miner_node.md` for the detailed implementation roadmap.
|
||||
|
||||
## Development Setup
|
||||
|
||||
- Create a Python virtual environment under `apps/miner-node/.venv`.
|
||||
- Install dependencies (FastAPI optional for health endpoint, `httpx`, `pydantic`, `psutil`).
|
||||
- Implement the package structure described in the bootstrap guide.
|
||||
|
||||
## Production Deployment (systemd)
|
||||
|
||||
1. Copy the project to `/opt/aitbc/apps/miner-node/` on the target host.
|
||||
2. Create a virtual environment and install dependencies as needed.
|
||||
3. Populate `.env` with coordinator URL/API token settings.
|
||||
4. Run the installer script from repo root:
|
||||
```bash
|
||||
sudo scripts/ops/install_miner_systemd.sh
|
||||
```
|
||||
This installs `configs/systemd/aitbc-miner.service`, reloads systemd, and enables the service.
|
||||
5. Check status/logs:
|
||||
```bash
|
||||
sudo systemctl status aitbc-miner
|
||||
journalctl -u aitbc-miner -f
|
||||
```
|
||||
30
apps/miner-node/pyproject.toml
Normal file
30
apps/miner-node/pyproject.toml
Normal file
@ -0,0 +1,30 @@
|
||||
[tool.poetry]
|
||||
name = "aitbc-miner-node"
|
||||
version = "0.1.0"
|
||||
description = "AITBC miner node daemon"
|
||||
authors = ["AITBC Team"]
|
||||
packages = [
|
||||
{ include = "aitbc_miner", from = "src" }
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
httpx = "^0.27.0"
|
||||
pydantic = "^2.7.0"
|
||||
pyyaml = "^6.0.1"
|
||||
psutil = "^5.9.8"
|
||||
aiosignal = "^1.3.1"
|
||||
uvloop = { version = "^0.19.0", optional = true }
|
||||
asyncio = { version = "^3.4.3", optional = true }
|
||||
rich = "^13.7.1"
|
||||
|
||||
[tool.poetry.extras]
|
||||
uvloop = ["uvloop"]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.2.0"
|
||||
pytest-asyncio = "^0.23.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
1
apps/miner-node/src/aitbc_miner/__init__.py
Normal file
1
apps/miner-node/src/aitbc_miner/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""AITBC miner node package."""
|
||||
1
apps/miner-node/src/aitbc_miner/agent/__init__.py
Normal file
1
apps/miner-node/src/aitbc_miner/agent/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Control loop and background tasks for the miner node."""
|
||||
127
apps/miner-node/src/aitbc_miner/agent/control.py
Normal file
127
apps/miner-node/src/aitbc_miner/agent/control.py
Normal file
@ -0,0 +1,127 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from collections.abc import Callable
|
||||
from typing import Optional
|
||||
|
||||
from ..config import settings
|
||||
from ..logging import get_logger
|
||||
from ..coordinator import CoordinatorClient
|
||||
from ..util.probe import collect_capabilities, collect_runtime_metrics
|
||||
from ..util.backoff import compute_backoff
|
||||
from ..util.fs import ensure_workspace, write_json
|
||||
from ..runners import get_runner
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class MinerControlLoop:
|
||||
def __init__(self) -> None:
|
||||
self._tasks: list[asyncio.Task[None]] = []
|
||||
self._stop_event = asyncio.Event()
|
||||
self._coordinator = CoordinatorClient()
|
||||
self._capabilities_snapshot = collect_capabilities(settings.max_concurrent_cpu, settings.max_concurrent_gpu)
|
||||
self._current_backoff = settings.poll_interval_seconds
|
||||
|
||||
async def start(self) -> None:
|
||||
logger.info("Starting miner control loop", extra={"node_id": settings.node_id})
|
||||
await self._register()
|
||||
self._tasks.append(asyncio.create_task(self._heartbeat_loop()))
|
||||
self._tasks.append(asyncio.create_task(self._poll_loop()))
|
||||
|
||||
async def stop(self) -> None:
|
||||
logger.info("Stopping miner control loop")
|
||||
self._stop_event.set()
|
||||
for task in self._tasks:
|
||||
task.cancel()
|
||||
await asyncio.gather(*self._tasks, return_exceptions=True)
|
||||
await self._coordinator.aclose()
|
||||
|
||||
async def _register(self) -> None:
|
||||
payload = {
|
||||
"capabilities": self._capabilities_snapshot.capabilities,
|
||||
"concurrency": self._capabilities_snapshot.concurrency,
|
||||
"region": settings.region,
|
||||
}
|
||||
try:
|
||||
resp = await self._coordinator.register(payload)
|
||||
logger.info("Registered miner", extra={"resp": resp})
|
||||
except Exception as exc:
|
||||
logger.exception("Failed to register miner", exc_info=exc)
|
||||
raise
|
||||
|
||||
async def _heartbeat_loop(self) -> None:
|
||||
interval = settings.heartbeat_interval_seconds
|
||||
while not self._stop_event.is_set():
|
||||
payload = {
|
||||
"inflight": 0,
|
||||
"status": "ONLINE",
|
||||
"metadata": collect_runtime_metrics(),
|
||||
}
|
||||
try:
|
||||
await self._coordinator.heartbeat(payload)
|
||||
logger.debug("heartbeat sent")
|
||||
except Exception as exc:
|
||||
logger.warning("heartbeat failed", exc_info=exc)
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
async def _poll_loop(self) -> None:
|
||||
interval = settings.poll_interval_seconds
|
||||
while not self._stop_event.is_set():
|
||||
payload = {"max_wait_seconds": interval}
|
||||
try:
|
||||
job = await self._coordinator.poll(payload)
|
||||
if job:
|
||||
logger.info("received job", extra={"job_id": job.get("job_id")})
|
||||
self._current_backoff = settings.poll_interval_seconds
|
||||
await self._handle_job(job)
|
||||
else:
|
||||
interval = min(compute_backoff(interval, 2.0, settings.heartbeat_jitter_pct, settings.max_backoff_seconds), settings.max_backoff_seconds)
|
||||
logger.debug("no job; next poll interval=%s", interval)
|
||||
except Exception as exc:
|
||||
logger.warning("poll failed", exc_info=exc)
|
||||
interval = min(compute_backoff(interval, 2.0, settings.heartbeat_jitter_pct, settings.max_backoff_seconds), settings.max_backoff_seconds)
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
async def _handle_job(self, job: dict) -> None:
|
||||
job_id = job.get("job_id", "unknown")
|
||||
workspace = ensure_workspace(settings.workspace_root, job_id)
|
||||
runner_kind = job.get("runner", {}).get("kind", "noop")
|
||||
runner = get_runner(runner_kind)
|
||||
|
||||
try:
|
||||
result = await runner.run(job, workspace)
|
||||
except Exception as exc:
|
||||
logger.exception("runner crashed", extra={"job_id": job_id, "runner": runner_kind})
|
||||
await self._coordinator.submit_failure(
|
||||
job_id,
|
||||
{
|
||||
"error_code": "RUNTIME_ERROR",
|
||||
"error_message": str(exc),
|
||||
"metrics": {},
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
if result.ok:
|
||||
write_json(workspace / "result.json", result.output)
|
||||
try:
|
||||
await self._coordinator.submit_result(
|
||||
job_id,
|
||||
{
|
||||
"result": result.output,
|
||||
"metrics": {"workspace": str(workspace)},
|
||||
},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("failed to submit result", extra={"job_id": job_id}, exc_info=exc)
|
||||
else:
|
||||
await self._coordinator.submit_failure(
|
||||
job_id,
|
||||
{
|
||||
"error_code": result.output.get("error_code", "FAILED"),
|
||||
"error_message": result.output.get("error_message", "Job failed"),
|
||||
"metrics": result.output.get("metrics", {}),
|
||||
},
|
||||
)
|
||||
40
apps/miner-node/src/aitbc_miner/config.py
Normal file
40
apps/miner-node/src/aitbc_miner/config.py
Normal file
@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class MinerSettings(BaseSettings):
|
||||
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", case_sensitive=False)
|
||||
|
||||
node_id: str = "node-dev-1"
|
||||
coordinator_base_url: str = "http://127.0.0.1:8011/v1"
|
||||
auth_token: str = "miner_dev_key_1"
|
||||
region: Optional[str] = None
|
||||
|
||||
workspace_root: Path = Field(default=Path("/var/lib/aitbc/miner/jobs"))
|
||||
cache_root: Path = Field(default=Path("/var/lib/aitbc/miner/cache"))
|
||||
|
||||
heartbeat_interval_seconds: int = 15
|
||||
heartbeat_jitter_pct: int = 10
|
||||
heartbeat_timeout_seconds: int = 60
|
||||
|
||||
poll_interval_seconds: int = 3
|
||||
max_backoff_seconds: int = 60
|
||||
|
||||
max_concurrent_cpu: int = 1
|
||||
max_concurrent_gpu: int = 1
|
||||
|
||||
enable_cli_runner: bool = True
|
||||
enable_python_runner: bool = True
|
||||
|
||||
allowlist_dir: Path = Field(default=Path("/etc/aitbc/miner/allowlist.d"))
|
||||
|
||||
log_level: str = "INFO"
|
||||
log_path: Optional[Path] = None
|
||||
|
||||
|
||||
settings = MinerSettings()
|
||||
76
apps/miner-node/src/aitbc_miner/coordinator.py
Normal file
76
apps/miner-node/src/aitbc_miner/coordinator.py
Normal file
@ -0,0 +1,76 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from .config import MinerSettings, settings
|
||||
from .logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class CoordinatorClient:
|
||||
"""Async HTTP client for interacting with the coordinator API."""
|
||||
|
||||
def __init__(self, cfg: MinerSettings | None = None) -> None:
|
||||
self.cfg = cfg or settings
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
@property
|
||||
def client(self) -> httpx.AsyncClient:
|
||||
if self._client is None:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.cfg.auth_token}",
|
||||
"User-Agent": f"aitbc-miner/{self.cfg.node_id}",
|
||||
}
|
||||
timeout = httpx.Timeout(connect=5.0, read=30.0, write=10.0, pool=None)
|
||||
self._client = httpx.AsyncClient(base_url=self.cfg.coordinator_base_url.rstrip("/"), headers=headers, timeout=timeout)
|
||||
return self._client
|
||||
|
||||
async def aclose(self) -> None:
|
||||
if self._client:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
async def register(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
logger.debug("registering miner", extra={"payload": payload})
|
||||
resp = await self.client.post("/miners/register", json=payload)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
async def heartbeat(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
resp = await self.client.post("/miners/heartbeat", json=payload)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
async def poll(self, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
resp = await self.client.post("/miners/poll", json=payload)
|
||||
if resp.status_code == 204:
|
||||
logger.debug("no job available")
|
||||
return None
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
async def submit_result(self, job_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
resp = await self.client.post(f"/miners/{job_id}/result", json=payload)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
async def submit_failure(self, job_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
resp = await self.client.post(f"/miners/{job_id}/fail", json=payload)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
async def __aenter__(self) -> "CoordinatorClient":
|
||||
_ = self.client
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb) -> None:
|
||||
await self.aclose()
|
||||
|
||||
|
||||
async def backoff(base: float, max_seconds: float) -> float:
|
||||
await asyncio.sleep(base)
|
||||
return min(base * 2, max_seconds)
|
||||
25
apps/miner-node/src/aitbc_miner/logging.py
Normal file
25
apps/miner-node/src/aitbc_miner/logging.py
Normal file
@ -0,0 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from .config import settings
|
||||
|
||||
|
||||
def configure_logging(level: Optional[str] = None, log_path: Optional[str] = None) -> None:
|
||||
log_level = getattr(logging, (level or settings.log_level).upper(), logging.INFO)
|
||||
handlers: list[logging.Handler] = [logging.StreamHandler()]
|
||||
if log_path:
|
||||
handlers.append(logging.FileHandler(log_path))
|
||||
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format="%(asctime)s %(levelname)s %(name)s :: %(message)s",
|
||||
handlers=handlers,
|
||||
)
|
||||
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
if not logging.getLogger().handlers:
|
||||
configure_logging(settings.log_level, settings.log_path.as_posix() if settings.log_path else None)
|
||||
return logging.getLogger(name)
|
||||
51
apps/miner-node/src/aitbc_miner/main.py
Normal file
51
apps/miner-node/src/aitbc_miner/main.py
Normal file
@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import signal
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import AsyncIterator
|
||||
|
||||
from .config import settings
|
||||
from .logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class MinerApplication:
|
||||
def __init__(self) -> None:
|
||||
self._stop_event = asyncio.Event()
|
||||
|
||||
async def start(self) -> None:
|
||||
logger.info("Miner node starting", extra={"node_id": settings.node_id})
|
||||
# TODO: initialize capability probe, register with coordinator, start heartbeat and poll loops
|
||||
await self._stop_event.wait()
|
||||
|
||||
async def stop(self) -> None:
|
||||
logger.info("Miner node shutting down")
|
||||
self._stop_event.set()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def miner_app() -> AsyncIterator[MinerApplication]:
|
||||
app = MinerApplication()
|
||||
try:
|
||||
yield app
|
||||
finally:
|
||||
await app.stop()
|
||||
|
||||
|
||||
def run() -> None:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
async def _run() -> None:
|
||||
async with miner_app() as app:
|
||||
loop.add_signal_handler(signal.SIGINT, lambda: asyncio.create_task(app.stop()))
|
||||
loop.add_signal_handler(signal.SIGTERM, lambda: asyncio.create_task(app.stop()))
|
||||
await app.start()
|
||||
|
||||
loop.run_until_complete(_run())
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
run()
|
||||
18
apps/miner-node/src/aitbc_miner/runners/__init__.py
Normal file
18
apps/miner-node/src/aitbc_miner/runners/__init__.py
Normal file
@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from .base import BaseRunner
|
||||
from .cli.simple import CLIRunner
|
||||
from .python.noop import PythonNoopRunner
|
||||
|
||||
|
||||
_RUNNERS: Dict[str, BaseRunner] = {
|
||||
"cli": CLIRunner(),
|
||||
"python": PythonNoopRunner(),
|
||||
"noop": PythonNoopRunner(),
|
||||
}
|
||||
|
||||
|
||||
def get_runner(kind: str) -> BaseRunner:
|
||||
return _RUNNERS.get(kind, _RUNNERS["noop"])
|
||||
17
apps/miner-node/src/aitbc_miner/runners/base.py
Normal file
17
apps/miner-node/src/aitbc_miner/runners/base.py
Normal file
@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunnerResult:
|
||||
ok: bool
|
||||
output: Dict[str, Any]
|
||||
artifacts: Dict[str, Path] | None = None
|
||||
|
||||
|
||||
class BaseRunner:
|
||||
async def run(self, job: Dict[str, Any], workspace: Path) -> RunnerResult:
|
||||
raise NotImplementedError
|
||||
62
apps/miner-node/src/aitbc_miner/runners/cli/simple.py
Normal file
62
apps/miner-node/src/aitbc_miner/runners/cli/simple.py
Normal file
@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from ..base import BaseRunner, RunnerResult
|
||||
|
||||
|
||||
class CLIRunner(BaseRunner):
|
||||
async def run(self, job: Dict[str, Any], workspace: Path) -> RunnerResult:
|
||||
runner_cfg = job.get("runner", {})
|
||||
command: List[str] = runner_cfg.get("command", [])
|
||||
if not command:
|
||||
return RunnerResult(
|
||||
ok=False,
|
||||
output={
|
||||
"error_code": "INVALID_COMMAND",
|
||||
"error_message": "runner.command is required for CLI jobs",
|
||||
"metrics": {},
|
||||
},
|
||||
)
|
||||
|
||||
stdout_path = workspace / "stdout.log"
|
||||
stderr_path = workspace / "stderr.log"
|
||||
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*command,
|
||||
cwd=str(workspace),
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
|
||||
stdout_bytes, stderr_bytes = await process.communicate()
|
||||
stdout_path.write_bytes(stdout_bytes)
|
||||
stderr_path.write_bytes(stderr_bytes)
|
||||
|
||||
if process.returncode == 0:
|
||||
return RunnerResult(
|
||||
ok=True,
|
||||
output={
|
||||
"exit_code": 0,
|
||||
"stdout": stdout_path.name,
|
||||
"stderr": stderr_path.name,
|
||||
},
|
||||
artifacts={
|
||||
"stdout": stdout_path,
|
||||
"stderr": stderr_path,
|
||||
},
|
||||
)
|
||||
|
||||
return RunnerResult(
|
||||
ok=False,
|
||||
output={
|
||||
"error_code": "PROCESS_FAILED",
|
||||
"error_message": f"command exited with code {process.returncode}",
|
||||
"metrics": {
|
||||
"exit_code": process.returncode,
|
||||
"stderr": stderr_path.name,
|
||||
},
|
||||
},
|
||||
)
|
||||
20
apps/miner-node/src/aitbc_miner/runners/python/noop.py
Normal file
20
apps/miner-node/src/aitbc_miner/runners/python/noop.py
Normal file
@ -0,0 +1,20 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
from ..base import BaseRunner, RunnerResult
|
||||
|
||||
|
||||
class PythonNoopRunner(BaseRunner):
|
||||
async def run(self, job: Dict[str, Any], workspace: Path) -> RunnerResult:
|
||||
await asyncio.sleep(0)
|
||||
payload = job.get("payload", {})
|
||||
return RunnerResult(
|
||||
ok=True,
|
||||
output={
|
||||
"echo": payload,
|
||||
"message": "python noop runner executed",
|
||||
},
|
||||
)
|
||||
19
apps/miner-node/src/aitbc_miner/util/backoff.py
Normal file
19
apps/miner-node/src/aitbc_miner/util/backoff.py
Normal file
@ -0,0 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
|
||||
|
||||
def compute_backoff(base: float, factor: float, jitter_pct: float, max_seconds: float) -> float:
|
||||
backoff = min(base * factor, max_seconds)
|
||||
jitter = backoff * (jitter_pct / 100.0)
|
||||
return max(0.0, random.uniform(backoff - jitter, backoff + jitter))
|
||||
|
||||
|
||||
def next_backoff(current: float, factor: float, jitter_pct: float, max_seconds: float) -> float:
|
||||
return compute_backoff(current, factor, jitter_pct, max_seconds)
|
||||
|
||||
|
||||
async def sleep_with_backoff(delay: float, factor: float, jitter_pct: float, max_seconds: float) -> float:
|
||||
await asyncio.sleep(delay)
|
||||
return next_backoff(delay, factor, jitter_pct, max_seconds)
|
||||
15
apps/miner-node/src/aitbc_miner/util/fs.py
Normal file
15
apps/miner-node/src/aitbc_miner/util/fs.py
Normal file
@ -0,0 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def ensure_workspace(root: Path, job_id: str) -> Path:
|
||||
path = root / job_id
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
def write_json(path: Path, data: dict) -> None:
|
||||
import json
|
||||
|
||||
path.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
91
apps/miner-node/src/aitbc_miner/util/probe.py
Normal file
91
apps/miner-node/src/aitbc_miner/util/probe.py
Normal file
@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
@dataclass
|
||||
class CapabilitySnapshot:
|
||||
capabilities: Dict[str, Any]
|
||||
concurrency: int
|
||||
region: str | None = None
|
||||
|
||||
|
||||
def collect_capabilities(max_cpu_concurrency: int, max_gpu_concurrency: int) -> CapabilitySnapshot:
|
||||
cpu_count = psutil.cpu_count(logical=True) or 1
|
||||
total_mem = psutil.virtual_memory().total
|
||||
gpu_info = _detect_gpus()
|
||||
|
||||
capabilities: Dict[str, Any] = {
|
||||
"node": platform.node(),
|
||||
"python_version": platform.python_version(),
|
||||
"platform": platform.platform(),
|
||||
"cpu": {
|
||||
"logical_cores": cpu_count,
|
||||
"model": platform.processor(),
|
||||
},
|
||||
"memory": {
|
||||
"total_bytes": total_mem,
|
||||
"total_gb": round(total_mem / (1024**3), 2),
|
||||
},
|
||||
"runners": {
|
||||
"cli": True,
|
||||
"python": True,
|
||||
},
|
||||
}
|
||||
|
||||
if gpu_info:
|
||||
capabilities["gpus"] = gpu_info
|
||||
|
||||
concurrency = max(1, max_cpu_concurrency, max_gpu_concurrency if gpu_info else 0)
|
||||
return CapabilitySnapshot(capabilities=capabilities, concurrency=concurrency)
|
||||
|
||||
|
||||
def collect_runtime_metrics() -> Dict[str, Any]:
|
||||
vm = psutil.virtual_memory()
|
||||
load_avg = psutil.getloadavg() if hasattr(psutil, "getloadavg") else (0, 0, 0)
|
||||
return {
|
||||
"cpu_percent": psutil.cpu_percent(interval=None),
|
||||
"load_avg": load_avg,
|
||||
"memory_percent": vm.percent,
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
|
||||
|
||||
def _detect_gpus() -> List[Dict[str, Any]]:
|
||||
nvidia_smi = shutil.which("nvidia-smi")
|
||||
if not nvidia_smi:
|
||||
return []
|
||||
try:
|
||||
output = subprocess.check_output(
|
||||
[
|
||||
nvidia_smi,
|
||||
"--query-gpu=name,memory.total",
|
||||
"--format=csv,noheader"
|
||||
],
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
return []
|
||||
|
||||
gpus: List[Dict[str, Any]] = []
|
||||
for line in output.strip().splitlines():
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if not parts:
|
||||
continue
|
||||
name = parts[0]
|
||||
mem_mb = None
|
||||
if len(parts) > 1 and parts[1].lower().endswith(" mib"):
|
||||
try:
|
||||
mem_mb = int(float(parts[1].split()[0]))
|
||||
except ValueError:
|
||||
mem_mb = None
|
||||
gpus.append({"name": name, "memory_mb": mem_mb})
|
||||
return gpus
|
||||
37
apps/miner-node/tests/test_runners.py
Normal file
37
apps/miner-node/tests/test_runners.py
Normal file
@ -0,0 +1,37 @@
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from aitbc_miner.runners.cli.simple import CLIRunner
|
||||
from aitbc_miner.runners.python.noop import PythonNoopRunner
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_python_noop_runner(tmp_path: Path):
|
||||
runner = PythonNoopRunner()
|
||||
job = {"payload": {"value": 42}}
|
||||
result = await runner.run(job, tmp_path)
|
||||
assert result.ok
|
||||
assert result.output["echo"] == job["payload"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cli_runner_success(tmp_path: Path):
|
||||
runner = CLIRunner()
|
||||
job = {"runner": {"command": ["echo", "hello"]}}
|
||||
result = await runner.run(job, tmp_path)
|
||||
assert result.ok
|
||||
assert result.artifacts is not None
|
||||
stdout_path = result.artifacts["stdout"]
|
||||
assert stdout_path.exists()
|
||||
assert stdout_path.read_text().strip() == "hello"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cli_runner_invalid_command(tmp_path: Path):
|
||||
runner = CLIRunner()
|
||||
job = {"runner": {}}
|
||||
result = await runner.run(job, tmp_path)
|
||||
assert not result.ok
|
||||
assert result.output["error_code"] == "INVALID_COMMAND"
|
||||
Reference in New Issue
Block a user