docs: update CLI command syntax across workflow documentation
Some checks failed
CLI Tests / test-cli (push) Has been cancelled
Security Scanning / security-scan (push) Has been cancelled
Integration Tests / test-service-integration (push) Has been cancelled
Python Tests / test-python (push) Has been cancelled
Documentation Validation / validate-docs (push) Has been cancelled
API Endpoint Tests / test-api-endpoints (push) Has been cancelled

- Updated marketplace commands: `marketplace --action` → `market` subcommands
- Updated wallet commands: direct flags → `wallet` subcommands
- Updated AI commands: `ai-submit`, `ai-status` → `ai submit`, `ai status`
- Updated blockchain commands: `chain` → `blockchain info`
- Standardized command structure across all workflow files
- Affected files: MULTI_NODE_MASTER_INDEX.md, TEST_MASTER_INDEX.md, multi-node-blockchain-marketplace
This commit is contained in:
aitbc
2026-04-08 12:10:21 +02:00
parent ef4a1c0e87
commit 40ddf89b9c
251 changed files with 3555 additions and 61407 deletions

View File

@@ -0,0 +1,129 @@
import json
import logging
import os
from collections import deque
from datetime import datetime, timedelta
from typing import Any
from urllib import error, request
logger = logging.getLogger(__name__)
class AlertDispatcher:
def __init__(self, cooldown_seconds: int = 300, max_history: int = 100):
self.cooldown_seconds = cooldown_seconds
self._last_sent: dict[str, datetime] = {}
self._history: deque[dict[str, Any]] = deque(maxlen=max_history)
def dispatch(self, alerts: dict[str, dict[str, Any]]) -> dict[str, Any]:
triggered = {
name: alert for name, alert in alerts.items() if alert.get("triggered")
}
results: dict[str, Any] = {
"triggered_count": len(triggered),
"sent": [],
"suppressed": [],
"failed": [],
"channel": self._channel_name(),
}
for name, alert in triggered.items():
if self._is_suppressed(name):
results["suppressed"].append(name)
self._record_alert(name, alert, delivery_status="suppressed")
continue
try:
self._deliver(name, alert)
self._last_sent[name] = datetime.utcnow()
results["sent"].append(name)
self._record_alert(name, alert, delivery_status="sent")
except Exception as exc:
logger.error("Alert delivery failed for %s: %s", name, exc)
results["failed"].append({"name": name, "error": str(exc)})
self._record_alert(name, alert, delivery_status="failed", error_message=str(exc))
return results
def get_recent_alerts(self, severity: str | None = None, limit: int = 50) -> list[dict[str, Any]]:
alerts = list(self._history)
if severity:
alerts = [alert for alert in alerts if alert["severity"] == severity]
limit = max(limit, 0)
if limit == 0:
return []
return list(reversed(alerts[-limit:]))
def reset_history(self) -> None:
self._history.clear()
def _is_suppressed(self, name: str) -> bool:
last_sent = self._last_sent.get(name)
if last_sent is None:
return False
return datetime.utcnow() - last_sent < timedelta(seconds=self.cooldown_seconds)
def _record_alert(
self,
name: str,
alert: dict[str, Any],
delivery_status: str,
error_message: str | None = None,
) -> None:
timestamp = datetime.utcnow().isoformat()
record = {
"id": f"metrics_alert_{name}_{int(datetime.utcnow().timestamp() * 1000)}",
"deployment_id": None,
"severity": alert.get("status", "critical"),
"message": f"Threshold triggered for {name}",
"timestamp": timestamp,
"resolved": False,
"source": "coordinator_metrics",
"channel": self._channel_name(),
"delivery_status": delivery_status,
"value": alert.get("value"),
"threshold": alert.get("threshold"),
}
if error_message is not None:
record["error"] = error_message
self._history.append(record)
def _deliver(self, name: str, alert: dict[str, Any]) -> None:
webhook_url = os.getenv("AITBC_ALERT_WEBHOOK_URL", "").strip()
payload = {
"name": name,
"status": alert.get("status", "critical"),
"value": alert.get("value"),
"threshold": alert.get("threshold"),
"timestamp": datetime.utcnow().isoformat(),
}
if webhook_url:
body = json.dumps(payload).encode("utf-8")
webhook_request = request.Request(
webhook_url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with request.urlopen(webhook_request, timeout=5) as response:
if response.status >= 400:
raise RuntimeError(f"Webhook responded with status {response.status}")
except error.URLError as exc:
raise RuntimeError(f"Webhook delivery error: {exc}") from exc
logger.warning("Alert delivered to webhook: %s", name)
return
logger.warning(
"Alert triggered without external webhook configured: %s value=%s threshold=%s",
name,
alert.get("value"),
alert.get("threshold"),
)
def _channel_name(self) -> str:
return "webhook" if os.getenv("AITBC_ALERT_WEBHOOK_URL", "").strip() else "log"
alert_dispatcher = AlertDispatcher()

View File

@@ -12,11 +12,13 @@ logger = logging.getLogger(__name__)
class CacheManager:
"""Simple in-memory cache with TTL support"""
"""Simple in-memory cache with TTL support and memory management"""
def __init__(self):
def __init__(self, max_size: int = 1000, max_memory_mb: int = 100):
self._cache: dict[str, dict[str, Any]] = {}
self._stats = {"hits": 0, "misses": 0, "sets": 0, "evictions": 0}
self.max_size = max_size
self.max_memory_mb = max_memory_mb
def get(self, key: str) -> Any | None:
"""Get value from cache"""
@@ -38,13 +40,21 @@ class CacheManager:
return cache_entry["value"]
def set(self, key: str, value: Any, ttl_seconds: int = 300) -> None:
"""Set value in cache with TTL"""
"""Set value in cache with TTL and enforce size/memory limits"""
# Check size limit
if len(self._cache) >= self.max_size:
self._evict_oldest()
expires_at = datetime.now() + timedelta(seconds=ttl_seconds)
self._cache[key] = {"value": value, "expires_at": expires_at, "created_at": datetime.now(), "ttl": ttl_seconds}
self._stats["sets"] += 1
logger.debug(f"Cache set for key: {key}, TTL: {ttl_seconds}s")
# Check memory limit periodically
if self._stats["sets"] % 100 == 0:
self._check_memory_limit()
def delete(self, key: str) -> bool:
"""Delete key from cache"""
@@ -83,11 +93,42 @@ class CacheManager:
"total_entries": len(self._cache),
"hit_rate_percent": round(hit_rate, 2),
"total_requests": total_requests,
"max_size": self.max_size,
"max_memory_mb": self.max_memory_mb,
}
def _evict_oldest(self) -> None:
"""Evict the oldest cache entry"""
if not self._cache:
return
# Find oldest entry by created_at timestamp
oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k]["created_at"])
del self._cache[oldest_key]
self._stats["evictions"] += 1
logger.debug(f"Evicted oldest cache entry: {oldest_key}")
def _check_memory_limit(self) -> None:
"""Check if cache exceeds memory limit and evict if needed"""
import sys
import gc
# Estimate cache memory usage (rough approximation)
cache_size_mb = sys.getsizeof(self._cache) / (1024 * 1024)
if cache_size_mb > self.max_memory_mb:
logger.warning(f"Cache memory limit exceeded ({cache_size_mb:.2f}MB > {self.max_memory_mb}MB), evicting entries")
# Evict 20% of entries to reduce memory
evict_count = max(1, int(len(self._cache) * 0.2))
for _ in range(evict_count):
self._evict_oldest()
# Force garbage collection
gc.collect()
# Global cache manager instance
cache_manager = CacheManager()
# Global cache manager instance with optimized settings
cache_manager = CacheManager(max_size=1000, max_memory_mb=100)
def cache_key_generator(*args, **kwargs) -> str:

View File

@@ -0,0 +1,181 @@
"""
Basic Metrics Collection Module
Collects and tracks system and application metrics for monitoring
"""
import logging
import os
import resource
from datetime import datetime
from typing import Any
logger = logging.getLogger(__name__)
class MetricsCollector:
"""Basic metrics collection for system and application monitoring"""
def __init__(self):
self._metrics: dict[str, Any] = {
"api_requests": 0,
"api_errors": 0,
"api_response_times": [],
"database_queries": 0,
"database_errors": 0,
"cache_hits": 0,
"cache_misses": 0,
"active_connections": 0,
"memory_usage_mb": 0,
"cpu_usage_percent": 0.0,
}
self._start_time = datetime.utcnow()
def increment_api_requests(self) -> None:
"""Increment API request counter"""
self._metrics["api_requests"] += 1
def increment_api_errors(self) -> None:
"""Increment API error counter"""
self._metrics["api_errors"] += 1
def record_api_response_time(self, response_time: float) -> None:
"""Record API response time"""
self._metrics["api_response_times"].append(response_time)
# Keep only last 100 response times
if len(self._metrics["api_response_times"]) > 100:
self._metrics["api_response_times"] = self._metrics["api_response_times"][-100:]
def increment_database_queries(self) -> None:
"""Increment database query counter"""
self._metrics["database_queries"] += 1
def increment_database_errors(self) -> None:
"""Increment database error counter"""
self._metrics["database_errors"] += 1
def increment_cache_hits(self) -> None:
"""Increment cache hit counter"""
self._metrics["cache_hits"] += 1
def increment_cache_misses(self) -> None:
"""Increment cache miss counter"""
self._metrics["cache_misses"] += 1
def update_active_connections(self, count: int) -> None:
"""Update active connections count"""
self._metrics["active_connections"] = count
def update_memory_usage(self, usage_mb: float) -> None:
"""Update memory usage"""
self._metrics["memory_usage_mb"] = usage_mb
def update_cpu_usage(self, usage_percent: float) -> None:
"""Update CPU usage percentage"""
self._metrics["cpu_usage_percent"] = usage_percent
def update_cache_stats(self, cache_stats: dict[str, Any]) -> None:
"""Update cache metrics from cache manager stats"""
self._metrics["cache_hits"] = cache_stats.get("hits", 0)
self._metrics["cache_misses"] = cache_stats.get("misses", 0)
def capture_system_snapshot(self) -> None:
"""Capture a lightweight system resource snapshot"""
memory_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
self._metrics["memory_usage_mb"] = round(memory_kb / 1024, 2)
load_average = os.getloadavg()[0] if hasattr(os, "getloadavg") else 0.0
cpu_estimate = min(round(load_average * 100, 2), 100.0)
self._metrics["cpu_usage_percent"] = cpu_estimate
def get_metrics(self) -> dict[str, Any]:
"""Get current metrics"""
self.capture_system_snapshot()
avg_response_time = 0.0
if self._metrics["api_response_times"]:
avg_response_time = sum(self._metrics["api_response_times"]) / len(self._metrics["api_response_times"])
cache_hit_rate = 0.0
total_cache_ops = self._metrics["cache_hits"] + self._metrics["cache_misses"]
if total_cache_ops > 0:
cache_hit_rate = (self._metrics["cache_hits"] / total_cache_ops) * 100
error_rate = 0.0
if self._metrics["api_requests"] > 0:
error_rate = (self._metrics["api_errors"] / self._metrics["api_requests"]) * 100
uptime_seconds = (datetime.utcnow() - self._start_time).total_seconds()
return {
**self._metrics,
"avg_response_time_ms": avg_response_time * 1000,
"cache_hit_rate_percent": cache_hit_rate,
"error_rate_percent": error_rate,
"alerts": self.get_alert_states(),
"uptime_seconds": uptime_seconds,
"uptime_formatted": self._format_uptime(uptime_seconds),
"timestamp": datetime.utcnow().isoformat(),
}
def _format_uptime(self, seconds: float) -> str:
"""Format uptime in human-readable format"""
days = int(seconds // 86400)
hours = int((seconds % 86400) // 3600)
minutes = int((seconds % 3600) // 60)
return f"{days}d {hours}h {minutes}m"
def get_alert_states(self) -> dict[str, dict[str, str | float | bool]]:
"""Evaluate alert thresholds for key metrics"""
avg_response_time_ms = 0.0
if self._metrics["api_response_times"]:
avg_response_time_ms = (sum(self._metrics["api_response_times"]) / len(self._metrics["api_response_times"])) * 1000
total_cache_ops = self._metrics["cache_hits"] + self._metrics["cache_misses"]
cache_hit_rate = (self._metrics["cache_hits"] / total_cache_ops * 100) if total_cache_ops > 0 else 0.0
error_rate = (self._metrics["api_errors"] / self._metrics["api_requests"] * 100) if self._metrics["api_requests"] > 0 else 0.0
memory_percent_estimate = min((self._metrics["memory_usage_mb"] / 1024) * 100, 100.0)
return {
"error_rate": {"triggered": error_rate > 1.0, "value": round(error_rate, 2), "threshold": 1.0, "status": "critical" if error_rate > 1.0 else "ok"},
"avg_response_time": {"triggered": avg_response_time_ms > 500.0, "value": round(avg_response_time_ms, 2), "threshold": 500.0, "status": "critical" if avg_response_time_ms > 500.0 else "ok"},
"memory_usage": {"triggered": memory_percent_estimate > 90.0, "value": round(memory_percent_estimate, 2), "threshold": 90.0, "status": "critical" if memory_percent_estimate > 90.0 else "ok"},
"cache_hit_rate": {"triggered": total_cache_ops > 0 and cache_hit_rate < 70.0, "value": round(cache_hit_rate, 2), "threshold": 70.0, "status": "critical" if total_cache_ops > 0 and cache_hit_rate < 70.0 else "ok"},
}
def reset_metrics(self) -> None:
"""Reset all metrics"""
self._metrics = {
"api_requests": 0,
"api_errors": 0,
"api_response_times": [],
"database_queries": 0,
"database_errors": 0,
"cache_hits": 0,
"cache_misses": 0,
"active_connections": 0,
"memory_usage_mb": 0,
"cpu_usage_percent": 0.0,
}
self._start_time = datetime.utcnow()
# Global metrics collector instance
metrics_collector = MetricsCollector()
def build_live_metrics_payload(
cache_stats: dict[str, Any],
dispatcher: Any | None = None,
collector: MetricsCollector | None = None,
) -> dict[str, Any]:
active_collector = collector or metrics_collector
active_collector.update_cache_stats(cache_stats)
metrics = active_collector.get_metrics()
if dispatcher is not None:
metrics["alert_delivery"] = dispatcher.dispatch(metrics.get("alerts", {}))
return metrics
def get_metrics() -> dict[str, Any]:
"""Get current metrics from global collector"""
return metrics_collector.get_metrics()
def reset_metrics() -> None:
"""Reset global metrics collector"""
metrics_collector.reset_metrics()