docs: update CLI command syntax across workflow documentation
Some checks failed
CLI Tests / test-cli (push) Has been cancelled
Security Scanning / security-scan (push) Has been cancelled
Integration Tests / test-service-integration (push) Has been cancelled
Python Tests / test-python (push) Has been cancelled
Documentation Validation / validate-docs (push) Has been cancelled
API Endpoint Tests / test-api-endpoints (push) Has been cancelled
Some checks failed
CLI Tests / test-cli (push) Has been cancelled
Security Scanning / security-scan (push) Has been cancelled
Integration Tests / test-service-integration (push) Has been cancelled
Python Tests / test-python (push) Has been cancelled
Documentation Validation / validate-docs (push) Has been cancelled
API Endpoint Tests / test-api-endpoints (push) Has been cancelled
- Updated marketplace commands: `marketplace --action` → `market` subcommands - Updated wallet commands: direct flags → `wallet` subcommands - Updated AI commands: `ai-submit`, `ai-status` → `ai submit`, `ai status` - Updated blockchain commands: `chain` → `blockchain info` - Standardized command structure across all workflow files - Affected files: MULTI_NODE_MASTER_INDEX.md, TEST_MASTER_INDEX.md, multi-node-blockchain-marketplace
This commit is contained in:
129
apps/coordinator-api/src/app/utils/alerting.py
Normal file
129
apps/coordinator-api/src/app/utils/alerting.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from collections import deque
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any
|
||||
from urllib import error, request
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AlertDispatcher:
|
||||
def __init__(self, cooldown_seconds: int = 300, max_history: int = 100):
|
||||
self.cooldown_seconds = cooldown_seconds
|
||||
self._last_sent: dict[str, datetime] = {}
|
||||
self._history: deque[dict[str, Any]] = deque(maxlen=max_history)
|
||||
|
||||
def dispatch(self, alerts: dict[str, dict[str, Any]]) -> dict[str, Any]:
|
||||
triggered = {
|
||||
name: alert for name, alert in alerts.items() if alert.get("triggered")
|
||||
}
|
||||
results: dict[str, Any] = {
|
||||
"triggered_count": len(triggered),
|
||||
"sent": [],
|
||||
"suppressed": [],
|
||||
"failed": [],
|
||||
"channel": self._channel_name(),
|
||||
}
|
||||
|
||||
for name, alert in triggered.items():
|
||||
if self._is_suppressed(name):
|
||||
results["suppressed"].append(name)
|
||||
self._record_alert(name, alert, delivery_status="suppressed")
|
||||
continue
|
||||
|
||||
try:
|
||||
self._deliver(name, alert)
|
||||
self._last_sent[name] = datetime.utcnow()
|
||||
results["sent"].append(name)
|
||||
self._record_alert(name, alert, delivery_status="sent")
|
||||
except Exception as exc:
|
||||
logger.error("Alert delivery failed for %s: %s", name, exc)
|
||||
results["failed"].append({"name": name, "error": str(exc)})
|
||||
self._record_alert(name, alert, delivery_status="failed", error_message=str(exc))
|
||||
|
||||
return results
|
||||
|
||||
def get_recent_alerts(self, severity: str | None = None, limit: int = 50) -> list[dict[str, Any]]:
|
||||
alerts = list(self._history)
|
||||
if severity:
|
||||
alerts = [alert for alert in alerts if alert["severity"] == severity]
|
||||
limit = max(limit, 0)
|
||||
if limit == 0:
|
||||
return []
|
||||
return list(reversed(alerts[-limit:]))
|
||||
|
||||
def reset_history(self) -> None:
|
||||
self._history.clear()
|
||||
|
||||
def _is_suppressed(self, name: str) -> bool:
|
||||
last_sent = self._last_sent.get(name)
|
||||
if last_sent is None:
|
||||
return False
|
||||
return datetime.utcnow() - last_sent < timedelta(seconds=self.cooldown_seconds)
|
||||
|
||||
def _record_alert(
|
||||
self,
|
||||
name: str,
|
||||
alert: dict[str, Any],
|
||||
delivery_status: str,
|
||||
error_message: str | None = None,
|
||||
) -> None:
|
||||
timestamp = datetime.utcnow().isoformat()
|
||||
record = {
|
||||
"id": f"metrics_alert_{name}_{int(datetime.utcnow().timestamp() * 1000)}",
|
||||
"deployment_id": None,
|
||||
"severity": alert.get("status", "critical"),
|
||||
"message": f"Threshold triggered for {name}",
|
||||
"timestamp": timestamp,
|
||||
"resolved": False,
|
||||
"source": "coordinator_metrics",
|
||||
"channel": self._channel_name(),
|
||||
"delivery_status": delivery_status,
|
||||
"value": alert.get("value"),
|
||||
"threshold": alert.get("threshold"),
|
||||
}
|
||||
if error_message is not None:
|
||||
record["error"] = error_message
|
||||
self._history.append(record)
|
||||
|
||||
def _deliver(self, name: str, alert: dict[str, Any]) -> None:
|
||||
webhook_url = os.getenv("AITBC_ALERT_WEBHOOK_URL", "").strip()
|
||||
payload = {
|
||||
"name": name,
|
||||
"status": alert.get("status", "critical"),
|
||||
"value": alert.get("value"),
|
||||
"threshold": alert.get("threshold"),
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
if webhook_url:
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
webhook_request = request.Request(
|
||||
webhook_url,
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with request.urlopen(webhook_request, timeout=5) as response:
|
||||
if response.status >= 400:
|
||||
raise RuntimeError(f"Webhook responded with status {response.status}")
|
||||
except error.URLError as exc:
|
||||
raise RuntimeError(f"Webhook delivery error: {exc}") from exc
|
||||
logger.warning("Alert delivered to webhook: %s", name)
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
"Alert triggered without external webhook configured: %s value=%s threshold=%s",
|
||||
name,
|
||||
alert.get("value"),
|
||||
alert.get("threshold"),
|
||||
)
|
||||
|
||||
def _channel_name(self) -> str:
|
||||
return "webhook" if os.getenv("AITBC_ALERT_WEBHOOK_URL", "").strip() else "log"
|
||||
|
||||
|
||||
alert_dispatcher = AlertDispatcher()
|
||||
@@ -12,11 +12,13 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CacheManager:
|
||||
"""Simple in-memory cache with TTL support"""
|
||||
"""Simple in-memory cache with TTL support and memory management"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, max_size: int = 1000, max_memory_mb: int = 100):
|
||||
self._cache: dict[str, dict[str, Any]] = {}
|
||||
self._stats = {"hits": 0, "misses": 0, "sets": 0, "evictions": 0}
|
||||
self.max_size = max_size
|
||||
self.max_memory_mb = max_memory_mb
|
||||
|
||||
def get(self, key: str) -> Any | None:
|
||||
"""Get value from cache"""
|
||||
@@ -38,13 +40,21 @@ class CacheManager:
|
||||
return cache_entry["value"]
|
||||
|
||||
def set(self, key: str, value: Any, ttl_seconds: int = 300) -> None:
|
||||
"""Set value in cache with TTL"""
|
||||
"""Set value in cache with TTL and enforce size/memory limits"""
|
||||
# Check size limit
|
||||
if len(self._cache) >= self.max_size:
|
||||
self._evict_oldest()
|
||||
|
||||
expires_at = datetime.now() + timedelta(seconds=ttl_seconds)
|
||||
|
||||
self._cache[key] = {"value": value, "expires_at": expires_at, "created_at": datetime.now(), "ttl": ttl_seconds}
|
||||
|
||||
self._stats["sets"] += 1
|
||||
logger.debug(f"Cache set for key: {key}, TTL: {ttl_seconds}s")
|
||||
|
||||
# Check memory limit periodically
|
||||
if self._stats["sets"] % 100 == 0:
|
||||
self._check_memory_limit()
|
||||
|
||||
def delete(self, key: str) -> bool:
|
||||
"""Delete key from cache"""
|
||||
@@ -83,11 +93,42 @@ class CacheManager:
|
||||
"total_entries": len(self._cache),
|
||||
"hit_rate_percent": round(hit_rate, 2),
|
||||
"total_requests": total_requests,
|
||||
"max_size": self.max_size,
|
||||
"max_memory_mb": self.max_memory_mb,
|
||||
}
|
||||
|
||||
def _evict_oldest(self) -> None:
|
||||
"""Evict the oldest cache entry"""
|
||||
if not self._cache:
|
||||
return
|
||||
|
||||
# Find oldest entry by created_at timestamp
|
||||
oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k]["created_at"])
|
||||
del self._cache[oldest_key]
|
||||
self._stats["evictions"] += 1
|
||||
logger.debug(f"Evicted oldest cache entry: {oldest_key}")
|
||||
|
||||
def _check_memory_limit(self) -> None:
|
||||
"""Check if cache exceeds memory limit and evict if needed"""
|
||||
import sys
|
||||
import gc
|
||||
|
||||
# Estimate cache memory usage (rough approximation)
|
||||
cache_size_mb = sys.getsizeof(self._cache) / (1024 * 1024)
|
||||
|
||||
if cache_size_mb > self.max_memory_mb:
|
||||
logger.warning(f"Cache memory limit exceeded ({cache_size_mb:.2f}MB > {self.max_memory_mb}MB), evicting entries")
|
||||
# Evict 20% of entries to reduce memory
|
||||
evict_count = max(1, int(len(self._cache) * 0.2))
|
||||
for _ in range(evict_count):
|
||||
self._evict_oldest()
|
||||
|
||||
# Force garbage collection
|
||||
gc.collect()
|
||||
|
||||
|
||||
# Global cache manager instance
|
||||
cache_manager = CacheManager()
|
||||
# Global cache manager instance with optimized settings
|
||||
cache_manager = CacheManager(max_size=1000, max_memory_mb=100)
|
||||
|
||||
|
||||
def cache_key_generator(*args, **kwargs) -> str:
|
||||
|
||||
181
apps/coordinator-api/src/app/utils/metrics.py
Normal file
181
apps/coordinator-api/src/app/utils/metrics.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""
|
||||
Basic Metrics Collection Module
|
||||
Collects and tracks system and application metrics for monitoring
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import resource
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Basic metrics collection for system and application monitoring"""
|
||||
|
||||
def __init__(self):
|
||||
self._metrics: dict[str, Any] = {
|
||||
"api_requests": 0,
|
||||
"api_errors": 0,
|
||||
"api_response_times": [],
|
||||
"database_queries": 0,
|
||||
"database_errors": 0,
|
||||
"cache_hits": 0,
|
||||
"cache_misses": 0,
|
||||
"active_connections": 0,
|
||||
"memory_usage_mb": 0,
|
||||
"cpu_usage_percent": 0.0,
|
||||
}
|
||||
self._start_time = datetime.utcnow()
|
||||
|
||||
def increment_api_requests(self) -> None:
|
||||
"""Increment API request counter"""
|
||||
self._metrics["api_requests"] += 1
|
||||
|
||||
def increment_api_errors(self) -> None:
|
||||
"""Increment API error counter"""
|
||||
self._metrics["api_errors"] += 1
|
||||
|
||||
def record_api_response_time(self, response_time: float) -> None:
|
||||
"""Record API response time"""
|
||||
self._metrics["api_response_times"].append(response_time)
|
||||
# Keep only last 100 response times
|
||||
if len(self._metrics["api_response_times"]) > 100:
|
||||
self._metrics["api_response_times"] = self._metrics["api_response_times"][-100:]
|
||||
|
||||
def increment_database_queries(self) -> None:
|
||||
"""Increment database query counter"""
|
||||
self._metrics["database_queries"] += 1
|
||||
|
||||
def increment_database_errors(self) -> None:
|
||||
"""Increment database error counter"""
|
||||
self._metrics["database_errors"] += 1
|
||||
|
||||
def increment_cache_hits(self) -> None:
|
||||
"""Increment cache hit counter"""
|
||||
self._metrics["cache_hits"] += 1
|
||||
|
||||
def increment_cache_misses(self) -> None:
|
||||
"""Increment cache miss counter"""
|
||||
self._metrics["cache_misses"] += 1
|
||||
|
||||
def update_active_connections(self, count: int) -> None:
|
||||
"""Update active connections count"""
|
||||
self._metrics["active_connections"] = count
|
||||
|
||||
def update_memory_usage(self, usage_mb: float) -> None:
|
||||
"""Update memory usage"""
|
||||
self._metrics["memory_usage_mb"] = usage_mb
|
||||
|
||||
def update_cpu_usage(self, usage_percent: float) -> None:
|
||||
"""Update CPU usage percentage"""
|
||||
self._metrics["cpu_usage_percent"] = usage_percent
|
||||
|
||||
def update_cache_stats(self, cache_stats: dict[str, Any]) -> None:
|
||||
"""Update cache metrics from cache manager stats"""
|
||||
self._metrics["cache_hits"] = cache_stats.get("hits", 0)
|
||||
self._metrics["cache_misses"] = cache_stats.get("misses", 0)
|
||||
|
||||
def capture_system_snapshot(self) -> None:
|
||||
"""Capture a lightweight system resource snapshot"""
|
||||
memory_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
||||
self._metrics["memory_usage_mb"] = round(memory_kb / 1024, 2)
|
||||
load_average = os.getloadavg()[0] if hasattr(os, "getloadavg") else 0.0
|
||||
cpu_estimate = min(round(load_average * 100, 2), 100.0)
|
||||
self._metrics["cpu_usage_percent"] = cpu_estimate
|
||||
|
||||
def get_metrics(self) -> dict[str, Any]:
|
||||
"""Get current metrics"""
|
||||
self.capture_system_snapshot()
|
||||
avg_response_time = 0.0
|
||||
if self._metrics["api_response_times"]:
|
||||
avg_response_time = sum(self._metrics["api_response_times"]) / len(self._metrics["api_response_times"])
|
||||
|
||||
cache_hit_rate = 0.0
|
||||
total_cache_ops = self._metrics["cache_hits"] + self._metrics["cache_misses"]
|
||||
if total_cache_ops > 0:
|
||||
cache_hit_rate = (self._metrics["cache_hits"] / total_cache_ops) * 100
|
||||
|
||||
error_rate = 0.0
|
||||
if self._metrics["api_requests"] > 0:
|
||||
error_rate = (self._metrics["api_errors"] / self._metrics["api_requests"]) * 100
|
||||
|
||||
uptime_seconds = (datetime.utcnow() - self._start_time).total_seconds()
|
||||
|
||||
return {
|
||||
**self._metrics,
|
||||
"avg_response_time_ms": avg_response_time * 1000,
|
||||
"cache_hit_rate_percent": cache_hit_rate,
|
||||
"error_rate_percent": error_rate,
|
||||
"alerts": self.get_alert_states(),
|
||||
"uptime_seconds": uptime_seconds,
|
||||
"uptime_formatted": self._format_uptime(uptime_seconds),
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
def _format_uptime(self, seconds: float) -> str:
|
||||
"""Format uptime in human-readable format"""
|
||||
days = int(seconds // 86400)
|
||||
hours = int((seconds % 86400) // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
return f"{days}d {hours}h {minutes}m"
|
||||
|
||||
def get_alert_states(self) -> dict[str, dict[str, str | float | bool]]:
|
||||
"""Evaluate alert thresholds for key metrics"""
|
||||
avg_response_time_ms = 0.0
|
||||
if self._metrics["api_response_times"]:
|
||||
avg_response_time_ms = (sum(self._metrics["api_response_times"]) / len(self._metrics["api_response_times"])) * 1000
|
||||
|
||||
total_cache_ops = self._metrics["cache_hits"] + self._metrics["cache_misses"]
|
||||
cache_hit_rate = (self._metrics["cache_hits"] / total_cache_ops * 100) if total_cache_ops > 0 else 0.0
|
||||
error_rate = (self._metrics["api_errors"] / self._metrics["api_requests"] * 100) if self._metrics["api_requests"] > 0 else 0.0
|
||||
memory_percent_estimate = min((self._metrics["memory_usage_mb"] / 1024) * 100, 100.0)
|
||||
|
||||
return {
|
||||
"error_rate": {"triggered": error_rate > 1.0, "value": round(error_rate, 2), "threshold": 1.0, "status": "critical" if error_rate > 1.0 else "ok"},
|
||||
"avg_response_time": {"triggered": avg_response_time_ms > 500.0, "value": round(avg_response_time_ms, 2), "threshold": 500.0, "status": "critical" if avg_response_time_ms > 500.0 else "ok"},
|
||||
"memory_usage": {"triggered": memory_percent_estimate > 90.0, "value": round(memory_percent_estimate, 2), "threshold": 90.0, "status": "critical" if memory_percent_estimate > 90.0 else "ok"},
|
||||
"cache_hit_rate": {"triggered": total_cache_ops > 0 and cache_hit_rate < 70.0, "value": round(cache_hit_rate, 2), "threshold": 70.0, "status": "critical" if total_cache_ops > 0 and cache_hit_rate < 70.0 else "ok"},
|
||||
}
|
||||
|
||||
def reset_metrics(self) -> None:
|
||||
"""Reset all metrics"""
|
||||
self._metrics = {
|
||||
"api_requests": 0,
|
||||
"api_errors": 0,
|
||||
"api_response_times": [],
|
||||
"database_queries": 0,
|
||||
"database_errors": 0,
|
||||
"cache_hits": 0,
|
||||
"cache_misses": 0,
|
||||
"active_connections": 0,
|
||||
"memory_usage_mb": 0,
|
||||
"cpu_usage_percent": 0.0,
|
||||
}
|
||||
self._start_time = datetime.utcnow()
|
||||
|
||||
|
||||
# Global metrics collector instance
|
||||
metrics_collector = MetricsCollector()
|
||||
|
||||
def build_live_metrics_payload(
|
||||
cache_stats: dict[str, Any],
|
||||
dispatcher: Any | None = None,
|
||||
collector: MetricsCollector | None = None,
|
||||
) -> dict[str, Any]:
|
||||
active_collector = collector or metrics_collector
|
||||
active_collector.update_cache_stats(cache_stats)
|
||||
metrics = active_collector.get_metrics()
|
||||
if dispatcher is not None:
|
||||
metrics["alert_delivery"] = dispatcher.dispatch(metrics.get("alerts", {}))
|
||||
return metrics
|
||||
|
||||
def get_metrics() -> dict[str, Any]:
|
||||
"""Get current metrics from global collector"""
|
||||
return metrics_collector.get_metrics()
|
||||
|
||||
def reset_metrics() -> None:
|
||||
"""Reset global metrics collector"""
|
||||
metrics_collector.reset_metrics()
|
||||
Reference in New Issue
Block a user