feat: add foreign key constraints and metrics for blockchain node

This commit is contained in:
oib
2025-09-28 06:04:30 +02:00
parent c1926136fb
commit fb60505cdf
189 changed files with 15678 additions and 158 deletions

View File

@ -0,0 +1,43 @@
# Blockchain Node Observability
This directory contains Prometheus and Grafana assets for the devnet environment. The stack relies on the HTTP `/metrics` endpoint exposed by:
1. The blockchain node API (`http://127.0.0.1:8080/metrics`).
2. The mock coordinator/miner exporter (`http://127.0.0.1:8090/metrics`).
## Files
- `prometheus.yml` Scrapes both blockchain node and mock coordinator/miner metrics.
- `grafana-dashboard.json` Panels for block interval, RPC throughput, miner activity, coordinator receipt flow, **plus new gossip queue, subscriber, and publication rate panels**.
- `alerts.yml` Alertmanager rules highlighting proposer stalls, miner errors, and coordinator receipt drop-offs.
- `gossip-recording-rules.yml` Prometheus recording rules that derive queue/subscriber gauges and publication rates from gossip metrics.
## Usage
```bash
# Launch Prometheus using the sample config
prometheus --config.file=apps/blockchain-node/observability/prometheus.yml
# Import the dashboard JSON into Grafana
grafana-cli dashboards import apps/blockchain-node/observability/grafana-dashboard.json
# Run Alertmanager with the example rules
alertmanager --config.file=apps/blockchain-node/observability/alerts.yml
# Reload Prometheus and Alertmanager after tuning thresholds
kill -HUP $(pgrep prometheus)
kill -HUP $(pgrep alertmanager)
```
> **Tip:** The devnet helper `scripts/devnet_up.sh` seeds the metrics endpoints. After running it, both scrape targets will begin emitting data in under a minute.
## Gossip Observability
Recent updates instrumented the gossip broker with Prometheus counters and gauges. Key metrics surfaced via the recording rules and dashboard include:
- `gossip_publications_rate_per_sec` and `gossip_broadcast_publications_rate_per_sec` per-second publication throughput for in-memory and broadcast backends.
- `gossip_publications_topic_rate_per_sec` topic-level publication rate time series (Grafana panel “Gossip Publication Rate by Topic”).
- `gossip_queue_size_by_topic` instantaneous queue depth per topic (“Gossip Queue Depth by Topic”).
- `gossip_subscribers_by_topic`, `gossip_subscribers_total`, `gossip_broadcast_subscribers_total` subscriber counts (“Gossip Subscriber Counts”).
Use these panels to monitor convergence/back-pressure during load tests (for example with `scripts/ws_load_test.py`) when running against a Redis-backed broadcast backend.

View File

@ -0,0 +1,43 @@
groups:
- name: blockchain-node
rules:
- alert: BlockProposalStalled
expr: (block_interval_seconds_sum / block_interval_seconds_count) > 5
for: 1m
labels:
severity: warning
annotations:
summary: "Block production interval exceeded 5s"
description: |
Average block interval is {{ $value }} seconds, exceeding the expected cadence.
- alert: BlockProposalDown
expr: (block_interval_seconds_sum / block_interval_seconds_count) > 10
for: 2m
labels:
severity: critical
annotations:
summary: "Block production halted"
description: |
Block intervals have spiked above 10 seconds for more than two minutes.
Check proposer loop and database state.
- alert: MinerErrorsDetected
expr: miner_error_rate > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Miner mock reporting errors"
description: |
The miner mock error gauge is {{ $value }}. Investigate miner telemetry.
- alert: CoordinatorReceiptDrop
expr: rate(miner_receipts_attested_total[5m]) == 0
for: 5m
labels:
severity: warning
annotations:
summary: "No receipts attested in 5 minutes"
description: |
Receipt attestations ceased during the last five minutes. Inspect coordinator connectivity.

View File

@ -0,0 +1,36 @@
groups:
- name: gossip_metrics
interval: 15s
rules:
- record: gossip_publications_rate_per_sec
expr: rate(gossip_publications_total[1m])
- record: gossip_broadcast_publications_rate_per_sec
expr: rate(gossip_broadcast_publications_total[1m])
- record: gossip_publications_topic_rate_per_sec
expr: label_replace(
rate({__name__=~"gossip_publications_topic_.*"}[1m]),
"topic",
"$1",
"__name__",
"gossip_publications_topic_(.*)"
)
- record: gossip_queue_size_by_topic
expr: label_replace(
{__name__=~"gossip_queue_size_.*"},
"topic",
"$1",
"__name__",
"gossip_queue_size_(.*)"
)
- record: gossip_subscribers_by_topic
expr: label_replace(
{__name__=~"gossip_subscribers_topic_.*"},
"topic",
"$1",
"__name__",
"gossip_subscribers_topic_(.*)"
)

View File

@ -0,0 +1,377 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "AITBC devnet observability for blockchain node, coordinator, and miner mock.",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1727420700000,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "block_interval_seconds_sum / block_interval_seconds_count",
"legendFormat": "avg block interval",
"refId": "A"
}
],
"title": "Block Interval (seconds)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"calcs": ["lastNotNull"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "rate(rpc_send_tx_total[5m])",
"legendFormat": "sendTx",
"refId": "A"
},
{
"expr": "rate(rpc_submit_receipt_total[5m])",
"legendFormat": "submitReceipt",
"refId": "B"
},
{
"expr": "rate(rpc_get_head_total[5m])",
"legendFormat": "getHead",
"refId": "C"
}
],
"title": "RPC Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"calcs": ["lastNotNull"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "miner_active_jobs",
"legendFormat": "active jobs",
"refId": "A"
},
{
"expr": "miner_error_rate",
"legendFormat": "error gauge",
"refId": "B"
}
],
"title": "Miner Activity",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "rate(miner_receipts_attested_total[5m])",
"legendFormat": "receipts attested",
"refId": "A"
},
{
"expr": "rate(miner_receipts_unknown_total[5m])",
"legendFormat": "unknown receipts",
"refId": "B"
}
],
"title": "Coordinator Receipt Flow",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 5,
"options": {
"legend": {
"calcs": ["lastNotNull"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "gossip_queue_size_by_topic",
"legendFormat": "{{topic}}",
"refId": "A"
}
],
"title": "Gossip Queue Depth by Topic",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": ["lastNotNull"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "gossip_subscribers_by_topic",
"legendFormat": "{{topic}}",
"refId": "A"
},
{
"expr": "gossip_subscribers_total",
"legendFormat": "total subscribers",
"refId": "B"
},
{
"expr": "gossip_broadcast_subscribers_total",
"legendFormat": "broadcast subscribers",
"refId": "C"
}
],
"title": "Gossip Subscriber Counts",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 7,
"options": {
"legend": {
"calcs": ["lastNotNull"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "gossip_publications_rate_per_sec",
"legendFormat": "memory backend",
"refId": "A"
},
{
"expr": "gossip_broadcast_publications_rate_per_sec",
"legendFormat": "broadcast backend",
"refId": "B"
}
],
"title": "Gossip Publication Rate (total)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PROMETHEUS_DS"
},
"fieldConfig": {
"defaults": {
"custom": {},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 8,
"options": {
"legend": {
"calcs": ["lastNotNull"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
}
},
"targets": [
{
"expr": "gossip_publications_topic_rate_per_sec",
"legendFormat": "{{topic}}",
"refId": "A"
}
],
"title": "Gossip Publication Rate by Topic",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"aitbc",
"blockchain-node"
],
"templating": {
"list": []
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "AITBC Blockchain Node",
"uid": null,
"version": 1,
"weekStart": ""
}

View File

@ -0,0 +1,28 @@
global:
scrape_interval: 5s
evaluation_interval: 10s
alerting:
alertmanagers:
- static_configs:
- targets:
- "127.0.0.1:9093"
scrape_configs:
- job_name: "blockchain-node"
static_configs:
- targets:
- "127.0.0.1:8080"
labels:
service: "blockchain-node"
- job_name: "mock-coordinator"
static_configs:
- targets:
- "127.0.0.1:8090"
labels:
service: "mock-coordinator"
rule_files:
- alerts.yml
- gossip-recording-rules.yml