feat(chain_sync): add retry logic with exponential backoff for RPC connections and block imports
- Add retry mechanism with exponential backoff for block broadcast loop - Implement max 5 retries with 2s base delay for RPC connection failures - Reset retry count on successful connection - Add 20s wait period after max retries before resetting counter - Add retry logic for block import with 3 attempts and 1s base delay - Handle non-200 HTTP responses as exceptions to trigger retries - Improve error logging
This commit is contained in:
@@ -60,6 +60,9 @@ class ChainSyncService:
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
|
|
||||||
last_broadcast_height = 0
|
last_broadcast_height = 0
|
||||||
|
retry_count = 0
|
||||||
|
max_retries = 5
|
||||||
|
base_delay = 2
|
||||||
|
|
||||||
while not self._stop_event.is_set():
|
while not self._stop_event.is_set():
|
||||||
try:
|
try:
|
||||||
@@ -70,6 +73,9 @@ class ChainSyncService:
|
|||||||
head_data = await resp.json()
|
head_data = await resp.json()
|
||||||
current_height = head_data.get('height', 0)
|
current_height = head_data.get('height', 0)
|
||||||
|
|
||||||
|
# Reset retry count on successful connection
|
||||||
|
retry_count = 0
|
||||||
|
|
||||||
# Broadcast new blocks
|
# Broadcast new blocks
|
||||||
if current_height > last_broadcast_height:
|
if current_height > last_broadcast_height:
|
||||||
for height in range(last_broadcast_height + 1, current_height + 1):
|
for height in range(last_broadcast_height + 1, current_height + 1):
|
||||||
@@ -79,11 +85,22 @@ class ChainSyncService:
|
|||||||
|
|
||||||
last_broadcast_height = current_height
|
last_broadcast_height = current_height
|
||||||
logger.info(f"Broadcasted blocks up to height {current_height}")
|
logger.info(f"Broadcasted blocks up to height {current_height}")
|
||||||
|
else:
|
||||||
|
raise Exception(f"RPC returned status {resp.status}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in block broadcast: {e}")
|
retry_count += 1
|
||||||
|
if retry_count <= max_retries:
|
||||||
|
delay = base_delay * (2 ** (retry_count - 1)) # Exponential backoff
|
||||||
|
logger.warning(f"RPC connection failed (attempt {retry_count}/{max_retries}), retrying in {delay}s: {e}")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
logger.error(f"RPC connection failed after {max_retries} attempts, waiting {base_delay * 10}s: {e}")
|
||||||
|
await asyncio.sleep(base_delay * 10)
|
||||||
|
retry_count = 0 # Reset retry count after long wait
|
||||||
|
|
||||||
await asyncio.sleep(2) # Check every 2 seconds
|
await asyncio.sleep(base_delay) # Check every 2 seconds when connected
|
||||||
|
|
||||||
async def _receive_blocks(self):
|
async def _receive_blocks(self):
|
||||||
"""Receive blocks from other nodes via Redis"""
|
"""Receive blocks from other nodes via Redis"""
|
||||||
@@ -142,19 +159,35 @@ class ChainSyncService:
|
|||||||
target_host = self.leader_host if self.leader_host else "127.0.0.1"
|
target_host = self.leader_host if self.leader_host else "127.0.0.1"
|
||||||
target_port = self.rpc_port
|
target_port = self.rpc_port
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
# Retry logic for import
|
||||||
async with session.post(
|
max_retries = 3
|
||||||
f"http://{target_host}:{target_port}/rpc/importBlock",
|
base_delay = 1
|
||||||
json=block_data
|
|
||||||
) as resp:
|
for attempt in range(max_retries):
|
||||||
if resp.status == 200:
|
try:
|
||||||
result = await resp.json()
|
async with aiohttp.ClientSession() as session:
|
||||||
if result.get('accepted'):
|
async with session.post(
|
||||||
logger.info(f"Imported block {block_data.get('height')} from {block_data.get('proposer')}")
|
f"http://{target_host}:{target_port}/rpc/importBlock",
|
||||||
else:
|
json=block_data
|
||||||
logger.debug(f"Rejected block {block_data.get('height')}: {result.get('reason')}")
|
) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
result = await resp.json()
|
||||||
|
if result.get('accepted'):
|
||||||
|
logger.info(f"Imported block {block_data.get('height')} from {block_data.get('proposer')}")
|
||||||
|
else:
|
||||||
|
logger.debug(f"Rejected block {block_data.get('height')}: {result.get('reason')}")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
raise Exception(f"HTTP {resp.status}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
delay = base_delay * (2 ** attempt)
|
||||||
|
logger.warning(f"Import failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s: {e}")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Failed to import block: {resp.status}")
|
logger.error(f"Failed to import block {block_data.get('height')} after {max_retries} attempts: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error importing block: {e}")
|
logger.error(f"Error importing block: {e}")
|
||||||
|
|||||||
@@ -36,6 +36,9 @@ class ChainSettings(BaseSettings):
|
|||||||
|
|
||||||
block_time_seconds: int = 2
|
block_time_seconds: int = 2
|
||||||
|
|
||||||
|
# Block production toggle (set false on followers)
|
||||||
|
enable_block_production: bool = True
|
||||||
|
|
||||||
# Block production limits
|
# Block production limits
|
||||||
max_block_size_bytes: int = 1_000_000 # 1 MB
|
max_block_size_bytes: int = 1_000_000 # 1 MB
|
||||||
max_txs_per_block: int = 500
|
max_txs_per_block: int = 500
|
||||||
|
|||||||
@@ -148,7 +148,11 @@ class BlockchainNode:
|
|||||||
max_size=settings.mempool_max_size,
|
max_size=settings.mempool_max_size,
|
||||||
min_fee=settings.min_fee,
|
min_fee=settings.min_fee,
|
||||||
)
|
)
|
||||||
self._start_proposers()
|
# Start proposers only if enabled (followers set enable_block_production=False)
|
||||||
|
if getattr(settings, "enable_block_production", True):
|
||||||
|
self._start_proposers()
|
||||||
|
else:
|
||||||
|
logger.info("Block production disabled on this node", extra={"proposer_id": settings.proposer_id})
|
||||||
await self._setup_gossip_subscribers()
|
await self._setup_gossip_subscribers()
|
||||||
try:
|
try:
|
||||||
await self._stop_event.wait()
|
await self._stop_event.wait()
|
||||||
|
|||||||
Reference in New Issue
Block a user