feat(chain_sync): add retry logic with exponential backoff for RPC connections and block imports
- Add retry mechanism with exponential backoff for block broadcast loop - Implement max 5 retries with 2s base delay for RPC connection failures - Reset retry count on successful connection - Add 20s wait period after max retries before resetting counter - Add retry logic for block import with 3 attempts and 1s base delay - Handle non-200 HTTP responses as exceptions to trigger retries - Improve error logging
This commit is contained in:
@@ -60,6 +60,9 @@ class ChainSyncService:
|
||||
import aiohttp
|
||||
|
||||
last_broadcast_height = 0
|
||||
retry_count = 0
|
||||
max_retries = 5
|
||||
base_delay = 2
|
||||
|
||||
while not self._stop_event.is_set():
|
||||
try:
|
||||
@@ -70,6 +73,9 @@ class ChainSyncService:
|
||||
head_data = await resp.json()
|
||||
current_height = head_data.get('height', 0)
|
||||
|
||||
# Reset retry count on successful connection
|
||||
retry_count = 0
|
||||
|
||||
# Broadcast new blocks
|
||||
if current_height > last_broadcast_height:
|
||||
for height in range(last_broadcast_height + 1, current_height + 1):
|
||||
@@ -79,11 +85,22 @@ class ChainSyncService:
|
||||
|
||||
last_broadcast_height = current_height
|
||||
logger.info(f"Broadcasted blocks up to height {current_height}")
|
||||
else:
|
||||
raise Exception(f"RPC returned status {resp.status}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in block broadcast: {e}")
|
||||
retry_count += 1
|
||||
if retry_count <= max_retries:
|
||||
delay = base_delay * (2 ** (retry_count - 1)) # Exponential backoff
|
||||
logger.warning(f"RPC connection failed (attempt {retry_count}/{max_retries}), retrying in {delay}s: {e}")
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"RPC connection failed after {max_retries} attempts, waiting {base_delay * 10}s: {e}")
|
||||
await asyncio.sleep(base_delay * 10)
|
||||
retry_count = 0 # Reset retry count after long wait
|
||||
|
||||
await asyncio.sleep(2) # Check every 2 seconds
|
||||
await asyncio.sleep(base_delay) # Check every 2 seconds when connected
|
||||
|
||||
async def _receive_blocks(self):
|
||||
"""Receive blocks from other nodes via Redis"""
|
||||
@@ -142,19 +159,35 @@ class ChainSyncService:
|
||||
target_host = self.leader_host if self.leader_host else "127.0.0.1"
|
||||
target_port = self.rpc_port
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"http://{target_host}:{target_port}/rpc/importBlock",
|
||||
json=block_data
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
result = await resp.json()
|
||||
if result.get('accepted'):
|
||||
logger.info(f"Imported block {block_data.get('height')} from {block_data.get('proposer')}")
|
||||
else:
|
||||
logger.debug(f"Rejected block {block_data.get('height')}: {result.get('reason')}")
|
||||
# Retry logic for import
|
||||
max_retries = 3
|
||||
base_delay = 1
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"http://{target_host}:{target_port}/rpc/importBlock",
|
||||
json=block_data
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
result = await resp.json()
|
||||
if result.get('accepted'):
|
||||
logger.info(f"Imported block {block_data.get('height')} from {block_data.get('proposer')}")
|
||||
else:
|
||||
logger.debug(f"Rejected block {block_data.get('height')}: {result.get('reason')}")
|
||||
return
|
||||
else:
|
||||
raise Exception(f"HTTP {resp.status}")
|
||||
|
||||
except Exception as e:
|
||||
if attempt < max_retries - 1:
|
||||
delay = base_delay * (2 ** attempt)
|
||||
logger.warning(f"Import failed (attempt {attempt + 1}/{max_retries}), retrying in {delay}s: {e}")
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
logger.warning(f"Failed to import block: {resp.status}")
|
||||
logger.error(f"Failed to import block {block_data.get('height')} after {max_retries} attempts: {e}")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error importing block: {e}")
|
||||
|
||||
@@ -36,6 +36,9 @@ class ChainSettings(BaseSettings):
|
||||
|
||||
block_time_seconds: int = 2
|
||||
|
||||
# Block production toggle (set false on followers)
|
||||
enable_block_production: bool = True
|
||||
|
||||
# Block production limits
|
||||
max_block_size_bytes: int = 1_000_000 # 1 MB
|
||||
max_txs_per_block: int = 500
|
||||
|
||||
@@ -148,7 +148,11 @@ class BlockchainNode:
|
||||
max_size=settings.mempool_max_size,
|
||||
min_fee=settings.min_fee,
|
||||
)
|
||||
self._start_proposers()
|
||||
# Start proposers only if enabled (followers set enable_block_production=False)
|
||||
if getattr(settings, "enable_block_production", True):
|
||||
self._start_proposers()
|
||||
else:
|
||||
logger.info("Block production disabled on this node", extra={"proposer_id": settings.proposer_id})
|
||||
await self._setup_gossip_subscribers()
|
||||
try:
|
||||
await self._stop_event.wait()
|
||||
|
||||
Reference in New Issue
Block a user