- Change file mode from 644 to 755 for all project files - Add chain_id parameter to get_balance RPC endpoint with default "ait-devnet" - Rename Miner.extra_meta_data to extra_metadata for consistency
454 lines
20 KiB
Python
Executable File
454 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
High-Performance CUDA ZK Accelerator with Optimized Kernels
|
|
Implements optimized CUDA kernels with memory coalescing, vectorization, and shared memory
|
|
"""
|
|
|
|
import ctypes
|
|
import numpy as np
|
|
from typing import List, Tuple, Optional
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
# Optimized field element structure for flat array access
|
|
class OptimizedFieldElement(ctypes.Structure):
|
|
_fields_ = [("limbs", ctypes.c_uint64 * 4)]
|
|
|
|
class HighPerformanceCUDAZKAccelerator:
|
|
"""High-performance Python interface for optimized CUDA ZK operations"""
|
|
|
|
def __init__(self, lib_path: str = None):
|
|
"""
|
|
Initialize high-performance CUDA accelerator
|
|
|
|
Args:
|
|
lib_path: Path to compiled optimized CUDA library (.so file)
|
|
"""
|
|
self.lib_path = lib_path or self._find_optimized_cuda_lib()
|
|
self.lib = None
|
|
self.initialized = False
|
|
|
|
try:
|
|
self.lib = ctypes.CDLL(self.lib_path)
|
|
self._setup_function_signatures()
|
|
self.initialized = True
|
|
print(f"✅ High-Performance CUDA ZK Accelerator initialized: {self.lib_path}")
|
|
except Exception as e:
|
|
print(f"❌ Failed to initialize CUDA accelerator: {e}")
|
|
self.initialized = False
|
|
|
|
def _find_optimized_cuda_lib(self) -> str:
|
|
"""Find the compiled optimized CUDA library"""
|
|
possible_paths = [
|
|
"./liboptimized_field_operations.so",
|
|
"./optimized_field_operations.so",
|
|
"../liboptimized_field_operations.so",
|
|
"../../liboptimized_field_operations.so",
|
|
"/usr/local/lib/liboptimized_field_operations.so"
|
|
]
|
|
|
|
for path in possible_paths:
|
|
if os.path.exists(path):
|
|
return path
|
|
|
|
raise FileNotFoundError("Optimized CUDA library not found. Please compile optimized_field_operations.cu first.")
|
|
|
|
def _setup_function_signatures(self):
|
|
"""Setup function signatures for optimized CUDA library functions"""
|
|
if not self.lib:
|
|
return
|
|
|
|
# Initialize optimized CUDA device
|
|
self.lib.init_optimized_cuda_device.argtypes = []
|
|
self.lib.init_optimized_cuda_device.restype = ctypes.c_int
|
|
|
|
# Optimized field addition with flat arrays
|
|
self.lib.gpu_optimized_field_addition.argtypes = [
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
ctypes.c_int
|
|
]
|
|
self.lib.gpu_optimized_field_addition.restype = ctypes.c_int
|
|
|
|
# Vectorized field addition
|
|
self.lib.gpu_vectorized_field_addition.argtypes = [
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), # field_vector_t
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
ctypes.c_int
|
|
]
|
|
self.lib.gpu_vectorized_field_addition.restype = ctypes.c_int
|
|
|
|
# Shared memory field addition
|
|
self.lib.gpu_shared_memory_field_addition.argtypes = [
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
|
ctypes.c_int
|
|
]
|
|
self.lib.gpu_shared_memory_field_addition.restype = ctypes.c_int
|
|
|
|
def init_device(self) -> bool:
|
|
"""Initialize optimized CUDA device and check capabilities"""
|
|
if not self.initialized:
|
|
print("❌ CUDA accelerator not initialized")
|
|
return False
|
|
|
|
try:
|
|
result = self.lib.init_optimized_cuda_device()
|
|
if result == 0:
|
|
print("✅ Optimized CUDA device initialized successfully")
|
|
return True
|
|
else:
|
|
print(f"❌ CUDA device initialization failed: {result}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ CUDA device initialization error: {e}")
|
|
return False
|
|
|
|
def benchmark_optimized_kernels(self, max_elements: int = 10000000) -> dict:
|
|
"""
|
|
Benchmark all optimized CUDA kernels and compare performance
|
|
|
|
Args:
|
|
max_elements: Maximum number of elements to test
|
|
|
|
Returns:
|
|
Comprehensive performance benchmark results
|
|
"""
|
|
if not self.initialized:
|
|
return {"error": "CUDA accelerator not initialized"}
|
|
|
|
print(f"🚀 High-Performance CUDA Kernel Benchmark (up to {max_elements:,} elements)")
|
|
print("=" * 80)
|
|
|
|
# Test different dataset sizes
|
|
test_sizes = [
|
|
1000, # 1K elements
|
|
10000, # 10K elements
|
|
100000, # 100K elements
|
|
1000000, # 1M elements
|
|
5000000, # 5M elements
|
|
10000000, # 10M elements
|
|
]
|
|
|
|
results = {
|
|
"test_sizes": [],
|
|
"optimized_flat": [],
|
|
"vectorized": [],
|
|
"shared_memory": [],
|
|
"cpu_baseline": [],
|
|
"performance_summary": {}
|
|
}
|
|
|
|
for size in test_sizes:
|
|
if size > max_elements:
|
|
break
|
|
|
|
print(f"\n📊 Benchmarking {size:,} elements...")
|
|
|
|
# Generate test data as flat arrays for optimal memory access
|
|
a_flat, b_flat = self._generate_flat_test_data(size)
|
|
|
|
# bn128 field modulus (simplified)
|
|
modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
|
|
|
|
# Benchmark optimized flat array kernel
|
|
flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, size)
|
|
|
|
# Benchmark vectorized kernel
|
|
vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, size)
|
|
|
|
# Benchmark shared memory kernel
|
|
shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, size)
|
|
|
|
# Benchmark CPU baseline
|
|
cpu_result = self._benchmark_cpu_baseline(a_flat, b_flat, modulus, size)
|
|
|
|
# Store results
|
|
results["test_sizes"].append(size)
|
|
results["optimized_flat"].append(flat_result)
|
|
results["vectorized"].append(vec_result)
|
|
results["shared_memory"].append(shared_result)
|
|
results["cpu_baseline"].append(cpu_result)
|
|
|
|
# Print comparison
|
|
print(f" Optimized Flat: {flat_result['time']:.4f}s, {flat_result['throughput']:.0f} elem/s")
|
|
print(f" Vectorized: {vec_result['time']:.4f}s, {vec_result['throughput']:.0f} elem/s")
|
|
print(f" Shared Memory: {shared_result['time']:.4f}s, {shared_result['throughput']:.0f} elem/s")
|
|
print(f" CPU Baseline: {cpu_result['time']:.4f}s, {cpu_result['throughput']:.0f} elem/s")
|
|
|
|
# Calculate speedups
|
|
flat_speedup = cpu_result['time'] / flat_result['time'] if flat_result['time'] > 0 else 0
|
|
vec_speedup = cpu_result['time'] / vec_result['time'] if vec_result['time'] > 0 else 0
|
|
shared_speedup = cpu_result['time'] / shared_result['time'] if shared_result['time'] > 0 else 0
|
|
|
|
print(f" Speedups - Flat: {flat_speedup:.2f}x, Vec: {vec_speedup:.2f}x, Shared: {shared_speedup:.2f}x")
|
|
|
|
# Calculate performance summary
|
|
results["performance_summary"] = self._calculate_performance_summary(results)
|
|
|
|
# Print final summary
|
|
self._print_performance_summary(results["performance_summary"])
|
|
|
|
return results
|
|
|
|
def _benchmark_optimized_flat_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
|
modulus: List[int], num_elements: int) -> dict:
|
|
"""Benchmark optimized flat array kernel"""
|
|
try:
|
|
result_flat = np.zeros_like(a_flat)
|
|
modulus_array = np.array(modulus, dtype=np.uint64)
|
|
|
|
# Multiple runs for consistency
|
|
times = []
|
|
for run in range(3):
|
|
start_time = time.time()
|
|
success = self.lib.gpu_optimized_field_addition(
|
|
a_flat, b_flat, result_flat, modulus_array, num_elements
|
|
)
|
|
run_time = time.time() - start_time
|
|
|
|
if success == 0: # Success
|
|
times.append(run_time)
|
|
|
|
if not times:
|
|
return {"time": float('inf'), "throughput": 0, "success": False}
|
|
|
|
avg_time = sum(times) / len(times)
|
|
throughput = num_elements / avg_time if avg_time > 0 else 0
|
|
|
|
return {"time": avg_time, "throughput": throughput, "success": True}
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Optimized flat kernel error: {e}")
|
|
return {"time": float('inf'), "throughput": 0, "success": False}
|
|
|
|
def _benchmark_vectorized_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
|
modulus: List[int], num_elements: int) -> dict:
|
|
"""Benchmark vectorized kernel"""
|
|
try:
|
|
# Convert flat arrays to vectorized format (uint4)
|
|
# For simplicity, we'll reuse the flat array kernel as vectorized
|
|
# In practice, would convert to proper vector format
|
|
result_flat = np.zeros_like(a_flat)
|
|
modulus_array = np.array(modulus, dtype=np.uint64)
|
|
|
|
times = []
|
|
for run in range(3):
|
|
start_time = time.time()
|
|
success = self.lib.gpu_vectorized_field_addition(
|
|
a_flat, b_flat, result_flat, modulus_array, num_elements
|
|
)
|
|
run_time = time.time() - start_time
|
|
|
|
if success == 0:
|
|
times.append(run_time)
|
|
|
|
if not times:
|
|
return {"time": float('inf'), "throughput": 0, "success": False}
|
|
|
|
avg_time = sum(times) / len(times)
|
|
throughput = num_elements / avg_time if avg_time > 0 else 0
|
|
|
|
return {"time": avg_time, "throughput": throughput, "success": True}
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Vectorized kernel error: {e}")
|
|
return {"time": float('inf'), "throughput": 0, "success": False}
|
|
|
|
def _benchmark_shared_memory_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
|
modulus: List[int], num_elements: int) -> dict:
|
|
"""Benchmark shared memory kernel"""
|
|
try:
|
|
result_flat = np.zeros_like(a_flat)
|
|
modulus_array = np.array(modulus, dtype=np.uint64)
|
|
|
|
times = []
|
|
for run in range(3):
|
|
start_time = time.time()
|
|
success = self.lib.gpu_shared_memory_field_addition(
|
|
a_flat, b_flat, result_flat, modulus_array, num_elements
|
|
)
|
|
run_time = time.time() - start_time
|
|
|
|
if success == 0:
|
|
times.append(run_time)
|
|
|
|
if not times:
|
|
return {"time": float('inf'), "throughput": 0, "success": False}
|
|
|
|
avg_time = sum(times) / len(times)
|
|
throughput = num_elements / avg_time if avg_time > 0 else 0
|
|
|
|
return {"time": avg_time, "throughput": throughput, "success": True}
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Shared memory kernel error: {e}")
|
|
return {"time": float('inf'), "throughput": 0, "success": False}
|
|
|
|
def _benchmark_cpu_baseline(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
|
modulus: List[int], num_elements: int) -> dict:
|
|
"""Benchmark CPU baseline for comparison"""
|
|
try:
|
|
start_time = time.time()
|
|
|
|
# Simple CPU field addition
|
|
result_flat = np.zeros_like(a_flat)
|
|
for i in range(num_elements):
|
|
base_idx = i * 4
|
|
for j in range(4):
|
|
result_flat[base_idx + j] = (a_flat[base_idx + j] + b_flat[base_idx + j]) % modulus[j]
|
|
|
|
cpu_time = time.time() - start_time
|
|
throughput = num_elements / cpu_time if cpu_time > 0 else 0
|
|
|
|
return {"time": cpu_time, "throughput": throughput, "success": True}
|
|
|
|
except Exception as e:
|
|
print(f" ❌ CPU baseline error: {e}")
|
|
return {"time": float('inf'), "throughput": 0, "success": False}
|
|
|
|
def _generate_flat_test_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Generate flat array test data for optimal memory access"""
|
|
# Generate flat arrays (num_elements * 4 limbs)
|
|
flat_size = num_elements * 4
|
|
|
|
# Use numpy for fast generation
|
|
a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
|
|
b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
|
|
|
|
return a_flat, b_flat
|
|
|
|
def _calculate_performance_summary(self, results: dict) -> dict:
|
|
"""Calculate performance summary statistics"""
|
|
summary = {}
|
|
|
|
# Find best performing kernel for each size
|
|
best_speedups = []
|
|
best_throughputs = []
|
|
|
|
for i, size in enumerate(results["test_sizes"]):
|
|
cpu_time = results["cpu_baseline"][i]["time"]
|
|
|
|
# Calculate speedups
|
|
flat_speedup = cpu_time / results["optimized_flat"][i]["time"] if results["optimized_flat"][i]["time"] > 0 else 0
|
|
vec_speedup = cpu_time / results["vectorized"][i]["time"] if results["vectorized"][i]["time"] > 0 else 0
|
|
shared_speedup = cpu_time / results["shared_memory"][i]["time"] if results["shared_memory"][i]["time"] > 0 else 0
|
|
|
|
best_speedup = max(flat_speedup, vec_speedup, shared_speedup)
|
|
best_speedups.append(best_speedup)
|
|
|
|
# Find best throughput
|
|
best_throughput = max(
|
|
results["optimized_flat"][i]["throughput"],
|
|
results["vectorized"][i]["throughput"],
|
|
results["shared_memory"][i]["throughput"]
|
|
)
|
|
best_throughputs.append(best_throughput)
|
|
|
|
if best_speedups:
|
|
summary["best_speedup"] = max(best_speedups)
|
|
summary["average_speedup"] = sum(best_speedups) / len(best_speedups)
|
|
summary["best_speedup_size"] = results["test_sizes"][best_speedups.index(max(best_speedups))]
|
|
|
|
if best_throughputs:
|
|
summary["best_throughput"] = max(best_throughputs)
|
|
summary["average_throughput"] = sum(best_throughputs) / len(best_throughputs)
|
|
summary["best_throughput_size"] = results["test_sizes"][best_throughputs.index(max(best_throughputs))]
|
|
|
|
return summary
|
|
|
|
def _print_performance_summary(self, summary: dict):
|
|
"""Print comprehensive performance summary"""
|
|
print(f"\n🎯 High-Performance CUDA Summary:")
|
|
print("=" * 50)
|
|
|
|
if "best_speedup" in summary:
|
|
print(f" Best Speedup: {summary['best_speedup']:.2f}x at {summary.get('best_speedup_size', 'N/A'):,} elements")
|
|
print(f" Average Speedup: {summary['average_speedup']:.2f}x across all tests")
|
|
|
|
if "best_throughput" in summary:
|
|
print(f" Best Throughput: {summary['best_throughput']:.0f} elements/s at {summary.get('best_throughput_size', 'N/A'):,} elements")
|
|
print(f" Average Throughput: {summary['average_throughput']:.0f} elements/s")
|
|
|
|
# Performance classification
|
|
if summary.get("best_speedup", 0) > 5:
|
|
print(" 🚀 Performance: EXCELLENT - Significant GPU acceleration achieved")
|
|
elif summary.get("best_speedup", 0) > 2:
|
|
print(" ✅ Performance: GOOD - Measurable GPU acceleration achieved")
|
|
elif summary.get("best_speedup", 0) > 1:
|
|
print(" ⚠️ Performance: MODERATE - Limited GPU acceleration")
|
|
else:
|
|
print(" ❌ Performance: POOR - No significant GPU acceleration")
|
|
|
|
def analyze_memory_bandwidth(self, num_elements: int = 1000000) -> dict:
|
|
"""Analyze memory bandwidth performance"""
|
|
print(f"🔍 Analyzing Memory Bandwidth Performance ({num_elements:,} elements)...")
|
|
|
|
a_flat, b_flat = self._generate_flat_test_data(num_elements)
|
|
modulus = [0xFFFFFFFFFFFFFFFF] * 4
|
|
|
|
# Test different kernels
|
|
flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, num_elements)
|
|
vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, num_elements)
|
|
shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, num_elements)
|
|
|
|
# Calculate theoretical bandwidth
|
|
data_size = num_elements * 4 * 8 * 3 # 3 arrays, 4 limbs, 8 bytes
|
|
|
|
analysis = {
|
|
"data_size_gb": data_size / (1024**3),
|
|
"flat_bandwidth_gb_s": data_size / (flat_result['time'] * 1024**3) if flat_result['time'] > 0 else 0,
|
|
"vectorized_bandwidth_gb_s": data_size / (vec_result['time'] * 1024**3) if vec_result['time'] > 0 else 0,
|
|
"shared_bandwidth_gb_s": data_size / (shared_result['time'] * 1024**3) if shared_result['time'] > 0 else 0,
|
|
}
|
|
|
|
print(f" Data Size: {analysis['data_size_gb']:.2f} GB")
|
|
print(f" Flat Kernel: {analysis['flat_bandwidth_gb_s']:.2f} GB/s")
|
|
print(f" Vectorized Kernel: {analysis['vectorized_bandwidth_gb_s']:.2f} GB/s")
|
|
print(f" Shared Memory Kernel: {analysis['shared_bandwidth_gb_s']:.2f} GB/s")
|
|
|
|
return analysis
|
|
|
|
def main():
|
|
"""Main function for testing high-performance CUDA acceleration"""
|
|
print("🚀 AITBC High-Performance CUDA ZK Accelerator Test")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
# Initialize high-performance accelerator
|
|
accelerator = HighPerformanceCUDAZKAccelerator()
|
|
|
|
if not accelerator.initialized:
|
|
print("❌ Failed to initialize CUDA accelerator")
|
|
return
|
|
|
|
# Initialize device
|
|
if not accelerator.init_device():
|
|
return
|
|
|
|
# Run comprehensive benchmark
|
|
results = accelerator.benchmark_optimized_kernels(10000000)
|
|
|
|
# Analyze memory bandwidth
|
|
bandwidth_analysis = accelerator.analyze_memory_bandwidth(1000000)
|
|
|
|
print("\n✅ High-Performance CUDA acceleration test completed!")
|
|
|
|
if results.get("performance_summary", {}).get("best_speedup", 0) > 1:
|
|
print(f"🚀 Optimization successful: {results['performance_summary']['best_speedup']:.2f}x speedup achieved")
|
|
else:
|
|
print("⚠️ Further optimization needed")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Test failed: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|