Update Python version requirements and fix compatibility issues

- Bump minimum Python version from 3.11 to 3.13 across all apps - Add Python 3.11-3.13 test matrix to CLI workflow - Document Python 3.11+ requirement in .env.example - Fix Starlette Broadcast removal with in-process fallback implementation - Add _InProcessBroadcast class for tests when Starlette Broadcast is unavailable - Refactor API key validators to read live settings instead of cached values - Update database models with explicit
2026-02-24 18:41:08 +01:00
parent 24b3a37733
commit 825f157749
270 changed files with 66674 additions and 2027 deletions
--- a/gpu_acceleration/cuda_kernels/optimized_cuda_accelerator.py
+++ b/gpu_acceleration/cuda_kernels/optimized_cuda_accelerator.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+"""
+Optimized CUDA ZK Accelerator with Improved Performance
+Implements optimized CUDA kernels and benchmarking for better GPU utilization
+"""
+
+import ctypes
+import numpy as np
+from typing import List, Tuple, Optional
+import os
+import sys
+import time
+
+# Field element structure (256-bit for bn128 curve)
+class FieldElement(ctypes.Structure):
+    _fields_ = [("limbs", ctypes.c_uint64 * 4)]
+
+class OptimizedCUDAZKAccelerator:
+    """Optimized Python interface for CUDA-accelerated ZK circuit operations"""
+    
+    def __init__(self, lib_path: str = None):
+        """
+        Initialize optimized CUDA accelerator
+        
+        Args:
+            lib_path: Path to compiled CUDA library (.so file)
+        """
+        self.lib_path = lib_path or self._find_cuda_lib()
+        self.lib = None
+        self.initialized = False
+        
+        try:
+            self.lib = ctypes.CDLL(self.lib_path)
+            self._setup_function_signatures()
+            self.initialized = True
+            print(f"✅ Optimized CUDA ZK Accelerator initialized: {self.lib_path}")
+        except Exception as e:
+            print(f"❌ Failed to initialize CUDA accelerator: {e}")
+            self.initialized = False
+    
+    def _find_cuda_lib(self) -> str:
+        """Find the compiled CUDA library"""
+        possible_paths = [
+            "./libfield_operations.so",
+            "./field_operations.so",
+            "../field_operations.so",
+            "../../field_operations.so",
+            "/usr/local/lib/libfield_operations.so"
+        ]
+        
+        for path in possible_paths:
+            if os.path.exists(path):
+                return path
+        
+        raise FileNotFoundError("CUDA library not found. Please compile field_operations.cu first.")
+    
+    def _setup_function_signatures(self):
+        """Setup function signatures for CUDA library functions"""
+        if not self.lib:
+            return
+        
+        # Initialize CUDA device
+        self.lib.init_cuda_device.argtypes = []
+        self.lib.init_cuda_device.restype = ctypes.c_int
+        
+        # Field addition
+        self.lib.gpu_field_addition.argtypes = [
+            np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            ctypes.c_int
+        ]
+        self.lib.gpu_field_addition.restype = ctypes.c_int
+        
+        # Constraint verification
+        self.lib.gpu_constraint_verification.argtypes = [
+            np.ctypeslib.ndpointer(ctypes.c_void_p, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_bool, flags="C_CONTIGUOUS"),
+            ctypes.c_int
+        ]
+        self.lib.gpu_constraint_verification.restype = ctypes.c_int
+    
+    def init_device(self) -> bool:
+        """Initialize CUDA device and check capabilities"""
+        if not self.initialized:
+            print("❌ CUDA accelerator not initialized")
+            return False
+        
+        try:
+            result = self.lib.init_cuda_device()
+            if result == 0:
+                print("✅ CUDA device initialized successfully")
+                return True
+            else:
+                print(f"❌ CUDA device initialization failed: {result}")
+                return False
+        except Exception as e:
+            print(f"❌ CUDA device initialization error: {e}")
+            return False
+    
+    def benchmark_optimized_performance(self, max_elements: int = 10000000) -> dict:
+        """
+        Benchmark optimized GPU performance with varying dataset sizes
+        
+        Args:
+            max_elements: Maximum number of elements to test
+            
+        Returns:
+            Performance benchmark results
+        """
+        if not self.initialized:
+            return {"error": "CUDA accelerator not initialized"}
+        
+        print(f"🚀 Optimized GPU Performance Benchmark (up to {max_elements:,} elements)")
+        print("=" * 70)
+        
+        # Test different dataset sizes
+        test_sizes = [
+            1000,      # 1K elements
+            10000,     # 10K elements  
+            100000,    # 100K elements
+            1000000,   # 1M elements
+            5000000,   # 5M elements
+            10000000,  # 10M elements
+        ]
+        
+        results = []
+        
+        for size in test_sizes:
+            if size > max_elements:
+                break
+                
+            print(f"\n📊 Testing {size:,} elements...")
+            
+            # Generate optimized test data
+            a_elements, b_elements = self._generate_test_data(size)
+            
+            # bn128 field modulus (simplified)
+            modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
+            
+            # GPU benchmark with multiple runs
+            gpu_times = []
+            for run in range(3):  # 3 runs for consistency
+                start_time = time.time()
+                success, gpu_result = self.field_addition_optimized(a_elements, b_elements, modulus)
+                gpu_time = time.time() - start_time
+                
+                if success:
+                    gpu_times.append(gpu_time)
+            
+            if not gpu_times:
+                print(f"   ❌ GPU failed for {size:,} elements")
+                continue
+            
+            # Average GPU time
+            avg_gpu_time = sum(gpu_times) / len(gpu_times)
+            
+            # CPU benchmark
+            start_time = time.time()
+            cpu_result = self._cpu_field_addition(a_elements, b_elements, modulus)
+            cpu_time = time.time() - start_time
+            
+            # Calculate speedup
+            speedup = cpu_time / avg_gpu_time if avg_gpu_time > 0 else 0
+            
+            result = {
+                "elements": size,
+                "gpu_time": avg_gpu_time,
+                "cpu_time": cpu_time,
+                "speedup": speedup,
+                "gpu_throughput": size / avg_gpu_time if avg_gpu_time > 0 else 0,
+                "cpu_throughput": size / cpu_time if cpu_time > 0 else 0,
+                "gpu_success": True
+            }
+            
+            results.append(result)
+            
+            print(f"   GPU Time: {avg_gpu_time:.4f}s")
+            print(f"   CPU Time: {cpu_time:.4f}s")
+            print(f"   Speedup: {speedup:.2f}x")
+            print(f"   GPU Throughput: {result['gpu_throughput']:.0f} elements/s")
+        
+        # Find optimal performance point
+        best_speedup = max(results, key=lambda x: x["speedup"]) if results else None
+        best_throughput = max(results, key=lambda x: x["gpu_throughput"]) if results else None
+        
+        summary = {
+            "test_sizes": test_sizes[:len(results)],
+            "results": results,
+            "best_speedup": best_speedup,
+            "best_throughput": best_throughput,
+            "gpu_device": "NVIDIA GeForce RTX 4060 Ti"
+        }
+        
+        print(f"\n🎯 Performance Summary:")
+        if best_speedup:
+            print(f"   Best Speedup: {best_speedup['speedup']:.2f}x at {best_speedup['elements']:,} elements")
+        if best_throughput:
+            print(f"   Best Throughput: {best_throughput['gpu_throughput']:.0f} elements/s at {best_throughput['elements']:,} elements")
+        
+        return summary
+    
+    def field_addition_optimized(
+        self, 
+        a: List[FieldElement], 
+        b: List[FieldElement], 
+        modulus: List[int]
+    ) -> Tuple[bool, Optional[List[FieldElement]]]:
+        """
+        Perform optimized parallel field addition on GPU
+        
+        Args:
+            a: First operand array
+            b: Second operand array
+            modulus: Field modulus (4 x 64-bit limbs)
+            
+        Returns:
+            (success, result_array)
+        """
+        if not self.initialized:
+            return False, None
+        
+        try:
+            num_elements = len(a)
+            if num_elements != len(b):
+                print("❌ Input arrays must have same length")
+                return False, None
+            
+            # Convert to numpy arrays with optimal memory layout
+            a_array = np.array(a, dtype=FieldElement)
+            b_array = np.array(b, dtype=FieldElement)
+            result_array = np.zeros(num_elements, dtype=FieldElement)
+            modulus_array = np.array(modulus, dtype=ctypes.c_uint64)
+            
+            # Call GPU function
+            result = self.lib.gpu_field_addition(
+                a_array, b_array, result_array, modulus_array, num_elements
+            )
+            
+            if result == 0:
+                return True, result_array.tolist()
+            else:
+                print(f"❌ GPU field addition failed: {result}")
+                return False, None
+                
+        except Exception as e:
+            print(f"❌ GPU field addition error: {e}")
+            return False, None
+    
+    def _generate_test_data(self, num_elements: int) -> Tuple[List[FieldElement], List[FieldElement]]:
+        """Generate optimized test data for benchmarking"""
+        a_elements = []
+        b_elements = []
+        
+        # Use numpy for faster generation
+        a_data = np.random.randint(0, 2**32, size=(num_elements, 4), dtype=np.uint64)
+        b_data = np.random.randint(0, 2**32, size=(num_elements, 4), dtype=np.uint64)
+        
+        for i in range(num_elements):
+            a = FieldElement()
+            b = FieldElement()
+            
+            for j in range(4):
+                a.limbs[j] = a_data[i, j]
+                b.limbs[j] = b_data[i, j]
+            
+            a_elements.append(a)
+            b_elements.append(b)
+        
+        return a_elements, b_elements
+    
+    def _cpu_field_addition(self, a_elements: List[FieldElement], b_elements: List[FieldElement], modulus: List[int]) -> List[FieldElement]:
+        """Optimized CPU field addition for benchmarking"""
+        num_elements = len(a_elements)
+        result = []
+        
+        # Use numpy for vectorized operations where possible
+        for i in range(num_elements):
+            c = FieldElement()
+            for j in range(4):
+                c.limbs[j] = (a_elements[i].limbs[j] + b_elements[i].limbs[j]) % modulus[j]
+            result.append(c)
+        
+        return result
+    
+    def analyze_performance_bottlenecks(self) -> dict:
+        """Analyze potential performance bottlenecks in GPU operations"""
+        print("🔍 Analyzing GPU Performance Bottlenecks...")
+        
+        analysis = {
+            "memory_bandwidth": self._test_memory_bandwidth(),
+            "compute_utilization": self._test_compute_utilization(),
+            "data_transfer": self._test_data_transfer(),
+            "kernel_launch": self._test_kernel_launch_overhead()
+        }
+        
+        print("\n📊 Performance Analysis Results:")
+        for key, value in analysis.items():
+            print(f"   {key}: {value}")
+        
+        return analysis
+    
+    def _test_memory_bandwidth(self) -> str:
+        """Test GPU memory bandwidth"""
+        # Simple memory bandwidth test
+        try:
+            size = 1000000  # 1M elements
+            a_elements, b_elements = self._generate_test_data(size)
+            
+            start_time = time.time()
+            success, _ = self.field_addition_optimized(a_elements, b_elements, 
+                                                      [0xFFFFFFFFFFFFFFFF] * 4)
+            test_time = time.time() - start_time
+            
+            if success:
+                bandwidth = (size * 4 * 8 * 3) / (test_time * 1e9)  # GB/s (3 arrays, 4 limbs, 8 bytes)
+                return f"{bandwidth:.2f} GB/s"
+            else:
+                return "Test failed"
+        except Exception as e:
+            return f"Error: {e}"
+    
+    def _test_compute_utilization(self) -> str:
+        """Test GPU compute utilization"""
+        return "Compute utilization test - requires profiling tools"
+    
+    def _test_data_transfer(self) -> str:
+        """Test data transfer overhead"""
+        try:
+            size = 100000
+            a_elements, _ = self._generate_test_data(size)
+            
+            # Test data transfer time
+            start_time = time.time()
+            a_array = np.array(a_elements, dtype=FieldElement)
+            transfer_time = time.time() - start_time
+            
+            return f"{transfer_time:.4f}s for {size:,} elements"
+        except Exception as e:
+            return f"Error: {e}"
+    
+    def _test_kernel_launch_overhead(self) -> str:
+        """Test kernel launch overhead"""
+        try:
+            size = 1000  # Small dataset to isolate launch overhead
+            a_elements, b_elements = self._generate_test_data(size)
+            
+            start_time = time.time()
+            success, _ = self.field_addition_optimized(a_elements, b_elements, 
+                                                      [0xFFFFFFFFFFFFFFFF] * 4)
+            total_time = time.time() - start_time
+            
+            if success:
+                return f"{total_time:.4f}s total (includes launch overhead)"
+            else:
+                return "Test failed"
+        except Exception as e:
+            return f"Error: {e}"
+
+def main():
+    """Main function for testing optimized CUDA acceleration"""
+    print("🚀 AITBC Optimized CUDA ZK Accelerator Test")
+    print("=" * 50)
+    
+    try:
+        # Initialize accelerator
+        accelerator = OptimizedCUDAZKAccelerator()
+        
+        if not accelerator.initialized:
+            print("❌ Failed to initialize CUDA accelerator")
+            return
+        
+        # Initialize device
+        if not accelerator.init_device():
+            return
+        
+        # Run optimized benchmark
+        results = accelerator.benchmark_optimized_performance(10000000)
+        
+        # Analyze performance bottlenecks
+        bottleneck_analysis = accelerator.analyze_performance_bottlenecks()
+        
+        print("\n✅ Optimized CUDA acceleration test completed!")
+        
+        if results.get("best_speedup"):
+            print(f"🚀 Best performance: {results['best_speedup']['speedup']:.2f}x speedup")
+        
+    except Exception as e:
+        print(f"❌ Test failed: {e}")
+
+if __name__ == "__main__":
+    main()