chore(security): enhance environment configuration, CI workflows, and wallet daemon with security improvements

- Restructure .env.example with security-focused documentation, service-specific environment file references, and AWS Secrets Manager integration - Update CLI tests workflow to single Python 3.13 version, add pytest-mock dependency, and consolidate test execution with coverage - Add comprehensive security validation to package publishing workflow with manual approval gates, secret scanning, and release
2026-03-03 10:33:46 +01:00
parent 00d00cb964
commit f353e00172
220 changed files with 42506 additions and 921 deletions
--- a/gpu_acceleration/legacy/high_performance_cuda_accelerator.py
+++ b/gpu_acceleration/legacy/high_performance_cuda_accelerator.py
@@ -0,0 +1,453 @@
+#!/usr/bin/env python3
+"""
+High-Performance CUDA ZK Accelerator with Optimized Kernels
+Implements optimized CUDA kernels with memory coalescing, vectorization, and shared memory
+"""
+
+import ctypes
+import numpy as np
+from typing import List, Tuple, Optional
+import os
+import sys
+import time
+
+# Optimized field element structure for flat array access
+class OptimizedFieldElement(ctypes.Structure):
+    _fields_ = [("limbs", ctypes.c_uint64 * 4)]
+
+class HighPerformanceCUDAZKAccelerator:
+    """High-performance Python interface for optimized CUDA ZK operations"""
+    
+    def __init__(self, lib_path: str = None):
+        """
+        Initialize high-performance CUDA accelerator
+        
+        Args:
+            lib_path: Path to compiled optimized CUDA library (.so file)
+        """
+        self.lib_path = lib_path or self._find_optimized_cuda_lib()
+        self.lib = None
+        self.initialized = False
+        
+        try:
+            self.lib = ctypes.CDLL(self.lib_path)
+            self._setup_function_signatures()
+            self.initialized = True
+            print(f"✅ High-Performance CUDA ZK Accelerator initialized: {self.lib_path}")
+        except Exception as e:
+            print(f"❌ Failed to initialize CUDA accelerator: {e}")
+            self.initialized = False
+    
+    def _find_optimized_cuda_lib(self) -> str:
+        """Find the compiled optimized CUDA library"""
+        possible_paths = [
+            "./liboptimized_field_operations.so",
+            "./optimized_field_operations.so",
+            "../liboptimized_field_operations.so",
+            "../../liboptimized_field_operations.so",
+            "/usr/local/lib/liboptimized_field_operations.so"
+        ]
+        
+        for path in possible_paths:
+            if os.path.exists(path):
+                return path
+        
+        raise FileNotFoundError("Optimized CUDA library not found. Please compile optimized_field_operations.cu first.")
+    
+    def _setup_function_signatures(self):
+        """Setup function signatures for optimized CUDA library functions"""
+        if not self.lib:
+            return
+        
+        # Initialize optimized CUDA device
+        self.lib.init_optimized_cuda_device.argtypes = []
+        self.lib.init_optimized_cuda_device.restype = ctypes.c_int
+        
+        # Optimized field addition with flat arrays
+        self.lib.gpu_optimized_field_addition.argtypes = [
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            ctypes.c_int
+        ]
+        self.lib.gpu_optimized_field_addition.restype = ctypes.c_int
+        
+        # Vectorized field addition
+        self.lib.gpu_vectorized_field_addition.argtypes = [
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),  # field_vector_t
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            ctypes.c_int
+        ]
+        self.lib.gpu_vectorized_field_addition.restype = ctypes.c_int
+        
+        # Shared memory field addition
+        self.lib.gpu_shared_memory_field_addition.argtypes = [
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
+            ctypes.c_int
+        ]
+        self.lib.gpu_shared_memory_field_addition.restype = ctypes.c_int
+    
+    def init_device(self) -> bool:
+        """Initialize optimized CUDA device and check capabilities"""
+        if not self.initialized:
+            print("❌ CUDA accelerator not initialized")
+            return False
+        
+        try:
+            result = self.lib.init_optimized_cuda_device()
+            if result == 0:
+                print("✅ Optimized CUDA device initialized successfully")
+                return True
+            else:
+                print(f"❌ CUDA device initialization failed: {result}")
+                return False
+        except Exception as e:
+            print(f"❌ CUDA device initialization error: {e}")
+            return False
+    
+    def benchmark_optimized_kernels(self, max_elements: int = 10000000) -> dict:
+        """
+        Benchmark all optimized CUDA kernels and compare performance
+        
+        Args:
+            max_elements: Maximum number of elements to test
+            
+        Returns:
+            Comprehensive performance benchmark results
+        """
+        if not self.initialized:
+            return {"error": "CUDA accelerator not initialized"}
+        
+        print(f"🚀 High-Performance CUDA Kernel Benchmark (up to {max_elements:,} elements)")
+        print("=" * 80)
+        
+        # Test different dataset sizes
+        test_sizes = [
+            1000,      # 1K elements
+            10000,     # 10K elements  
+            100000,    # 100K elements
+            1000000,   # 1M elements
+            5000000,   # 5M elements
+            10000000,  # 10M elements
+        ]
+        
+        results = {
+            "test_sizes": [],
+            "optimized_flat": [],
+            "vectorized": [],
+            "shared_memory": [],
+            "cpu_baseline": [],
+            "performance_summary": {}
+        }
+        
+        for size in test_sizes:
+            if size > max_elements:
+                break
+                
+            print(f"\n📊 Benchmarking {size:,} elements...")
+            
+            # Generate test data as flat arrays for optimal memory access
+            a_flat, b_flat = self._generate_flat_test_data(size)
+            
+            # bn128 field modulus (simplified)
+            modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
+            
+            # Benchmark optimized flat array kernel
+            flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, size)
+            
+            # Benchmark vectorized kernel
+            vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, size)
+            
+            # Benchmark shared memory kernel
+            shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, size)
+            
+            # Benchmark CPU baseline
+            cpu_result = self._benchmark_cpu_baseline(a_flat, b_flat, modulus, size)
+            
+            # Store results
+            results["test_sizes"].append(size)
+            results["optimized_flat"].append(flat_result)
+            results["vectorized"].append(vec_result)
+            results["shared_memory"].append(shared_result)
+            results["cpu_baseline"].append(cpu_result)
+            
+            # Print comparison
+            print(f"   Optimized Flat:   {flat_result['time']:.4f}s, {flat_result['throughput']:.0f} elem/s")
+            print(f"   Vectorized:       {vec_result['time']:.4f}s, {vec_result['throughput']:.0f} elem/s")
+            print(f"   Shared Memory:    {shared_result['time']:.4f}s, {shared_result['throughput']:.0f} elem/s")
+            print(f"   CPU Baseline:     {cpu_result['time']:.4f}s, {cpu_result['throughput']:.0f} elem/s")
+            
+            # Calculate speedups
+            flat_speedup = cpu_result['time'] / flat_result['time'] if flat_result['time'] > 0 else 0
+            vec_speedup = cpu_result['time'] / vec_result['time'] if vec_result['time'] > 0 else 0
+            shared_speedup = cpu_result['time'] / shared_result['time'] if shared_result['time'] > 0 else 0
+            
+            print(f"   Speedups - Flat: {flat_speedup:.2f}x, Vec: {vec_speedup:.2f}x, Shared: {shared_speedup:.2f}x")
+        
+        # Calculate performance summary
+        results["performance_summary"] = self._calculate_performance_summary(results)
+        
+        # Print final summary
+        self._print_performance_summary(results["performance_summary"])
+        
+        return results
+    
+    def _benchmark_optimized_flat_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, 
+                                        modulus: List[int], num_elements: int) -> dict:
+        """Benchmark optimized flat array kernel"""
+        try:
+            result_flat = np.zeros_like(a_flat)
+            modulus_array = np.array(modulus, dtype=np.uint64)
+            
+            # Multiple runs for consistency
+            times = []
+            for run in range(3):
+                start_time = time.time()
+                success = self.lib.gpu_optimized_field_addition(
+                    a_flat, b_flat, result_flat, modulus_array, num_elements
+                )
+                run_time = time.time() - start_time
+                
+                if success == 0:  # Success
+                    times.append(run_time)
+            
+            if not times:
+                return {"time": float('inf'), "throughput": 0, "success": False}
+            
+            avg_time = sum(times) / len(times)
+            throughput = num_elements / avg_time if avg_time > 0 else 0
+            
+            return {"time": avg_time, "throughput": throughput, "success": True}
+            
+        except Exception as e:
+            print(f"   ❌ Optimized flat kernel error: {e}")
+            return {"time": float('inf'), "throughput": 0, "success": False}
+    
+    def _benchmark_vectorized_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, 
+                                    modulus: List[int], num_elements: int) -> dict:
+        """Benchmark vectorized kernel"""
+        try:
+            # Convert flat arrays to vectorized format (uint4)
+            # For simplicity, we'll reuse the flat array kernel as vectorized
+            # In practice, would convert to proper vector format
+            result_flat = np.zeros_like(a_flat)
+            modulus_array = np.array(modulus, dtype=np.uint64)
+            
+            times = []
+            for run in range(3):
+                start_time = time.time()
+                success = self.lib.gpu_vectorized_field_addition(
+                    a_flat, b_flat, result_flat, modulus_array, num_elements
+                )
+                run_time = time.time() - start_time
+                
+                if success == 0:
+                    times.append(run_time)
+            
+            if not times:
+                return {"time": float('inf'), "throughput": 0, "success": False}
+            
+            avg_time = sum(times) / len(times)
+            throughput = num_elements / avg_time if avg_time > 0 else 0
+            
+            return {"time": avg_time, "throughput": throughput, "success": True}
+            
+        except Exception as e:
+            print(f"   ❌ Vectorized kernel error: {e}")
+            return {"time": float('inf'), "throughput": 0, "success": False}
+    
+    def _benchmark_shared_memory_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, 
+                                       modulus: List[int], num_elements: int) -> dict:
+        """Benchmark shared memory kernel"""
+        try:
+            result_flat = np.zeros_like(a_flat)
+            modulus_array = np.array(modulus, dtype=np.uint64)
+            
+            times = []
+            for run in range(3):
+                start_time = time.time()
+                success = self.lib.gpu_shared_memory_field_addition(
+                    a_flat, b_flat, result_flat, modulus_array, num_elements
+                )
+                run_time = time.time() - start_time
+                
+                if success == 0:
+                    times.append(run_time)
+            
+            if not times:
+                return {"time": float('inf'), "throughput": 0, "success": False}
+            
+            avg_time = sum(times) / len(times)
+            throughput = num_elements / avg_time if avg_time > 0 else 0
+            
+            return {"time": avg_time, "throughput": throughput, "success": True}
+            
+        except Exception as e:
+            print(f"   ❌ Shared memory kernel error: {e}")
+            return {"time": float('inf'), "throughput": 0, "success": False}
+    
+    def _benchmark_cpu_baseline(self, a_flat: np.ndarray, b_flat: np.ndarray, 
+                                modulus: List[int], num_elements: int) -> dict:
+        """Benchmark CPU baseline for comparison"""
+        try:
+            start_time = time.time()
+            
+            # Simple CPU field addition
+            result_flat = np.zeros_like(a_flat)
+            for i in range(num_elements):
+                base_idx = i * 4
+                for j in range(4):
+                    result_flat[base_idx + j] = (a_flat[base_idx + j] + b_flat[base_idx + j]) % modulus[j]
+            
+            cpu_time = time.time() - start_time
+            throughput = num_elements / cpu_time if cpu_time > 0 else 0
+            
+            return {"time": cpu_time, "throughput": throughput, "success": True}
+            
+        except Exception as e:
+            print(f"   ❌ CPU baseline error: {e}")
+            return {"time": float('inf'), "throughput": 0, "success": False}
+    
+    def _generate_flat_test_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]:
+        """Generate flat array test data for optimal memory access"""
+        # Generate flat arrays (num_elements * 4 limbs)
+        flat_size = num_elements * 4
+        
+        # Use numpy for fast generation
+        a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
+        b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
+        
+        return a_flat, b_flat
+    
+    def _calculate_performance_summary(self, results: dict) -> dict:
+        """Calculate performance summary statistics"""
+        summary = {}
+        
+        # Find best performing kernel for each size
+        best_speedups = []
+        best_throughputs = []
+        
+        for i, size in enumerate(results["test_sizes"]):
+            cpu_time = results["cpu_baseline"][i]["time"]
+            
+            # Calculate speedups
+            flat_speedup = cpu_time / results["optimized_flat"][i]["time"] if results["optimized_flat"][i]["time"] > 0 else 0
+            vec_speedup = cpu_time / results["vectorized"][i]["time"] if results["vectorized"][i]["time"] > 0 else 0
+            shared_speedup = cpu_time / results["shared_memory"][i]["time"] if results["shared_memory"][i]["time"] > 0 else 0
+            
+            best_speedup = max(flat_speedup, vec_speedup, shared_speedup)
+            best_speedups.append(best_speedup)
+            
+            # Find best throughput
+            best_throughput = max(
+                results["optimized_flat"][i]["throughput"],
+                results["vectorized"][i]["throughput"],
+                results["shared_memory"][i]["throughput"]
+            )
+            best_throughputs.append(best_throughput)
+        
+        if best_speedups:
+            summary["best_speedup"] = max(best_speedups)
+            summary["average_speedup"] = sum(best_speedups) / len(best_speedups)
+            summary["best_speedup_size"] = results["test_sizes"][best_speedups.index(max(best_speedups))]
+        
+        if best_throughputs:
+            summary["best_throughput"] = max(best_throughputs)
+            summary["average_throughput"] = sum(best_throughputs) / len(best_throughputs)
+            summary["best_throughput_size"] = results["test_sizes"][best_throughputs.index(max(best_throughputs))]
+        
+        return summary
+    
+    def _print_performance_summary(self, summary: dict):
+        """Print comprehensive performance summary"""
+        print(f"\n🎯 High-Performance CUDA Summary:")
+        print("=" * 50)
+        
+        if "best_speedup" in summary:
+            print(f"   Best Speedup: {summary['best_speedup']:.2f}x at {summary.get('best_speedup_size', 'N/A'):,} elements")
+            print(f"   Average Speedup: {summary['average_speedup']:.2f}x across all tests")
+        
+        if "best_throughput" in summary:
+            print(f"   Best Throughput: {summary['best_throughput']:.0f} elements/s at {summary.get('best_throughput_size', 'N/A'):,} elements")
+            print(f"   Average Throughput: {summary['average_throughput']:.0f} elements/s")
+        
+        # Performance classification
+        if summary.get("best_speedup", 0) > 5:
+            print("   🚀 Performance: EXCELLENT - Significant GPU acceleration achieved")
+        elif summary.get("best_speedup", 0) > 2:
+            print("   ✅ Performance: GOOD - Measurable GPU acceleration achieved")
+        elif summary.get("best_speedup", 0) > 1:
+            print("   ⚠️  Performance: MODERATE - Limited GPU acceleration")
+        else:
+            print("   ❌ Performance: POOR - No significant GPU acceleration")
+    
+    def analyze_memory_bandwidth(self, num_elements: int = 1000000) -> dict:
+        """Analyze memory bandwidth performance"""
+        print(f"🔍 Analyzing Memory Bandwidth Performance ({num_elements:,} elements)...")
+        
+        a_flat, b_flat = self._generate_flat_test_data(num_elements)
+        modulus = [0xFFFFFFFFFFFFFFFF] * 4
+        
+        # Test different kernels
+        flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, num_elements)
+        vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, num_elements)
+        shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, num_elements)
+        
+        # Calculate theoretical bandwidth
+        data_size = num_elements * 4 * 8 * 3  # 3 arrays, 4 limbs, 8 bytes
+        
+        analysis = {
+            "data_size_gb": data_size / (1024**3),
+            "flat_bandwidth_gb_s": data_size / (flat_result['time'] * 1024**3) if flat_result['time'] > 0 else 0,
+            "vectorized_bandwidth_gb_s": data_size / (vec_result['time'] * 1024**3) if vec_result['time'] > 0 else 0,
+            "shared_bandwidth_gb_s": data_size / (shared_result['time'] * 1024**3) if shared_result['time'] > 0 else 0,
+        }
+        
+        print(f"   Data Size: {analysis['data_size_gb']:.2f} GB")
+        print(f"   Flat Kernel: {analysis['flat_bandwidth_gb_s']:.2f} GB/s")
+        print(f"   Vectorized Kernel: {analysis['vectorized_bandwidth_gb_s']:.2f} GB/s")
+        print(f"   Shared Memory Kernel: {analysis['shared_bandwidth_gb_s']:.2f} GB/s")
+        
+        return analysis
+
+def main():
+    """Main function for testing high-performance CUDA acceleration"""
+    print("🚀 AITBC High-Performance CUDA ZK Accelerator Test")
+    print("=" * 60)
+    
+    try:
+        # Initialize high-performance accelerator
+        accelerator = HighPerformanceCUDAZKAccelerator()
+        
+        if not accelerator.initialized:
+            print("❌ Failed to initialize CUDA accelerator")
+            return
+        
+        # Initialize device
+        if not accelerator.init_device():
+            return
+        
+        # Run comprehensive benchmark
+        results = accelerator.benchmark_optimized_kernels(10000000)
+        
+        # Analyze memory bandwidth
+        bandwidth_analysis = accelerator.analyze_memory_bandwidth(1000000)
+        
+        print("\n✅ High-Performance CUDA acceleration test completed!")
+        
+        if results.get("performance_summary", {}).get("best_speedup", 0) > 1:
+            print(f"🚀 Optimization successful: {results['performance_summary']['best_speedup']:.2f}x speedup achieved")
+        else:
+            print("⚠️  Further optimization needed")
+        
+    except Exception as e:
+        print(f"❌ Test failed: {e}")
+
+if __name__ == "__main__":
+    main()