chore(security): enhance environment configuration, CI workflows, and wallet daemon with security improvements
- Restructure .env.example with security-focused documentation, service-specific environment file references, and AWS Secrets Manager integration - Update CLI tests workflow to single Python 3.13 version, add pytest-mock dependency, and consolidate test execution with coverage - Add comprehensive security validation to package publishing workflow with manual approval gates, secret scanning, and release
This commit is contained in:
453
gpu_acceleration/legacy/high_performance_cuda_accelerator.py
Normal file
453
gpu_acceleration/legacy/high_performance_cuda_accelerator.py
Normal file
@@ -0,0 +1,453 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
High-Performance CUDA ZK Accelerator with Optimized Kernels
|
||||
Implements optimized CUDA kernels with memory coalescing, vectorization, and shared memory
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
import numpy as np
|
||||
from typing import List, Tuple, Optional
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Optimized field element structure for flat array access
|
||||
class OptimizedFieldElement(ctypes.Structure):
|
||||
_fields_ = [("limbs", ctypes.c_uint64 * 4)]
|
||||
|
||||
class HighPerformanceCUDAZKAccelerator:
|
||||
"""High-performance Python interface for optimized CUDA ZK operations"""
|
||||
|
||||
def __init__(self, lib_path: str = None):
|
||||
"""
|
||||
Initialize high-performance CUDA accelerator
|
||||
|
||||
Args:
|
||||
lib_path: Path to compiled optimized CUDA library (.so file)
|
||||
"""
|
||||
self.lib_path = lib_path or self._find_optimized_cuda_lib()
|
||||
self.lib = None
|
||||
self.initialized = False
|
||||
|
||||
try:
|
||||
self.lib = ctypes.CDLL(self.lib_path)
|
||||
self._setup_function_signatures()
|
||||
self.initialized = True
|
||||
print(f"✅ High-Performance CUDA ZK Accelerator initialized: {self.lib_path}")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to initialize CUDA accelerator: {e}")
|
||||
self.initialized = False
|
||||
|
||||
def _find_optimized_cuda_lib(self) -> str:
|
||||
"""Find the compiled optimized CUDA library"""
|
||||
possible_paths = [
|
||||
"./liboptimized_field_operations.so",
|
||||
"./optimized_field_operations.so",
|
||||
"../liboptimized_field_operations.so",
|
||||
"../../liboptimized_field_operations.so",
|
||||
"/usr/local/lib/liboptimized_field_operations.so"
|
||||
]
|
||||
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
raise FileNotFoundError("Optimized CUDA library not found. Please compile optimized_field_operations.cu first.")
|
||||
|
||||
def _setup_function_signatures(self):
|
||||
"""Setup function signatures for optimized CUDA library functions"""
|
||||
if not self.lib:
|
||||
return
|
||||
|
||||
# Initialize optimized CUDA device
|
||||
self.lib.init_optimized_cuda_device.argtypes = []
|
||||
self.lib.init_optimized_cuda_device.restype = ctypes.c_int
|
||||
|
||||
# Optimized field addition with flat arrays
|
||||
self.lib.gpu_optimized_field_addition.argtypes = [
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
ctypes.c_int
|
||||
]
|
||||
self.lib.gpu_optimized_field_addition.restype = ctypes.c_int
|
||||
|
||||
# Vectorized field addition
|
||||
self.lib.gpu_vectorized_field_addition.argtypes = [
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), # field_vector_t
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
ctypes.c_int
|
||||
]
|
||||
self.lib.gpu_vectorized_field_addition.restype = ctypes.c_int
|
||||
|
||||
# Shared memory field addition
|
||||
self.lib.gpu_shared_memory_field_addition.argtypes = [
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
|
||||
ctypes.c_int
|
||||
]
|
||||
self.lib.gpu_shared_memory_field_addition.restype = ctypes.c_int
|
||||
|
||||
def init_device(self) -> bool:
|
||||
"""Initialize optimized CUDA device and check capabilities"""
|
||||
if not self.initialized:
|
||||
print("❌ CUDA accelerator not initialized")
|
||||
return False
|
||||
|
||||
try:
|
||||
result = self.lib.init_optimized_cuda_device()
|
||||
if result == 0:
|
||||
print("✅ Optimized CUDA device initialized successfully")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ CUDA device initialization failed: {result}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ CUDA device initialization error: {e}")
|
||||
return False
|
||||
|
||||
def benchmark_optimized_kernels(self, max_elements: int = 10000000) -> dict:
|
||||
"""
|
||||
Benchmark all optimized CUDA kernels and compare performance
|
||||
|
||||
Args:
|
||||
max_elements: Maximum number of elements to test
|
||||
|
||||
Returns:
|
||||
Comprehensive performance benchmark results
|
||||
"""
|
||||
if not self.initialized:
|
||||
return {"error": "CUDA accelerator not initialized"}
|
||||
|
||||
print(f"🚀 High-Performance CUDA Kernel Benchmark (up to {max_elements:,} elements)")
|
||||
print("=" * 80)
|
||||
|
||||
# Test different dataset sizes
|
||||
test_sizes = [
|
||||
1000, # 1K elements
|
||||
10000, # 10K elements
|
||||
100000, # 100K elements
|
||||
1000000, # 1M elements
|
||||
5000000, # 5M elements
|
||||
10000000, # 10M elements
|
||||
]
|
||||
|
||||
results = {
|
||||
"test_sizes": [],
|
||||
"optimized_flat": [],
|
||||
"vectorized": [],
|
||||
"shared_memory": [],
|
||||
"cpu_baseline": [],
|
||||
"performance_summary": {}
|
||||
}
|
||||
|
||||
for size in test_sizes:
|
||||
if size > max_elements:
|
||||
break
|
||||
|
||||
print(f"\n📊 Benchmarking {size:,} elements...")
|
||||
|
||||
# Generate test data as flat arrays for optimal memory access
|
||||
a_flat, b_flat = self._generate_flat_test_data(size)
|
||||
|
||||
# bn128 field modulus (simplified)
|
||||
modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
|
||||
|
||||
# Benchmark optimized flat array kernel
|
||||
flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, size)
|
||||
|
||||
# Benchmark vectorized kernel
|
||||
vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, size)
|
||||
|
||||
# Benchmark shared memory kernel
|
||||
shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, size)
|
||||
|
||||
# Benchmark CPU baseline
|
||||
cpu_result = self._benchmark_cpu_baseline(a_flat, b_flat, modulus, size)
|
||||
|
||||
# Store results
|
||||
results["test_sizes"].append(size)
|
||||
results["optimized_flat"].append(flat_result)
|
||||
results["vectorized"].append(vec_result)
|
||||
results["shared_memory"].append(shared_result)
|
||||
results["cpu_baseline"].append(cpu_result)
|
||||
|
||||
# Print comparison
|
||||
print(f" Optimized Flat: {flat_result['time']:.4f}s, {flat_result['throughput']:.0f} elem/s")
|
||||
print(f" Vectorized: {vec_result['time']:.4f}s, {vec_result['throughput']:.0f} elem/s")
|
||||
print(f" Shared Memory: {shared_result['time']:.4f}s, {shared_result['throughput']:.0f} elem/s")
|
||||
print(f" CPU Baseline: {cpu_result['time']:.4f}s, {cpu_result['throughput']:.0f} elem/s")
|
||||
|
||||
# Calculate speedups
|
||||
flat_speedup = cpu_result['time'] / flat_result['time'] if flat_result['time'] > 0 else 0
|
||||
vec_speedup = cpu_result['time'] / vec_result['time'] if vec_result['time'] > 0 else 0
|
||||
shared_speedup = cpu_result['time'] / shared_result['time'] if shared_result['time'] > 0 else 0
|
||||
|
||||
print(f" Speedups - Flat: {flat_speedup:.2f}x, Vec: {vec_speedup:.2f}x, Shared: {shared_speedup:.2f}x")
|
||||
|
||||
# Calculate performance summary
|
||||
results["performance_summary"] = self._calculate_performance_summary(results)
|
||||
|
||||
# Print final summary
|
||||
self._print_performance_summary(results["performance_summary"])
|
||||
|
||||
return results
|
||||
|
||||
def _benchmark_optimized_flat_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
||||
modulus: List[int], num_elements: int) -> dict:
|
||||
"""Benchmark optimized flat array kernel"""
|
||||
try:
|
||||
result_flat = np.zeros_like(a_flat)
|
||||
modulus_array = np.array(modulus, dtype=np.uint64)
|
||||
|
||||
# Multiple runs for consistency
|
||||
times = []
|
||||
for run in range(3):
|
||||
start_time = time.time()
|
||||
success = self.lib.gpu_optimized_field_addition(
|
||||
a_flat, b_flat, result_flat, modulus_array, num_elements
|
||||
)
|
||||
run_time = time.time() - start_time
|
||||
|
||||
if success == 0: # Success
|
||||
times.append(run_time)
|
||||
|
||||
if not times:
|
||||
return {"time": float('inf'), "throughput": 0, "success": False}
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
throughput = num_elements / avg_time if avg_time > 0 else 0
|
||||
|
||||
return {"time": avg_time, "throughput": throughput, "success": True}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Optimized flat kernel error: {e}")
|
||||
return {"time": float('inf'), "throughput": 0, "success": False}
|
||||
|
||||
def _benchmark_vectorized_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
||||
modulus: List[int], num_elements: int) -> dict:
|
||||
"""Benchmark vectorized kernel"""
|
||||
try:
|
||||
# Convert flat arrays to vectorized format (uint4)
|
||||
# For simplicity, we'll reuse the flat array kernel as vectorized
|
||||
# In practice, would convert to proper vector format
|
||||
result_flat = np.zeros_like(a_flat)
|
||||
modulus_array = np.array(modulus, dtype=np.uint64)
|
||||
|
||||
times = []
|
||||
for run in range(3):
|
||||
start_time = time.time()
|
||||
success = self.lib.gpu_vectorized_field_addition(
|
||||
a_flat, b_flat, result_flat, modulus_array, num_elements
|
||||
)
|
||||
run_time = time.time() - start_time
|
||||
|
||||
if success == 0:
|
||||
times.append(run_time)
|
||||
|
||||
if not times:
|
||||
return {"time": float('inf'), "throughput": 0, "success": False}
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
throughput = num_elements / avg_time if avg_time > 0 else 0
|
||||
|
||||
return {"time": avg_time, "throughput": throughput, "success": True}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Vectorized kernel error: {e}")
|
||||
return {"time": float('inf'), "throughput": 0, "success": False}
|
||||
|
||||
def _benchmark_shared_memory_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
||||
modulus: List[int], num_elements: int) -> dict:
|
||||
"""Benchmark shared memory kernel"""
|
||||
try:
|
||||
result_flat = np.zeros_like(a_flat)
|
||||
modulus_array = np.array(modulus, dtype=np.uint64)
|
||||
|
||||
times = []
|
||||
for run in range(3):
|
||||
start_time = time.time()
|
||||
success = self.lib.gpu_shared_memory_field_addition(
|
||||
a_flat, b_flat, result_flat, modulus_array, num_elements
|
||||
)
|
||||
run_time = time.time() - start_time
|
||||
|
||||
if success == 0:
|
||||
times.append(run_time)
|
||||
|
||||
if not times:
|
||||
return {"time": float('inf'), "throughput": 0, "success": False}
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
throughput = num_elements / avg_time if avg_time > 0 else 0
|
||||
|
||||
return {"time": avg_time, "throughput": throughput, "success": True}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Shared memory kernel error: {e}")
|
||||
return {"time": float('inf'), "throughput": 0, "success": False}
|
||||
|
||||
def _benchmark_cpu_baseline(self, a_flat: np.ndarray, b_flat: np.ndarray,
|
||||
modulus: List[int], num_elements: int) -> dict:
|
||||
"""Benchmark CPU baseline for comparison"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Simple CPU field addition
|
||||
result_flat = np.zeros_like(a_flat)
|
||||
for i in range(num_elements):
|
||||
base_idx = i * 4
|
||||
for j in range(4):
|
||||
result_flat[base_idx + j] = (a_flat[base_idx + j] + b_flat[base_idx + j]) % modulus[j]
|
||||
|
||||
cpu_time = time.time() - start_time
|
||||
throughput = num_elements / cpu_time if cpu_time > 0 else 0
|
||||
|
||||
return {"time": cpu_time, "throughput": throughput, "success": True}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ CPU baseline error: {e}")
|
||||
return {"time": float('inf'), "throughput": 0, "success": False}
|
||||
|
||||
def _generate_flat_test_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Generate flat array test data for optimal memory access"""
|
||||
# Generate flat arrays (num_elements * 4 limbs)
|
||||
flat_size = num_elements * 4
|
||||
|
||||
# Use numpy for fast generation
|
||||
a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
|
||||
b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
|
||||
|
||||
return a_flat, b_flat
|
||||
|
||||
def _calculate_performance_summary(self, results: dict) -> dict:
|
||||
"""Calculate performance summary statistics"""
|
||||
summary = {}
|
||||
|
||||
# Find best performing kernel for each size
|
||||
best_speedups = []
|
||||
best_throughputs = []
|
||||
|
||||
for i, size in enumerate(results["test_sizes"]):
|
||||
cpu_time = results["cpu_baseline"][i]["time"]
|
||||
|
||||
# Calculate speedups
|
||||
flat_speedup = cpu_time / results["optimized_flat"][i]["time"] if results["optimized_flat"][i]["time"] > 0 else 0
|
||||
vec_speedup = cpu_time / results["vectorized"][i]["time"] if results["vectorized"][i]["time"] > 0 else 0
|
||||
shared_speedup = cpu_time / results["shared_memory"][i]["time"] if results["shared_memory"][i]["time"] > 0 else 0
|
||||
|
||||
best_speedup = max(flat_speedup, vec_speedup, shared_speedup)
|
||||
best_speedups.append(best_speedup)
|
||||
|
||||
# Find best throughput
|
||||
best_throughput = max(
|
||||
results["optimized_flat"][i]["throughput"],
|
||||
results["vectorized"][i]["throughput"],
|
||||
results["shared_memory"][i]["throughput"]
|
||||
)
|
||||
best_throughputs.append(best_throughput)
|
||||
|
||||
if best_speedups:
|
||||
summary["best_speedup"] = max(best_speedups)
|
||||
summary["average_speedup"] = sum(best_speedups) / len(best_speedups)
|
||||
summary["best_speedup_size"] = results["test_sizes"][best_speedups.index(max(best_speedups))]
|
||||
|
||||
if best_throughputs:
|
||||
summary["best_throughput"] = max(best_throughputs)
|
||||
summary["average_throughput"] = sum(best_throughputs) / len(best_throughputs)
|
||||
summary["best_throughput_size"] = results["test_sizes"][best_throughputs.index(max(best_throughputs))]
|
||||
|
||||
return summary
|
||||
|
||||
def _print_performance_summary(self, summary: dict):
|
||||
"""Print comprehensive performance summary"""
|
||||
print(f"\n🎯 High-Performance CUDA Summary:")
|
||||
print("=" * 50)
|
||||
|
||||
if "best_speedup" in summary:
|
||||
print(f" Best Speedup: {summary['best_speedup']:.2f}x at {summary.get('best_speedup_size', 'N/A'):,} elements")
|
||||
print(f" Average Speedup: {summary['average_speedup']:.2f}x across all tests")
|
||||
|
||||
if "best_throughput" in summary:
|
||||
print(f" Best Throughput: {summary['best_throughput']:.0f} elements/s at {summary.get('best_throughput_size', 'N/A'):,} elements")
|
||||
print(f" Average Throughput: {summary['average_throughput']:.0f} elements/s")
|
||||
|
||||
# Performance classification
|
||||
if summary.get("best_speedup", 0) > 5:
|
||||
print(" 🚀 Performance: EXCELLENT - Significant GPU acceleration achieved")
|
||||
elif summary.get("best_speedup", 0) > 2:
|
||||
print(" ✅ Performance: GOOD - Measurable GPU acceleration achieved")
|
||||
elif summary.get("best_speedup", 0) > 1:
|
||||
print(" ⚠️ Performance: MODERATE - Limited GPU acceleration")
|
||||
else:
|
||||
print(" ❌ Performance: POOR - No significant GPU acceleration")
|
||||
|
||||
def analyze_memory_bandwidth(self, num_elements: int = 1000000) -> dict:
|
||||
"""Analyze memory bandwidth performance"""
|
||||
print(f"🔍 Analyzing Memory Bandwidth Performance ({num_elements:,} elements)...")
|
||||
|
||||
a_flat, b_flat = self._generate_flat_test_data(num_elements)
|
||||
modulus = [0xFFFFFFFFFFFFFFFF] * 4
|
||||
|
||||
# Test different kernels
|
||||
flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, num_elements)
|
||||
vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, num_elements)
|
||||
shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, num_elements)
|
||||
|
||||
# Calculate theoretical bandwidth
|
||||
data_size = num_elements * 4 * 8 * 3 # 3 arrays, 4 limbs, 8 bytes
|
||||
|
||||
analysis = {
|
||||
"data_size_gb": data_size / (1024**3),
|
||||
"flat_bandwidth_gb_s": data_size / (flat_result['time'] * 1024**3) if flat_result['time'] > 0 else 0,
|
||||
"vectorized_bandwidth_gb_s": data_size / (vec_result['time'] * 1024**3) if vec_result['time'] > 0 else 0,
|
||||
"shared_bandwidth_gb_s": data_size / (shared_result['time'] * 1024**3) if shared_result['time'] > 0 else 0,
|
||||
}
|
||||
|
||||
print(f" Data Size: {analysis['data_size_gb']:.2f} GB")
|
||||
print(f" Flat Kernel: {analysis['flat_bandwidth_gb_s']:.2f} GB/s")
|
||||
print(f" Vectorized Kernel: {analysis['vectorized_bandwidth_gb_s']:.2f} GB/s")
|
||||
print(f" Shared Memory Kernel: {analysis['shared_bandwidth_gb_s']:.2f} GB/s")
|
||||
|
||||
return analysis
|
||||
|
||||
def main():
|
||||
"""Main function for testing high-performance CUDA acceleration"""
|
||||
print("🚀 AITBC High-Performance CUDA ZK Accelerator Test")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Initialize high-performance accelerator
|
||||
accelerator = HighPerformanceCUDAZKAccelerator()
|
||||
|
||||
if not accelerator.initialized:
|
||||
print("❌ Failed to initialize CUDA accelerator")
|
||||
return
|
||||
|
||||
# Initialize device
|
||||
if not accelerator.init_device():
|
||||
return
|
||||
|
||||
# Run comprehensive benchmark
|
||||
results = accelerator.benchmark_optimized_kernels(10000000)
|
||||
|
||||
# Analyze memory bandwidth
|
||||
bandwidth_analysis = accelerator.analyze_memory_bandwidth(1000000)
|
||||
|
||||
print("\n✅ High-Performance CUDA acceleration test completed!")
|
||||
|
||||
if results.get("performance_summary", {}).get("best_speedup", 0) > 1:
|
||||
print(f"🚀 Optimization successful: {results['performance_summary']['best_speedup']:.2f}x speedup achieved")
|
||||
else:
|
||||
print("⚠️ Further optimization needed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Test failed: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user