Update Python version requirements and fix compatibility issues

- Bump minimum Python version from 3.11 to 3.13 across all apps
- Add Python 3.11-3.13 test matrix to CLI workflow
- Document Python 3.11+ requirement in .env.example
- Fix Starlette Broadcast removal with in-process fallback implementation
- Add _InProcessBroadcast class for tests when Starlette Broadcast is unavailable
- Refactor API key validators to read live settings instead of cached values
- Update database models with explicit
This commit is contained in:
oib
2026-02-24 18:41:08 +01:00
parent 24b3a37733
commit 825f157749
270 changed files with 66674 additions and 2027 deletions

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
"""
CUDA Integration for ZK Circuit Acceleration
Python wrapper for GPU-accelerated field operations and constraint verification
"""
import ctypes
import numpy as np
from typing import List, Tuple, Optional
import os
import sys
# Field element structure (256-bit for bn128 curve)
class FieldElement(ctypes.Structure):
_fields_ = [("limbs", ctypes.c_uint64 * 4)]
# Constraint structure for parallel processing
class Constraint(ctypes.Structure):
_fields_ = [
("a", FieldElement),
("b", FieldElement),
("c", FieldElement),
("operation", ctypes.c_uint8) # 0: a + b = c, 1: a * b = c
]
class CUDAZKAccelerator:
"""Python interface for CUDA-accelerated ZK circuit operations"""
def __init__(self, lib_path: str = None):
"""
Initialize CUDA accelerator
Args:
lib_path: Path to compiled CUDA library (.so file)
"""
self.lib_path = lib_path or self._find_cuda_lib()
self.lib = None
self.initialized = False
try:
self.lib = ctypes.CDLL(self.lib_path)
self._setup_function_signatures()
self.initialized = True
print(f"✅ CUDA ZK Accelerator initialized: {self.lib_path}")
except Exception as e:
print(f"❌ Failed to initialize CUDA accelerator: {e}")
self.initialized = False
def _find_cuda_lib(self) -> str:
"""Find the compiled CUDA library"""
# Look for library in common locations
possible_paths = [
"./libfield_operations.so",
"./field_operations.so",
"../field_operations.so",
"../../field_operations.so",
"/usr/local/lib/libfield_operations.so"
]
for path in possible_paths:
if os.path.exists(path):
return path
raise FileNotFoundError("CUDA library not found. Please compile field_operations.cu first.")
def _setup_function_signatures(self):
"""Setup function signatures for CUDA library functions"""
if not self.lib:
return
# Initialize CUDA device
self.lib.init_cuda_device.argtypes = []
self.lib.init_cuda_device.restype = ctypes.c_int
# Field addition
self.lib.gpu_field_addition.argtypes = [
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
ctypes.c_int
]
self.lib.gpu_field_addition.restype = ctypes.c_int
# Constraint verification
self.lib.gpu_constraint_verification.argtypes = [
np.ctypeslib.ndpointer(Constraint, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_bool, flags="C_CONTIGUOUS"),
ctypes.c_int
]
self.lib.gpu_constraint_verification.restype = ctypes.c_int
def init_device(self) -> bool:
"""Initialize CUDA device and check capabilities"""
if not self.initialized:
print("❌ CUDA accelerator not initialized")
return False
try:
result = self.lib.init_cuda_device()
if result == 0:
print("✅ CUDA device initialized successfully")
return True
else:
print(f"❌ CUDA device initialization failed: {result}")
return False
except Exception as e:
print(f"❌ CUDA device initialization error: {e}")
return False
def field_addition(
self,
a: List[FieldElement],
b: List[FieldElement],
modulus: List[int]
) -> Tuple[bool, Optional[List[FieldElement]]]:
"""
Perform parallel field addition on GPU
Args:
a: First operand array
b: Second operand array
modulus: Field modulus (4 x 64-bit limbs)
Returns:
(success, result_array)
"""
if not self.initialized:
return False, None
try:
num_elements = len(a)
if num_elements != len(b):
print("❌ Input arrays must have same length")
return False, None
# Convert to numpy arrays
a_array = np.array(a, dtype=FieldElement)
b_array = np.array(b, dtype=FieldElement)
result_array = np.zeros(num_elements, dtype=FieldElement)
modulus_array = np.array(modulus, dtype=ctypes.c_uint64)
# Call GPU function
result = self.lib.gpu_field_addition(
a_array, b_array, result_array, modulus_array, num_elements
)
if result == 0:
print(f"✅ GPU field addition completed for {num_elements} elements")
return True, result_array.tolist()
else:
print(f"❌ GPU field addition failed: {result}")
return False, None
except Exception as e:
print(f"❌ GPU field addition error: {e}")
return False, None
def constraint_verification(
self,
constraints: List[Constraint],
witness: List[FieldElement]
) -> Tuple[bool, Optional[List[bool]]]:
"""
Perform parallel constraint verification on GPU
Args:
constraints: Array of constraints to verify
witness: Witness array
Returns:
(success, verification_results)
"""
if not self.initialized:
return False, None
try:
num_constraints = len(constraints)
# Convert to numpy arrays
constraints_array = np.array(constraints, dtype=Constraint)
witness_array = np.array(witness, dtype=FieldElement)
results_array = np.zeros(num_constraints, dtype=ctypes.c_bool)
# Call GPU function
result = self.lib.gpu_constraint_verification(
constraints_array, witness_array, results_array, num_constraints
)
if result == 0:
verified_count = np.sum(results_array)
print(f"✅ GPU constraint verification: {verified_count}/{num_constraints} passed")
return True, results_array.tolist()
else:
print(f"❌ GPU constraint verification failed: {result}")
return False, None
except Exception as e:
print(f"❌ GPU constraint verification error: {e}")
return False, None
def benchmark_performance(self, num_elements: int = 10000) -> dict:
"""
Benchmark GPU vs CPU performance for field operations
Args:
num_elements: Number of elements to process
Returns:
Performance benchmark results
"""
if not self.initialized:
return {"error": "CUDA accelerator not initialized"}
print(f"🚀 Benchmarking GPU performance with {num_elements} elements...")
# Generate test data
a_elements = []
b_elements = []
for i in range(num_elements):
a = FieldElement()
b = FieldElement()
# Fill with test values
for j in range(4):
a.limbs[j] = (i + j) % (2**32)
b.limbs[j] = (i * 2 + j) % (2**32)
a_elements.append(a)
b_elements.append(b)
# bn128 field modulus (simplified)
modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
# GPU benchmark
import time
start_time = time.time()
success, gpu_result = self.field_addition(a_elements, b_elements, modulus)
gpu_time = time.time() - start_time
# CPU benchmark (simplified)
start_time = time.time()
# Simple CPU field addition
cpu_result = []
for i in range(num_elements):
c = FieldElement()
for j in range(4):
c.limbs[j] = (a_elements[i].limbs[j] + b_elements[i].limbs[j]) % modulus[j]
cpu_result.append(c)
cpu_time = time.time() - start_time
# Calculate speedup
speedup = cpu_time / gpu_time if gpu_time > 0 else 0
results = {
"num_elements": num_elements,
"gpu_time": gpu_time,
"cpu_time": cpu_time,
"speedup": speedup,
"gpu_success": success,
"elements_per_second_gpu": num_elements / gpu_time if gpu_time > 0 else 0,
"elements_per_second_cpu": num_elements / cpu_time if cpu_time > 0 else 0
}
print(f"📊 Benchmark Results:")
print(f" GPU Time: {gpu_time:.4f}s")
print(f" CPU Time: {cpu_time:.4f}s")
print(f" Speedup: {speedup:.2f}x")
print(f" GPU Throughput: {results['elements_per_second_gpu']:.0f} elements/s")
return results
def main():
"""Main function for testing CUDA acceleration"""
print("🚀 AITBC CUDA ZK Accelerator Test")
print("=" * 50)
try:
# Initialize accelerator
accelerator = CUDAZKAccelerator()
if not accelerator.initialized:
print("❌ Failed to initialize CUDA accelerator")
print("💡 Please compile field_operations.cu first:")
print(" nvcc -shared -o libfield_operations.so field_operations.cu")
return
# Initialize device
if not accelerator.init_device():
return
# Run benchmark
results = accelerator.benchmark_performance(10000)
if "error" not in results:
print("\n✅ CUDA acceleration test completed successfully!")
print(f"🚀 Achieved {results['speedup']:.2f}x speedup")
else:
print(f"❌ Benchmark failed: {results['error']}")
except Exception as e:
print(f"❌ Test failed: {e}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,330 @@
/**
* CUDA Kernel for ZK Circuit Field Operations
*
* Implements GPU-accelerated field arithmetic for zero-knowledge proof generation
* focusing on parallel processing of large constraint systems and witness calculations.
*/
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <device_launch_parameters.h>
#include <stdint.h>
#include <stdio.h>
// Custom 128-bit integer type for CUDA compatibility
typedef unsigned long long uint128_t __attribute__((mode(TI)));
// Field element structure (256-bit for bn128 curve)
typedef struct {
uint64_t limbs[4]; // 4 x 64-bit limbs for 256-bit field element
} field_element_t;
// Constraint structure for parallel processing
typedef struct {
field_element_t a;
field_element_t b;
field_element_t c;
uint8_t operation; // 0: a + b = c, 1: a * b = c
} constraint_t;
// CUDA kernel for parallel field addition
__global__ void field_addition_kernel(
const field_element_t* a,
const field_element_t* b,
field_element_t* result,
const uint64_t modulus[4],
int num_elements
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_elements) {
// Perform field addition with modulus reduction
uint64_t carry = 0;
for (int i = 0; i < 4; i++) {
uint128_t sum = (uint128_t)a[idx].limbs[i] + b[idx].limbs[i] + carry;
result[idx].limbs[i] = (uint64_t)sum;
carry = sum >> 64;
}
// Modulus reduction if needed
uint128_t reduction = 0;
for (int i = 0; i < 4; i++) {
if (result[idx].limbs[i] >= modulus[i]) {
reduction = 1;
break;
}
}
if (reduction) {
carry = 0;
for (int i = 0; i < 4; i++) {
uint128_t diff = (uint128_t)result[idx].limbs[i] - modulus[i] - carry;
result[idx].limbs[i] = (uint64_t)diff;
carry = diff >> 63; // Borrow
}
}
}
}
// CUDA kernel for parallel field multiplication
__global__ void field_multiplication_kernel(
const field_element_t* a,
const field_element_t* b,
field_element_t* result,
const uint64_t modulus[4],
int num_elements
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_elements) {
// Perform schoolbook multiplication with modulus reduction
uint64_t product[8] = {0}; // Intermediate product (512 bits)
// Multiply all limbs
for (int i = 0; i < 4; i++) {
uint64_t carry = 0;
for (int j = 0; j < 4; j++) {
uint128_t partial = (uint128_t)a[idx].limbs[i] * b[idx].limbs[j] + product[i + j] + carry;
product[i + j] = (uint64_t)partial;
carry = partial >> 64;
}
product[i + 4] = carry;
}
// Montgomery reduction (simplified for demonstration)
// In practice, would use proper Montgomery reduction algorithm
for (int i = 0; i < 4; i++) {
result[idx].limbs[i] = product[i]; // Simplified - needs proper reduction
}
}
}
// CUDA kernel for parallel constraint verification
__global__ void constraint_verification_kernel(
const constraint_t* constraints,
const field_element_t* witness,
bool* results,
int num_constraints
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_constraints) {
const constraint_t* c = &constraints[idx];
field_element_t computed;
if (c->operation == 0) {
// Addition constraint: a + b = c
// Simplified field addition
uint64_t carry = 0;
for (int i = 0; i < 4; i++) {
uint128_t sum = (uint128_t)c->a.limbs[i] + c->b.limbs[i] + carry;
computed.limbs[i] = (uint64_t)sum;
carry = sum >> 64;
}
} else {
// Multiplication constraint: a * b = c
// Simplified field multiplication
computed.limbs[0] = c->a.limbs[0] * c->b.limbs[0]; // Simplified
computed.limbs[1] = 0;
computed.limbs[2] = 0;
computed.limbs[3] = 0;
}
// Check if computed equals expected
bool equal = true;
for (int i = 0; i < 4; i++) {
if (computed.limbs[i] != c->c.limbs[i]) {
equal = false;
break;
}
}
results[idx] = equal;
}
}
// CUDA kernel for parallel witness generation
__global__ void witness_generation_kernel(
const field_element_t* inputs,
field_element_t* witness,
int num_inputs,
int witness_size
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_inputs) {
// Copy inputs to witness
witness[idx] = inputs[idx];
// Generate additional witness elements (simplified)
// In practice, would implement proper witness generation algorithm
for (int i = num_inputs; i < witness_size; i++) {
if (idx == 0) { // Only first thread generates additional elements
// Simple linear combination (placeholder)
witness[i].limbs[0] = inputs[0].limbs[0] + i;
witness[i].limbs[1] = 0;
witness[i].limbs[2] = 0;
witness[i].limbs[3] = 0;
}
}
}
}
// Host wrapper functions
extern "C" {
// Initialize CUDA device and check capabilities
cudaError_t init_cuda_device() {
int deviceCount = 0;
cudaError_t error = cudaGetDeviceCount(&deviceCount);
if (error != cudaSuccess || deviceCount == 0) {
printf("No CUDA devices found\n");
return error;
}
// Select first available device
error = cudaSetDevice(0);
if (error != cudaSuccess) {
printf("Failed to set CUDA device\n");
return error;
}
// Get device properties
cudaDeviceProp prop;
error = cudaGetDeviceProperties(&prop, 0);
if (error == cudaSuccess) {
printf("CUDA Device: %s\n", prop.name);
printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
printf("Global Memory: %zu MB\n", prop.totalGlobalMem / (1024 * 1024));
printf("Shared Memory per Block: %zu KB\n", prop.sharedMemPerBlock / 1024);
printf("Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
}
return error;
}
// Parallel field addition on GPU
cudaError_t gpu_field_addition(
const field_element_t* a,
const field_element_t* b,
field_element_t* result,
const uint64_t modulus[4],
int num_elements
) {
// Allocate device memory
field_element_t *d_a, *d_b, *d_result;
uint64_t *d_modulus;
size_t field_size = num_elements * sizeof(field_element_t);
size_t modulus_size = 4 * sizeof(uint64_t);
cudaError_t error = cudaMalloc(&d_a, field_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_b, field_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_result, field_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_modulus, modulus_size);
if (error != cudaSuccess) return error;
// Copy data to device
error = cudaMemcpy(d_a, a, field_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_b, b, field_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_modulus, modulus, modulus_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
// Launch kernel
int threadsPerBlock = 256;
int blocksPerGrid = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
printf("Launching field addition kernel: %d blocks, %d threads per block\n",
blocksPerGrid, threadsPerBlock);
field_addition_kernel<<<blocksPerGrid, threadsPerBlock>>>(
d_a, d_b, d_result, d_modulus, num_elements
);
// Check for kernel launch errors
error = cudaGetLastError();
if (error != cudaSuccess) return error;
// Copy result back to host
error = cudaMemcpy(result, d_result, field_size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
cudaFree(d_modulus);
return error;
}
// Parallel constraint verification on GPU
cudaError_t gpu_constraint_verification(
const constraint_t* constraints,
const field_element_t* witness,
bool* results,
int num_constraints
) {
// Allocate device memory
constraint_t *d_constraints;
field_element_t *d_witness;
bool *d_results;
size_t constraint_size = num_constraints * sizeof(constraint_t);
size_t witness_size = 1000 * sizeof(field_element_t); // Assume witness size
size_t result_size = num_constraints * sizeof(bool);
cudaError_t error = cudaMalloc(&d_constraints, constraint_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_witness, witness_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_results, result_size);
if (error != cudaSuccess) return error;
// Copy data to device
error = cudaMemcpy(d_constraints, constraints, constraint_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_witness, witness, witness_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
// Launch kernel
int threadsPerBlock = 256;
int blocksPerGrid = (num_constraints + threadsPerBlock - 1) / threadsPerBlock;
printf("Launching constraint verification kernel: %d blocks, %d threads per block\n",
blocksPerGrid, threadsPerBlock);
constraint_verification_kernel<<<blocksPerGrid, threadsPerBlock>>>(
d_constraints, d_witness, d_results, num_constraints
);
// Check for kernel launch errors
error = cudaGetLastError();
if (error != cudaSuccess) return error;
// Copy result back to host
error = cudaMemcpy(results, d_results, result_size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_constraints);
cudaFree(d_witness);
cudaFree(d_results);
return error;
}
} // extern "C"

View File

@@ -0,0 +1,396 @@
#!/usr/bin/env python3
"""
GPU-Aware ZK Circuit Compilation with Memory Optimization
Implements GPU-aware compilation strategies and memory management for large circuits
"""
import os
import json
import time
import hashlib
import subprocess
from typing import Dict, List, Optional, Tuple
from pathlib import Path
class GPUAwareCompiler:
"""GPU-aware ZK circuit compiler with memory optimization"""
def __init__(self, base_dir: str = None):
self.base_dir = Path(base_dir or "/home/oib/windsurf/aitbc/apps/zk-circuits")
self.cache_dir = Path("/tmp/zk_gpu_cache")
self.cache_dir.mkdir(exist_ok=True)
# GPU memory configuration (RTX 4060 Ti: 16GB)
self.gpu_memory_config = {
"total_memory_mb": 16384,
"safe_memory_mb": 14336, # Leave 2GB for system
"circuit_memory_per_constraint": 0.001, # MB per constraint
"max_constraints_per_batch": 1000000 # 1M constraints per batch
}
print(f"🚀 GPU-Aware Compiler initialized")
print(f" Base directory: {self.base_dir}")
print(f" Cache directory: {self.cache_dir}")
print(f" GPU memory: {self.gpu_memory_config['total_memory_mb']}MB")
def estimate_circuit_memory(self, circuit_path: str) -> Dict:
"""
Estimate memory requirements for circuit compilation
Args:
circuit_path: Path to circuit file
Returns:
Memory estimation dictionary
"""
circuit_file = Path(circuit_path)
if not circuit_file.exists():
return {"error": "Circuit file not found"}
# Parse circuit to estimate constraints
try:
with open(circuit_file, 'r') as f:
content = f.read()
# Simple constraint estimation
constraint_count = content.count('<==') + content.count('===')
# Estimate memory requirements
estimated_memory = constraint_count * self.gpu_memory_config["circuit_memory_per_constraint"]
# Add overhead for compilation
compilation_overhead = estimated_memory * 2 # 2x for intermediate data
total_memory_mb = estimated_memory + compilation_overhead
return {
"circuit_path": str(circuit_file),
"estimated_constraints": constraint_count,
"estimated_memory_mb": total_memory_mb,
"compilation_overhead_mb": compilation_overhead,
"gpu_feasible": total_memory_mb < self.gpu_memory_config["safe_memory_mb"],
"recommended_batch_size": min(
self.gpu_memory_config["max_constraints_per_batch"],
int(self.gpu_memory_config["safe_memory_mb"] / self.gpu_memory_config["circuit_memory_per_constraint"])
)
}
except Exception as e:
return {"error": f"Failed to parse circuit: {e}"}
def compile_with_gpu_optimization(self, circuit_path: str, output_dir: str = None) -> Dict:
"""
Compile circuit with GPU-aware memory optimization
Args:
circuit_path: Path to circuit file
output_dir: Output directory for compiled artifacts
Returns:
Compilation results
"""
start_time = time.time()
# Estimate memory requirements
memory_est = self.estimate_circuit_memory(circuit_path)
if "error" in memory_est:
return memory_est
print(f"🔧 Compiling {circuit_path}")
print(f" Estimated constraints: {memory_est['estimated_constraints']}")
print(f" Estimated memory: {memory_est['estimated_memory_mb']:.2f}MB")
# Check GPU feasibility
if not memory_est["gpu_feasible"]:
print("⚠️ Circuit too large for GPU, using CPU compilation")
return self.compile_cpu_fallback(circuit_path, output_dir)
# Create cache key
cache_key = self._create_cache_key(circuit_path)
cache_path = self.cache_dir / f"{cache_key}.json"
# Check cache
if cache_path.exists():
cached_result = self._load_cache(cache_path)
if cached_result:
print("✅ Using cached compilation result")
cached_result["cache_hit"] = True
cached_result["compilation_time"] = time.time() - start_time
return cached_result
# Perform GPU-aware compilation
try:
result = self._compile_circuit(circuit_path, output_dir, memory_est)
# Cache result
self._save_cache(cache_path, result)
result["compilation_time"] = time.time() - start_time
result["cache_hit"] = False
print(f"✅ Compilation completed in {result['compilation_time']:.3f}s")
return result
except Exception as e:
print(f"❌ Compilation failed: {e}")
return {"error": str(e), "compilation_time": time.time() - start_time}
def _compile_circuit(self, circuit_path: str, output_dir: str, memory_est: Dict) -> Dict:
"""
Perform actual circuit compilation with GPU optimization
"""
circuit_file = Path(circuit_path)
circuit_name = circuit_file.stem
# Set output directory
if not output_dir:
output_dir = self.base_dir / "build" / circuit_name
else:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Compile with Circom
cmd = [
"circom",
str(circuit_file),
"--r1cs",
"--wasm",
"-o", str(output_dir)
]
print(f"🔄 Running: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
cwd=str(self.base_dir)
)
if result.returncode != 0:
return {
"error": "Circom compilation failed",
"stderr": result.stderr,
"stdout": result.stdout
}
# Check compiled artifacts
r1cs_path = output_dir / f"{circuit_name}.r1cs"
wasm_path = output_dir / f"{circuit_name}_js" / f"{circuit_name}.wasm"
artifacts = {}
if r1cs_path.exists():
artifacts["r1cs"] = str(r1cs_path)
r1cs_size = r1cs_path.stat().st_size / (1024 * 1024) # MB
print(f" R1CS size: {r1cs_size:.2f}MB")
if wasm_path.exists():
artifacts["wasm"] = str(wasm_path)
wasm_size = wasm_path.stat().st_size / (1024 * 1024) # MB
print(f" WASM size: {wasm_size:.2f}MB")
return {
"success": True,
"circuit_name": circuit_name,
"output_dir": str(output_dir),
"artifacts": artifacts,
"memory_estimation": memory_est,
"optimization_applied": "gpu_aware_memory"
}
def compile_cpu_fallback(self, circuit_path: str, output_dir: str = None) -> Dict:
"""Fallback CPU compilation for circuits too large for GPU"""
print("🔄 Using CPU fallback compilation")
# Use standard circom compilation
return self._compile_circuit(circuit_path, output_dir, {"gpu_feasible": False})
def batch_compile_optimized(self, circuit_paths: List[str]) -> Dict:
"""
Compile multiple circuits with GPU memory optimization
Args:
circuit_paths: List of circuit file paths
Returns:
Batch compilation results
"""
start_time = time.time()
print(f"🚀 Batch compiling {len(circuit_paths)} circuits")
# Estimate total memory requirements
total_memory = 0
memory_estimates = []
for circuit_path in circuit_paths:
est = self.estimate_circuit_memory(circuit_path)
if "error" not in est:
total_memory += est["estimated_memory_mb"]
memory_estimates.append(est)
print(f" Total estimated memory: {total_memory:.2f}MB")
# Check if batch fits in GPU memory
if total_memory > self.gpu_memory_config["safe_memory_mb"]:
print("⚠️ Batch too large for GPU, using sequential compilation")
return self.sequential_compile(circuit_paths)
# Parallel compilation (simplified - would use actual GPU parallelization)
results = []
for circuit_path in circuit_paths:
result = self.compile_with_gpu_optimization(circuit_path)
results.append(result)
total_time = time.time() - start_time
return {
"success": True,
"batch_size": len(circuit_paths),
"total_time": total_time,
"average_time": total_time / len(circuit_paths),
"results": results,
"memory_estimates": memory_estimates
}
def sequential_compile(self, circuit_paths: List[str]) -> Dict:
"""Sequential compilation fallback"""
start_time = time.time()
results = []
for circuit_path in circuit_paths:
result = self.compile_with_gpu_optimization(circuit_path)
results.append(result)
total_time = time.time() - start_time
return {
"success": True,
"batch_size": len(circuit_paths),
"compilation_type": "sequential",
"total_time": total_time,
"average_time": total_time / len(circuit_paths),
"results": results
}
def _create_cache_key(self, circuit_path: str) -> str:
"""Create cache key for circuit"""
circuit_file = Path(circuit_path)
# Use file hash and modification time
file_hash = hashlib.sha256()
try:
with open(circuit_file, 'rb') as f:
file_hash.update(f.read())
# Add modification time
mtime = circuit_file.stat().st_mtime
file_hash.update(str(mtime).encode())
return file_hash.hexdigest()[:16]
except Exception:
# Fallback to filename
return hashlib.md5(str(circuit_path).encode()).hexdigest()[:16]
def _load_cache(self, cache_path: Path) -> Optional[Dict]:
"""Load cached compilation result"""
try:
with open(cache_path, 'r') as f:
return json.load(f)
except Exception:
return None
def _save_cache(self, cache_path: Path, result: Dict):
"""Save compilation result to cache"""
try:
with open(cache_path, 'w') as f:
json.dump(result, f, indent=2)
except Exception as e:
print(f"⚠️ Failed to save cache: {e}")
def benchmark_compilation_performance(self, circuit_path: str, iterations: int = 5) -> Dict:
"""
Benchmark compilation performance
Args:
circuit_path: Path to circuit file
iterations: Number of iterations to run
Returns:
Performance benchmark results
"""
print(f"📊 Benchmarking compilation performance ({iterations} iterations)")
times = []
cache_hits = 0
successes = 0
for i in range(iterations):
print(f" Iteration {i + 1}/{iterations}")
start_time = time.time()
result = self.compile_with_gpu_optimization(circuit_path)
iteration_time = time.time() - start_time
times.append(iteration_time)
if result.get("cache_hit"):
cache_hits += 1
if result.get("success"):
successes += 1
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
return {
"circuit_path": circuit_path,
"iterations": iterations,
"success_rate": successes / iterations,
"cache_hit_rate": cache_hits / iterations,
"average_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"times": times
}
def main():
"""Main function for testing GPU-aware compilation"""
print("🚀 AITBC GPU-Aware ZK Circuit Compiler")
print("=" * 50)
compiler = GPUAwareCompiler()
# Test with existing circuits
test_circuits = [
"modular_ml_components.circom",
"ml_training_verification.circom",
"ml_inference_verification.circom"
]
for circuit in test_circuits:
circuit_path = compiler.base_dir / circuit
if circuit_path.exists():
print(f"\n🔧 Testing {circuit}")
# Estimate memory
memory_est = compiler.estimate_circuit_memory(str(circuit_path))
print(f" Memory estimation: {memory_est}")
# Compile
result = compiler.compile_with_gpu_optimization(str(circuit_path))
print(f" Result: {result.get('success', False)}")
else:
print(f"⚠️ Circuit not found: {circuit_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,453 @@
#!/usr/bin/env python3
"""
High-Performance CUDA ZK Accelerator with Optimized Kernels
Implements optimized CUDA kernels with memory coalescing, vectorization, and shared memory
"""
import ctypes
import numpy as np
from typing import List, Tuple, Optional
import os
import sys
import time
# Optimized field element structure for flat array access
class OptimizedFieldElement(ctypes.Structure):
_fields_ = [("limbs", ctypes.c_uint64 * 4)]
class HighPerformanceCUDAZKAccelerator:
"""High-performance Python interface for optimized CUDA ZK operations"""
def __init__(self, lib_path: str = None):
"""
Initialize high-performance CUDA accelerator
Args:
lib_path: Path to compiled optimized CUDA library (.so file)
"""
self.lib_path = lib_path or self._find_optimized_cuda_lib()
self.lib = None
self.initialized = False
try:
self.lib = ctypes.CDLL(self.lib_path)
self._setup_function_signatures()
self.initialized = True
print(f"✅ High-Performance CUDA ZK Accelerator initialized: {self.lib_path}")
except Exception as e:
print(f"❌ Failed to initialize CUDA accelerator: {e}")
self.initialized = False
def _find_optimized_cuda_lib(self) -> str:
"""Find the compiled optimized CUDA library"""
possible_paths = [
"./liboptimized_field_operations.so",
"./optimized_field_operations.so",
"../liboptimized_field_operations.so",
"../../liboptimized_field_operations.so",
"/usr/local/lib/liboptimized_field_operations.so"
]
for path in possible_paths:
if os.path.exists(path):
return path
raise FileNotFoundError("Optimized CUDA library not found. Please compile optimized_field_operations.cu first.")
def _setup_function_signatures(self):
"""Setup function signatures for optimized CUDA library functions"""
if not self.lib:
return
# Initialize optimized CUDA device
self.lib.init_optimized_cuda_device.argtypes = []
self.lib.init_optimized_cuda_device.restype = ctypes.c_int
# Optimized field addition with flat arrays
self.lib.gpu_optimized_field_addition.argtypes = [
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
ctypes.c_int
]
self.lib.gpu_optimized_field_addition.restype = ctypes.c_int
# Vectorized field addition
self.lib.gpu_vectorized_field_addition.argtypes = [
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), # field_vector_t
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
ctypes.c_int
]
self.lib.gpu_vectorized_field_addition.restype = ctypes.c_int
# Shared memory field addition
self.lib.gpu_shared_memory_field_addition.argtypes = [
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
ctypes.c_int
]
self.lib.gpu_shared_memory_field_addition.restype = ctypes.c_int
def init_device(self) -> bool:
"""Initialize optimized CUDA device and check capabilities"""
if not self.initialized:
print("❌ CUDA accelerator not initialized")
return False
try:
result = self.lib.init_optimized_cuda_device()
if result == 0:
print("✅ Optimized CUDA device initialized successfully")
return True
else:
print(f"❌ CUDA device initialization failed: {result}")
return False
except Exception as e:
print(f"❌ CUDA device initialization error: {e}")
return False
def benchmark_optimized_kernels(self, max_elements: int = 10000000) -> dict:
"""
Benchmark all optimized CUDA kernels and compare performance
Args:
max_elements: Maximum number of elements to test
Returns:
Comprehensive performance benchmark results
"""
if not self.initialized:
return {"error": "CUDA accelerator not initialized"}
print(f"🚀 High-Performance CUDA Kernel Benchmark (up to {max_elements:,} elements)")
print("=" * 80)
# Test different dataset sizes
test_sizes = [
1000, # 1K elements
10000, # 10K elements
100000, # 100K elements
1000000, # 1M elements
5000000, # 5M elements
10000000, # 10M elements
]
results = {
"test_sizes": [],
"optimized_flat": [],
"vectorized": [],
"shared_memory": [],
"cpu_baseline": [],
"performance_summary": {}
}
for size in test_sizes:
if size > max_elements:
break
print(f"\n📊 Benchmarking {size:,} elements...")
# Generate test data as flat arrays for optimal memory access
a_flat, b_flat = self._generate_flat_test_data(size)
# bn128 field modulus (simplified)
modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
# Benchmark optimized flat array kernel
flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, size)
# Benchmark vectorized kernel
vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, size)
# Benchmark shared memory kernel
shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, size)
# Benchmark CPU baseline
cpu_result = self._benchmark_cpu_baseline(a_flat, b_flat, modulus, size)
# Store results
results["test_sizes"].append(size)
results["optimized_flat"].append(flat_result)
results["vectorized"].append(vec_result)
results["shared_memory"].append(shared_result)
results["cpu_baseline"].append(cpu_result)
# Print comparison
print(f" Optimized Flat: {flat_result['time']:.4f}s, {flat_result['throughput']:.0f} elem/s")
print(f" Vectorized: {vec_result['time']:.4f}s, {vec_result['throughput']:.0f} elem/s")
print(f" Shared Memory: {shared_result['time']:.4f}s, {shared_result['throughput']:.0f} elem/s")
print(f" CPU Baseline: {cpu_result['time']:.4f}s, {cpu_result['throughput']:.0f} elem/s")
# Calculate speedups
flat_speedup = cpu_result['time'] / flat_result['time'] if flat_result['time'] > 0 else 0
vec_speedup = cpu_result['time'] / vec_result['time'] if vec_result['time'] > 0 else 0
shared_speedup = cpu_result['time'] / shared_result['time'] if shared_result['time'] > 0 else 0
print(f" Speedups - Flat: {flat_speedup:.2f}x, Vec: {vec_speedup:.2f}x, Shared: {shared_speedup:.2f}x")
# Calculate performance summary
results["performance_summary"] = self._calculate_performance_summary(results)
# Print final summary
self._print_performance_summary(results["performance_summary"])
return results
def _benchmark_optimized_flat_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
modulus: List[int], num_elements: int) -> dict:
"""Benchmark optimized flat array kernel"""
try:
result_flat = np.zeros_like(a_flat)
modulus_array = np.array(modulus, dtype=np.uint64)
# Multiple runs for consistency
times = []
for run in range(3):
start_time = time.time()
success = self.lib.gpu_optimized_field_addition(
a_flat, b_flat, result_flat, modulus_array, num_elements
)
run_time = time.time() - start_time
if success == 0: # Success
times.append(run_time)
if not times:
return {"time": float('inf'), "throughput": 0, "success": False}
avg_time = sum(times) / len(times)
throughput = num_elements / avg_time if avg_time > 0 else 0
return {"time": avg_time, "throughput": throughput, "success": True}
except Exception as e:
print(f" ❌ Optimized flat kernel error: {e}")
return {"time": float('inf'), "throughput": 0, "success": False}
def _benchmark_vectorized_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
modulus: List[int], num_elements: int) -> dict:
"""Benchmark vectorized kernel"""
try:
# Convert flat arrays to vectorized format (uint4)
# For simplicity, we'll reuse the flat array kernel as vectorized
# In practice, would convert to proper vector format
result_flat = np.zeros_like(a_flat)
modulus_array = np.array(modulus, dtype=np.uint64)
times = []
for run in range(3):
start_time = time.time()
success = self.lib.gpu_vectorized_field_addition(
a_flat, b_flat, result_flat, modulus_array, num_elements
)
run_time = time.time() - start_time
if success == 0:
times.append(run_time)
if not times:
return {"time": float('inf'), "throughput": 0, "success": False}
avg_time = sum(times) / len(times)
throughput = num_elements / avg_time if avg_time > 0 else 0
return {"time": avg_time, "throughput": throughput, "success": True}
except Exception as e:
print(f" ❌ Vectorized kernel error: {e}")
return {"time": float('inf'), "throughput": 0, "success": False}
def _benchmark_shared_memory_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray,
modulus: List[int], num_elements: int) -> dict:
"""Benchmark shared memory kernel"""
try:
result_flat = np.zeros_like(a_flat)
modulus_array = np.array(modulus, dtype=np.uint64)
times = []
for run in range(3):
start_time = time.time()
success = self.lib.gpu_shared_memory_field_addition(
a_flat, b_flat, result_flat, modulus_array, num_elements
)
run_time = time.time() - start_time
if success == 0:
times.append(run_time)
if not times:
return {"time": float('inf'), "throughput": 0, "success": False}
avg_time = sum(times) / len(times)
throughput = num_elements / avg_time if avg_time > 0 else 0
return {"time": avg_time, "throughput": throughput, "success": True}
except Exception as e:
print(f" ❌ Shared memory kernel error: {e}")
return {"time": float('inf'), "throughput": 0, "success": False}
def _benchmark_cpu_baseline(self, a_flat: np.ndarray, b_flat: np.ndarray,
modulus: List[int], num_elements: int) -> dict:
"""Benchmark CPU baseline for comparison"""
try:
start_time = time.time()
# Simple CPU field addition
result_flat = np.zeros_like(a_flat)
for i in range(num_elements):
base_idx = i * 4
for j in range(4):
result_flat[base_idx + j] = (a_flat[base_idx + j] + b_flat[base_idx + j]) % modulus[j]
cpu_time = time.time() - start_time
throughput = num_elements / cpu_time if cpu_time > 0 else 0
return {"time": cpu_time, "throughput": throughput, "success": True}
except Exception as e:
print(f" ❌ CPU baseline error: {e}")
return {"time": float('inf'), "throughput": 0, "success": False}
def _generate_flat_test_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]:
"""Generate flat array test data for optimal memory access"""
# Generate flat arrays (num_elements * 4 limbs)
flat_size = num_elements * 4
# Use numpy for fast generation
a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
return a_flat, b_flat
def _calculate_performance_summary(self, results: dict) -> dict:
"""Calculate performance summary statistics"""
summary = {}
# Find best performing kernel for each size
best_speedups = []
best_throughputs = []
for i, size in enumerate(results["test_sizes"]):
cpu_time = results["cpu_baseline"][i]["time"]
# Calculate speedups
flat_speedup = cpu_time / results["optimized_flat"][i]["time"] if results["optimized_flat"][i]["time"] > 0 else 0
vec_speedup = cpu_time / results["vectorized"][i]["time"] if results["vectorized"][i]["time"] > 0 else 0
shared_speedup = cpu_time / results["shared_memory"][i]["time"] if results["shared_memory"][i]["time"] > 0 else 0
best_speedup = max(flat_speedup, vec_speedup, shared_speedup)
best_speedups.append(best_speedup)
# Find best throughput
best_throughput = max(
results["optimized_flat"][i]["throughput"],
results["vectorized"][i]["throughput"],
results["shared_memory"][i]["throughput"]
)
best_throughputs.append(best_throughput)
if best_speedups:
summary["best_speedup"] = max(best_speedups)
summary["average_speedup"] = sum(best_speedups) / len(best_speedups)
summary["best_speedup_size"] = results["test_sizes"][best_speedups.index(max(best_speedups))]
if best_throughputs:
summary["best_throughput"] = max(best_throughputs)
summary["average_throughput"] = sum(best_throughputs) / len(best_throughputs)
summary["best_throughput_size"] = results["test_sizes"][best_throughputs.index(max(best_throughputs))]
return summary
def _print_performance_summary(self, summary: dict):
"""Print comprehensive performance summary"""
print(f"\n🎯 High-Performance CUDA Summary:")
print("=" * 50)
if "best_speedup" in summary:
print(f" Best Speedup: {summary['best_speedup']:.2f}x at {summary.get('best_speedup_size', 'N/A'):,} elements")
print(f" Average Speedup: {summary['average_speedup']:.2f}x across all tests")
if "best_throughput" in summary:
print(f" Best Throughput: {summary['best_throughput']:.0f} elements/s at {summary.get('best_throughput_size', 'N/A'):,} elements")
print(f" Average Throughput: {summary['average_throughput']:.0f} elements/s")
# Performance classification
if summary.get("best_speedup", 0) > 5:
print(" 🚀 Performance: EXCELLENT - Significant GPU acceleration achieved")
elif summary.get("best_speedup", 0) > 2:
print(" ✅ Performance: GOOD - Measurable GPU acceleration achieved")
elif summary.get("best_speedup", 0) > 1:
print(" ⚠️ Performance: MODERATE - Limited GPU acceleration")
else:
print(" ❌ Performance: POOR - No significant GPU acceleration")
def analyze_memory_bandwidth(self, num_elements: int = 1000000) -> dict:
"""Analyze memory bandwidth performance"""
print(f"🔍 Analyzing Memory Bandwidth Performance ({num_elements:,} elements)...")
a_flat, b_flat = self._generate_flat_test_data(num_elements)
modulus = [0xFFFFFFFFFFFFFFFF] * 4
# Test different kernels
flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, num_elements)
vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, num_elements)
shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, num_elements)
# Calculate theoretical bandwidth
data_size = num_elements * 4 * 8 * 3 # 3 arrays, 4 limbs, 8 bytes
analysis = {
"data_size_gb": data_size / (1024**3),
"flat_bandwidth_gb_s": data_size / (flat_result['time'] * 1024**3) if flat_result['time'] > 0 else 0,
"vectorized_bandwidth_gb_s": data_size / (vec_result['time'] * 1024**3) if vec_result['time'] > 0 else 0,
"shared_bandwidth_gb_s": data_size / (shared_result['time'] * 1024**3) if shared_result['time'] > 0 else 0,
}
print(f" Data Size: {analysis['data_size_gb']:.2f} GB")
print(f" Flat Kernel: {analysis['flat_bandwidth_gb_s']:.2f} GB/s")
print(f" Vectorized Kernel: {analysis['vectorized_bandwidth_gb_s']:.2f} GB/s")
print(f" Shared Memory Kernel: {analysis['shared_bandwidth_gb_s']:.2f} GB/s")
return analysis
def main():
"""Main function for testing high-performance CUDA acceleration"""
print("🚀 AITBC High-Performance CUDA ZK Accelerator Test")
print("=" * 60)
try:
# Initialize high-performance accelerator
accelerator = HighPerformanceCUDAZKAccelerator()
if not accelerator.initialized:
print("❌ Failed to initialize CUDA accelerator")
return
# Initialize device
if not accelerator.init_device():
return
# Run comprehensive benchmark
results = accelerator.benchmark_optimized_kernels(10000000)
# Analyze memory bandwidth
bandwidth_analysis = accelerator.analyze_memory_bandwidth(1000000)
print("\n✅ High-Performance CUDA acceleration test completed!")
if results.get("performance_summary", {}).get("best_speedup", 0) > 1:
print(f"🚀 Optimization successful: {results['performance_summary']['best_speedup']:.2f}x speedup achieved")
else:
print("⚠️ Further optimization needed")
except Exception as e:
print(f"❌ Test failed: {e}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,394 @@
#!/usr/bin/env python3
"""
Optimized CUDA ZK Accelerator with Improved Performance
Implements optimized CUDA kernels and benchmarking for better GPU utilization
"""
import ctypes
import numpy as np
from typing import List, Tuple, Optional
import os
import sys
import time
# Field element structure (256-bit for bn128 curve)
class FieldElement(ctypes.Structure):
_fields_ = [("limbs", ctypes.c_uint64 * 4)]
class OptimizedCUDAZKAccelerator:
"""Optimized Python interface for CUDA-accelerated ZK circuit operations"""
def __init__(self, lib_path: str = None):
"""
Initialize optimized CUDA accelerator
Args:
lib_path: Path to compiled CUDA library (.so file)
"""
self.lib_path = lib_path or self._find_cuda_lib()
self.lib = None
self.initialized = False
try:
self.lib = ctypes.CDLL(self.lib_path)
self._setup_function_signatures()
self.initialized = True
print(f"✅ Optimized CUDA ZK Accelerator initialized: {self.lib_path}")
except Exception as e:
print(f"❌ Failed to initialize CUDA accelerator: {e}")
self.initialized = False
def _find_cuda_lib(self) -> str:
"""Find the compiled CUDA library"""
possible_paths = [
"./libfield_operations.so",
"./field_operations.so",
"../field_operations.so",
"../../field_operations.so",
"/usr/local/lib/libfield_operations.so"
]
for path in possible_paths:
if os.path.exists(path):
return path
raise FileNotFoundError("CUDA library not found. Please compile field_operations.cu first.")
def _setup_function_signatures(self):
"""Setup function signatures for CUDA library functions"""
if not self.lib:
return
# Initialize CUDA device
self.lib.init_cuda_device.argtypes = []
self.lib.init_cuda_device.restype = ctypes.c_int
# Field addition
self.lib.gpu_field_addition.argtypes = [
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
ctypes.c_int
]
self.lib.gpu_field_addition.restype = ctypes.c_int
# Constraint verification
self.lib.gpu_constraint_verification.argtypes = [
np.ctypeslib.ndpointer(ctypes.c_void_p, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(FieldElement, flags="C_CONTIGUOUS"),
np.ctypeslib.ndpointer(ctypes.c_bool, flags="C_CONTIGUOUS"),
ctypes.c_int
]
self.lib.gpu_constraint_verification.restype = ctypes.c_int
def init_device(self) -> bool:
"""Initialize CUDA device and check capabilities"""
if not self.initialized:
print("❌ CUDA accelerator not initialized")
return False
try:
result = self.lib.init_cuda_device()
if result == 0:
print("✅ CUDA device initialized successfully")
return True
else:
print(f"❌ CUDA device initialization failed: {result}")
return False
except Exception as e:
print(f"❌ CUDA device initialization error: {e}")
return False
def benchmark_optimized_performance(self, max_elements: int = 10000000) -> dict:
"""
Benchmark optimized GPU performance with varying dataset sizes
Args:
max_elements: Maximum number of elements to test
Returns:
Performance benchmark results
"""
if not self.initialized:
return {"error": "CUDA accelerator not initialized"}
print(f"🚀 Optimized GPU Performance Benchmark (up to {max_elements:,} elements)")
print("=" * 70)
# Test different dataset sizes
test_sizes = [
1000, # 1K elements
10000, # 10K elements
100000, # 100K elements
1000000, # 1M elements
5000000, # 5M elements
10000000, # 10M elements
]
results = []
for size in test_sizes:
if size > max_elements:
break
print(f"\n📊 Testing {size:,} elements...")
# Generate optimized test data
a_elements, b_elements = self._generate_test_data(size)
# bn128 field modulus (simplified)
modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
# GPU benchmark with multiple runs
gpu_times = []
for run in range(3): # 3 runs for consistency
start_time = time.time()
success, gpu_result = self.field_addition_optimized(a_elements, b_elements, modulus)
gpu_time = time.time() - start_time
if success:
gpu_times.append(gpu_time)
if not gpu_times:
print(f" ❌ GPU failed for {size:,} elements")
continue
# Average GPU time
avg_gpu_time = sum(gpu_times) / len(gpu_times)
# CPU benchmark
start_time = time.time()
cpu_result = self._cpu_field_addition(a_elements, b_elements, modulus)
cpu_time = time.time() - start_time
# Calculate speedup
speedup = cpu_time / avg_gpu_time if avg_gpu_time > 0 else 0
result = {
"elements": size,
"gpu_time": avg_gpu_time,
"cpu_time": cpu_time,
"speedup": speedup,
"gpu_throughput": size / avg_gpu_time if avg_gpu_time > 0 else 0,
"cpu_throughput": size / cpu_time if cpu_time > 0 else 0,
"gpu_success": True
}
results.append(result)
print(f" GPU Time: {avg_gpu_time:.4f}s")
print(f" CPU Time: {cpu_time:.4f}s")
print(f" Speedup: {speedup:.2f}x")
print(f" GPU Throughput: {result['gpu_throughput']:.0f} elements/s")
# Find optimal performance point
best_speedup = max(results, key=lambda x: x["speedup"]) if results else None
best_throughput = max(results, key=lambda x: x["gpu_throughput"]) if results else None
summary = {
"test_sizes": test_sizes[:len(results)],
"results": results,
"best_speedup": best_speedup,
"best_throughput": best_throughput,
"gpu_device": "NVIDIA GeForce RTX 4060 Ti"
}
print(f"\n🎯 Performance Summary:")
if best_speedup:
print(f" Best Speedup: {best_speedup['speedup']:.2f}x at {best_speedup['elements']:,} elements")
if best_throughput:
print(f" Best Throughput: {best_throughput['gpu_throughput']:.0f} elements/s at {best_throughput['elements']:,} elements")
return summary
def field_addition_optimized(
self,
a: List[FieldElement],
b: List[FieldElement],
modulus: List[int]
) -> Tuple[bool, Optional[List[FieldElement]]]:
"""
Perform optimized parallel field addition on GPU
Args:
a: First operand array
b: Second operand array
modulus: Field modulus (4 x 64-bit limbs)
Returns:
(success, result_array)
"""
if not self.initialized:
return False, None
try:
num_elements = len(a)
if num_elements != len(b):
print("❌ Input arrays must have same length")
return False, None
# Convert to numpy arrays with optimal memory layout
a_array = np.array(a, dtype=FieldElement)
b_array = np.array(b, dtype=FieldElement)
result_array = np.zeros(num_elements, dtype=FieldElement)
modulus_array = np.array(modulus, dtype=ctypes.c_uint64)
# Call GPU function
result = self.lib.gpu_field_addition(
a_array, b_array, result_array, modulus_array, num_elements
)
if result == 0:
return True, result_array.tolist()
else:
print(f"❌ GPU field addition failed: {result}")
return False, None
except Exception as e:
print(f"❌ GPU field addition error: {e}")
return False, None
def _generate_test_data(self, num_elements: int) -> Tuple[List[FieldElement], List[FieldElement]]:
"""Generate optimized test data for benchmarking"""
a_elements = []
b_elements = []
# Use numpy for faster generation
a_data = np.random.randint(0, 2**32, size=(num_elements, 4), dtype=np.uint64)
b_data = np.random.randint(0, 2**32, size=(num_elements, 4), dtype=np.uint64)
for i in range(num_elements):
a = FieldElement()
b = FieldElement()
for j in range(4):
a.limbs[j] = a_data[i, j]
b.limbs[j] = b_data[i, j]
a_elements.append(a)
b_elements.append(b)
return a_elements, b_elements
def _cpu_field_addition(self, a_elements: List[FieldElement], b_elements: List[FieldElement], modulus: List[int]) -> List[FieldElement]:
"""Optimized CPU field addition for benchmarking"""
num_elements = len(a_elements)
result = []
# Use numpy for vectorized operations where possible
for i in range(num_elements):
c = FieldElement()
for j in range(4):
c.limbs[j] = (a_elements[i].limbs[j] + b_elements[i].limbs[j]) % modulus[j]
result.append(c)
return result
def analyze_performance_bottlenecks(self) -> dict:
"""Analyze potential performance bottlenecks in GPU operations"""
print("🔍 Analyzing GPU Performance Bottlenecks...")
analysis = {
"memory_bandwidth": self._test_memory_bandwidth(),
"compute_utilization": self._test_compute_utilization(),
"data_transfer": self._test_data_transfer(),
"kernel_launch": self._test_kernel_launch_overhead()
}
print("\n📊 Performance Analysis Results:")
for key, value in analysis.items():
print(f" {key}: {value}")
return analysis
def _test_memory_bandwidth(self) -> str:
"""Test GPU memory bandwidth"""
# Simple memory bandwidth test
try:
size = 1000000 # 1M elements
a_elements, b_elements = self._generate_test_data(size)
start_time = time.time()
success, _ = self.field_addition_optimized(a_elements, b_elements,
[0xFFFFFFFFFFFFFFFF] * 4)
test_time = time.time() - start_time
if success:
bandwidth = (size * 4 * 8 * 3) / (test_time * 1e9) # GB/s (3 arrays, 4 limbs, 8 bytes)
return f"{bandwidth:.2f} GB/s"
else:
return "Test failed"
except Exception as e:
return f"Error: {e}"
def _test_compute_utilization(self) -> str:
"""Test GPU compute utilization"""
return "Compute utilization test - requires profiling tools"
def _test_data_transfer(self) -> str:
"""Test data transfer overhead"""
try:
size = 100000
a_elements, _ = self._generate_test_data(size)
# Test data transfer time
start_time = time.time()
a_array = np.array(a_elements, dtype=FieldElement)
transfer_time = time.time() - start_time
return f"{transfer_time:.4f}s for {size:,} elements"
except Exception as e:
return f"Error: {e}"
def _test_kernel_launch_overhead(self) -> str:
"""Test kernel launch overhead"""
try:
size = 1000 # Small dataset to isolate launch overhead
a_elements, b_elements = self._generate_test_data(size)
start_time = time.time()
success, _ = self.field_addition_optimized(a_elements, b_elements,
[0xFFFFFFFFFFFFFFFF] * 4)
total_time = time.time() - start_time
if success:
return f"{total_time:.4f}s total (includes launch overhead)"
else:
return "Test failed"
except Exception as e:
return f"Error: {e}"
def main():
"""Main function for testing optimized CUDA acceleration"""
print("🚀 AITBC Optimized CUDA ZK Accelerator Test")
print("=" * 50)
try:
# Initialize accelerator
accelerator = OptimizedCUDAZKAccelerator()
if not accelerator.initialized:
print("❌ Failed to initialize CUDA accelerator")
return
# Initialize device
if not accelerator.init_device():
return
# Run optimized benchmark
results = accelerator.benchmark_optimized_performance(10000000)
# Analyze performance bottlenecks
bottleneck_analysis = accelerator.analyze_performance_bottlenecks()
print("\n✅ Optimized CUDA acceleration test completed!")
if results.get("best_speedup"):
print(f"🚀 Best performance: {results['best_speedup']['speedup']:.2f}x speedup")
except Exception as e:
print(f"❌ Test failed: {e}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,517 @@
/**
* Optimized CUDA Kernels for ZK Circuit Field Operations
*
* Implements high-performance GPU-accelerated field arithmetic with optimized memory access
* patterns, vectorized operations, and improved data transfer efficiency.
*/
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <device_launch_parameters.h>
#include <stdint.h>
#include <stdio.h>
// Custom 128-bit integer type for CUDA compatibility
typedef unsigned long long uint128_t __attribute__((mode(TI)));
// Optimized field element structure using flat arrays for better memory coalescing
typedef struct {
uint64_t limbs[4]; // 4 x 64-bit limbs for 256-bit field element
} field_element_t;
// Vectorized field element for improved memory bandwidth
typedef uint4 field_vector_t; // 128-bit vector (4 x 32-bit)
// Optimized constraint structure
typedef struct {
uint64_t a[4];
uint64_t b[4];
uint64_t c[4];
uint8_t operation; // 0: a + b = c, 1: a * b = c
} optimized_constraint_t;
// Optimized kernel for parallel field addition with coalesced memory access
__global__ void optimized_field_addition_kernel(
const uint64_t* __restrict__ a_flat,
const uint64_t* __restrict__ b_flat,
uint64_t* __restrict__ result_flat,
const uint64_t* __restrict__ modulus,
int num_elements
) {
// Calculate global thread ID
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Process multiple elements per thread for better utilization
for (int elem = tid; elem < num_elements; elem += stride) {
int base_idx = elem * 4; // 4 limbs per element
// Perform field addition with carry propagation
uint64_t carry = 0;
// Unrolled loop for better performance
#pragma unroll
for (int i = 0; i < 4; i++) {
uint128_t sum = (uint128_t)a_flat[base_idx + i] + b_flat[base_idx + i] + carry;
result_flat[base_idx + i] = (uint64_t)sum;
carry = sum >> 64;
}
// Simplified modulus reduction (for demonstration)
// In practice, would implement proper bn128 field reduction
if (carry > 0) {
#pragma unroll
for (int i = 0; i < 4; i++) {
uint128_t diff = (uint128_t)result_flat[base_idx + i] - modulus[i] - carry;
result_flat[base_idx + i] = (uint64_t)diff;
carry = diff >> 63; // Borrow
}
}
}
}
// Vectorized field addition kernel using uint4 for better memory bandwidth
__global__ void vectorized_field_addition_kernel(
const field_vector_t* __restrict__ a_vec,
const field_vector_t* __restrict__ b_vec,
field_vector_t* __restrict__ result_vec,
const uint64_t* __restrict__ modulus,
int num_vectors
) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int vec = tid; vec < num_vectors; vec += stride) {
// Load vectors
field_vector_t a = a_vec[vec];
field_vector_t b = b_vec[vec];
// Perform vectorized addition
field_vector_t result;
uint64_t carry = 0;
// Component-wise addition with carry
uint128_t sum0 = (uint128_t)a.x + b.x + carry;
result.x = (uint64_t)sum0;
carry = sum0 >> 64;
uint128_t sum1 = (uint128_t)a.y + b.y + carry;
result.y = (uint64_t)sum1;
carry = sum1 >> 64;
uint128_t sum2 = (uint128_t)a.z + b.z + carry;
result.z = (uint64_t)sum2;
carry = sum2 >> 64;
uint128_t sum3 = (uint128_t)a.w + b.w + carry;
result.w = (uint64_t)sum3;
// Store result
result_vec[vec] = result;
}
}
// Shared memory optimized kernel for large datasets
__global__ void shared_memory_field_addition_kernel(
const uint64_t* __restrict__ a_flat,
const uint64_t* __restrict__ b_flat,
uint64_t* __restrict__ result_flat,
const uint64_t* __restrict__ modulus,
int num_elements
) {
// Shared memory for tile processing
__shared__ uint64_t tile_a[256 * 4]; // 256 threads, 4 limbs each
__shared__ uint64_t tile_b[256 * 4];
__shared__ uint64_t tile_result[256 * 4];
int tid = threadIdx.x;
int elements_per_tile = blockDim.x;
int tile_idx = blockIdx.x;
int elem_in_tile = tid;
// Load data into shared memory
if (tile_idx * elements_per_tile + elem_in_tile < num_elements) {
int global_idx = (tile_idx * elements_per_tile + elem_in_tile) * 4;
// Coalesced global memory access
#pragma unroll
for (int i = 0; i < 4; i++) {
tile_a[tid * 4 + i] = a_flat[global_idx + i];
tile_b[tid * 4 + i] = b_flat[global_idx + i];
}
}
__syncthreads();
// Process in shared memory
if (tile_idx * elements_per_tile + elem_in_tile < num_elements) {
uint64_t carry = 0;
#pragma unroll
for (int i = 0; i < 4; i++) {
uint128_t sum = (uint128_t)tile_a[tid * 4 + i] + tile_b[tid * 4 + i] + carry;
tile_result[tid * 4 + i] = (uint64_t)sum;
carry = sum >> 64;
}
// Simplified modulus reduction
if (carry > 0) {
#pragma unroll
for (int i = 0; i < 4; i++) {
uint128_t diff = (uint128_t)tile_result[tid * 4 + i] - modulus[i] - carry;
tile_result[tid * 4 + i] = (uint64_t)diff;
carry = diff >> 63;
}
}
}
__syncthreads();
// Write back to global memory
if (tile_idx * elements_per_tile + elem_in_tile < num_elements) {
int global_idx = (tile_idx * elements_per_tile + elem_in_tile) * 4;
// Coalesced global memory write
#pragma unroll
for (int i = 0; i < 4; i++) {
result_flat[global_idx + i] = tile_result[tid * 4 + i];
}
}
}
// Optimized constraint verification kernel
__global__ void optimized_constraint_verification_kernel(
const optimized_constraint_t* __restrict__ constraints,
const uint64_t* __restrict__ witness_flat,
bool* __restrict__ results,
int num_constraints
) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int constraint_idx = tid; constraint_idx < num_constraints; constraint_idx += stride) {
const optimized_constraint_t* c = &constraints[constraint_idx];
bool constraint_satisfied = true;
if (c->operation == 0) {
// Addition constraint: a + b = c
uint64_t computed[4];
uint64_t carry = 0;
#pragma unroll
for (int i = 0; i < 4; i++) {
uint128_t sum = (uint128_t)c->a[i] + c->b[i] + carry;
computed[i] = (uint64_t)sum;
carry = sum >> 64;
}
// Check if computed equals expected
#pragma unroll
for (int i = 0; i < 4; i++) {
if (computed[i] != c->c[i]) {
constraint_satisfied = false;
break;
}
}
} else {
// Multiplication constraint: a * b = c (simplified)
// In practice, would implement proper field multiplication
constraint_satisfied = (c->a[0] * c->b[0]) == c->c[0]; // Simplified check
}
results[constraint_idx] = constraint_satisfied;
}
}
// Stream-optimized kernel for overlapping computation and transfer
__global__ void stream_optimized_field_kernel(
const uint64_t* __restrict__ a_flat,
const uint64_t* __restrict__ b_flat,
uint64_t* __restrict__ result_flat,
const uint64_t* __restrict__ modulus,
int num_elements,
int stream_id
) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Each stream processes a chunk of the data
int elements_per_stream = (num_elements + 3) / 4; // 4 streams
int start_elem = stream_id * elements_per_stream;
int end_elem = min(start_elem + elements_per_stream, num_elements);
for (int elem = start_elem + tid; elem < end_elem; elem += stride) {
int base_idx = elem * 4;
uint64_t carry = 0;
#pragma unroll
for (int i = 0; i < 4; i++) {
uint128_t sum = (uint128_t)a_flat[base_idx + i] + b_flat[base_idx + i] + carry;
result_flat[base_idx + i] = (uint64_t)sum;
carry = sum >> 64;
}
}
}
// Host wrapper functions for optimized operations
extern "C" {
// Initialize CUDA device with optimization info
cudaError_t init_optimized_cuda_device() {
int deviceCount = 0;
cudaError_t error = cudaGetDeviceCount(&deviceCount);
if (error != cudaSuccess || deviceCount == 0) {
printf("No CUDA devices found\n");
return error;
}
// Select best device
int best_device = 0;
size_t max_memory = 0;
for (int i = 0; i < deviceCount; i++) {
cudaDeviceProp prop;
error = cudaGetDeviceProperties(&prop, i);
if (error == cudaSuccess && prop.totalGlobalMem > max_memory) {
max_memory = prop.totalGlobalMem;
best_device = i;
}
}
error = cudaSetDevice(best_device);
if (error != cudaSuccess) {
printf("Failed to set CUDA device\n");
return error;
}
// Get device properties
cudaDeviceProp prop;
error = cudaGetDeviceProperties(&prop, best_device);
if (error == cudaSuccess) {
printf("✅ Optimized CUDA Device: %s\n", prop.name);
printf(" Compute Capability: %d.%d\n", prop.major, prop.minor);
printf(" Global Memory: %zu MB\n", prop.totalGlobalMem / (1024 * 1024));
printf(" Shared Memory per Block: %zu KB\n", prop.sharedMemPerBlock / 1024);
printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
printf(" Warp Size: %d\n", prop.warpSize);
printf(" Max Grid Size: [%d, %d, %d]\n",
prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
}
return error;
}
// Optimized field addition with flat arrays
cudaError_t gpu_optimized_field_addition(
const uint64_t* a_flat,
const uint64_t* b_flat,
uint64_t* result_flat,
const uint64_t* modulus,
int num_elements
) {
// Allocate device memory
uint64_t *d_a, *d_b, *d_result, *d_modulus;
size_t flat_size = num_elements * 4 * sizeof(uint64_t); // 4 limbs per element
size_t modulus_size = 4 * sizeof(uint64_t);
cudaError_t error = cudaMalloc(&d_a, flat_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_b, flat_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_result, flat_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_modulus, modulus_size);
if (error != cudaSuccess) return error;
// Copy data to device with optimized transfer
error = cudaMemcpy(d_a, a_flat, flat_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_b, b_flat, flat_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_modulus, modulus, modulus_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
// Launch optimized kernel
int threadsPerBlock = 256; // Optimal for most GPUs
int blocksPerGrid = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
// Ensure we have enough blocks for good GPU utilization
blocksPerGrid = max(blocksPerGrid, 32); // Minimum blocks for good occupancy
printf("🚀 Launching optimized field addition kernel:\n");
printf(" Elements: %d\n", num_elements);
printf(" Blocks: %d\n", blocksPerGrid);
printf(" Threads per Block: %d\n", threadsPerBlock);
printf(" Total Threads: %d\n", blocksPerGrid * threadsPerBlock);
// Use optimized kernel
optimized_field_addition_kernel<<<blocksPerGrid, threadsPerBlock>>>(
d_a, d_b, d_result, d_modulus, num_elements
);
// Check for kernel launch errors
error = cudaGetLastError();
if (error != cudaSuccess) return error;
// Synchronize to ensure kernel completion
error = cudaDeviceSynchronize();
if (error != cudaSuccess) return error;
// Copy result back to host
error = cudaMemcpy(result_flat, d_result, flat_size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
cudaFree(d_modulus);
return error;
}
// Vectorized field addition for better memory bandwidth
cudaError_t gpu_vectorized_field_addition(
const field_vector_t* a_vec,
const field_vector_t* b_vec,
field_vector_t* result_vec,
const uint64_t* modulus,
int num_elements
) {
// Allocate device memory
field_vector_t *d_a, *d_b, *d_result;
uint64_t *d_modulus;
size_t vec_size = num_elements * sizeof(field_vector_t);
size_t modulus_size = 4 * sizeof(uint64_t);
cudaError_t error = cudaMalloc(&d_a, vec_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_b, vec_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_result, vec_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_modulus, modulus_size);
if (error != cudaSuccess) return error;
// Copy data to device
error = cudaMemcpy(d_a, a_vec, vec_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_b, b_vec, vec_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_modulus, modulus, modulus_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
// Launch vectorized kernel
int threadsPerBlock = 256;
int blocksPerGrid = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
blocksPerGrid = max(blocksPerGrid, 32);
printf("🚀 Launching vectorized field addition kernel:\n");
printf(" Elements: %d\n", num_elements);
printf(" Blocks: %d\n", blocksPerGrid);
printf(" Threads per Block: %d\n", threadsPerBlock);
vectorized_field_addition_kernel<<<blocksPerGrid, threadsPerBlock>>>(
d_a, d_b, d_result, d_modulus, num_elements
);
error = cudaGetLastError();
if (error != cudaSuccess) return error;
error = cudaDeviceSynchronize();
if (error != cudaSuccess) return error;
// Copy result back
error = cudaMemcpy(result_vec, d_result, vec_size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
cudaFree(d_modulus);
return error;
}
// Shared memory optimized field addition
cudaError_t gpu_shared_memory_field_addition(
const uint64_t* a_flat,
const uint64_t* b_flat,
uint64_t* result_flat,
const uint64_t* modulus,
int num_elements
) {
// Similar to optimized version but uses shared memory
uint64_t *d_a, *d_b, *d_result, *d_modulus;
size_t flat_size = num_elements * 4 * sizeof(uint64_t);
size_t modulus_size = 4 * sizeof(uint64_t);
cudaError_t error = cudaMalloc(&d_a, flat_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_b, flat_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_result, flat_size);
if (error != cudaSuccess) return error;
error = cudaMalloc(&d_modulus, modulus_size);
if (error != cudaSuccess) return error;
// Copy data
error = cudaMemcpy(d_a, a_flat, flat_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_b, b_flat, flat_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
error = cudaMemcpy(d_modulus, modulus, modulus_size, cudaMemcpyHostToDevice);
if (error != cudaSuccess) return error;
// Launch shared memory kernel
int threadsPerBlock = 256; // Matches shared memory tile size
int blocksPerGrid = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
blocksPerGrid = max(blocksPerGrid, 32);
printf("🚀 Launching shared memory field addition kernel:\n");
printf(" Elements: %d\n", num_elements);
printf(" Blocks: %d\n", blocksPerGrid);
printf(" Threads per Block: %d\n", threadsPerBlock);
shared_memory_field_addition_kernel<<<blocksPerGrid, threadsPerBlock>>>(
d_a, d_b, d_result, d_modulus, num_elements
);
error = cudaGetLastError();
if (error != cudaSuccess) return error;
error = cudaDeviceSynchronize();
if (error != cudaSuccess) return error;
error = cudaMemcpy(result_flat, d_result, flat_size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
cudaFree(d_modulus);
return error;
}
} // extern "C"