- Bump minimum Python version from 3.11 to 3.13 across all apps - Add Python 3.11-3.13 test matrix to CLI workflow - Document Python 3.11+ requirement in .env.example - Fix Starlette Broadcast removal with in-process fallback implementation - Add _InProcessBroadcast class for tests when Starlette Broadcast is unavailable - Refactor API key validators to read live settings instead of cached values - Update database models with explicit
518 lines
17 KiB
Plaintext
518 lines
17 KiB
Plaintext
/**
|
|
* Optimized CUDA Kernels for ZK Circuit Field Operations
|
|
*
|
|
* Implements high-performance GPU-accelerated field arithmetic with optimized memory access
|
|
* patterns, vectorized operations, and improved data transfer efficiency.
|
|
*/
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <curand_kernel.h>
|
|
#include <device_launch_parameters.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
|
|
// Custom 128-bit integer type for CUDA compatibility
|
|
typedef unsigned long long uint128_t __attribute__((mode(TI)));
|
|
|
|
// Optimized field element structure using flat arrays for better memory coalescing
|
|
typedef struct {
|
|
uint64_t limbs[4]; // 4 x 64-bit limbs for 256-bit field element
|
|
} field_element_t;
|
|
|
|
// Vectorized field element for improved memory bandwidth
|
|
typedef uint4 field_vector_t; // 128-bit vector (4 x 32-bit)
|
|
|
|
// Optimized constraint structure
|
|
typedef struct {
|
|
uint64_t a[4];
|
|
uint64_t b[4];
|
|
uint64_t c[4];
|
|
uint8_t operation; // 0: a + b = c, 1: a * b = c
|
|
} optimized_constraint_t;
|
|
|
|
// Optimized kernel for parallel field addition with coalesced memory access
|
|
__global__ void optimized_field_addition_kernel(
|
|
const uint64_t* __restrict__ a_flat,
|
|
const uint64_t* __restrict__ b_flat,
|
|
uint64_t* __restrict__ result_flat,
|
|
const uint64_t* __restrict__ modulus,
|
|
int num_elements
|
|
) {
|
|
// Calculate global thread ID
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int stride = blockDim.x * gridDim.x;
|
|
|
|
// Process multiple elements per thread for better utilization
|
|
for (int elem = tid; elem < num_elements; elem += stride) {
|
|
int base_idx = elem * 4; // 4 limbs per element
|
|
|
|
// Perform field addition with carry propagation
|
|
uint64_t carry = 0;
|
|
|
|
// Unrolled loop for better performance
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
uint128_t sum = (uint128_t)a_flat[base_idx + i] + b_flat[base_idx + i] + carry;
|
|
result_flat[base_idx + i] = (uint64_t)sum;
|
|
carry = sum >> 64;
|
|
}
|
|
|
|
// Simplified modulus reduction (for demonstration)
|
|
// In practice, would implement proper bn128 field reduction
|
|
if (carry > 0) {
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
uint128_t diff = (uint128_t)result_flat[base_idx + i] - modulus[i] - carry;
|
|
result_flat[base_idx + i] = (uint64_t)diff;
|
|
carry = diff >> 63; // Borrow
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Vectorized field addition kernel using uint4 for better memory bandwidth
|
|
__global__ void vectorized_field_addition_kernel(
|
|
const field_vector_t* __restrict__ a_vec,
|
|
const field_vector_t* __restrict__ b_vec,
|
|
field_vector_t* __restrict__ result_vec,
|
|
const uint64_t* __restrict__ modulus,
|
|
int num_vectors
|
|
) {
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int stride = blockDim.x * gridDim.x;
|
|
|
|
for (int vec = tid; vec < num_vectors; vec += stride) {
|
|
// Load vectors
|
|
field_vector_t a = a_vec[vec];
|
|
field_vector_t b = b_vec[vec];
|
|
|
|
// Perform vectorized addition
|
|
field_vector_t result;
|
|
uint64_t carry = 0;
|
|
|
|
// Component-wise addition with carry
|
|
uint128_t sum0 = (uint128_t)a.x + b.x + carry;
|
|
result.x = (uint64_t)sum0;
|
|
carry = sum0 >> 64;
|
|
|
|
uint128_t sum1 = (uint128_t)a.y + b.y + carry;
|
|
result.y = (uint64_t)sum1;
|
|
carry = sum1 >> 64;
|
|
|
|
uint128_t sum2 = (uint128_t)a.z + b.z + carry;
|
|
result.z = (uint64_t)sum2;
|
|
carry = sum2 >> 64;
|
|
|
|
uint128_t sum3 = (uint128_t)a.w + b.w + carry;
|
|
result.w = (uint64_t)sum3;
|
|
|
|
// Store result
|
|
result_vec[vec] = result;
|
|
}
|
|
}
|
|
|
|
// Shared memory optimized kernel for large datasets
|
|
__global__ void shared_memory_field_addition_kernel(
|
|
const uint64_t* __restrict__ a_flat,
|
|
const uint64_t* __restrict__ b_flat,
|
|
uint64_t* __restrict__ result_flat,
|
|
const uint64_t* __restrict__ modulus,
|
|
int num_elements
|
|
) {
|
|
// Shared memory for tile processing
|
|
__shared__ uint64_t tile_a[256 * 4]; // 256 threads, 4 limbs each
|
|
__shared__ uint64_t tile_b[256 * 4];
|
|
__shared__ uint64_t tile_result[256 * 4];
|
|
|
|
int tid = threadIdx.x;
|
|
int elements_per_tile = blockDim.x;
|
|
int tile_idx = blockIdx.x;
|
|
int elem_in_tile = tid;
|
|
|
|
// Load data into shared memory
|
|
if (tile_idx * elements_per_tile + elem_in_tile < num_elements) {
|
|
int global_idx = (tile_idx * elements_per_tile + elem_in_tile) * 4;
|
|
|
|
// Coalesced global memory access
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
tile_a[tid * 4 + i] = a_flat[global_idx + i];
|
|
tile_b[tid * 4 + i] = b_flat[global_idx + i];
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
// Process in shared memory
|
|
if (tile_idx * elements_per_tile + elem_in_tile < num_elements) {
|
|
uint64_t carry = 0;
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
uint128_t sum = (uint128_t)tile_a[tid * 4 + i] + tile_b[tid * 4 + i] + carry;
|
|
tile_result[tid * 4 + i] = (uint64_t)sum;
|
|
carry = sum >> 64;
|
|
}
|
|
|
|
// Simplified modulus reduction
|
|
if (carry > 0) {
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
uint128_t diff = (uint128_t)tile_result[tid * 4 + i] - modulus[i] - carry;
|
|
tile_result[tid * 4 + i] = (uint64_t)diff;
|
|
carry = diff >> 63;
|
|
}
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
// Write back to global memory
|
|
if (tile_idx * elements_per_tile + elem_in_tile < num_elements) {
|
|
int global_idx = (tile_idx * elements_per_tile + elem_in_tile) * 4;
|
|
|
|
// Coalesced global memory write
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
result_flat[global_idx + i] = tile_result[tid * 4 + i];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Optimized constraint verification kernel
|
|
__global__ void optimized_constraint_verification_kernel(
|
|
const optimized_constraint_t* __restrict__ constraints,
|
|
const uint64_t* __restrict__ witness_flat,
|
|
bool* __restrict__ results,
|
|
int num_constraints
|
|
) {
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int stride = blockDim.x * gridDim.x;
|
|
|
|
for (int constraint_idx = tid; constraint_idx < num_constraints; constraint_idx += stride) {
|
|
const optimized_constraint_t* c = &constraints[constraint_idx];
|
|
|
|
bool constraint_satisfied = true;
|
|
|
|
if (c->operation == 0) {
|
|
// Addition constraint: a + b = c
|
|
uint64_t computed[4];
|
|
uint64_t carry = 0;
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
uint128_t sum = (uint128_t)c->a[i] + c->b[i] + carry;
|
|
computed[i] = (uint64_t)sum;
|
|
carry = sum >> 64;
|
|
}
|
|
|
|
// Check if computed equals expected
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
if (computed[i] != c->c[i]) {
|
|
constraint_satisfied = false;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
// Multiplication constraint: a * b = c (simplified)
|
|
// In practice, would implement proper field multiplication
|
|
constraint_satisfied = (c->a[0] * c->b[0]) == c->c[0]; // Simplified check
|
|
}
|
|
|
|
results[constraint_idx] = constraint_satisfied;
|
|
}
|
|
}
|
|
|
|
// Stream-optimized kernel for overlapping computation and transfer
|
|
__global__ void stream_optimized_field_kernel(
|
|
const uint64_t* __restrict__ a_flat,
|
|
const uint64_t* __restrict__ b_flat,
|
|
uint64_t* __restrict__ result_flat,
|
|
const uint64_t* __restrict__ modulus,
|
|
int num_elements,
|
|
int stream_id
|
|
) {
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int stride = blockDim.x * gridDim.x;
|
|
|
|
// Each stream processes a chunk of the data
|
|
int elements_per_stream = (num_elements + 3) / 4; // 4 streams
|
|
int start_elem = stream_id * elements_per_stream;
|
|
int end_elem = min(start_elem + elements_per_stream, num_elements);
|
|
|
|
for (int elem = start_elem + tid; elem < end_elem; elem += stride) {
|
|
int base_idx = elem * 4;
|
|
|
|
uint64_t carry = 0;
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < 4; i++) {
|
|
uint128_t sum = (uint128_t)a_flat[base_idx + i] + b_flat[base_idx + i] + carry;
|
|
result_flat[base_idx + i] = (uint64_t)sum;
|
|
carry = sum >> 64;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Host wrapper functions for optimized operations
|
|
extern "C" {
|
|
|
|
// Initialize CUDA device with optimization info
|
|
cudaError_t init_optimized_cuda_device() {
|
|
int deviceCount = 0;
|
|
cudaError_t error = cudaGetDeviceCount(&deviceCount);
|
|
|
|
if (error != cudaSuccess || deviceCount == 0) {
|
|
printf("No CUDA devices found\n");
|
|
return error;
|
|
}
|
|
|
|
// Select best device
|
|
int best_device = 0;
|
|
size_t max_memory = 0;
|
|
|
|
for (int i = 0; i < deviceCount; i++) {
|
|
cudaDeviceProp prop;
|
|
error = cudaGetDeviceProperties(&prop, i);
|
|
if (error == cudaSuccess && prop.totalGlobalMem > max_memory) {
|
|
max_memory = prop.totalGlobalMem;
|
|
best_device = i;
|
|
}
|
|
}
|
|
|
|
error = cudaSetDevice(best_device);
|
|
if (error != cudaSuccess) {
|
|
printf("Failed to set CUDA device\n");
|
|
return error;
|
|
}
|
|
|
|
// Get device properties
|
|
cudaDeviceProp prop;
|
|
error = cudaGetDeviceProperties(&prop, best_device);
|
|
if (error == cudaSuccess) {
|
|
printf("✅ Optimized CUDA Device: %s\n", prop.name);
|
|
printf(" Compute Capability: %d.%d\n", prop.major, prop.minor);
|
|
printf(" Global Memory: %zu MB\n", prop.totalGlobalMem / (1024 * 1024));
|
|
printf(" Shared Memory per Block: %zu KB\n", prop.sharedMemPerBlock / 1024);
|
|
printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
|
|
printf(" Warp Size: %d\n", prop.warpSize);
|
|
printf(" Max Grid Size: [%d, %d, %d]\n",
|
|
prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
// Optimized field addition with flat arrays
|
|
cudaError_t gpu_optimized_field_addition(
|
|
const uint64_t* a_flat,
|
|
const uint64_t* b_flat,
|
|
uint64_t* result_flat,
|
|
const uint64_t* modulus,
|
|
int num_elements
|
|
) {
|
|
// Allocate device memory
|
|
uint64_t *d_a, *d_b, *d_result, *d_modulus;
|
|
|
|
size_t flat_size = num_elements * 4 * sizeof(uint64_t); // 4 limbs per element
|
|
size_t modulus_size = 4 * sizeof(uint64_t);
|
|
|
|
cudaError_t error = cudaMalloc(&d_a, flat_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_b, flat_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_result, flat_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_modulus, modulus_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Copy data to device with optimized transfer
|
|
error = cudaMemcpy(d_a, a_flat, flat_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMemcpy(d_b, b_flat, flat_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMemcpy(d_modulus, modulus, modulus_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Launch optimized kernel
|
|
int threadsPerBlock = 256; // Optimal for most GPUs
|
|
int blocksPerGrid = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
|
|
|
|
// Ensure we have enough blocks for good GPU utilization
|
|
blocksPerGrid = max(blocksPerGrid, 32); // Minimum blocks for good occupancy
|
|
|
|
printf("🚀 Launching optimized field addition kernel:\n");
|
|
printf(" Elements: %d\n", num_elements);
|
|
printf(" Blocks: %d\n", blocksPerGrid);
|
|
printf(" Threads per Block: %d\n", threadsPerBlock);
|
|
printf(" Total Threads: %d\n", blocksPerGrid * threadsPerBlock);
|
|
|
|
// Use optimized kernel
|
|
optimized_field_addition_kernel<<<blocksPerGrid, threadsPerBlock>>>(
|
|
d_a, d_b, d_result, d_modulus, num_elements
|
|
);
|
|
|
|
// Check for kernel launch errors
|
|
error = cudaGetLastError();
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Synchronize to ensure kernel completion
|
|
error = cudaDeviceSynchronize();
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Copy result back to host
|
|
error = cudaMemcpy(result_flat, d_result, flat_size, cudaMemcpyDeviceToHost);
|
|
|
|
// Free device memory
|
|
cudaFree(d_a);
|
|
cudaFree(d_b);
|
|
cudaFree(d_result);
|
|
cudaFree(d_modulus);
|
|
|
|
return error;
|
|
}
|
|
|
|
// Vectorized field addition for better memory bandwidth
|
|
cudaError_t gpu_vectorized_field_addition(
|
|
const field_vector_t* a_vec,
|
|
const field_vector_t* b_vec,
|
|
field_vector_t* result_vec,
|
|
const uint64_t* modulus,
|
|
int num_elements
|
|
) {
|
|
// Allocate device memory
|
|
field_vector_t *d_a, *d_b, *d_result;
|
|
uint64_t *d_modulus;
|
|
|
|
size_t vec_size = num_elements * sizeof(field_vector_t);
|
|
size_t modulus_size = 4 * sizeof(uint64_t);
|
|
|
|
cudaError_t error = cudaMalloc(&d_a, vec_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_b, vec_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_result, vec_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_modulus, modulus_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Copy data to device
|
|
error = cudaMemcpy(d_a, a_vec, vec_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMemcpy(d_b, b_vec, vec_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMemcpy(d_modulus, modulus, modulus_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Launch vectorized kernel
|
|
int threadsPerBlock = 256;
|
|
int blocksPerGrid = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
|
|
blocksPerGrid = max(blocksPerGrid, 32);
|
|
|
|
printf("🚀 Launching vectorized field addition kernel:\n");
|
|
printf(" Elements: %d\n", num_elements);
|
|
printf(" Blocks: %d\n", blocksPerGrid);
|
|
printf(" Threads per Block: %d\n", threadsPerBlock);
|
|
|
|
vectorized_field_addition_kernel<<<blocksPerGrid, threadsPerBlock>>>(
|
|
d_a, d_b, d_result, d_modulus, num_elements
|
|
);
|
|
|
|
error = cudaGetLastError();
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaDeviceSynchronize();
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Copy result back
|
|
error = cudaMemcpy(result_vec, d_result, vec_size, cudaMemcpyDeviceToHost);
|
|
|
|
// Free device memory
|
|
cudaFree(d_a);
|
|
cudaFree(d_b);
|
|
cudaFree(d_result);
|
|
cudaFree(d_modulus);
|
|
|
|
return error;
|
|
}
|
|
|
|
// Shared memory optimized field addition
|
|
cudaError_t gpu_shared_memory_field_addition(
|
|
const uint64_t* a_flat,
|
|
const uint64_t* b_flat,
|
|
uint64_t* result_flat,
|
|
const uint64_t* modulus,
|
|
int num_elements
|
|
) {
|
|
// Similar to optimized version but uses shared memory
|
|
uint64_t *d_a, *d_b, *d_result, *d_modulus;
|
|
|
|
size_t flat_size = num_elements * 4 * sizeof(uint64_t);
|
|
size_t modulus_size = 4 * sizeof(uint64_t);
|
|
|
|
cudaError_t error = cudaMalloc(&d_a, flat_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_b, flat_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_result, flat_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMalloc(&d_modulus, modulus_size);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Copy data
|
|
error = cudaMemcpy(d_a, a_flat, flat_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMemcpy(d_b, b_flat, flat_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMemcpy(d_modulus, modulus, modulus_size, cudaMemcpyHostToDevice);
|
|
if (error != cudaSuccess) return error;
|
|
|
|
// Launch shared memory kernel
|
|
int threadsPerBlock = 256; // Matches shared memory tile size
|
|
int blocksPerGrid = (num_elements + threadsPerBlock - 1) / threadsPerBlock;
|
|
blocksPerGrid = max(blocksPerGrid, 32);
|
|
|
|
printf("🚀 Launching shared memory field addition kernel:\n");
|
|
printf(" Elements: %d\n", num_elements);
|
|
printf(" Blocks: %d\n", blocksPerGrid);
|
|
printf(" Threads per Block: %d\n", threadsPerBlock);
|
|
|
|
shared_memory_field_addition_kernel<<<blocksPerGrid, threadsPerBlock>>>(
|
|
d_a, d_b, d_result, d_modulus, num_elements
|
|
);
|
|
|
|
error = cudaGetLastError();
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaDeviceSynchronize();
|
|
if (error != cudaSuccess) return error;
|
|
|
|
error = cudaMemcpy(result_flat, d_result, flat_size, cudaMemcpyDeviceToHost);
|
|
|
|
// Free device memory
|
|
cudaFree(d_a);
|
|
cudaFree(d_b);
|
|
cudaFree(d_result);
|
|
cudaFree(d_modulus);
|
|
|
|
return error;
|
|
}
|
|
|
|
} // extern "C"
|