From 89a873307de466a4c4feeeb46d2ba70526e77d63 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Mon, 20 Jan 2025 18:27:24 +0000 Subject: [PATCH] fix(gpu): general fixes on indexes used in multi-gpu context. - fix a bug in which the wrong GPU may be queried for the max shared memory - If multiple streams are running split through multiple GPUs, operations happening on a stream in GPU i should query GPU i about its max shared memory, - also fixes wrong indexing at rust side. --- .../cuda/include/integer/integer_utilities.h | 18 +- .../cuda/include/pbs/pbs_multibit_utilities.h | 4 +- .../cuda/include/pbs/pbs_utilities.h | 14 +- .../pbs/programmable_bootstrap_multibit.h | 2 +- .../cuda/src/pbs/bootstrapping_key.cu | 16 +- .../cuda/src/pbs/bootstrapping_key.cuh | 16 +- .../pbs/programmable_bootstrap_amortized.cuh | 10 +- .../pbs/programmable_bootstrap_cg_classic.cuh | 33 ++- .../programmable_bootstrap_cg_multibit.cuh | 33 ++- .../src/pbs/programmable_bootstrap_classic.cu | 46 ++-- .../pbs/programmable_bootstrap_classic.cuh | 9 +- .../pbs/programmable_bootstrap_multibit.cu | 37 +-- .../pbs/programmable_bootstrap_multibit.cuh | 8 +- .../programmable_bootstrap_tbc_classic.cuh | 25 +- .../programmable_bootstrap_tbc_multibit.cuh | 24 +- .../benchmarks/benchmark_pbs.cpp | 4 +- backends/tfhe-cuda-backend/src/bindings.rs | 225 +++++++++--------- tfhe/src/core_crypto/gpu/mod.rs | 16 +- tfhe/src/core_crypto/gpu/vec.rs | 32 +-- 19 files changed, 306 insertions(+), 266 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 186b8c7ce2..c45c1d316e 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -149,7 +149,9 @@ template struct int_radix_lut { std::vector lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec; - int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t *gpu_indexes; + + int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, bool allocate_gpu_memory) { @@ -160,6 +162,9 @@ template struct int_radix_lut { Torus lut_buffer_size = (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); + gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t)); + std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t)); + /////////////// active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -255,7 +260,7 @@ template struct int_radix_lut { } // constructor to reuse memory - int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes, + int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, int_radix_lut *base_lut_object) { @@ -266,6 +271,9 @@ template struct int_radix_lut { Torus lut_buffer_size = (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); + gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t)); + std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t)); + // base lut object should have bigger or equal memory than current one assert(num_radix_blocks <= base_lut_object->num_blocks); // pbs @@ -332,7 +340,7 @@ template struct int_radix_lut { } // Construction for many luts - int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes, + int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, uint32_t num_many_lut, bool allocate_gpu_memory) { @@ -344,6 +352,9 @@ template struct int_radix_lut { Torus lut_buffer_size = (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); + gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t)); + std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t)); + /////////////// active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -494,6 +505,7 @@ template struct int_radix_lut { void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { + free(this->gpu_indexes); for (uint i = 0; i < active_gpu_count; i++) { cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]); cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]); diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h index f0a54cca2f..147280628e 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h @@ -5,12 +5,12 @@ template bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( - uint32_t polynomial_size); + uint32_t polynomial_size, int max_shared_memory); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count); + uint32_t level_count, int max_shared_memory); #if CUDA_ARCH >= 900 template diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h index 722a276235..2b4ea1cb8c 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h @@ -61,7 +61,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) { template bool supports_distributed_shared_memory_on_classic_programmable_bootstrap( - uint32_t polynomial_size); + uint32_t polynomial_size, int max_shared_memory); template struct pbs_buffer; @@ -77,10 +77,10 @@ template struct pbs_buffer { uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant, bool allocate_gpu_memory) { - + cudaSetDevice(gpu_index); this->pbs_variant = pbs_variant; - auto max_shared_memory = cuda_get_max_shared_memory(0); + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); if (allocate_gpu_memory) { switch (pbs_variant) { @@ -157,7 +157,7 @@ template struct pbs_buffer { bool supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size); + Torus>(polynomial_size, max_shared_memory); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( @@ -218,8 +218,7 @@ template struct pbs_buffer { template uint64_t get_buffer_size_programmable_bootstrap_cg( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count) { - int max_shared_memory = cuda_get_max_shared_memory(0); + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_cg(polynomial_size); uint64_t partial_sm = @@ -245,7 +244,8 @@ template bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples); + uint32_t num_samples, + int max_shared_memory); template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h index 504c864069..4ab6491f92 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h @@ -8,7 +8,7 @@ extern "C" { bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples); + uint32_t num_samples, int max_shared_memory); void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( void *stream, uint32_t gpu_index, void *dest, void const *src, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu index e018934cde..33f13fc106 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu @@ -106,10 +106,12 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, int gridSize = total_polynomials; int blockSize = polynomial_size / choose_opt_amortized(polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); + double2 *buffer; switch (polynomial_size) { case 256: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -130,7 +132,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, } break; case 512: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -151,7 +153,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, } break; case 1024: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -172,7 +174,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, } break; case 2048: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -193,7 +195,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, } break; case 4096: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -214,7 +216,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, } break; case 8192: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -235,7 +237,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, } break; case 16384: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index 44df19c900..ba47fd56a1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -111,10 +111,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); + double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); switch (polynomial_size) { case 256: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -132,7 +134,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 512: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -150,7 +152,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 1024: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -168,7 +170,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 2048: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -186,7 +188,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 4096: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -204,7 +206,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 8192: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -222,7 +224,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 16384: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + if (shared_memory_size <= max_shared_memory) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 374bbce727..22c4587c65 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -233,9 +233,8 @@ uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized( template uint64_t get_buffer_size_programmable_bootstrap_amortized( uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count) { + uint32_t input_lwe_ciphertext_count, int max_shared_memory) { - int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_amortized( polynomial_size, glwe_dimension); @@ -265,7 +264,7 @@ __host__ void scratch_programmable_bootstrap_amortized( uint64_t partial_sm = get_buffer_size_partial_sm_programmable_bootstrap_amortized( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { cudaFuncSetAttribute( device_programmable_bootstrap_amortized, @@ -284,7 +283,8 @@ __host__ void scratch_programmable_bootstrap_amortized( if (allocate_gpu_memory) { uint64_t buffer_size = get_buffer_size_programmable_bootstrap_amortized( - glwe_dimension, polynomial_size, input_lwe_ciphertext_count); + glwe_dimension, polynomial_size, input_lwe_ciphertext_count, + max_shared_memory); *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index); check_cuda_error(cudaGetLastError()); } @@ -311,7 +311,7 @@ __host__ void host_programmable_bootstrap_amortized( uint64_t DM_FULL = SM_FULL; - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); // Create a 1-dimensional grid of threads diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index c77b69b353..12073bc403 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -199,7 +199,7 @@ __host__ void scratch_programmable_bootstrap_cg( uint64_t partial_sm = get_buffer_size_partial_sm_programmable_bootstrap_cg( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { check_cuda_error(cudaFuncSetAttribute( device_programmable_bootstrap_cg, @@ -246,7 +246,7 @@ __host__ void host_programmable_bootstrap_cg( get_buffer_size_partial_sm_programmable_bootstrap_cg( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); uint64_t full_dm = full_sm; @@ -300,7 +300,8 @@ __host__ void host_programmable_bootstrap_cg( // Verify if the grid size satisfies the cooperative group constraints template __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( - int glwe_dimension, int level_count, int num_samples) { + int glwe_dimension, int level_count, int num_samples, + int max_shared_memory) { // If Cooperative Groups is not supported, no need to check anything else if (!cuda_check_support_cooperative_groups()) @@ -320,7 +321,6 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples; int max_active_blocks_per_sm; - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, @@ -346,30 +346,37 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( // Verify if the grid size satisfies the cooperative group constraints template __host__ bool supports_cooperative_groups_on_programmable_bootstrap( - int glwe_dimension, int polynomial_size, int level_count, int num_samples) { + int glwe_dimension, int polynomial_size, int level_count, int num_samples, + int max_shared_memory) { switch (polynomial_size) { case 256: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 512: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 1024: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 2048: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 4096: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 8192: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 16384: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, - num_samples); + Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples, + max_shared_memory); default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " "Supported N's are powers of two" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 5eb59c5b78..01b8425950 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -225,7 +225,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_keybundle, @@ -309,7 +309,7 @@ __host__ void execute_cg_external_product_loop( uint64_t no_dm = 0; auto lwe_chunk_size = buffer->lwe_chunk_size; - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); uint32_t keybundle_size_per_input = @@ -406,7 +406,8 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( // Verify if the grid size satisfies the cooperative group constraints template __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( - int glwe_dimension, int level_count, int num_samples) { + int glwe_dimension, int level_count, int num_samples, + int max_shared_memory) { // If Cooperative Groups is not supported, no need to check anything else if (!cuda_check_support_cooperative_groups()) @@ -426,7 +427,6 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples; int max_active_blocks_per_sm; - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm_cg_accumulate) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, @@ -457,30 +457,37 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( // group constraints template __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap( - int glwe_dimension, int polynomial_size, int level_count, int num_samples) { + int glwe_dimension, int polynomial_size, int level_count, int num_samples, + int max_shared_memory) { switch (polynomial_size) { case 256: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 512: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 1024: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 2048: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 4096: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 8192: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples); + Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples, + max_shared_memory); case 16384: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, - num_samples); + Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples, + max_shared_memory); default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " "N's are powers of two" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index 17859dbdd8..11c7b40b36 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -8,46 +8,56 @@ template bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples) { + uint32_t num_samples, + int max_shared_memory) { return supports_cooperative_groups_on_programmable_bootstrap( - glwe_dimension, polynomial_size, level_count, num_samples); + glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); } template bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count) { + uint32_t level_count, + int max_shared_memory) { #if CUDA_ARCH >= 900 switch (polynomial_size) { case 256: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 512: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 1024: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 2048: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 4096: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 8192: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 16384: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); default: PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported " "N's are powers of two" @@ -314,10 +324,11 @@ void scratch_cuda_programmable_bootstrap_32( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - level_count)) + level_count, max_shared_memory)) scratch_cuda_programmable_bootstrap_tbc( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, @@ -326,7 +337,7 @@ void scratch_cuda_programmable_bootstrap_32( #endif if (has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count)) + input_lwe_ciphertext_count, max_shared_memory)) scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, @@ -348,10 +359,11 @@ void scratch_cuda_programmable_bootstrap_64( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - level_count)) + level_count, max_shared_memory)) scratch_cuda_programmable_bootstrap_tbc( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, @@ -360,7 +372,7 @@ void scratch_cuda_programmable_bootstrap_64( #endif if (has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count)) + input_lwe_ciphertext_count, max_shared_memory)) scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, @@ -720,7 +732,7 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, template bool has_support_to_cuda_programmable_bootstrap_cg( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples); + uint32_t num_samples, int max_shared_memory); template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -786,10 +798,10 @@ template void scratch_cuda_programmable_bootstrap( template bool has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count); + uint32_t level_count, int max_shared_memory); template bool has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count); + uint32_t level_count, int max_shared_memory); #if CUDA_ARCH >= 900 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 25701aca92..0826f53f8b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -262,7 +262,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) template uint64_t get_buffer_size_programmable_bootstrap( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count) { + uint32_t input_lwe_ciphertext_count, int max_shared_memory) { uint64_t full_sm_step_one = get_buffer_size_full_sm_programmable_bootstrap_step_one( @@ -278,7 +278,6 @@ uint64_t get_buffer_size_programmable_bootstrap( uint64_t full_dm = full_sm_step_one; uint64_t device_mem = 0; - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm) { device_mem = full_dm * input_lwe_ciphertext_count * level_count * (glwe_dimension + 1); @@ -317,7 +316,7 @@ __host__ void scratch_programmable_bootstrap( uint64_t partial_sm = get_buffer_size_partial_sm_programmable_bootstrap(polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); // Configure step one if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) { @@ -373,7 +372,7 @@ __host__ void execute_step_one( uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); @@ -415,7 +414,7 @@ __host__ void execute_step_two( uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, uint32_t num_many_lut, uint32_t lut_stride) { - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 488e938e67..828aecb0d7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -9,45 +9,53 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples) { + uint32_t num_samples, int max_shared_memory) { return supports_cooperative_groups_on_multibit_programmable_bootstrap< - uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples); + uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples, + max_shared_memory); } template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count) { + uint32_t level_count, int max_shared_memory) { #if CUDA_ARCH >= 900 switch (polynomial_size) { case 256: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 512: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 1024: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 2048: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 4096: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 8192: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); case 16384: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, - polynomial_size, level_count); + polynomial_size, level_count, + max_shared_memory); default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " "N's are powers of two" @@ -392,7 +400,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - level_count)) + level_count, cuda_get_max_shared_memory(gpu_index))) scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, @@ -401,7 +409,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( #endif if (supports_cooperative_groups_on_multibit_programmable_bootstrap< uint64_t>(glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count)) + input_lwe_ciphertext_count, + cuda_get_max_shared_memory(gpu_index))) scratch_cuda_cg_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, @@ -440,7 +449,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, polynomial_size); int max_blocks_per_sm; - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); if (max_shared_memory < full_sm_keybundle) cudaOccupancyMaxActiveBlocksPerMultiprocessor( @@ -523,7 +532,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count); + uint32_t level_count, int max_shared_memory); #if (CUDA_ARCH >= 900) template diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index ba73d29bf7..9898adeb61 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -388,7 +388,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); @@ -506,7 +506,7 @@ __host__ void execute_compute_keybundle( uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); auto d_mem = buffer->d_mem_keybundle; @@ -550,7 +550,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, uint64_t partial_sm_accumulate_step_one = get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one< Torus>(polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); // @@ -604,7 +604,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, uint64_t full_sm_accumulate_step_two = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); cudaSetDevice(gpu_index); auto d_mem = buffer->d_mem_acc_step_two; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index bbbf6f2eee..8b5aa94b1c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -201,9 +201,12 @@ __host__ void scratch_programmable_bootstrap_tbc( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + cudaSetDevice(gpu_index); + + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); bool supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size); + Torus>(polynomial_size, max_shared_memory); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( polynomial_size); @@ -215,7 +218,6 @@ __host__ void scratch_programmable_bootstrap_tbc( minimum_sm_tbc = get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory >= full_sm + minimum_sm_tbc) { check_cuda_error(cudaFuncSetAttribute( @@ -262,10 +264,12 @@ __host__ void host_programmable_bootstrap_tbc( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_many_lut, uint32_t lut_stride) { + cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); auto supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size); + Torus>(polynomial_size, max_shared_memory); // With SM each block corresponds to either the mask or body, no need to // duplicate data for each @@ -280,9 +284,6 @@ __host__ void host_programmable_bootstrap_tbc( get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); - cudaSetDevice(gpu_index); - uint64_t full_dm = full_sm; uint64_t partial_dm = full_dm - partial_sm; @@ -342,7 +343,8 @@ __host__ void host_programmable_bootstrap_tbc( // Verify if the grid size satisfies the cooperative group constraints template __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( - int glwe_dimension, int level_count, int num_samples) { + int glwe_dimension, int level_count, int num_samples, + int max_shared_memory) { // If Cooperative Groups is not supported, no need to check anything else if (!cuda_check_support_cooperative_groups()) @@ -356,7 +358,6 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( get_buffer_size_partial_sm_programmable_bootstrap_tbc( params::degree); - int max_shared_memory = cuda_get_max_shared_memory(0); int thds = params::degree / params::opt; // Get the maximum number of active blocks per streaming multiprocessors @@ -387,12 +388,11 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( template bool supports_distributed_shared_memory_on_classic_programmable_bootstrap( - uint32_t polynomial_size) { + uint32_t polynomial_size, int max_shared_memory) { uint64_t minimum_sm = get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < minimum_sm) { // If we cannot store a single polynomial in a block shared memory we cannot // use TBC @@ -405,7 +405,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap( template __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count) { + uint32_t level_count, int max_shared_memory) { if (!cuda_check_support_thread_block_clusters() || num_samples > 128) return false; @@ -417,7 +417,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( polynomial_size); uint64_t minimum_sm_tbc = 0; if (supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size)) + Torus>(polynomial_size, max_shared_memory)) minimum_sm_tbc = get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); @@ -440,7 +440,6 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( * case and it will fail if we try. Thus, since level_count * * (glwe_dimension+1) is usually smaller than 8 at this moment, we will * disable cudaFuncAttributeNonPortableClusterSizeAllowed */ - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm + minimum_sm_tbc) { check_cuda_error(cudaFuncSetAttribute( device_programmable_bootstrap_tbc, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 701f80379b..6a512da7bd 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -204,10 +204,12 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); bool supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< - Torus>(polynomial_size); + Torus>(polynomial_size, max_shared_memory); uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( @@ -224,8 +226,6 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); - if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_keybundle, @@ -301,11 +301,14 @@ __host__ void execute_tbc_external_product_loop( uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut, uint32_t lut_stride) { + cudaSetDevice(gpu_index); auto lwe_chunk_size = buffer->lwe_chunk_size; + + int max_shared_memory = cuda_get_max_shared_memory(gpu_index); auto supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< - Torus>(polynomial_size); + Torus>(polynomial_size, max_shared_memory); uint64_t full_dm = get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( @@ -319,9 +322,6 @@ __host__ void execute_tbc_external_product_loop( get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); - cudaSetDevice(gpu_index); - uint32_t keybundle_size_per_input = lwe_chunk_size * level_count * (glwe_dimension + 1) * (glwe_dimension + 1) * (polynomial_size / 2); @@ -426,12 +426,11 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( template bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( - uint32_t polynomial_size) { + uint32_t polynomial_size, int max_shared_memory) { uint64_t minimum_sm = get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory <= minimum_sm) { // If we cannot store a single polynomial in a block shared memory we // cannot use TBC @@ -444,7 +443,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( template __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count) { + uint32_t level_count, int max_shared_memory) { if (!cuda_check_support_thread_block_clusters()) return false; @@ -457,7 +456,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( polynomial_size); uint64_t minimum_sm_tbc_accumulate = 0; if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap< - Torus>(polynomial_size)) + Torus>(polynomial_size, max_shared_memory)) minimum_sm_tbc_accumulate = get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); @@ -480,7 +479,6 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( * case and it will fail if we try. Thus, since level_count * * (glwe_dimension+1) is usually smaller than 8 at this moment, we will * disable cudaFuncAttributeNonPortableClusterSizeAllowed */ - int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { check_cuda_error(cudaFuncSetAttribute( @@ -520,5 +518,5 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( template bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( - uint32_t polynomial_size); + uint32_t polynomial_size, int max_shared_memory); #endif // FASTMULTIBIT_PBS_H diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp index 7b4cbe9802..5d912db23a 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp @@ -199,7 +199,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit( glwe_dimension, polynomial_size, pbs_level, - input_lwe_ciphertext_count)) { + input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) { st.SkipWithError("Configuration not supported for fast operation"); return; } @@ -288,7 +288,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, pbs_level, - input_lwe_ciphertext_count)) { + input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) { st.SkipWithError("Configuration not supported for fast operation"); return; } diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index fa16da0136..1fe30fde32 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -1,8 +1,8 @@ -/* automatically generated by rust-bindgen 0.70.1 */ +/* automatically generated by rust-bindgen 0.71.1 */ use crate::ffi; -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -12,7 +12,7 @@ extern "C" { lwe_dimension: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -22,7 +22,7 @@ extern "C" { lwe_dimension: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_glwe_sample_extract_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -37,7 +37,7 @@ extern "C" { pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0; pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1; pub type PBS_TYPE = ffi::c_uint; -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_compress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -57,7 +57,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -79,7 +79,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_compress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -91,7 +91,7 @@ extern "C" { mem_ptr: *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -104,7 +104,7 @@ extern "C" { mem_ptr: *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_compress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -112,7 +112,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -141,7 +141,7 @@ pub const COMPARISON_TYPE_LE: COMPARISON_TYPE = 5; pub const COMPARISON_TYPE_MAX: COMPARISON_TYPE = 6; pub const COMPARISON_TYPE_MIN: COMPARISON_TYPE = 7; pub type COMPARISON_TYPE = ffi::c_uint; -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_apply_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -163,7 +163,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_apply_many_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -186,7 +186,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_apply_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -199,7 +199,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_apply_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -207,7 +207,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_apply_bivariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -229,7 +229,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_apply_bivariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -244,7 +244,7 @@ extern "C" { shift: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_apply_bivariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -252,7 +252,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_apply_many_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -267,7 +267,7 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_full_propagation_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -287,7 +287,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_full_propagation_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -299,7 +299,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_full_propagation( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -307,7 +307,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -330,7 +330,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_mult_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -347,7 +347,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_mult( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -355,7 +355,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_negate_integer_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -368,7 +368,7 @@ extern "C" { carry_modulus: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -381,7 +381,7 @@ extern "C" { carry_modulus: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -404,7 +404,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -417,7 +417,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -440,7 +440,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -453,7 +453,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_logical_scalar_shift( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -461,7 +461,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -469,7 +469,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -493,7 +493,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -506,7 +506,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_shift_and_rotate( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -514,7 +514,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_comparison_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -538,7 +538,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_comparison_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -552,7 +552,7 @@ extern "C" { lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -567,7 +567,7 @@ extern "C" { num_scalar_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_comparison( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -575,7 +575,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_bitop_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -598,7 +598,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_bitop_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -612,7 +612,7 @@ extern "C" { lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -628,7 +628,7 @@ extern "C" { op: BITOP_TYPE, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_bitop( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -636,7 +636,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_cmux_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -658,7 +658,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_cmux_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -673,7 +673,7 @@ extern "C" { lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_cmux( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -681,7 +681,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -704,7 +704,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -717,7 +717,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_scalar_rotate( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -725,7 +725,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -749,7 +749,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -773,7 +773,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -789,7 +789,7 @@ extern "C" { uses_carry: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_and_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -806,7 +806,7 @@ extern "C" { uses_carry: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_propagate_single_carry( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -814,7 +814,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_add_and_propagate_single_carry( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -822,7 +822,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -845,7 +845,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_overflowing_sub_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -862,7 +862,7 @@ extern "C" { uses_input_borrow: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_overflowing_sub( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -870,7 +870,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -892,7 +892,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -906,7 +906,7 @@ extern "C" { num_blocks_in_radix: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -914,7 +914,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_scalar_mul_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -935,7 +935,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -953,7 +953,7 @@ extern "C" { num_scalars: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_scalar_mul( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -961,7 +961,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -984,7 +984,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_div_rem_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1000,7 +1000,7 @@ extern "C" { num_blocks_in_radix: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_div_rem( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1008,7 +1008,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1030,7 +1030,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1044,7 +1044,7 @@ extern "C" { shift: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1052,7 +1052,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_reverse_blocks_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1062,7 +1062,7 @@ extern "C" { lwe_size: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1085,7 +1085,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1098,7 +1098,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_abs_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1106,7 +1106,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1128,7 +1128,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_are_all_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1141,7 +1141,7 @@ extern "C" { num_radix_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_are_all_comparisons_block_true( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1149,7 +1149,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1171,7 +1171,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1184,7 +1184,7 @@ extern "C" { num_radix_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_is_at_least_one_comparisons_block_true( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1192,7 +1192,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_keyswitch_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1208,7 +1208,7 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_keyswitch_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1224,7 +1224,7 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1236,7 +1236,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1252,14 +1252,14 @@ extern "C" { num_lwes: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_packing_keyswitch_lwe_list_to_glwe( stream: *mut ffi::c_void, gpu_index: u32, fp_ks_buffer: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_negate_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1269,7 +1269,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_negate_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1279,7 +1279,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1290,7 +1290,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1301,7 +1301,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1312,7 +1312,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1323,7 +1323,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1334,7 +1334,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1345,7 +1345,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_plaintext_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1356,7 +1356,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_fourier_polynomial_mul( stream: *mut ffi::c_void, gpu_index: u32, @@ -1367,7 +1367,7 @@ extern "C" { total_polynomials: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_programmable_bootstrap_key_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1379,7 +1379,7 @@ extern "C" { polynomial_size: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_programmable_bootstrap_key_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1391,7 +1391,7 @@ extern "C" { polynomial_size: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_amortized_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1402,7 +1402,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_amortized_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1413,7 +1413,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1433,7 +1433,7 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1453,14 +1453,14 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_programmable_bootstrap_amortized( stream: *mut ffi::c_void, gpu_index: u32, pbs_buffer: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1472,7 +1472,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1484,7 +1484,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1506,7 +1506,7 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1528,22 +1528,23 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_programmable_bootstrap( stream: *mut ffi::c_void, gpu_index: u32, pbs_buffer: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit( glwe_dimension: u32, polynomial_size: u32, level_count: u32, num_samples: u32, + max_shared_memory: ffi::c_int, ) -> bool; } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1556,7 +1557,7 @@ extern "C" { grouping_factor: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1568,7 +1569,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1591,7 +1592,7 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_multi_bit_programmable_bootstrap( stream: *mut ffi::c_void, gpu_index: u32, diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 7ed226c3f3..07e341d981 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -332,12 +332,12 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async( polynomial_size: PolynomialSize, ) { let size = std::mem::size_of_val(src); - for (gpu_index, &stream) in streams.ptr.iter().enumerate() { + for (i, &stream_ptr) in streams.ptr.iter().enumerate() { assert_eq!(dest.len() * std::mem::size_of::(), size); cuda_convert_lwe_programmable_bootstrap_key_64( - stream, - streams.gpu_indexes[gpu_index].0, - dest.get_mut_c_ptr(gpu_index as u32), + stream_ptr, + streams.gpu_indexes[i].0, + dest.as_mut_c_ptr(i as u32), src.as_ptr().cast(), input_lwe_dim.0 as u32, glwe_dim.0 as u32, @@ -365,12 +365,12 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async(), size); cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( - stream, - streams.gpu_indexes[gpu_index].0, - dest.as_mut_c_ptr(gpu_index as u32), + stream_ptr, + streams.gpu_indexes[i].0, + dest.as_mut_c_ptr(i as u32), src.as_ptr().cast(), input_lwe_dim.0 as u32, glwe_dim.0 as u32, diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs index d9a84ca205..377b30abda 100644 --- a/tfhe/src/core_crypto/gpu/vec.rs +++ b/tfhe/src/core_crypto/gpu/vec.rs @@ -93,18 +93,13 @@ impl CudaVec { pub fn new_multi_gpu(len: usize, streams: &CudaStreams) -> Self { let size = len as u64 * std::mem::size_of::() as u64; let mut ptrs = Vec::with_capacity(streams.len()); - for (index, &stream) in streams.ptr.iter().enumerate() { - let ptr = unsafe { cuda_malloc_async(size, stream, index as u32) }; + for (i, &stream_ptr) in streams.ptr.iter().enumerate() { + let gpu_index = streams.gpu_indexes[i]; + let ptr = unsafe { cuda_malloc_async(size, stream_ptr, gpu_index.0) }; unsafe { - cuda_memset_async( - ptr, - 0u64, - size, - streams.ptr[index], - streams.gpu_indexes[index].0, - ); + cuda_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0); } - streams.synchronize_one(index as u32); + streams.synchronize_one(i as u32); ptrs.push(ptr); } @@ -203,19 +198,20 @@ impl CudaVec { where T: Numeric, { - for (gpu_index, &stream) in streams.ptr.iter().enumerate() { + for (i, &stream_ptr) in streams.ptr.iter().enumerate() { assert!(self.len() >= src.len()); let size = std::mem::size_of_val(src); // We have to check that src is not empty, because Rust slice with size 0 results in an // invalid pointer being passed to copy_to_gpu_async if size > 0 { + let gpu_index = streams.gpu_indexes[i]; cuda_memcpy_async_to_gpu( - self.get_mut_c_ptr(gpu_index as u32), + self.get_mut_c_ptr(i as u32), src.as_ptr().cast(), size as u64, - stream, - streams.gpu_indexes[gpu_index].0, + stream_ptr, + gpu_index.0, ); } } @@ -442,13 +438,9 @@ unsafe impl Sync for CudaVec where T: Sync + Numeric {} impl Drop for CudaVec { /// Free memory for pointer `ptr` synchronously fn drop(&mut self) { - for (ptr, gpu_index) in self - .ptr - .iter() - .copied() - .zip(self.gpu_indexes.iter().copied()) - { + for (i, &ptr) in self.ptr.iter().enumerate() { // Synchronizes the device to be sure no stream is still using this pointer + let gpu_index = self.gpu_indexes[i]; synchronize_device(gpu_index.0); unsafe { cuda_drop(ptr, gpu_index.0) }; }