fix(gpu): general fixes on indexes used in multi-gpu context.

- fix a bug in which the wrong GPU may be queried for the max shared memory - If multiple streams are running split through multiple GPUs, operations happening on a stream in GPU i should query GPU i about its max shared memory, - also fixes wrong indexing at rust side.
zama-ai · Jan 21, 2025 · 89a8733 · 89a8733
1 parent f66805e
commit 89a8733
Show file tree

Hide file tree

Showing 19 changed files with 306 additions and 266 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -149,7 +149,9 @@ template <typename Torus> struct int_radix_lut {
   std::vector<Torus *> lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec;
 
-  int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+  uint32_t *gpu_indexes;
+
+  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, bool allocate_gpu_memory) {
 
@@ -160,6 +162,9 @@ template <typename Torus> struct int_radix_lut {
     Torus lut_buffer_size =
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
     ///////////////
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -255,7 +260,7 @@ template <typename Torus> struct int_radix_lut {
   }
 
   // constructor to reuse memory
-  int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, int_radix_lut *base_lut_object) {
 
@@ -266,6 +271,9 @@ template <typename Torus> struct int_radix_lut {
     Torus lut_buffer_size =
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
     // base lut object should have bigger or equal memory than current one
     assert(num_radix_blocks <= base_lut_object->num_blocks);
     // pbs
@@ -332,7 +340,7 @@ template <typename Torus> struct int_radix_lut {
   }
 
   // Construction for many luts
-  int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, uint32_t num_many_lut,
                 bool allocate_gpu_memory) {
@@ -344,6 +352,9 @@ template <typename Torus> struct int_radix_lut {
     Torus lut_buffer_size =
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
     ///////////////
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -494,6 +505,7 @@ template <typename Torus> struct int_radix_lut {
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
+    free(this->gpu_indexes);
     for (uint i = 0; i < active_gpu_count; i++) {
       cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
       cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);

diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -5,12 +5,12 @@
 
 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size);
+    uint32_t polynomial_size, int max_shared_memory);
 
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count);
+    uint32_t level_count, int max_shared_memory);
 
 #if CUDA_ARCH >= 900
 template <typename Torus>

diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -61,7 +61,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
 
 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size);
+    uint32_t polynomial_size, int max_shared_memory);
 
 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
 
@@ -77,10 +77,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
              uint32_t polynomial_size, uint32_t level_count,
              uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
              bool allocate_gpu_memory) {
-
+    cudaSetDevice(gpu_index);
     this->pbs_variant = pbs_variant;
 
-    auto max_shared_memory = cuda_get_max_shared_memory(0);
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 
     if (allocate_gpu_memory) {
       switch (pbs_variant) {
@@ -157,7 +157,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 
         bool supports_dsm =
             supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                Torus>(polynomial_size);
+                Torus>(polynomial_size, max_shared_memory);
 
         uint64_t full_sm =
             get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
@@ -218,8 +218,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_cg(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count) {
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
   uint64_t partial_sm =
@@ -245,7 +244,8 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
                                                    uint32_t level_count,
-                                                   uint32_t num_samples);
+                                                   uint32_t num_samples,
+                                                   int max_shared_memory);
 
 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(

diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -8,7 +8,7 @@ extern "C" {
 
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, int max_shared_memory);
 
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
     void *stream, uint32_t gpu_index, void *dest, void const *src,

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -106,10 +106,12 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
   int gridSize = total_polynomials;
   int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
   double2 *buffer;
   switch (polynomial_size) {
   case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
@@ -130,7 +132,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
@@ -151,7 +153,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
@@ -172,7 +174,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
@@ -193,7 +195,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
@@ -214,7 +216,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
@@ -235,7 +237,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -111,10 +111,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
 
   cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
   double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
   switch (polynomial_size) {
   case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -132,7 +134,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -150,7 +152,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -168,7 +170,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -186,7 +188,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -204,7 +206,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -222,7 +224,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -233,9 +233,8 @@ uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_amortized(
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count) {
+    uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size, glwe_dimension);
@@ -265,7 +264,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
     cudaFuncSetAttribute(
         device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
@@ -284,7 +283,8 @@ __host__ void scratch_programmable_bootstrap_amortized(
   if (allocate_gpu_memory) {
     uint64_t buffer_size =
         get_buffer_size_programmable_bootstrap_amortized<Torus>(
-            glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            max_shared_memory);
     *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
     check_cuda_error(cudaGetLastError());
   }
@@ -311,7 +311,7 @@ __host__ void host_programmable_bootstrap_amortized(
 
   uint64_t DM_FULL = SM_FULL;
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   // Create a 1-dimensional grid of threads

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -199,7 +199,7 @@ __host__ void scratch_programmable_bootstrap_cg(
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
           polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
     check_cuda_error(cudaFuncSetAttribute(
         device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
@@ -246,7 +246,7 @@ __host__ void host_programmable_bootstrap_cg(
       get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
           polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   uint64_t full_dm = full_sm;
@@ -300,7 +300,8 @@ __host__ void host_programmable_bootstrap_cg(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
-    int glwe_dimension, int level_count, int num_samples) {
+    int glwe_dimension, int level_count, int num_samples,
+    int max_shared_memory) {
 
   // If Cooperative Groups is not supported, no need to check anything else
   if (!cuda_check_support_cooperative_groups())
@@ -320,7 +321,6 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
   int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
   int max_active_blocks_per_sm;
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
@@ -346,30 +346,37 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_programmable_bootstrap(
-    int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    int max_shared_memory) {
   switch (polynomial_size) {
   case 256:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
   case 512:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
   case 1024:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 2048:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 4096:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 8192:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 16384:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                       num_samples);
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
+                                       max_shared_memory);
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
           "Supported N's are powers of two"