From 89a873307de466a4c4feeeb46d2ba70526e77d63 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Mon, 20 Jan 2025 18:27:24 +0000
Subject: [PATCH] fix(gpu): general fixes on indexes used in multi-gpu context.

- fix a bug in which the wrong GPU may be queried for the max shared memory
- If multiple streams are running split through multiple GPUs,
  operations happening on a stream in GPU i should query GPU i about its
max shared memory,
- also fixes wrong indexing at rust side.
---
 .../cuda/include/integer/integer_utilities.h  |  18 +-
 .../cuda/include/pbs/pbs_multibit_utilities.h |   4 +-
 .../cuda/include/pbs/pbs_utilities.h          |  14 +-
 .../pbs/programmable_bootstrap_multibit.h     |   2 +-
 .../cuda/src/pbs/bootstrapping_key.cu         |  16 +-
 .../cuda/src/pbs/bootstrapping_key.cuh        |  16 +-
 .../pbs/programmable_bootstrap_amortized.cuh  |  10 +-
 .../pbs/programmable_bootstrap_cg_classic.cuh |  33 ++-
 .../programmable_bootstrap_cg_multibit.cuh    |  33 ++-
 .../src/pbs/programmable_bootstrap_classic.cu |  46 ++--
 .../pbs/programmable_bootstrap_classic.cuh    |   9 +-
 .../pbs/programmable_bootstrap_multibit.cu    |  37 +--
 .../pbs/programmable_bootstrap_multibit.cuh   |   8 +-
 .../programmable_bootstrap_tbc_classic.cuh    |  25 +-
 .../programmable_bootstrap_tbc_multibit.cuh   |  24 +-
 .../benchmarks/benchmark_pbs.cpp              |   4 +-
 backends/tfhe-cuda-backend/src/bindings.rs    | 225 +++++++++---------
 tfhe/src/core_crypto/gpu/mod.rs               |  16 +-
 tfhe/src/core_crypto/gpu/vec.rs               |  32 +--
 19 files changed, 306 insertions(+), 266 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index 186b8c7ce2..c45c1d316e 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -149,7 +149,9 @@ template <typename Torus> struct int_radix_lut {
   std::vector<Torus *> lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec;
 
-  int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+  uint32_t *gpu_indexes;
+
+  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, bool allocate_gpu_memory) {
 
@@ -160,6 +162,9 @@ template <typename Torus> struct int_radix_lut {
     Torus lut_buffer_size =
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
     ///////////////
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -255,7 +260,7 @@ template <typename Torus> struct int_radix_lut {
   }
 
   // constructor to reuse memory
-  int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, int_radix_lut *base_lut_object) {
 
@@ -266,6 +271,9 @@ template <typename Torus> struct int_radix_lut {
     Torus lut_buffer_size =
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
     // base lut object should have bigger or equal memory than current one
     assert(num_radix_blocks <= base_lut_object->num_blocks);
     // pbs
@@ -332,7 +340,7 @@ template <typename Torus> struct int_radix_lut {
   }
 
   // Construction for many luts
-  int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, uint32_t num_many_lut,
                 bool allocate_gpu_memory) {
@@ -344,6 +352,9 @@ template <typename Torus> struct int_radix_lut {
     Torus lut_buffer_size =
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
     ///////////////
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -494,6 +505,7 @@ template <typename Torus> struct int_radix_lut {
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
+    free(this->gpu_indexes);
     for (uint i = 0; i < active_gpu_count; i++) {
       cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
       cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
index f0a54cca2f..147280628e 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -5,12 +5,12 @@
 
 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size);
+    uint32_t polynomial_size, int max_shared_memory);
 
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count);
+    uint32_t level_count, int max_shared_memory);
 
 #if CUDA_ARCH >= 900
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
index 722a276235..2b4ea1cb8c 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -61,7 +61,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
 
 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size);
+    uint32_t polynomial_size, int max_shared_memory);
 
 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
 
@@ -77,10 +77,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
              uint32_t polynomial_size, uint32_t level_count,
              uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
              bool allocate_gpu_memory) {
-
+    cudaSetDevice(gpu_index);
     this->pbs_variant = pbs_variant;
 
-    auto max_shared_memory = cuda_get_max_shared_memory(0);
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 
     if (allocate_gpu_memory) {
       switch (pbs_variant) {
@@ -157,7 +157,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 
         bool supports_dsm =
             supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                Torus>(polynomial_size);
+                Torus>(polynomial_size, max_shared_memory);
 
         uint64_t full_sm =
             get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
@@ -218,8 +218,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_cg(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count) {
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
   uint64_t partial_sm =
@@ -245,7 +244,8 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
                                                    uint32_t level_count,
-                                                   uint32_t num_samples);
+                                                   uint32_t num_samples,
+                                                   int max_shared_memory);
 
 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
index 504c864069..4ab6491f92 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -8,7 +8,7 @@ extern "C" {
 
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, int max_shared_memory);
 
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
     void *stream, uint32_t gpu_index, void *dest, void const *src,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
index e018934cde..33f13fc106 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -106,10 +106,12 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
   int gridSize = total_polynomials;
   int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
   double2 *buffer;
   switch (polynomial_size) {
   case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
@@ -130,7 +132,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
@@ -151,7 +153,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
@@ -172,7 +174,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
@@ -193,7 +195,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
@@ -214,7 +216,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
@@ -235,7 +237,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
     }
     break;
   case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
index 44df19c900..ba47fd56a1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -111,10 +111,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
 
   cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
   double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
   switch (polynomial_size) {
   case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -132,7 +134,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -150,7 +152,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -168,7 +170,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -186,7 +188,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -204,7 +206,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -222,7 +224,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+    if (shared_memory_size <= max_shared_memory) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index 374bbce727..22c4587c65 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -233,9 +233,8 @@ uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_amortized(
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count) {
+    uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size, glwe_dimension);
@@ -265,7 +264,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
     cudaFuncSetAttribute(
         device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
@@ -284,7 +283,8 @@ __host__ void scratch_programmable_bootstrap_amortized(
   if (allocate_gpu_memory) {
     uint64_t buffer_size =
         get_buffer_size_programmable_bootstrap_amortized<Torus>(
-            glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            max_shared_memory);
     *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
     check_cuda_error(cudaGetLastError());
   }
@@ -311,7 +311,7 @@ __host__ void host_programmable_bootstrap_amortized(
 
   uint64_t DM_FULL = SM_FULL;
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   // Create a 1-dimensional grid of threads
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index c77b69b353..12073bc403 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -199,7 +199,7 @@ __host__ void scratch_programmable_bootstrap_cg(
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
           polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
     check_cuda_error(cudaFuncSetAttribute(
         device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
@@ -246,7 +246,7 @@ __host__ void host_programmable_bootstrap_cg(
       get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
           polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   uint64_t full_dm = full_sm;
@@ -300,7 +300,8 @@ __host__ void host_programmable_bootstrap_cg(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
-    int glwe_dimension, int level_count, int num_samples) {
+    int glwe_dimension, int level_count, int num_samples,
+    int max_shared_memory) {
 
   // If Cooperative Groups is not supported, no need to check anything else
   if (!cuda_check_support_cooperative_groups())
@@ -320,7 +321,6 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
   int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
   int max_active_blocks_per_sm;
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
@@ -346,30 +346,37 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_programmable_bootstrap(
-    int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    int max_shared_memory) {
   switch (polynomial_size) {
   case 256:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
   case 512:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
   case 1024:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 2048:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 4096:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 8192:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 16384:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                       num_samples);
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
+                                       max_shared_memory);
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
           "Supported N's are powers of two"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index 5eb59c5b78..01b8425950 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -225,7 +225,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
       get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
           polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   if (max_shared_memory < full_sm_keybundle) {
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -309,7 +309,7 @@ __host__ void execute_cg_external_product_loop(
   uint64_t no_dm = 0;
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   uint32_t keybundle_size_per_input =
@@ -406,7 +406,8 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
-    int glwe_dimension, int level_count, int num_samples) {
+    int glwe_dimension, int level_count, int num_samples,
+    int max_shared_memory) {
 
   // If Cooperative Groups is not supported, no need to check anything else
   if (!cuda_check_support_cooperative_groups())
@@ -426,7 +427,6 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
   int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
   int max_active_blocks_per_sm;
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm_cg_accumulate) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
@@ -457,30 +457,37 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
 // group constraints
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
-    int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    int max_shared_memory) {
   switch (polynomial_size) {
   case 256:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
   case 512:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
   case 1024:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 2048:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 4096:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 8192:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
   case 16384:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                       num_samples);
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
+                                       max_shared_memory);
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
index 17859dbdd8..11c7b40b36 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -8,46 +8,56 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
                                                    uint32_t level_count,
-                                                   uint32_t num_samples) {
+                                                   uint32_t num_samples,
+                                                   int max_shared_memory) {
   return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
-      glwe_dimension, polynomial_size, level_count, num_samples);
+      glwe_dimension, polynomial_size, level_count, num_samples,
+      max_shared_memory);
 }
 
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
                                                     uint32_t glwe_dimension,
                                                     uint32_t polynomial_size,
-                                                    uint32_t level_count) {
+                                                    uint32_t level_count,
+                                                    int max_shared_memory) {
 #if CUDA_ARCH >= 900
   switch (polynomial_size) {
   case 256:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count);
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
   case 512:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count);
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
   case 1024:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 2048:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 4096:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 8192:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 16384:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count);
+                                       polynomial_size, level_count,
+                                       max_shared_memory);
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
@@ -314,10 +324,11 @@ void scratch_cuda_programmable_bootstrap_32(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
   if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count))
+          level_count, max_shared_memory))
     scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
         stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
@@ -326,7 +337,7 @@ void scratch_cuda_programmable_bootstrap_32(
 #endif
       if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
               glwe_dimension, polynomial_size, level_count,
-              input_lwe_ciphertext_count))
+              input_lwe_ciphertext_count, max_shared_memory))
     scratch_cuda_programmable_bootstrap_cg<uint32_t>(
         stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
@@ -348,10 +359,11 @@ void scratch_cuda_programmable_bootstrap_64(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
   if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count))
+          level_count, max_shared_memory))
     scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
@@ -360,7 +372,7 @@ void scratch_cuda_programmable_bootstrap_64(
 #endif
       if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
               glwe_dimension, polynomial_size, level_count,
-              input_lwe_ciphertext_count))
+              input_lwe_ciphertext_count, max_shared_memory))
     scratch_cuda_programmable_bootstrap_cg<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
@@ -720,7 +732,7 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
 
 template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, int max_shared_memory);
 
 template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -786,10 +798,10 @@ template void scratch_cuda_programmable_bootstrap<uint32_t>(
 
 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count);
+    uint32_t level_count, int max_shared_memory);
 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count);
+    uint32_t level_count, int max_shared_memory);
 
 #if CUDA_ARCH >= 900
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index 25701aca92..0826f53f8b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -262,7 +262,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count) {
+    uint32_t input_lwe_ciphertext_count, int max_shared_memory) {
 
   uint64_t full_sm_step_one =
       get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
@@ -278,7 +278,6 @@ uint64_t get_buffer_size_programmable_bootstrap(
   uint64_t full_dm = full_sm_step_one;
 
   uint64_t device_mem = 0;
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm) {
     device_mem = full_dm * input_lwe_ciphertext_count * level_count *
                  (glwe_dimension + 1);
@@ -317,7 +316,7 @@ __host__ void scratch_programmable_bootstrap(
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 
   // Configure step one
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
@@ -373,7 +372,7 @@ __host__ void execute_step_one(
     uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
     uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
   int thds = polynomial_size / params::opt;
   dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
@@ -415,7 +414,7 @@ __host__ void execute_step_two(
     uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
     uint32_t num_many_lut, uint32_t lut_stride) {
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
   int thds = polynomial_size / params::opt;
   dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index 488e938e67..828aecb0d7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -9,45 +9,53 @@
 
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples) {
+    uint32_t num_samples, int max_shared_memory) {
   return supports_cooperative_groups_on_multibit_programmable_bootstrap<
-      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples);
+      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
+                max_shared_memory);
 }
 
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count) {
+    uint32_t level_count, int max_shared_memory) {
 #if CUDA_ARCH >= 900
   switch (polynomial_size) {
   case 256:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count);
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
   case 512:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count);
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
   case 1024:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 2048:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 4096:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 8192:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count);
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
   case 16384:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count);
+                                       polynomial_size, level_count,
+                                       max_shared_memory);
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
@@ -392,7 +400,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
 #if (CUDA_ARCH >= 900)
   if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count))
+          level_count, cuda_get_max_shared_memory(gpu_index)))
     scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         glwe_dimension, polynomial_size, level_count,
@@ -401,7 +409,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
 #endif
       if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
               uint64_t>(glwe_dimension, polynomial_size, level_count,
-                        input_lwe_ciphertext_count))
+                        input_lwe_ciphertext_count,
+                        cuda_get_max_shared_memory(gpu_index)))
     scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         glwe_dimension, polynomial_size, level_count,
@@ -440,7 +449,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
           polynomial_size);
 
   int max_blocks_per_sm;
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
   if (max_shared_memory < full_sm_keybundle)
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
@@ -523,7 +532,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count);
+    uint32_t level_count, int max_shared_memory);
 
 #if (CUDA_ARCH >= 900)
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index ba73d29bf7..9898adeb61 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -388,7 +388,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
           polynomial_size);
@@ -506,7 +506,7 @@ __host__ void execute_compute_keybundle(
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
           polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   auto d_mem = buffer->d_mem_keybundle;
@@ -550,7 +550,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
   uint64_t partial_sm_accumulate_step_one =
       get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
           Torus>(polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   //
@@ -604,7 +604,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
   uint64_t full_sm_accumulate_step_two =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
           polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   cudaSetDevice(gpu_index);
 
   auto d_mem = buffer->d_mem_acc_step_two;
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index bbbf6f2eee..8b5aa94b1c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -201,9 +201,12 @@ __host__ void scratch_programmable_bootstrap_tbc(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  cudaSetDevice(gpu_index);
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   bool supports_dsm =
       supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size);
+          Torus>(polynomial_size, max_shared_memory);
 
   uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
       polynomial_size);
@@ -215,7 +218,6 @@ __host__ void scratch_programmable_bootstrap_tbc(
     minimum_sm_tbc =
         get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
             polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
 
   if (max_shared_memory >= full_sm + minimum_sm_tbc) {
     check_cuda_error(cudaFuncSetAttribute(
@@ -262,10 +264,12 @@ __host__ void host_programmable_bootstrap_tbc(
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
     uint32_t num_many_lut, uint32_t lut_stride) {
+  cudaSetDevice(gpu_index);
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   auto supports_dsm =
       supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size);
+          Torus>(polynomial_size, max_shared_memory);
 
   // With SM each block corresponds to either the mask or body, no need to
   // duplicate data for each
@@ -280,9 +284,6 @@ __host__ void host_programmable_bootstrap_tbc(
         get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
             polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-  cudaSetDevice(gpu_index);
-
   uint64_t full_dm = full_sm;
 
   uint64_t partial_dm = full_dm - partial_sm;
@@ -342,7 +343,8 @@ __host__ void host_programmable_bootstrap_tbc(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
-    int glwe_dimension, int level_count, int num_samples) {
+    int glwe_dimension, int level_count, int num_samples,
+    int max_shared_memory) {
 
   // If Cooperative Groups is not supported, no need to check anything else
   if (!cuda_check_support_cooperative_groups())
@@ -356,7 +358,6 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
       get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
           params::degree);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   int thds = params::degree / params::opt;
 
   // Get the maximum number of active blocks per streaming multiprocessors
@@ -387,12 +388,11 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
 
 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size) {
+    uint32_t polynomial_size, int max_shared_memory) {
   uint64_t minimum_sm =
       get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
           polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < minimum_sm) {
     // If we cannot store a single polynomial in a block shared memory we cannot
     // use TBC
@@ -405,7 +405,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count) {
+    uint32_t level_count, int max_shared_memory) {
 
   if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
     return false;
@@ -417,7 +417,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
           polynomial_size);
   uint64_t minimum_sm_tbc = 0;
   if (supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size))
+          Torus>(polynomial_size, max_shared_memory))
     minimum_sm_tbc =
         get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
             polynomial_size);
@@ -440,7 +440,6 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
    * case and it will fail if we try. Thus, since level_count *
    * (glwe_dimension+1) is usually smaller than 8 at this moment, we will
    * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm + minimum_sm_tbc) {
     check_cuda_error(cudaFuncSetAttribute(
         device_programmable_bootstrap_tbc<Torus, params, NOSM>,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 701f80379b..6a512da7bd 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -204,10 +204,12 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
     pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+  cudaSetDevice(gpu_index);
 
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   bool supports_dsm =
       supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size);
+          Torus>(polynomial_size, max_shared_memory);
 
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -224,8 +226,6 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
         get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
             polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-
   if (max_shared_memory < full_sm_keybundle) {
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -301,11 +301,14 @@ __host__ void execute_tbc_external_product_loop(
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
     uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
     uint32_t lut_stride) {
+  cudaSetDevice(gpu_index);
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   auto supports_dsm =
       supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size);
+          Torus>(polynomial_size, max_shared_memory);
 
   uint64_t full_dm =
       get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
@@ -319,9 +322,6 @@ __host__ void execute_tbc_external_product_loop(
         get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
             polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-  cudaSetDevice(gpu_index);
-
   uint32_t keybundle_size_per_input =
       lwe_chunk_size * level_count * (glwe_dimension + 1) *
       (glwe_dimension + 1) * (polynomial_size / 2);
@@ -426,12 +426,11 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
 
 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size) {
+    uint32_t polynomial_size, int max_shared_memory) {
   uint64_t minimum_sm =
       get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
           polynomial_size);
 
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory <= minimum_sm) {
     // If we cannot store a single polynomial in a block shared memory we
     // cannot use TBC
@@ -444,7 +443,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count) {
+    uint32_t level_count, int max_shared_memory) {
 
   if (!cuda_check_support_thread_block_clusters())
     return false;
@@ -457,7 +456,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
           polynomial_size);
   uint64_t minimum_sm_tbc_accumulate = 0;
   if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size))
+          Torus>(polynomial_size, max_shared_memory))
     minimum_sm_tbc_accumulate =
         get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
             polynomial_size);
@@ -480,7 +479,6 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
    * case and it will fail if we try. Thus, since level_count *
    * (glwe_dimension+1) is usually smaller than 8 at this moment, we will
    * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
-  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory <
       partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
@@ -520,5 +518,5 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
 
 template bool
 supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
-    uint32_t polynomial_size);
+    uint32_t polynomial_size, int max_shared_memory);
 #endif // FASTMULTIBIT_PBS_H
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
index 7b4cbe9802..5d912db23a 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -199,7 +199,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
 (benchmark::State &st) {
   if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
           glwe_dimension, polynomial_size, pbs_level,
-          input_lwe_ciphertext_count)) {
+          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
     st.SkipWithError("Configuration not supported for fast operation");
     return;
   }
@@ -288,7 +288,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
 (benchmark::State &st) {
   if (!has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
           glwe_dimension, polynomial_size, pbs_level,
-          input_lwe_ciphertext_count)) {
+          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
     st.SkipWithError("Configuration not supported for fast operation");
     return;
   }
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index fa16da0136..1fe30fde32 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1,8 +1,8 @@
-/* automatically generated by rust-bindgen 0.70.1 */
+/* automatically generated by rust-bindgen 0.71.1 */
 
 use crate::ffi;
 
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -12,7 +12,7 @@ extern "C" {
         lwe_dimension: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -22,7 +22,7 @@ extern "C" {
         lwe_dimension: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_glwe_sample_extract_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -37,7 +37,7 @@ extern "C" {
 pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
 pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
 pub type PBS_TYPE = ffi::c_uint;
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -57,7 +57,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -79,7 +79,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_compress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -91,7 +91,7 @@ extern "C" {
         mem_ptr: *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -104,7 +104,7 @@ extern "C" {
         mem_ptr: *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -112,7 +112,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -141,7 +141,7 @@ pub const COMPARISON_TYPE_LE: COMPARISON_TYPE = 5;
 pub const COMPARISON_TYPE_MAX: COMPARISON_TYPE = 6;
 pub const COMPARISON_TYPE_MIN: COMPARISON_TYPE = 7;
 pub type COMPARISON_TYPE = ffi::c_uint;
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_apply_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -163,7 +163,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_apply_many_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -186,7 +186,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_apply_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -199,7 +199,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_apply_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -207,7 +207,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -229,7 +229,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -244,7 +244,7 @@ extern "C" {
         shift: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -252,7 +252,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_apply_many_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -267,7 +267,7 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_full_propagation_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -287,7 +287,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_full_propagation_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -299,7 +299,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_full_propagation(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -307,7 +307,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -330,7 +330,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_mult_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -347,7 +347,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_mult(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -355,7 +355,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_negate_integer_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -368,7 +368,7 @@ extern "C" {
         carry_modulus: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -381,7 +381,7 @@ extern "C" {
         carry_modulus: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -404,7 +404,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -417,7 +417,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -440,7 +440,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -453,7 +453,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_logical_scalar_shift(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -461,7 +461,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -469,7 +469,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -493,7 +493,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -506,7 +506,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_shift_and_rotate(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -514,7 +514,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_comparison_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -538,7 +538,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -552,7 +552,7 @@ extern "C" {
         lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -567,7 +567,7 @@ extern "C" {
         num_scalar_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_comparison(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -575,7 +575,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_bitop_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -598,7 +598,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -612,7 +612,7 @@ extern "C" {
         lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -628,7 +628,7 @@ extern "C" {
         op: BITOP_TYPE,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_bitop(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -636,7 +636,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_cmux_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -658,7 +658,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_cmux_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -673,7 +673,7 @@ extern "C" {
         lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_cmux(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -681,7 +681,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -704,7 +704,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -717,7 +717,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_scalar_rotate(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -725,7 +725,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -749,7 +749,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -773,7 +773,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -789,7 +789,7 @@ extern "C" {
         uses_carry: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_and_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -806,7 +806,7 @@ extern "C" {
         uses_carry: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_propagate_single_carry(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -814,7 +814,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_add_and_propagate_single_carry(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -822,7 +822,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -845,7 +845,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_overflowing_sub_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -862,7 +862,7 @@ extern "C" {
         uses_input_borrow: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_overflowing_sub(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -870,7 +870,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -892,7 +892,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -906,7 +906,7 @@ extern "C" {
         num_blocks_in_radix: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -914,7 +914,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_scalar_mul_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -935,7 +935,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -953,7 +953,7 @@ extern "C" {
         num_scalars: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_scalar_mul(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -961,7 +961,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -984,7 +984,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1000,7 +1000,7 @@ extern "C" {
         num_blocks_in_radix: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_div_rem(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1008,7 +1008,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1030,7 +1030,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1044,7 +1044,7 @@ extern "C" {
         shift: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1052,7 +1052,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_reverse_blocks_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1062,7 +1062,7 @@ extern "C" {
         lwe_size: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1085,7 +1085,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1098,7 +1098,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_abs_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1106,7 +1106,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1128,7 +1128,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1141,7 +1141,7 @@ extern "C" {
         num_radix_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_are_all_comparisons_block_true(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1149,7 +1149,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1171,7 +1171,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1184,7 +1184,7 @@ extern "C" {
         num_radix_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1192,7 +1192,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_keyswitch_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1208,7 +1208,7 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_keyswitch_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1224,7 +1224,7 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1236,7 +1236,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1252,14 +1252,14 @@ extern "C" {
         num_lwes: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         fp_ks_buffer: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_negate_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1269,7 +1269,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_negate_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1279,7 +1279,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1290,7 +1290,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1301,7 +1301,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1312,7 +1312,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1323,7 +1323,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1334,7 +1334,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1345,7 +1345,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_plaintext_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1356,7 +1356,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_fourier_polynomial_mul(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1367,7 +1367,7 @@ extern "C" {
         total_polynomials: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_programmable_bootstrap_key_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1379,7 +1379,7 @@ extern "C" {
         polynomial_size: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_programmable_bootstrap_key_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1391,7 +1391,7 @@ extern "C" {
         polynomial_size: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_amortized_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1402,7 +1402,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_amortized_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1413,7 +1413,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1433,7 +1433,7 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1453,14 +1453,14 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_programmable_bootstrap_amortized(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         pbs_buffer: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1472,7 +1472,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1484,7 +1484,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1506,7 +1506,7 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1528,22 +1528,23 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_programmable_bootstrap(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         pbs_buffer: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
         glwe_dimension: u32,
         polynomial_size: u32,
         level_count: u32,
         num_samples: u32,
+        max_shared_memory: ffi::c_int,
     ) -> bool;
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1556,7 +1557,7 @@ extern "C" {
         grouping_factor: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1568,7 +1569,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1591,7 +1592,7 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
         stream: *mut ffi::c_void,
         gpu_index: u32,
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 7ed226c3f3..07e341d981 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -332,12 +332,12 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
     polynomial_size: PolynomialSize,
 ) {
     let size = std::mem::size_of_val(src);
-    for (gpu_index, &stream) in streams.ptr.iter().enumerate() {
+    for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
         assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
         cuda_convert_lwe_programmable_bootstrap_key_64(
-            stream,
-            streams.gpu_indexes[gpu_index].0,
-            dest.get_mut_c_ptr(gpu_index as u32),
+            stream_ptr,
+            streams.gpu_indexes[i].0,
+            dest.as_mut_c_ptr(i as u32),
             src.as_ptr().cast(),
             input_lwe_dim.0 as u32,
             glwe_dim.0 as u32,
@@ -365,12 +365,12 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
     grouping_factor: LweBskGroupingFactor,
 ) {
     let size = std::mem::size_of_val(src);
-    for (gpu_index, &stream) in streams.ptr.iter().enumerate() {
+    for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
         assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
         cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-            stream,
-            streams.gpu_indexes[gpu_index].0,
-            dest.as_mut_c_ptr(gpu_index as u32),
+            stream_ptr,
+            streams.gpu_indexes[i].0,
+            dest.as_mut_c_ptr(i as u32),
             src.as_ptr().cast(),
             input_lwe_dim.0 as u32,
             glwe_dim.0 as u32,
diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs
index d9a84ca205..377b30abda 100644
--- a/tfhe/src/core_crypto/gpu/vec.rs
+++ b/tfhe/src/core_crypto/gpu/vec.rs
@@ -93,18 +93,13 @@ impl<T: Numeric> CudaVec<T> {
     pub fn new_multi_gpu(len: usize, streams: &CudaStreams) -> Self {
         let size = len as u64 * std::mem::size_of::<T>() as u64;
         let mut ptrs = Vec::with_capacity(streams.len());
-        for (index, &stream) in streams.ptr.iter().enumerate() {
-            let ptr = unsafe { cuda_malloc_async(size, stream, index as u32) };
+        for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
+            let gpu_index = streams.gpu_indexes[i];
+            let ptr = unsafe { cuda_malloc_async(size, stream_ptr, gpu_index.0) };
             unsafe {
-                cuda_memset_async(
-                    ptr,
-                    0u64,
-                    size,
-                    streams.ptr[index],
-                    streams.gpu_indexes[index].0,
-                );
+                cuda_memset_async(ptr, 0u64, size, stream_ptr, gpu_index.0);
             }
-            streams.synchronize_one(index as u32);
+            streams.synchronize_one(i as u32);
             ptrs.push(ptr);
         }
 
@@ -203,19 +198,20 @@ impl<T: Numeric> CudaVec<T> {
     where
         T: Numeric,
     {
-        for (gpu_index, &stream) in streams.ptr.iter().enumerate() {
+        for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
             assert!(self.len() >= src.len());
             let size = std::mem::size_of_val(src);
 
             // We have to check that src is not empty, because Rust slice with size 0 results in an
             // invalid pointer being passed to copy_to_gpu_async
             if size > 0 {
+                let gpu_index = streams.gpu_indexes[i];
                 cuda_memcpy_async_to_gpu(
-                    self.get_mut_c_ptr(gpu_index as u32),
+                    self.get_mut_c_ptr(i as u32),
                     src.as_ptr().cast(),
                     size as u64,
-                    stream,
-                    streams.gpu_indexes[gpu_index].0,
+                    stream_ptr,
+                    gpu_index.0,
                 );
             }
         }
@@ -442,13 +438,9 @@ unsafe impl<T> Sync for CudaVec<T> where T: Sync + Numeric {}
 impl<T: Numeric> Drop for CudaVec<T> {
     /// Free memory for pointer `ptr` synchronously
     fn drop(&mut self) {
-        for (ptr, gpu_index) in self
-            .ptr
-            .iter()
-            .copied()
-            .zip(self.gpu_indexes.iter().copied())
-        {
+        for (i, &ptr) in self.ptr.iter().enumerate() {
             // Synchronizes the device to be sure no stream is still using this pointer
+            let gpu_index = self.gpu_indexes[i];
             synchronize_device(gpu_index.0);
             unsafe { cuda_drop(ptr, gpu_index.0) };
         }