chore(gpu): encapsulate cudaSetDevice

zama-ai · Jan 31, 2025 · 3c88574 · 3c88574
1 parent c470b71
commit 3c88574
Show file tree

Hide file tree

Showing 33 changed files with 91 additions and 85 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -27,6 +27,8 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
     std::abort();                                                              \
   }
 
+void cuda_set_device(uint32_t gpu_index);
+
 cudaEvent_t cuda_create_event(uint32_t gpu_index);
 
 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,

diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -189,7 +189,7 @@ template <typename Torus> struct int_radix_lut {
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint i = 0; i < active_gpu_count; i++) {
-      cudaSetDevice(i);
+      cuda_set_device(i);
       int8_t *gpu_pbs_buffer;
       auto num_blocks_on_gpu =
           get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
@@ -384,7 +384,7 @@ template <typename Torus> struct int_radix_lut {
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint i = 0; i < active_gpu_count; i++) {
-      cudaSetDevice(i);
+      cuda_set_device(i);
       int8_t *gpu_pbs_buffer;
       auto num_blocks_on_gpu =
           get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);

diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
              uint32_t polynomial_size, uint32_t level_count,
              uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
              PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
-    cudaSetDevice(gpu_index);
+    cuda_set_device(gpu_index);
 
     this->pbs_variant = pbs_variant;
     this->lwe_chunk_size = lwe_chunk_size;

diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -77,7 +77,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
              uint32_t polynomial_size, uint32_t level_count,
              uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
              bool allocate_gpu_memory) {
-    cudaSetDevice(gpu_index);
+    cuda_set_device(gpu_index);
     this->pbs_variant = pbs_variant;
 
     auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -11,7 +11,7 @@ void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
                                                uint32_t gpu_index, T *dest,
                                                T *src, uint32_t number_of_cts,
                                                uint32_t lwe_dimension) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
   uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
   cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
 }
@@ -21,7 +21,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
                                                uint32_t gpu_index, T *dest,
                                                T *src, uint32_t number_of_cts,
                                                uint32_t lwe_dimension) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
   uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
   cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }
@@ -55,7 +55,7 @@ __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                   Torus const *glwe_array_in,
                                   uint32_t const *nth_array, uint32_t num_nths,
                                   uint32_t glwe_dimension) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
 
   dim3 grid(num_nths);
   dim3 thds(params::degree / params::opt);

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -261,7 +261,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
 
   // Optimization of packing keyswitch when packing many LWEs
 
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaGetLastError());
 
   int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -57,7 +57,7 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
   if (gpu_count != 1)
     PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
           "supported yet.")
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
 
   int shared_memory_size = sizeof(double) * polynomial_size;
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -105,7 +105,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples) {
 
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
 
   constexpr int num_threads_y = 32;
   int num_blocks, num_threads_x;
@@ -160,7 +160,7 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
     cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t num_lwes, bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
 
   int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -110,7 +110,7 @@ template <typename Torus>
 __host__ void host_modulus_switch_inplace(cudaStream_t stream,
                                           uint32_t gpu_index, Torus *array,
                                           int size, uint32_t log_modulus) {
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
 
   int num_threads = 0, num_blocks = 0;
   getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);

diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -2,46 +2,50 @@
 #include <cstdint>
 #include <cuda_runtime.h>
 
-cudaEvent_t cuda_create_event(uint32_t gpu_index) {
+void cuda_set_device(uint32_t gpu_index) {
   check_cuda_error(cudaSetDevice(gpu_index));
+}
+
+cudaEvent_t cuda_create_event(uint32_t gpu_index) {
+  cuda_set_device(gpu_index);
   cudaEvent_t event;
   check_cuda_error(cudaEventCreate(&event));
   return event;
 }
 
 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
                        uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaEventRecord(event, stream));
 }
 
 void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
                             uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
 }
 
 void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaEventDestroy(event));
 }
 
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   cudaStream_t stream;
   check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
   return stream;
 }
 
 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaStreamDestroy(stream));
 }
 
 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaStreamSynchronize(stream));
 }
 
@@ -59,7 +63,7 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   void *ptr;
   check_cuda_error(cudaMalloc((void **)&ptr, size));
 
@@ -70,7 +74,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                         uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   void *ptr;
 
 #ifndef CUDART_VERSION
@@ -93,7 +97,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
 
 /// Check that allocation is valid
 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   size_t total_mem, free_mem;
   check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
   if (size > free_mem) {
@@ -141,7 +145,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
     PANIC("Cuda error: invalid device pointer in async copy to GPU.")
   }
 
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }
@@ -161,7 +165,7 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
   if (attr_src.type != cudaMemoryTypeDevice) {
     PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
   }
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   if (attr_src.device == attr_dest.device) {
     check_cuda_error(
         cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
@@ -186,7 +190,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
   if (attr_src.type != cudaMemoryTypeDevice) {
     PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
   }
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   if (attr_src.device == attr_dest.device) {
     check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
   } else {
@@ -197,7 +201,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
 
 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaDeviceSynchronize());
 }
 
@@ -210,7 +214,7 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
   if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
     PANIC("Cuda error: invalid dest device pointer in cuda memset.")
   }
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }
 
@@ -230,7 +234,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
     if (attr.type != cudaMemoryTypeDevice) {
       PANIC("Cuda error: invalid dest device pointer in cuda set value.")
     }
-    check_cuda_error(cudaSetDevice(gpu_index));
+    cuda_set_device(gpu_index);
     int block_size = 256;
     int num_blocks = (n + block_size - 1) / block_size;
 
@@ -260,7 +264,7 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
     PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
   }
 
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }
@@ -274,14 +278,14 @@ int cuda_get_number_of_gpus() {
 
 /// Drop a cuda array
 void cuda_drop(void *ptr, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
   check_cuda_error(cudaFree(ptr));
 }
 
 /// Drop a cuda array asynchronously, if supported on the device
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
 
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cuda_set_device(gpu_index);
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -12,7 +12,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
                           int_zero_out_if_buffer<Torus> *mem_ptr,
                           int_radix_lut<Torus> *predicate, void *const *bsks,
                           Torus *const *ksks, uint32_t num_radix_blocks) {
-  cudaSetDevice(gpu_indexes[0]);
+  cuda_set_device(gpu_indexes[0]);
   auto params = mem_ptr->params;
 
   // We can't use integer_radix_apply_bivariate_lookup_table_kb since the

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -38,7 +38,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
                                     uint32_t lwe_dimension,
                                     uint32_t num_radix_blocks) {
 
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
   int num_blocks = 0, num_threads = 0;
   int num_entries = (lwe_dimension + 1);
   getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -50,7 +50,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
   if (array_in == array_out)
     PANIC("Cuda error: Input and output must be different");
 
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
   auto compression_params = mem_ptr->compression_params;
 
   auto log_modulus = mem_ptr->storage_log_modulus;
@@ -185,7 +185,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
   if (array_in == glwe_array_out)
     PANIC("Cuda error: Input and output must be different");
 
-  cudaSetDevice(gpu_index);
+  cuda_set_device(gpu_index);
 
   auto compression_params = mem_ptr->compression_params;