Skip to content

Commit

Permalink
chore(gpu): encapsulate cudaSetDevice
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves authored and agnesLeroy committed Jan 31, 2025
1 parent c470b71 commit 3c88574
Show file tree
Hide file tree
Showing 33 changed files with 91 additions and 85 deletions.
2 changes: 2 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort(); \
}

void cuda_set_device(uint32_t gpu_index);

cudaEvent_t cuda_create_event(uint32_t gpu_index);

void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ template <typename Torus> struct int_radix_lut {
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cudaSetDevice(i);
cuda_set_device(i);
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
Expand Down Expand Up @@ -384,7 +384,7 @@ template <typename Torus> struct int_radix_lut {
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cudaSetDevice(i);
cuda_set_device(i);
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

this->pbs_variant = pbs_variant;
this->lwe_chunk_size = lwe_chunk_size;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;

auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
Expand Down
6 changes: 3 additions & 3 deletions backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
}
Expand All @@ -21,7 +21,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
}
Expand Down Expand Up @@ -55,7 +55,7 @@ __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
Torus const *glwe_array_in,
uint32_t const *nth_array, uint32_t num_nths,
uint32_t glwe_dimension) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

dim3 grid(num_nths);
dim3 thds(params::degree / params::opt);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(

// Optimization of packing keyswitch when packing many LWEs

cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(cudaGetLastError());

int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
if (gpu_count != 1)
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
"supported yet.")
cudaSetDevice(gpu_indexes[0]);
cuda_set_device(gpu_indexes[0]);

int shared_memory_size = sizeof(double) * polynomial_size;

Expand Down
4 changes: 2 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {

cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

constexpr int num_threads_y = 32;
int num_blocks, num_threads_x;
Expand Down Expand Up @@ -160,7 +160,7 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_lwes, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ template <typename Torus>
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
uint32_t gpu_index, Torus *array,
int size, uint32_t log_modulus) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
Expand Down
42 changes: 23 additions & 19 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,50 @@
#include <cstdint>
#include <cuda_runtime.h>

cudaEvent_t cuda_create_event(uint32_t gpu_index) {
void cuda_set_device(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
}

cudaEvent_t cuda_create_event(uint32_t gpu_index) {
cuda_set_device(gpu_index);
cudaEvent_t event;
check_cuda_error(cudaEventCreate(&event));
return event;
}

void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaEventRecord(event, stream));
}

void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
}

void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaEventDestroy(event));
}

/// Unsafe function to create a CUDA stream, must check first that GPU exists
cudaStream_t cuda_create_stream(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
cudaStream_t stream;
check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
return stream;
}

/// Unsafe function to destroy CUDA stream, must check first the GPU exists
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaStreamDestroy(stream));
}

void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaStreamSynchronize(stream));
}

Expand All @@ -59,7 +63,7 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
/// or if there's not enough memory. A safe wrapper around it must call
/// cuda_check_valid_malloc() first
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
void *ptr;
check_cuda_error(cudaMalloc((void **)&ptr, size));

Expand All @@ -70,7 +74,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
void *ptr;

#ifndef CUDART_VERSION
Expand All @@ -93,7 +97,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,

/// Check that allocation is valid
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
size_t total_mem, free_mem;
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
if (size > free_mem) {
Expand Down Expand Up @@ -141,7 +145,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
}

check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
}
Expand All @@ -161,7 +165,7 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
if (attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
}
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
if (attr_src.device == attr_dest.device) {
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
Expand All @@ -186,7 +190,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
if (attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
}
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
if (attr_src.device == attr_dest.device) {
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
} else {
Expand All @@ -197,7 +201,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,

/// Synchronizes device
void cuda_synchronize_device(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaDeviceSynchronize());
}

Expand All @@ -210,7 +214,7 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
}
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
}

Expand All @@ -230,7 +234,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
}
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;

Expand Down Expand Up @@ -260,7 +264,7 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
}

check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
}
Expand All @@ -274,14 +278,14 @@ int cuda_get_number_of_gpus() {

/// Drop a cuda array
void cuda_drop(void *ptr, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
check_cuda_error(cudaFree(ptr));
}

/// Drop a cuda array asynchronously, if supported on the device
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

check_cuda_error(cudaSetDevice(gpu_index));
cuda_set_device(gpu_index);
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cuda_set_device(gpu_indexes[0]);
auto params = mem_ptr->params;

// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
uint32_t lwe_dimension,
uint32_t num_radix_blocks) {

cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
if (array_in == array_out)
PANIC("Cuda error: Input and output must be different");

cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
auto compression_params = mem_ptr->compression_params;

auto log_modulus = mem_ptr->storage_log_modulus;
Expand Down Expand Up @@ -185,7 +185,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
if (array_in == glwe_array_out)
PANIC("Cuda error: Input and output must be different");

cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

auto compression_params = mem_ptr->compression_params;

Expand Down
Loading

0 comments on commit 3c88574

Please sign in to comment.