Skip to content

Commit

Permalink
fix(gpu): general fixes on indexes used in multi-gpu context.
Browse files Browse the repository at this point in the history
- fix a bug in which the wrong GPU may be queried for the max shared memory
- If multiple streams are running split through multiple GPUs,
  operations happening on a stream in GPU i should query GPU i about its
max shared memory,
- also fixes wrong indexing at rust side.
  • Loading branch information
pdroalves committed Jan 21, 2025
1 parent f66805e commit 89a8733
Show file tree
Hide file tree
Showing 19 changed files with 306 additions and 266 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ template <typename Torus> struct int_radix_lut {
std::vector<Torus *> lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;

int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t *gpu_indexes;

int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {

Expand All @@ -160,6 +162,9 @@ template <typename Torus> struct int_radix_lut {
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));

///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
Expand Down Expand Up @@ -255,7 +260,7 @@ template <typename Torus> struct int_radix_lut {
}

// constructor to reuse memory
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, int_radix_lut *base_lut_object) {

Expand All @@ -266,6 +271,9 @@ template <typename Torus> struct int_radix_lut {
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));

// base lut object should have bigger or equal memory than current one
assert(num_radix_blocks <= base_lut_object->num_blocks);
// pbs
Expand Down Expand Up @@ -332,7 +340,7 @@ template <typename Torus> struct int_radix_lut {
}

// Construction for many luts
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, uint32_t num_many_lut,
bool allocate_gpu_memory) {
Expand All @@ -344,6 +352,9 @@ template <typename Torus> struct int_radix_lut {
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));

///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
Expand Down Expand Up @@ -494,6 +505,7 @@ template <typename Torus> struct int_radix_lut {

void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
free(this->gpu_indexes);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size);
uint32_t polynomial_size, int max_shared_memory);

template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count);
uint32_t level_count, int max_shared_memory);

#if CUDA_ARCH >= 900
template <typename Torus>
Expand Down
14 changes: 7 additions & 7 deletions backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {

template <typename Torus>
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size);
uint32_t polynomial_size, int max_shared_memory);

template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

Expand All @@ -77,10 +77,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory) {

cudaSetDevice(gpu_index);
this->pbs_variant = pbs_variant;

auto max_shared_memory = cuda_get_max_shared_memory(0);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

if (allocate_gpu_memory) {
switch (pbs_variant) {
Expand Down Expand Up @@ -157,7 +157,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size);
Torus>(polynomial_size, max_shared_memory);

uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
Expand Down Expand Up @@ -218,8 +218,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
int max_shared_memory = cuda_get_max_shared_memory(0);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
Expand All @@ -245,7 +244,8 @@ template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples,
int max_shared_memory);

template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ extern "C" {

bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, int max_shared_memory);

void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
Expand Down
16 changes: 9 additions & 7 deletions backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,12 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);

int max_shared_memory = cuda_get_max_shared_memory(gpu_index);

double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
Expand All @@ -130,7 +132,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
Expand All @@ -151,7 +153,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
Expand All @@ -172,7 +174,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
Expand All @@ -193,7 +195,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
Expand All @@ -214,7 +216,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
Expand All @@ -235,7 +237,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
Expand Down
16 changes: 9 additions & 7 deletions backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,

cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);

int max_shared_memory = cuda_get_max_shared_memory(gpu_index);

double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
Expand All @@ -132,7 +134,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
Expand All @@ -150,7 +152,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
Expand All @@ -168,7 +170,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
Expand All @@ -186,7 +188,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
Expand All @@ -204,7 +206,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
Expand All @@ -222,7 +224,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
if (shared_memory_size <= max_shared_memory) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,8 @@ uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count) {
uint32_t input_lwe_ciphertext_count, int max_shared_memory) {

int max_shared_memory = cuda_get_max_shared_memory(0);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
Expand Down Expand Up @@ -265,7 +264,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
Expand All @@ -284,7 +283,8 @@ __host__ void scratch_programmable_bootstrap_amortized(
if (allocate_gpu_memory) {
uint64_t buffer_size =
get_buffer_size_programmable_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
check_cuda_error(cudaGetLastError());
}
Expand All @@ -311,7 +311,7 @@ __host__ void host_programmable_bootstrap_amortized(

uint64_t DM_FULL = SM_FULL;

int max_shared_memory = cuda_get_max_shared_memory(0);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cudaSetDevice(gpu_index);

// Create a 1-dimensional grid of threads
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ __host__ void scratch_programmable_bootstrap_cg(
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
Expand Down Expand Up @@ -246,7 +246,7 @@ __host__ void host_programmable_bootstrap_cg(
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);

int max_shared_memory = cuda_get_max_shared_memory(0);
int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cudaSetDevice(gpu_index);

uint64_t full_dm = full_sm;
Expand Down Expand Up @@ -300,7 +300,8 @@ __host__ void host_programmable_bootstrap_cg(
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
int glwe_dimension, int level_count, int num_samples) {
int glwe_dimension, int level_count, int num_samples,
int max_shared_memory) {

// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
Expand All @@ -320,7 +321,6 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
int max_active_blocks_per_sm;

int max_shared_memory = cuda_get_max_shared_memory(0);
if (max_shared_memory < partial_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
Expand All @@ -346,30 +346,37 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus>
__host__ bool supports_cooperative_groups_on_programmable_bootstrap(
int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
int max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 512:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 1024:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 2048:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 4096:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 8192:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 16384:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
num_samples);
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
Expand Down
Loading

0 comments on commit 89a8733

Please sign in to comment.