Skip to content

Commit

Permalink
chore(gpu): track noise level/degree in apply lut
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jan 31, 2025
1 parent 3c88574 commit 04fb07b
Show file tree
Hide file tree
Showing 17 changed files with 442 additions and 384 deletions.
13 changes: 6 additions & 7 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
uint64_t lut_degree, bool allocate_gpu_memory);
void scratch_cuda_apply_many_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
Expand All @@ -63,12 +63,11 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t num_many_lut, bool allocate_gpu_memory);
void cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks);
void cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks);

void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down
7 changes: 3 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
predicate->lwe_indexes_in, params.big_lwe_dimension,
params.message_modulus, num_radix_blocks);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
ksks, num_radix_blocks, predicate);
}
Expand Down Expand Up @@ -68,7 +68,7 @@ __host__ void legacy_host_integer_radix_cmux_kb(
mem_false, params.big_lwe_dimension,
num_radix_blocks);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
num_radix_blocks, mem_ptr->message_extract_lut);
}
Expand Down Expand Up @@ -122,8 +122,7 @@ __host__ void host_integer_radix_cmux_kb(
mem_false);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(lwe_array_out->ptr),
(Torus *)(added_cts->ptr), bsks, ksks, num_radix_blocks,
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
mem_ptr->message_extract_lut);
delete mem_true;
delete mem_false;
Expand Down
22 changes: 11 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,12 @@ __host__ void are_all_comparisons_block_true(
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
num_chunks, lut);
}
Expand Down Expand Up @@ -219,12 +219,12 @@ __host__ void is_at_least_one_comparisons_block_true(
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
accumulator, bsks, ksks, num_chunks, lut);
}
Expand Down Expand Up @@ -305,7 +305,7 @@ __host__ void host_compare_with_zero_equality(
}
}

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
zero_comparison);
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
Expand Down Expand Up @@ -371,7 +371,7 @@ __host__ void compare_radix_blocks_kb(

// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
num_radix_blocks, is_non_zero_lut);

Expand Down Expand Up @@ -422,7 +422,7 @@ __host__ void tree_sign_reduction(
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
partial_block_count >> 1, inner_tree_leaf);

Expand Down Expand Up @@ -468,7 +468,7 @@ __host__ void tree_sign_reduction(
last_lut->broadcast_lut(streams, gpu_indexes, 0);

// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
last_lut);
}
Expand Down Expand Up @@ -514,7 +514,7 @@ __host__ void host_integer_radix_difference_check_kb(

// Clean noise
auto identity_lut = mem_ptr->identity_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
2 * packed_num_radix_blocks, identity_lut);

Expand Down Expand Up @@ -552,11 +552,11 @@ __host__ void host_integer_radix_difference_check_kb(
packed_left + packed_num_radix_blocks * big_lwe_size;
Torus *last_right_block_before_sign_block =
packed_right + packed_num_radix_blocks * big_lwe_size;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
identity_lut);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
1, identity_lut);
Expand Down
10 changes: 5 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsks, ksks, 1,
mem_ptr->masking_luts_1[shifted_mask]);
Expand Down Expand Up @@ -314,7 +314,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsks, ksks, 1,
mem_ptr->masking_luts_2[shifted_mask]);
Expand Down Expand Up @@ -481,7 +481,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto create_clean_version_of_merged_remainder =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data, bsks, ksks,
Expand Down Expand Up @@ -595,10 +595,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
ksks, num_blocks, mem_ptr->message_extract_lut_2);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
Expand Down
23 changes: 10 additions & 13 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
uint64_t lut_degree, bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
Expand All @@ -195,7 +195,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
allocate_gpu_memory);
lut_degree, allocate_gpu_memory);
}

void scratch_cuda_apply_many_univariate_lut_kb_64(
Expand All @@ -219,19 +219,16 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
num_many_lut, allocate_gpu_memory);
}

void cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks) {
void cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks) {

host_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<const uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
num_blocks);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
bsks);
}

void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
Expand Down
Loading

0 comments on commit 04fb07b

Please sign in to comment.