Skip to content

Commit

Permalink
chore(gpu): add degree information to cuda int_radix_lut
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jan 24, 2025
1 parent 7a241aa commit f27212b
Show file tree
Hide file tree
Showing 8 changed files with 359 additions and 197 deletions.
3 changes: 0 additions & 3 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);

void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);

void synchronize_streams(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count);

uint32_t cuda_is_available();

void *cuda_malloc(uint64_t size, uint32_t gpu_index);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ template <typename Torus> struct int_decompression {

generate_device_accumulator_with_encoding<Torus>(
streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
decompression_rescale_lut->get_degree(0),
decompression_rescale_lut->get_max_degree(0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
effective_compression_message_modulus,
effective_compression_carry_modulus,
Expand Down
387 changes: 262 additions & 125 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES} integer/radix_ciphertext.cu)
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_include_directories(tfhe_cuda_backend PRIVATE .)
7 changes: 5 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ __host__ void are_all_comparisons_block_true(
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
is_max_value_lut->get_degree(1),
is_max_value_lut->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f);

Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus));
Expand Down Expand Up @@ -460,7 +462,8 @@ __host__ void tree_sign_reduction(
f = sign_handler_f;
}
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f);
last_lut->broadcast_lut(streams, gpu_indexes, 0);

Expand Down
125 changes: 72 additions & 53 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -644,13 +644,11 @@ void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
/// scaling is done in the output space, as there are more bits in the output
/// space, the delta is smaller hence the apparent "division" happening.
template <typename Torus>
void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_message_modulus,
uint32_t input_carry_modulus,
uint32_t output_message_modulus,
uint32_t output_carry_modulus,
std::function<Torus(Torus)> f) {
uint64_t generate_lookup_table_with_encoding(
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_message_modulus, uint32_t input_carry_modulus,
uint32_t output_message_modulus, uint32_t output_carry_modulus,
std::function<Torus(Torus)> f) {

uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
uint32_t output_modulus_sup = output_message_modulus * output_carry_modulus;
Expand All @@ -662,12 +660,14 @@ void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension,
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

auto body = &acc[glwe_dimension * polynomial_size];
uint64_t degree = 0;

// This accumulator extracts the carry bits
for (int i = 0; i < input_modulus_sup; i++) {
int index = i * box_size;
for (int j = index; j < index + box_size; j++) {
auto f_eval = f(i);
degree = max(degree, f_eval);
body[j] = f_eval * output_delta;
}
}
Expand All @@ -680,22 +680,23 @@ void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension,
}

rotate_left<Torus>(body, half_box_size, polynomial_size);
return degree;
}

template <typename Torus>
void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus)> f) {
generate_lookup_table_with_encoding(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus,
message_modulus, carry_modulus, f);
uint64_t generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus)> f) {
return generate_lookup_table_with_encoding(
acc, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
message_modulus, carry_modulus, f);
}

template <typename Torus>
void generate_many_lookup_table(
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
uint64_t generate_many_lookup_table(
Torus *acc, uint64_t *degrees, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::vector<std::function<Torus(Torus)>> &functions) {

uint32_t modulus_sup = message_modulus * carry_modulus;
Expand All @@ -713,6 +714,10 @@ void generate_many_lookup_table(

// Space used for each sub lut
uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size;
uint64_t max_degree = (modulus_sup / fn_counts - 1);
for (int f = 0; f < fn_counts; f++) {
degrees[f] = 0;
}

// This accumulator extracts the carry bits
for (int f = 0; f < fn_counts; f++) {
Expand All @@ -721,6 +726,7 @@ void generate_many_lookup_table(
int index = i * box_size + lut_offset;
for (int j = index; j < index + box_size; j++) {
auto f_eval = functions[f](i);
degrees[f] = max(f_eval, degrees[f]);
body[j] = f_eval * delta;
}
}
Expand All @@ -733,14 +739,15 @@ void generate_many_lookup_table(
}

rotate_left<Torus>(body, half_box_size, polynomial_size);
return max_degree;
}

template <typename Torus>
void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f) {
uint64_t generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f) {

Torus factor_u64 = message_modulus;
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
Expand All @@ -750,12 +757,13 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
return f(lhs, rhs);
};

generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, wrapped_f);
return generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus,
wrapped_f);
}

template <typename Torus>
void generate_lookup_table_bivariate_with_factor(
uint64_t generate_lookup_table_bivariate_with_factor(
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, int factor) {
Expand All @@ -768,8 +776,9 @@ void generate_lookup_table_bivariate_with_factor(
return f(lhs, rhs);
};

generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, wrapped_f);
return generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus,
wrapped_f);
}

/*
Expand All @@ -782,16 +791,18 @@ void generate_lookup_table_bivariate_with_factor(
template <typename Torus>
void generate_device_accumulator_bivariate(
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f) {

// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));

*max_degree = message_modulus * carry_modulus - 1;
// fill bivariate accumulator
generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
*degree = generate_lookup_table_bivariate<Torus>(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f);

// copy host lut and lut_indexes_vec to device
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
Expand All @@ -813,15 +824,17 @@ void generate_device_accumulator_bivariate(
template <typename Torus>
void generate_device_accumulator_bivariate_with_factor(
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, int factor) {

// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));

*max_degree = message_modulus * carry_modulus - 1;
// fill bivariate accumulator
generate_lookup_table_bivariate_with_factor<Torus>(
*degree = generate_lookup_table_bivariate_with_factor<Torus>(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
factor);

Expand All @@ -838,8 +851,8 @@ void generate_device_accumulator_bivariate_with_factor(

template <typename Torus>
void generate_device_accumulator_with_encoding(
cudaStream_t stream, uint32_t gpu_index, Torus *acc,
uint32_t glwe_dimension, uint32_t polynomial_size,
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_message_modulus, uint32_t input_carry_modulus,
uint32_t output_message_modulus, uint32_t output_carry_modulus,
std::function<Torus(Torus)> f) {
Expand All @@ -848,8 +861,9 @@ void generate_device_accumulator_with_encoding(
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));

*max_degree = input_message_modulus * input_carry_modulus - 1;
// fill accumulator
generate_lookup_table_with_encoding<Torus>(
*degree = generate_lookup_table_with_encoding<Torus>(
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
input_carry_modulus, output_message_modulus, output_carry_modulus, f);

Expand All @@ -871,15 +885,17 @@ void generate_device_accumulator_with_encoding(
*/
template <typename Torus>
void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
Torus *acc, uint32_t glwe_dimension,
Torus *acc, uint64_t *degree,
uint64_t *max_degree, uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus)> f) {

generate_device_accumulator_with_encoding(
stream, gpu_index, acc, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, message_modulus, carry_modulus, f);
stream, gpu_index, acc, degree, max_degree, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, message_modulus,
carry_modulus, f);
}

/*
Expand All @@ -891,18 +907,19 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
*/
template <typename Torus>
void generate_many_lut_device_accumulator(
cudaStream_t stream, uint32_t gpu_index, Torus *acc,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus,
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees,
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::vector<std::function<Torus(Torus)>> &functions) {

// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));

// fill accumulator
generate_many_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, functions);
*max_degree = generate_many_lookup_table<Torus>(
h_lut, degrees, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, functions);

// copy host lut and lut_indexes_vec to device
cuda_memcpy_async_to_gpu(
Expand Down Expand Up @@ -1455,9 +1472,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
if (num_sign_blocks > 2) {
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
reduce_two_orderings_function);
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, reduce_two_orderings_function);
lut->broadcast_lut(streams, gpu_indexes, 0);

while (num_sign_blocks > 2) {
Expand Down Expand Up @@ -1489,8 +1506,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,

auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f);
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
Expand All @@ -1508,8 +1526,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,

auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f);
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,13 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_message);
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_message);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

Expand Down
21 changes: 11 additions & 10 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
};

auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator<Torus>(streams[0], gpu_indexes[0],
lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, scalar_last_leaf_lut_f);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
Expand Down Expand Up @@ -195,9 +195,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f);
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
Expand Down Expand Up @@ -331,9 +331,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f);
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
Expand Down Expand Up @@ -426,6 +426,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto signed_msb_lut = mem_ptr->signed_msb_lut;
generate_device_accumulator_bivariate<Torus>(
msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);
Expand Down

0 comments on commit f27212b

Please sign in to comment.