Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(gpu): use struct to pass radix data in the ffi #1994

Merged
merged 1 commit into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);

void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);

void synchronize_streams(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count);

uint32_t cuda_is_available();

void *cuda_malloc(uint64_t size, uint32_t gpu_index);
Expand Down
21 changes: 16 additions & 5 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

extern "C" {

typedef struct {
void *ptr;
agnesLeroy marked this conversation as resolved.
Show resolved Hide resolved
uint64_t *degrees;
uint64_t *noise_levels;
uint32_t num_radix_blocks;
uint32_t lwe_dimension;
} CudaRadixCiphertextFFI;
agnesLeroy marked this conversation as resolved.
Show resolved Hide resolved

void scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
Expand Down Expand Up @@ -258,9 +267,11 @@ void scratch_cuda_integer_radix_cmux_kb_64(

void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count);
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);

void cleanup_cuda_integer_radix_cmux(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down Expand Up @@ -439,8 +450,8 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
void *const *bsks, void *const *ksks);

void cleanup_cuda_integer_abs_inplace(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down
48 changes: 26 additions & 22 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#define CUDA_INTEGER_UTILITIES_H

#include "integer.h"
#include "integer/radix_ciphertext.cuh"
#include "integer/radix_ciphertext.h"
#include "keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include <cassert>
Expand Down Expand Up @@ -2963,9 +2965,9 @@ template <typename Torus> struct int_cmux_buffer {
int_radix_lut<Torus> *predicate_lut;
int_radix_lut<Torus> *message_extract_lut;

Torus *buffer_in;
Torus *buffer_out;
Torus *condition_array;
CudaRadixCiphertextFFI *buffer_in = new CudaRadixCiphertextFFI;
CudaRadixCiphertextFFI *buffer_out = new CudaRadixCiphertextFFI;
CudaRadixCiphertextFFI *condition_array = new CudaRadixCiphertextFFI;

int_radix_params params;

Expand All @@ -2978,15 +2980,15 @@ template <typename Torus> struct int_cmux_buffer {
this->params = params;

if (allocate_gpu_memory) {
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);

buffer_in =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
buffer_out =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
condition_array =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
create_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], buffer_in, 2 * num_radix_blocks,
params.big_lwe_dimension);
create_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], buffer_out, 2 * num_radix_blocks,
params.big_lwe_dimension);
create_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], condition_array, 2 * num_radix_blocks,
params.big_lwe_dimension);

auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus {
return predicate_lut_f(condition) ? 0 : block;
Expand Down Expand Up @@ -3047,9 +3049,12 @@ template <typename Torus> struct int_cmux_buffer {
message_extract_lut->release(streams, gpu_indexes, gpu_count);
delete message_extract_lut;

cuda_drop_async(buffer_in, streams[0], gpu_indexes[0]);
cuda_drop_async(buffer_out, streams[0], gpu_indexes[0]);
cuda_drop_async(condition_array, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], buffer_in);
delete buffer_in;
release_radix_ciphertext(streams[0], gpu_indexes[0], buffer_out);
delete buffer_out;
release_radix_ciphertext(streams[0], gpu_indexes[0], condition_array);
delete condition_array;
}
};

Expand Down Expand Up @@ -4351,7 +4356,7 @@ template <typename Torus> struct int_abs_buffer {
int_sc_prop_memory<Torus> *scp_mem;
int_bitop_buffer<Torus> *bitxor_mem;

Torus *mask;
CudaRadixCiphertextFFI *mask = new CudaRadixCiphertextFFI;
int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
Expand All @@ -4372,11 +4377,9 @@ template <typename Torus> struct int_abs_buffer {
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params,
num_radix_blocks, allocate_gpu_memory);

uint32_t lwe_size = params.big_lwe_dimension + 1;
uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);

mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes,
streams[0], gpu_indexes[0]);
create_trivial_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
mask, num_radix_blocks,
params.big_lwe_dimension);
}
}

Expand All @@ -4390,7 +4393,8 @@ template <typename Torus> struct int_abs_buffer {
delete scp_mem;
delete bitxor_mem;

cuda_drop_async(mask, streams[0], gpu_indexes[0]);
release_radix_ciphertext(streams[0], gpu_indexes[0], mask);
delete mask;
}
};

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef CUDA_RADIX_CIPHERTEXT_H
#define CUDA_RADIX_CIPHERTEXT_H

void release_radix_ciphertext(cudaStream_t const stream,
uint32_t const gpu_index,
CudaRadixCiphertextFFI *data);

#endif
16 changes: 7 additions & 9 deletions backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_

#include "integer/integer.h"
#include <stdint.h>

extern "C" {
Expand All @@ -14,16 +15,13 @@ void cuda_negate_lwe_ciphertext_vector_64(
void const *lwe_array_in, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_1,
CudaRadixCiphertextFFI const *lwe_array_in_2);
void cuda_add_lwe_ciphertext_vector_64(
agnesLeroy marked this conversation as resolved.
Show resolved Hide resolved
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);

void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_1,
CudaRadixCiphertextFFI const *lwe_array_in_2);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
add_library(tfhe_cuda_backend STATIC ${SOURCES} integer/radix_ciphertext.cu)
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_include_directories(tfhe_cuda_backend PRIVATE .)
7 changes: 7 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaStreamSynchronize(stream));
}

void synchronize_streams(cudaStream_t const *streams,
agnesLeroy marked this conversation as resolved.
Show resolved Hide resolved
uint32_t const *gpu_indexes, uint32_t gpu_count) {
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}

// Determine if a CUDA device is available at runtime
uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }

Expand Down
9 changes: 4 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
void *const *bsks, void *const *ksks) {

auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, static_cast<uint64_t *>(ct), bsks,
(uint64_t **)(ksks), mem, is_signed,
num_blocks);
gpu_count, ct, bsks, (uint64_t **)(ksks), mem,
is_signed);
}

void cleanup_cuda_integer_abs_inplace(void *const *streams,
Expand Down
64 changes: 49 additions & 15 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,12 @@
#define TFHE_RS_ABS_CUH

#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/bitwise_ops.cuh"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "pbs/programmable_bootstrap.h"
#include "radix_ciphertext.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
Expand All @@ -32,16 +29,15 @@ __host__ void scratch_cuda_integer_abs_kb(
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks,
uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
bool is_signed, uint32_t num_blocks) {
__host__ void legacy_host_integer_abs_kb_async(
agnesLeroy marked this conversation as resolved.
Show resolved Hide resolved
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
if (!is_signed)
return;

auto radix_params = mem_ptr->params;
auto mask = mem_ptr->mask;
auto mask = (Torus *)(mem_ptr->mask->ptr);

auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
Expand All @@ -52,20 +48,58 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);

host_integer_radix_arithmetic_scalar_shift_kb_inplace(
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);
legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);

uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
bsks, ksks, num_blocks, requested_flag, uses_carry);

host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
ct, mem_ptr->bitxor_mem, bsks, ksks,
num_blocks);
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
if (!is_signed)
agnesLeroy marked this conversation as resolved.
Show resolved Hide resolved
return;

auto mask = mem_ptr->mask;

uint32_t num_bits_in_ciphertext =
(31 - __builtin_clz(mem_ptr->params.message_modulus)) *
ct->num_radix_blocks;

copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
streams[0], gpu_indexes[0], mask, ct, 0);

host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(mask->ptr),
num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks,
ksks, ct->num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct);

uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(ct->ptr), nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ct->num_radix_blocks, requested_flag,
uses_carry);

host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count,
(Torus *)(ct->ptr), (Torus *)(mask->ptr),
(Torus *)(ct->ptr), mem_ptr->bitxor_mem,
bsks, ksks, ct->num_radix_blocks);
}

#endif // TFHE_RS_ABS_CUH
19 changes: 8 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,16 @@ void scratch_cuda_integer_radix_cmux_kb_64(

void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count) {
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {

host_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_condition),
static_cast<const uint64_t *>(lwe_array_true),
static_cast<const uint64_t *>(lwe_array_false),
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),

lwe_ciphertext_count);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_condition, lwe_array_true, lwe_array_false,
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
}

void cleanup_cuda_integer_radix_cmux(void *const *streams,
Expand Down
Loading
Loading