diff --git a/Makefile b/Makefile index 79e85a30a0..8b2727c755 100644 --- a/Makefile +++ b/Makefile @@ -282,14 +282,14 @@ check_typos: install_typos_checker .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled clippy_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \ - --features=boolean,shortint,integer,internal-keycache,gpu \ + --features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \ --all-targets \ -p $(TFHE_SPEC) -- --no-deps -D warnings .PHONY: check_gpu # Run check on tfhe with "gpu" enabled check_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \ - --features=boolean,shortint,integer,internal-keycache,gpu \ + --features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \ --all-targets \ -p $(TFHE_SPEC) @@ -394,10 +394,10 @@ clippy_trivium: install_rs_check_toolchain .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.) clippy_all_targets: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ - --features=boolean,shortint,integer,internal-keycache,zk-pok,strings \ + --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \ -p $(TFHE_SPEC) -- --no-deps -D warnings RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ - --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \ + --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \ -p $(TFHE_SPEC) -- --no-deps -D warnings .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng @@ -1056,35 +1056,35 @@ bench_integer: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_signed_integer # Run benchmarks for signed integer bench_signed_integer: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-signed-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend bench_integer_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression bench_integer_compression: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench glwe_packing_compression-integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_compression_gpu bench_integer_compression_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench glwe_packing_compression-integer-bench \ - --features=integer,internal-keycache,gpu -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters bench_integer_multi_bit: install_rs_check_toolchain @@ -1092,7 +1092,7 @@ bench_integer_multi_bit: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters bench_signed_integer_multi_bit: install_rs_check_toolchain @@ -1100,7 +1100,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-signed-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters bench_integer_multi_bit_gpu: install_rs_check_toolchain @@ -1108,7 +1108,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain @@ -1116,14 +1116,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs bench_integer_zk: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench zk-pke-bench \ - --features=integer,internal-keycache,zk-pok,nightly-avx512 \ + --features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \ -p $(TFHE_SPEC) -- .PHONY: bench_shortint # Run benchmarks for shortint diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs index 8fb1200035..4fea7c3db7 100644 --- a/tfhe/benches/integer/bench.rs +++ b/tfhe/benches/integer/bench.rs @@ -11,6 +11,7 @@ use crate::utilities::{ use criterion::{criterion_group, Criterion, Throughput}; use rand::prelude::*; use rayon::prelude::*; +use std::cmp::max; use std::env; use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::prelude::*; @@ -143,15 +144,25 @@ fn bench_server_key_binary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = gen_random_u256(&mut rng); + let mut ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, &mut ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -294,15 +305,23 @@ fn bench_server_key_unary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + + reset_pbs_count(); + unary_fn(&sks, &mut ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -451,15 +470,24 @@ fn bench_server_key_binary_scalar_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size; + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, clear_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -567,15 +595,28 @@ fn if_then_else_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let true_ct = cks.encrypt_radix(clear_0, num_block); + + let clear_1 = gen_random_u256(&mut rng); + let false_ct = cks.encrypt_radix(clear_1, num_block); + + let condition = sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + + reset_pbs_count(); + sks.if_then_else_parallelized(&condition, &true_ct, &false_ct); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let cts_cond = (0..elements) .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5))) .collect::>(); @@ -663,20 +704,34 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize); + let cks = RadixClientKey::from((cks, nb_ctxt)); + + let clears = (0..len) + .map(|_| gen_random_u256(&mut rng) & max_for_bit_size) + .collect::>(); + let ctxts = clears + .iter() + .copied() + .map(|clear| cks.encrypt(clear)) + .collect::>(); + + reset_pbs_count(); + sks.sum_ciphertexts_parallelized(&ctxts); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!( "{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits" ); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - - let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize); - let cks = RadixClientKey::from((cks, nb_ctxt)); - let cts = (0..elements) .map(|_| { let clears = (0..len) @@ -1308,19 +1363,23 @@ define_server_key_bench_default_fn!( mod cuda { use super::*; use criterion::{black_box, criterion_group}; + use std::cmp::max; use tfhe::core_crypto::gpu::CudaStreams; use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext; use tfhe::integer::gpu::server_key::CudaServerKey; + use tfhe::integer::{RadixCiphertext, ServerKey}; use tfhe_csprng::seeders::Seed; - fn bench_cuda_server_key_unary_function_clean_inputs( + fn bench_cuda_server_key_unary_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, unary_op: F, + unary_op_cpu: G, ) where F: Fn(&CudaServerKey, &mut CudaUnsignedRadixCiphertext, &CudaStreams) + Sync, + G: Fn(&ServerKey, &mut RadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1359,17 +1418,24 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + unary_op_cpu(&cpu_sks, &mut ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1402,11 +1468,12 @@ mod cuda { /// Base function to bench a server key function that is a binary operation, input ciphertext /// will contain only zero carries - fn bench_cuda_server_key_binary_function_clean_inputs( + fn bench_cuda_server_key_binary_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, + binary_op_cpu: G, ) where F: Fn( &CudaServerKey, @@ -1414,6 +1481,7 @@ mod cuda { &mut CudaUnsignedRadixCiphertext, &CudaStreams, ) + Sync, + G: Fn(&ServerKey, &mut RadixCiphertext, &mut RadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1458,17 +1526,27 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = gen_random_u256(&mut rng); + let mut ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, &mut ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1507,15 +1585,17 @@ mod cuda { bench_group.finish() } - fn bench_cuda_server_key_binary_scalar_function_clean_inputs( + fn bench_cuda_server_key_binary_scalar_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, - rng_func: G, + binary_op_cpu: G, + rng_func: H, ) where F: Fn(&CudaServerKey, &mut CudaUnsignedRadixCiphertext, ScalarType, &CudaStreams) + Sync, - G: Fn(&mut ThreadRng, usize) -> ScalarType, + G: Fn(&ServerKey, &mut RadixCiphertext, ScalarType) + Sync, + H: Fn(&mut ThreadRng, usize) -> ScalarType, { let mut bench_group = c.benchmark_group(bench_name); let mut rng = rand::thread_rng(); @@ -1565,19 +1645,28 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size; + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, clear_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); bench_id = format!( "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}" ); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1668,17 +1757,28 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let ct_then = cks.encrypt_radix(clear_0, num_block); + let clear_1 = gen_random_u256(&mut rng); + let ct_else = cks.encrypt_radix(clear_1, num_block); + let ct_cond = cpu_sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + cpu_sks.if_then_else_parallelized(&ct_cond, &ct_then, &ct_else); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let cts_cond = (0..elements) .map(|_| { let ct_cond = cks.encrypt_bool(rng.gen::()); @@ -1769,15 +1869,22 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + // Execute the operation once to know its cost. + reset_pbs_count(); + cpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded( + Seed(0), + bit_size as u64, + num_block as u64, + ); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); - bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - b.iter(|| { (0..elements).into_par_iter().for_each(|i| { let selected_gpu = @@ -1811,7 +1918,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_unary_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name: $name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_unary_function_clean_inputs( @@ -1820,6 +1927,9 @@ mod cuda { stringify!($name), |server_key, lhs, stream| { server_key.$server_key_method(lhs, stream); + }, + |server_key_cpu, lhs| { + server_key_cpu.$server_key_method_cpu(lhs); } ) } @@ -1827,7 +1937,7 @@ mod cuda { }); macro_rules! define_cuda_server_key_bench_clean_input_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_function_clean_inputs( @@ -1836,6 +1946,9 @@ mod cuda { stringify!($name), |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); + }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); } ) } @@ -1844,7 +1957,7 @@ mod cuda { ); macro_rules! define_cuda_server_key_bench_clean_input_scalar_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_scalar_function_clean_inputs( @@ -1854,6 +1967,9 @@ mod cuda { |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); + }, $($rng_fn)* ) } @@ -1866,222 +1982,262 @@ mod cuda { //=========================================== define_cuda_server_key_bench_clean_input_unary_fn!( method_name: unchecked_neg, + method_name_cpu: unchecked_neg, display_name: negation ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_bitand, + method_name_cpu: unchecked_bitand, display_name: bitand ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_bitor, + method_name_cpu: unchecked_bitor, display_name: bitor ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_bitxor, + method_name_cpu: unchecked_bitxor, display_name: bitxor ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: unchecked_bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_mul, + method_name_cpu: unchecked_mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_div_rem, + method_name_cpu: unchecked_div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_add, + method_name_cpu: unchecked_add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_sub, + method_name_cpu: unchecked_sub, display_name: sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_unsigned_overflowing_sub, + method_name_cpu: unchecked_unsigned_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_unsigned_overflowing_add, + method_name_cpu: unsigned_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_eq, + method_name_cpu: unchecked_eq, display_name: equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_ne, + method_name_cpu: unchecked_ne, display_name: not_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_left_shift, + method_name_cpu: unchecked_left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_right_shift, + method_name_cpu: unchecked_right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_rotate_left, + method_name_cpu: unchecked_rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_rotate_right, + method_name_cpu: unchecked_rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: unchecked_ilog2, + method_name_cpu: unchecked_ilog2_parallelized, display_name: ilog2 ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_bitand, + method_name_cpu: unchecked_scalar_bitand_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_bitor, + method_name_cpu: unchecked_scalar_bitor_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_bitxor, + method_name_cpu: unchecked_scalar_bitxor_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_add, + method_name_cpu: unchecked_scalar_add, display_name: add, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_mul, + method_name_cpu: unchecked_scalar_mul_parallelized, display_name: mul, rng_func: mul_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_sub, + method_name_cpu: unchecked_scalar_sub, display_name: sub, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_left_shift, + method_name_cpu: unchecked_scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_right_shift, + method_name_cpu: unchecked_scalar_right_shift_parallelized, display_name: right_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_rotate_left, + method_name_cpu: unchecked_scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_rotate_right, + method_name_cpu: unchecked_scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_eq, + method_name_cpu: unchecked_scalar_eq_parallelized, display_name: equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_ne, + method_name_cpu: unchecked_scalar_ne_parallelized, display_name: not_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_gt, + method_name_cpu: unchecked_scalar_gt_parallelized, display_name: greater_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_ge, + method_name_cpu: unchecked_scalar_ge_parallelized, display_name: greater_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_lt, + method_name_cpu: unchecked_scalar_lt_parallelized, display_name: less_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_le, + method_name_cpu: unchecked_scalar_le_parallelized, display_name: less_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_max, + method_name_cpu: unchecked_scalar_max_parallelized, display_name: max, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_min, + method_name_cpu: unchecked_scalar_min_parallelized, display_name: min, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_div_rem, + method_name_cpu: unchecked_scalar_div_rem_parallelized, display_name: div_mod, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_div, + method_name_cpu: unchecked_scalar_div_parallelized, display_name: div, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_rem, + method_name_cpu: unchecked_scalar_rem_parallelized, display_name: modulo, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_unsigned_overflowing_scalar_add, + method_name_cpu: unsigned_overflowing_scalar_add_parallelized, display_name: overflowing_add, rng_func: default_scalar ); @@ -2092,282 +2248,334 @@ mod cuda { define_cuda_server_key_bench_clean_input_unary_fn!( method_name: neg, + method_name_cpu: neg_parallelized, display_name: negation ); define_cuda_server_key_bench_clean_input_fn!( method_name: add, + method_name_cpu: add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_fn!( method_name: sub, + method_name_cpu: sub_parallelized, display_name: sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unsigned_overflowing_sub, + method_name_cpu: unsigned_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unsigned_overflowing_add, + method_name_cpu: unsigned_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_fn!( method_name: mul, + method_name_cpu: mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_fn!( method_name: div_rem, + method_name_cpu: div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_fn!( method_name: div, + method_name_cpu: div_parallelized, display_name: div ); define_cuda_server_key_bench_clean_input_fn!( method_name: rem, + method_name_cpu: rem_parallelized, display_name: modulo ); define_cuda_server_key_bench_clean_input_fn!( method_name: ne, + method_name_cpu: ne_parallelized, display_name: not_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: eq, + method_name_cpu: eq_parallelized, display_name: equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: bitand, + method_name_cpu: bitand_parallelized, display_name: bitand ); define_cuda_server_key_bench_clean_input_fn!( method_name: bitor, + method_name_cpu: bitor_parallelized, display_name: bitor ); define_cuda_server_key_bench_clean_input_fn!( method_name: bitxor, + method_name_cpu: bitxor_parallelized, display_name: bitxor ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_fn!( method_name: gt, + method_name_cpu: gt_parallelized, display_name: greater_than ); define_cuda_server_key_bench_clean_input_fn!( method_name: ge, + method_name_cpu: ge_parallelized, display_name: greater_or_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: lt, + method_name_cpu: lt_parallelized, display_name: less_than ); define_cuda_server_key_bench_clean_input_fn!( method_name: le, + method_name_cpu: le_parallelized, display_name: less_or_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: max, + method_name_cpu: max_parallelized, display_name: max ); define_cuda_server_key_bench_clean_input_fn!( method_name: min, + method_name_cpu: min_parallelized, display_name: min ); define_cuda_server_key_bench_clean_input_fn!( method_name: left_shift, + method_name_cpu: left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: right_shift, + method_name_cpu: right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: rotate_left, + method_name_cpu: rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_fn!( method_name: rotate_right, + method_name_cpu: rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: leading_zeros, + method_name_cpu: leading_zeros_parallelized, display_name: leading_zeros ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: leading_ones, + method_name_cpu: leading_ones_parallelized, display_name: leading_ones ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: trailing_zeros, + method_name_cpu: trailing_zeros_parallelized, display_name: trailing_zeros ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: trailing_ones, + method_name_cpu: trailing_ones_parallelized, display_name: trailing_ones ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: ilog2, + method_name_cpu: ilog2_parallelized, display_name: ilog2 ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_sub, + method_name_cpu: scalar_sub_parallelized, display_name: sub, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_add, + method_name_cpu: scalar_add_parallelized, display_name: add, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_mul, + method_name_cpu: scalar_mul_parallelized, display_name: mul, rng_func: mul_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_left_shift, + method_name_cpu: scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_right_shift, + method_name_cpu: scalar_right_shift_parallelized, display_name: right_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_rotate_left, + method_name_cpu: scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_rotate_right, + method_name_cpu: scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_bitand, + method_name_cpu: scalar_bitand_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_bitor, + method_name_cpu: scalar_bitor_parallelized, display_name: bitor, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_bitxor, + method_name_cpu: scalar_bitxor_parallelized, display_name: bitxor, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_eq, + method_name_cpu: scalar_eq_parallelized, display_name: equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_ne, + method_name_cpu: scalar_ne_parallelized, display_name: not_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_gt, + method_name_cpu: scalar_gt_parallelized, display_name: greater_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_ge, + method_name_cpu: scalar_ge_parallelized, display_name: greater_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_lt, + method_name_cpu: scalar_lt_parallelized, display_name: less_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_le, + method_name_cpu: scalar_le_parallelized, display_name: less_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_max, + method_name_cpu: scalar_max_parallelized, display_name: max, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_min, + method_name_cpu: scalar_min_parallelized, display_name: min, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_div_rem, + method_name_cpu: scalar_div_rem_parallelized, display_name: div_mod, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_div, + method_name_cpu: scalar_div_parallelized, display_name: div, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_rem, + method_name_cpu: scalar_rem_parallelized, display_name: modulo, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unsigned_overflowing_scalar_add, + method_name_cpu: unsigned_overflowing_scalar_add_parallelized, display_name: overflowing_add, rng_func: default_scalar ); @@ -2597,6 +2805,7 @@ use cuda::{ cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; +use tfhe::{get_pbs_count, reset_pbs_count}; criterion_group!( smart_ops, diff --git a/tfhe/benches/integer/glwe_packing_compression.rs b/tfhe/benches/integer/glwe_packing_compression.rs index cb30a63570..de030a8522 100644 --- a/tfhe/benches/integer/glwe_packing_compression.rs +++ b/tfhe/benches/integer/glwe_packing_compression.rs @@ -6,6 +6,7 @@ use crate::utilities::{ }; use criterion::{black_box, criterion_group, Criterion, Throughput}; use rayon::prelude::*; +use std::cmp::max; use tfhe::integer::ciphertext::CompressedCiphertextListBuilder; use tfhe::integer::{ClientKey, RadixCiphertext}; use tfhe::keycache::NamedParam; @@ -77,9 +78,19 @@ fn cpu_glwe_packing(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + // Execute the operation once to know its cost. + let ct = cks.encrypt_radix(0_u32, num_blocks); + let mut builder = CompressedCiphertextListBuilder::new(); + builder.push(ct); + let compressed = builder.build(&compression_key); + + reset_pbs_count(); + let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap(); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize; - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); // FIXME thread usage seemed to be somewhat more "efficient". // For example, with bit_size = 2, my laptop is only using around 2/3 of the // available threads Thread usage increases with bit_size = 8 but @@ -150,6 +161,7 @@ fn cpu_glwe_packing(c: &mut Criterion) { #[cfg(feature = "gpu")] mod cuda { use super::*; + use std::cmp::max; use tfhe::core_crypto::gpu::CudaStreams; use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder; use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext; @@ -185,27 +197,26 @@ mod cuda { let bench_id_pack; let bench_id_unpack; + // Generate private compression key + let cks = ClientKey::new(param); + let private_compression_key = cks.new_compression_private_key(comp_param); + + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, compressed_decompression_key) = + radix_cks.new_compressed_compression_decompression_keys(&private_compression_key); + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream); + let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( + radix_cks.parameters().glwe_dimension(), + radix_cks.parameters().polynomial_size(), + radix_cks.parameters().message_modulus(), + radix_cks.parameters().carry_modulus(), + radix_cks.parameters().ciphertext_modulus(), + &stream, + ); + match BENCH_TYPE.get().unwrap() { BenchmarkType::Latency => { - // Generate private compression key - let cks = ClientKey::new(param); - let private_compression_key = cks.new_compression_private_key(comp_param); - - // Generate and convert compression keys - let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); - let (compressed_compression_key, compressed_decompression_key) = radix_cks - .new_compressed_compression_decompression_keys(&private_compression_key); - let cuda_compression_key = - compressed_compression_key.decompress_to_cuda(&stream); - let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - &stream, - ); - // Encrypt let ct = cks.encrypt_radix(0_u32, num_blocks); let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); @@ -239,28 +250,25 @@ mod cuda { }); } BenchmarkType::Throughput => { + // Execute the operation once to know its cost. + let (cpu_compression_key, cpu_decompression_key) = + cks.new_compression_decompression_keys(&private_compression_key); + let ct = cks.encrypt_radix(0_u32, num_blocks); + let mut builder = CompressedCiphertextListBuilder::new(); + builder.push(ct); + let compressed = builder.build(&cpu_compression_key); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + let _: RadixCiphertext = + compressed.get(0, &cpu_decompression_key).unwrap().unwrap(); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)) .ceil() as usize; - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); - let cks = ClientKey::new(param); - let private_compression_key = cks.new_compression_private_key(comp_param); - - let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); - let (compressed_compression_key, compressed_decompression_key) = radix_cks - .new_compressed_compression_decompression_keys(&private_compression_key); - let cuda_compression_key = - compressed_compression_key.decompress_to_cuda(&stream); - let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - &stream, - ); - // Encrypt let ct = cks.encrypt_radix(0_u32, num_blocks); let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); @@ -344,6 +352,7 @@ criterion_group!(cpu_glwe_packing2, cpu_glwe_packing); #[cfg(feature = "gpu")] use cuda::gpu_glwe_packing2; +use tfhe::{get_pbs_count, reset_pbs_count}; fn main() { BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap()); diff --git a/tfhe/benches/integer/oprf.rs b/tfhe/benches/integer/oprf.rs index 8bdc1e9407..07279f4a53 100644 --- a/tfhe/benches/integer/oprf.rs +++ b/tfhe/benches/integer/oprf.rs @@ -4,9 +4,11 @@ use crate::utilities::{ }; use criterion::{black_box, Criterion, Throughput}; use rayon::prelude::*; +use std::cmp::max; use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::IntegerKeyKind; use tfhe::keycache::NamedParam; +use tfhe::{get_pbs_count, reset_pbs_count}; use tfhe_csprng::seeders::Seed; pub fn unsigned_oprf(c: &mut Criterion) { @@ -40,12 +42,21 @@ pub fn unsigned_oprf(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + reset_pbs_count(); + sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded( + Seed(0), + bit_size as u64, + num_block as u64, + ); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - b.iter(|| { (0..elements).into_par_iter().for_each(|_| { sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded( diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs index 9c1cf0ed1b..bb2f001785 100644 --- a/tfhe/benches/integer/signed_bench.rs +++ b/tfhe/benches/integer/signed_bench.rs @@ -8,6 +8,7 @@ use crate::utilities::{ use criterion::{criterion_group, Criterion, Throughput}; use rand::prelude::*; use rayon::prelude::*; +use std::cmp::max; use std::env; use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::prelude::*; @@ -66,12 +67,20 @@ fn bench_server_key_signed_binary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + binary_op(&sks, &ct_0, &ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -151,12 +160,21 @@ fn bench_server_key_signed_shift_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_1 = rng.gen_range(0u128..bit_size as u128); + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + binary_op(&sks, &ct_0, &ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -233,12 +251,19 @@ fn bench_server_key_unary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + unary_fn(&sks, &ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -307,12 +332,21 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let cond = sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + sks.if_then_else_parallelized(&cond, &ct_then, &ct_else); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let cts_cond = (0..elements) .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5))) .collect::>(); @@ -830,12 +864,20 @@ fn bench_server_key_binary_scalar_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let clear_1 = rng_func(&mut rng, bit_size); + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, clear_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -1328,6 +1370,7 @@ mod cuda { use super::*; use criterion::criterion_group; use rayon::iter::IntoParallelRefIterator; + use std::cmp::max; use tfhe::core_crypto::gpu::CudaStreams; use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext}; @@ -1335,11 +1378,12 @@ mod cuda { /// Base function to bench a server key function that is a binary operation, input ciphertext /// will contain only zero carries - fn bench_cuda_server_key_binary_signed_function_clean_inputs( + fn bench_cuda_server_key_binary_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, + binary_op_cpu: G, ) where F: Fn( &CudaServerKey, @@ -1347,6 +1391,7 @@ mod cuda { &mut CudaSignedRadixCiphertext, &CudaStreams, ) + Sync, + G: Fn(&ServerKey, &SignedRadixCiphertext, &SignedRadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1401,14 +1446,22 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let mut ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, &mut ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1460,7 +1513,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_signed_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_signed_function_clean_inputs( @@ -1469,6 +1522,9 @@ mod cuda { stringify!($name), |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); + }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); } ) } @@ -1478,13 +1534,15 @@ mod cuda { /// Base function to bench a server key function that is a unary operation, input ciphertext /// will contain only zero carries - fn bench_cuda_server_key_unary_signed_function_clean_inputs( + fn bench_cuda_server_key_unary_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, unary_op: F, + unary_op_cpu: G, ) where F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, &CudaStreams) + Sync, + G: Fn(&ServerKey, &SignedRadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1527,14 +1585,21 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + unary_op_cpu(&cpu_sks, &ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1572,7 +1637,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_signed_unary_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_unary_signed_function_clean_inputs( @@ -1581,6 +1646,9 @@ mod cuda { stringify!($name), |server_key, input, stream| { server_key.$server_key_method(input, stream); + }, + |server_key_cpu, lhs| { + server_key_cpu.$server_key_method_cpu(lhs); } ) } @@ -1588,15 +1656,17 @@ mod cuda { } ); - fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs( + fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, - rng_func: G, + binary_op_cpu: G, + rng_func: H, ) where F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, ScalarType, &CudaStreams) + Sync, - G: Fn(&mut ThreadRng, usize) -> ScalarType, + G: Fn(&ServerKey, &mut SignedRadixCiphertext, ScalarType) + Sync, + H: Fn(&mut ThreadRng, usize) -> ScalarType, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1650,16 +1720,24 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let clear_0 = rng_func(&mut rng, bit_size); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, clear_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!( "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}" ); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1702,7 +1780,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_scalar_signed_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_scalar_signed_function_clean_inputs( @@ -1712,6 +1790,9 @@ mod cuda { |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); + }, $($rng_fn)* ) } @@ -1721,11 +1802,12 @@ mod cuda { /// Base function to bench a server key function that is a binary operation for shift/rotate, /// input ciphertext will contain only zero carries - fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs( + fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, + binary_op_cpu: G, ) where F: Fn( &CudaServerKey, @@ -1733,6 +1815,7 @@ mod cuda { &mut CudaUnsignedRadixCiphertext, &CudaStreams, ) + Sync, + G: Fn(&ServerKey, &SignedRadixCiphertext, &RadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1786,14 +1869,23 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clear_1 = rng.gen_range(0u128..bit_size as u128); + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &ct_0, &ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1843,7 +1935,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_signed_shift_rotate ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_shift_rotate_signed_function_clean_inputs( @@ -1852,6 +1944,9 @@ mod cuda { stringify!($name), |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); + }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); } ) } @@ -1916,14 +2011,23 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let cond = cpu_sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + cpu_sks.if_then_else_parallelized(&cond, &ct_then, &ct_else); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let cts_cond = (0..elements) .map(|_| { let ct_cond = cks.encrypt_bool(rng.gen::()); @@ -1997,246 +2101,291 @@ mod cuda { define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_add, + method_name_cpu: unchecked_add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_sub, + method_name_cpu: unchecked_sub, display_name: sub ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: unchecked_neg, + method_name_cpu: unchecked_neg, display_name: neg ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: unchecked_abs, + method_name_cpu: unchecked_abs_parallelized, display_name: abs ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_mul, + method_name_cpu: unchecked_mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_div_rem, + method_name_cpu: unchecked_div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_bitand, + method_name_cpu: unchecked_bitand_parallelized, display_name: bitand ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_bitor, + method_name_cpu: unchecked_bitor_parallelized, display_name: bitor ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_bitxor, + method_name_cpu: unchecked_bitxor_parallelized, display_name: bitxor ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: unchecked_bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_rotate_left, + method_name_cpu: unchecked_rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_rotate_right, + method_name_cpu: unchecked_rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_left_shift, + method_name_cpu: unchecked_left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_right_shift, + method_name_cpu: unchecked_right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_eq, + method_name_cpu: unchecked_eq_parallelized, display_name: eq ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_ne, + method_name_cpu: unchecked_ne_parallelized, display_name: ne ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_gt, + method_name_cpu: unchecked_gt_parallelized, display_name: gt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_ge, + method_name_cpu: unchecked_ge_parallelized, display_name: ge ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_lt, + method_name_cpu: unchecked_lt_parallelized, display_name: lt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_le, + method_name_cpu: unchecked_le_parallelized, display_name: le ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_min, + method_name_cpu: unchecked_min_parallelized, display_name: min ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_max, + method_name_cpu: unchecked_max_parallelized, display_name: max ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_signed_overflowing_add, + method_name_cpu: unchecked_signed_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_signed_overflowing_sub, + method_name_cpu: unchecked_signed_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_add, + method_name_cpu: unchecked_scalar_add, display_name: add, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_mul, + method_name_cpu: unchecked_scalar_mul_parallelized, display_name: mul, rng_func: mul_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_sub, + method_name_cpu: unchecked_scalar_sub, display_name: sub, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_bitand, + method_name_cpu: unchecked_scalar_bitand_parallelized, display_name: bitand, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_bitor, + method_name_cpu: unchecked_scalar_bitor_parallelized, display_name: bitor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_bitxor, + method_name_cpu: unchecked_scalar_bitxor_parallelized, display_name: bitxor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_right_shift, + method_name_cpu: unchecked_scalar_right_shift_parallelized, display_name: right_shift, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_left_shift, + method_name_cpu: unchecked_scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_rotate_right, + method_name_cpu: unchecked_scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_rotate_left, + method_name_cpu: unchecked_scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_eq, + method_name_cpu: unchecked_scalar_eq_parallelized, display_name: eq, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_ne, + method_name_cpu: unchecked_scalar_ne_parallelized, display_name: ne, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_gt, + method_name_cpu: unchecked_scalar_gt_parallelized, display_name: gt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_ge, + method_name_cpu: unchecked_scalar_ge_parallelized, display_name: ge, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_lt, + method_name_cpu: unchecked_scalar_lt_parallelized, display_name: lt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_le, + method_name_cpu: unchecked_scalar_le_parallelized, display_name: le, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_min, + method_name_cpu: unchecked_scalar_min_parallelized, display_name: min, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_max, + method_name_cpu: unchecked_scalar_max_parallelized, display_name: max, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: signed_overflowing_scalar_add, + method_name_cpu: signed_overflowing_scalar_add_parallelized, display_name: overflowing_add, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: signed_overflowing_scalar_sub, + method_name_cpu: signed_overflowing_scalar_sub_parallelized, display_name: overflowing_sub, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_signed_scalar_div_rem, + method_name_cpu: unchecked_signed_scalar_div_rem_parallelized, display_name: div_rem, rng_func: div_scalar ); @@ -2247,234 +2396,277 @@ mod cuda { define_cuda_server_key_bench_clean_input_signed_fn!( method_name: add, + method_name_cpu: add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: sub, + method_name_cpu: sub_parallelized, display_name: sub ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: neg, + method_name_cpu: neg_parallelized, display_name: neg ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: abs, + method_name_cpu: abs_parallelized, display_name: abs ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: mul, + method_name_cpu: mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: div_rem, + method_name_cpu: div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: bitand, + method_name_cpu: bitand_parallelized, display_name: bitand ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: bitor, + method_name_cpu: bitor_parallelized, display_name: bitor ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: bitxor, + method_name_cpu: bitxor_parallelized, display_name: bitxor ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: rotate_left, + method_name_cpu: rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: rotate_right, + method_name_cpu: rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: left_shift, + method_name_cpu: left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: right_shift, + method_name_cpu: right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: eq, + method_name_cpu: eq_parallelized, display_name: eq ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: ne, + method_name_cpu: ne_parallelized, display_name: ne ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: gt, + method_name_cpu: gt_parallelized, display_name: gt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: ge, + method_name_cpu: ge_parallelized, display_name: ge ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: lt, + method_name_cpu: lt_parallelized, display_name: lt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: le, + method_name_cpu: le_parallelized, display_name: le ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: min, + method_name_cpu: min_parallelized, display_name: min ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: max, + method_name_cpu: max_parallelized, display_name: max ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: signed_overflowing_add, + method_name_cpu: signed_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: signed_overflowing_sub, + method_name_cpu: signed_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_add, + method_name_cpu: scalar_add_parallelized, display_name: add, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_mul, + method_name_cpu: scalar_mul_parallelized, display_name: mul, rng_func: mul_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_sub, + method_name_cpu: scalar_sub_parallelized, display_name: sub, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_bitand, + method_name_cpu: scalar_bitand_parallelized, display_name: bitand, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_bitor, + method_name_cpu: scalar_bitor_parallelized, display_name: bitor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_bitxor, + method_name_cpu: scalar_bitxor_parallelized, display_name: bitxor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_left_shift, + method_name_cpu: scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_right_shift, + method_name_cpu: scalar_right_shift_parallelized, display_name: right_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_rotate_left, + method_name_cpu: scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_rotate_right, + method_name_cpu: scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_eq, + method_name_cpu: scalar_eq_parallelized, display_name: eq, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_ne, + method_name_cpu: scalar_ne_parallelized, display_name: ne, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_gt, + method_name_cpu: scalar_gt_parallelized, display_name: gt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_ge, + method_name_cpu: scalar_ge_parallelized, display_name: ge, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_lt, + method_name_cpu: scalar_lt_parallelized, display_name: lt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_le, + method_name_cpu: scalar_le_parallelized, display_name: le, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_min, + method_name_cpu: scalar_min_parallelized, display_name: min, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_max, + method_name_cpu: scalar_max_parallelized, display_name: max, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: signed_scalar_div_rem, + method_name_cpu: signed_scalar_div_rem_parallelized, display_name: div_rem, rng_func: div_scalar ); @@ -2697,6 +2889,7 @@ use cuda::{ cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; +use tfhe::{get_pbs_count, reset_pbs_count}; #[cfg(feature = "gpu")] fn go_through_gpu_bench_groups(val: &str) { diff --git a/tfhe/benches/integer/zk_pke.rs b/tfhe/benches/integer/zk_pke.rs index 18daf9cbbb..f8a316a304 100644 --- a/tfhe/benches/integer/zk_pke.rs +++ b/tfhe/benches/integer/zk_pke.rs @@ -5,6 +5,7 @@ use crate::utilities::{throughput_num_threads, BenchmarkType, BENCH_TYPE}; use criterion::{criterion_group, Criterion, Throughput}; use rand::prelude::*; use rayon::prelude::*; +use std::cmp::max; use std::fs::{File, OpenOptions}; use std::io::Write; use std::path::Path; @@ -24,6 +25,7 @@ use tfhe::shortint::parameters::key_switching::p_fail_2_minus_64::ks_pbs::{ }; use tfhe::shortint::parameters::PBSParameters; use tfhe::zk::{CompactPkeCrs, ZkComputeLoad}; +use tfhe::{get_pbs_count, reset_pbs_count}; use utilities::{write_to_json, OperatorType}; fn write_result(file: &mut File, name: &str, value: usize) { @@ -112,7 +114,17 @@ fn pke_zk_proof(c: &mut Criterion) { }); } BenchmarkType::Throughput => { - let elements = throughput_num_threads(num_block); + // Execute the operation once to know its cost. + let input_msg = rng.gen::(); + let messages = vec![input_msg; fhe_uint_count]; + + reset_pbs_count(); + let _ = tfhe::integer::ProvenCompactCiphertextList::builder(&pk) + .extend(messages.iter().copied()) + .build_with_proof_packed(&crs, &metadata, compute_load); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_id = format!( @@ -330,7 +342,27 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) { } BenchmarkType::Throughput => { // In throughput mode object sizes are not recorded. - let elements = throughput_num_threads(num_block); + + // Execute the operation once to know its cost. + let input_msg = rng.gen::(); + let messages = vec![input_msg; fhe_uint_count]; + let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk) + .extend(messages.iter().copied()) + .build_with_proof_packed(&crs, &metadata, compute_load) + .unwrap(); + + reset_pbs_count(); + let _ = ct1.verify_and_expand( + &crs, + &pk, + &metadata, + IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary( + casting_key.as_view(), + ), + ); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_id_verify = format!( diff --git a/tfhe/benches/utilities.rs b/tfhe/benches/utilities.rs index 9cd33200cd..5c565e5f78 100644 --- a/tfhe/benches/utilities.rs +++ b/tfhe/benches/utilities.rs @@ -393,22 +393,30 @@ pub mod integer_utils { /// Generate a number of threads to use to saturate current machine for throughput measurements. #[allow(dead_code)] - pub fn throughput_num_threads(num_block: usize) -> u64 { + pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 { let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set - let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil(); + let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0); + // Some operations with a high count of PBS (e.g. division) would yield an operation + // loading value so low that the number of elements in the end wouldn't be meaningful. + let minimum_loading = if num_block < 64 { 0.2 } else { 0.1 }; #[cfg(feature = "gpu")] { // This value is for Nvidia H100 GPU let streaming_multiprocessors = 132; let num_gpus = unsafe { cuda_get_number_of_gpus() }; - ((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64 + let total_num_sm = streaming_multiprocessors * num_gpus; + let operation_loading = + ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading); + (total_num_sm as f64 * block_multiplicator * operation_loading) as u64 } #[cfg(not(feature = "gpu"))] { let num_threads = rayon::current_num_threads() as f64; + let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading); // Add 20% more to maximum threads available. - ((num_threads + (num_threads * 0.2)) * block_multiplicator) as u64 + ((num_threads + (num_threads * 0.2)) * block_multiplicator.min(1.0) * operation_loading) + as u64 } }