diff --git a/test/simd/test_simd_fnt.cpp b/test/simd/test_simd_fnt.cpp index bdbcdf62..a2c44178 100644 --- a/test/simd/test_simd_fnt.cpp +++ b/test/simd/test_simd_fnt.cpp @@ -100,7 +100,8 @@ class SimdTestFnt : public ::testing::Test { return vec[0]; } - void gen_rand_data(std::vector& vec) + template + void gen_rand_data(std::vector& vec) { const size_t len = vec.size(); @@ -234,81 +235,127 @@ class SimdTestFnt : public ::testing::Test { } template - void core_op_perf(const std::string& text, const TFunc& f) + void core_op_perf_single(const std::string& text, const TFunc& f) { - const size_t len = vec_len * simd::countof(); - - std::vector buf_x(len, 0); - std::vector buf_y(len, 0); - gen_rand_data(buf_x); - gen_rand_data(buf_y); + std::cout << text << "\n"; + std::cout << "\tVectors nb\t\tAverage nb of CPU cycles\n"; + simd::VecType x = this->rand_vec(); + simd::VecType y = this->rand_vec(); + for (auto vec_len : arr_vec_len) { + const size_t nb = iters_nb * vec_len; - simd::VecType* data_x = reinterpret_cast(buf_x.data()); - simd::VecType* data_y = reinterpret_cast(buf_y.data()); + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < nb; ++i) { + simd::VecType _x = simd::load_to_reg(&x); + simd::VecType _y = simd::load_to_reg(&y); - uint64_t start = quadiron::hw_timer(); - for (unsigned i = 0; i < iters_nb; ++i) { - for (size_t j = 0; j < vec_len; ++j) { - simd::VecType x = simd::load_to_reg(&data_x[i]); - simd::VecType y = simd::load_to_reg(&data_y[i]); + f(_x, _y); - f(x, y); - - simd::store_to_mem(&data_x[i], x); + simd::store_to_mem(&x, _x); } + uint64_t end = quadiron::hw_timer(); + double avg_cycles_nb = + static_cast(end - start) / static_cast(nb); + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; } - uint64_t end = quadiron::hw_timer(); - double avg_cycles_nb = - static_cast(end - start) / static_cast(iters_nb); - std::cout << "Average nb of CPU cycles per operation " << text << ": " - << avg_cycles_nb / vec_len << "\n"; + std::cout << "\n"; } template - void butterfly_perf(const std::string& text, const TFunc& f) + void core_op_perf(const std::string& text, const TFunc& f) { - const size_t len = vec_len * simd::countof(); - - std::vector buf_x(len); - std::vector buf_y(len); - gen_rand_data(buf_x); - gen_rand_data(buf_y); - - simd::VecType* data_x = reinterpret_cast(buf_x.data()); - simd::VecType* data_y = reinterpret_cast(buf_y.data()); - - std::vector data_z(this->vec_len); + std::cout << text << "\n"; + std::cout << "\tVectors nb\t\tAverage nb of CPU cycles\n"; + for (auto vec_len : arr_vec_len) { + const size_t len = vec_len * simd::countof(); + + std::vector buf_x(len, 0); + std::vector buf_y(len, 0); + gen_rand_data(buf_x); + gen_rand_data(buf_y); + + simd::VecType* data_x = + reinterpret_cast(buf_x.data()); + simd::VecType* data_y = + reinterpret_cast(buf_y.data()); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < iters_nb; ++i) { + for (size_t j = 0; j < vec_len; ++j) { + simd::VecType x = simd::load_to_reg(&data_x[j]); + simd::VecType y = simd::load_to_reg(&data_y[j]); + + f(x, y); + + simd::store_to_mem(&data_x[j], x); + } + } + uint64_t end = quadiron::hw_timer(); + double avg_cycles_nb = static_cast(end - start) + / static_cast(iters_nb) + / static_cast(vec_len); + ; - T coef = - 1 - + this->distribution->operator()(quadiron::prng()) % (this->q - 2); - const simd::CtGsCase ct_case = simd::get_case(coef, this->q); - const simd::VecType c = simd::set_one(coef); + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; + } + std::cout << "\n"; + } - uint64_t start = quadiron::hw_timer(); - for (unsigned i = 0; i < iters_nb; ++i) { - for (size_t j = 0; j < this->vec_len; ++j) { - simd::VecType x = simd::load_to_reg(&data_x[i]); - simd::VecType y = simd::load_to_reg(&data_y[i]); + template + void butterfly_perf(const std::string& text, const TFunc& f) + { + std::cout << text << "\n"; + std::cout << "\tVectors nb\t\tAverage nb of CPU cycles\n"; + for (auto vec_len : arr_vec_len) { + const size_t len = vec_len * simd::countof(); + + std::vector buf_x(len); + std::vector buf_y(len); + gen_rand_data(buf_x); + gen_rand_data(buf_y); + + simd::VecType* data_x = + reinterpret_cast(buf_x.data()); + simd::VecType* data_y = + reinterpret_cast(buf_y.data()); + + std::vector data_z(vec_len); + + T coef = 1 + + this->distribution->operator()(quadiron::prng()) + % (this->q - 2); + const simd::CtGsCase ct_case = simd::get_case(coef, this->q); + const simd::VecType c = simd::set_one(coef); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < iters_nb; ++i) { + for (size_t j = 0; j < vec_len; ++j) { + simd::VecType x = simd::load_to_reg(&data_x[j]); + simd::VecType y = simd::load_to_reg(&data_y[j]); + + f(ct_case, c, x, y); + + simd::store_to_mem(&data_x[j], x); + simd::store_to_mem(&data_y[j], y); + } + } + uint64_t end = quadiron::hw_timer(); - f(ct_case, c, x, y); + double avg_cycles_nb = static_cast(end - start) + / static_cast(iters_nb) + / static_cast(vec_len); + ; - simd::store_to_mem(&data_x[i], x); - simd::store_to_mem(&data_y[i], y); - } + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; } - uint64_t end = quadiron::hw_timer(); - - double avg_cycles_nb = - static_cast(end - start) / static_cast(iters_nb); - std::cout << "Average nb of CPU cycles per operation " << text << ": " - << avg_cycles_nb / vec_len << "\n"; + std::cout << "\n"; } T q; std::unique_ptr> distribution; - size_t vec_len = 256; - size_t iters_nb = 1e3; + std::vector arr_vec_len = + {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; + size_t iters_nb = 1e5; }; using AllTypes = ::testing::Types; @@ -438,22 +485,34 @@ TYPED_TEST(SimdTestFnt, TestButterflyGs) // NOLINT TYPED_TEST(SimdTestFnt, PerfModMulSingle) // NOLINT { - const size_t iters_nb = 1e5; - simd::VecType x = this->rand_vec(); - simd::VecType y = this->rand_vec(); + const size_t len = simd::countof(); + TypeParam x[len]; + TypeParam y[len]; + + for (unsigned i = 0; i < len; ++i) { + x[i] = 1 + + (this->distribution->operator()(quadiron::prng()) + % (this->q - 1)); + y[i] = 1 + + (this->distribution->operator()(quadiron::prng()) + % (this->q - 1)); + } + + simd::VecType* vec_x = reinterpret_cast(x); + simd::VecType* vec_y = reinterpret_cast(y); uint64_t start = quadiron::hw_timer(); - for (unsigned i = 0; i < iters_nb; ++i) { - simd::VecType _x = simd::load_to_reg(&x); - simd::VecType _y = simd::load_to_reg(&y); + for (unsigned i = 0; i < this->iters_nb; ++i) { + simd::VecType _x = simd::load_to_reg(vec_x); + simd::VecType _y = simd::load_to_reg(vec_y); _x = simd::mod_mul(_x, _y); - simd::store_to_mem(&x, _x); + simd::store_to_mem(vec_x, _x); } uint64_t end = quadiron::hw_timer(); double avg_cycles_nb = - static_cast(end - start) / static_cast(iters_nb); + static_cast(end - start) / static_cast(this->iters_nb); std::cout << "PerfModMulSingle: " << avg_cycles_nb << "\n"; } @@ -491,6 +550,45 @@ TYPED_TEST(SimdTestFnt, PerfModBuf) // NOLINT }); } +TYPED_TEST(SimdTestFnt, PerfPackUnpack) // NOLINT +{ + std::cout << "Pack & Unpack" + << "\n"; + std::cout << "\tVectors nb\t\tAverage nb of CPU cycles\n"; + for (auto vec_len : this->arr_vec_len) { + const size_t len = vec_len * simd::countof(); + + std::vector buf_data(len, 0); + std::vector buf_meta(vec_len, 0); + this->gen_rand_data(buf_data); + this->gen_rand_data(buf_meta); + + simd::VecType* data = reinterpret_cast(buf_data.data()); + simd::MetaType* meta = buf_meta.data(); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < this->iters_nb; ++i) { + for (size_t j = 0; j < vec_len; ++j) { + simd::VecType lo, hi; + + simd::VecType x = simd::load_to_reg(&data[j]); + + simd::unpack(meta[j], x, hi, lo); + simd::pack(lo, hi, x, meta[j]); + + simd::store_to_mem(&data[j], x); + } + } + uint64_t end = quadiron::hw_timer(); + double avg_cycles_nb = static_cast(end - start) + / static_cast(this->iters_nb) + / static_cast(vec_len); + + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; + } + std::cout << "\n"; +} + TYPED_TEST(SimdTestFnt, PerfButterfly) // NOLINT { this->butterfly_perf(