From c4b3ef992eeb311d55f70ba4bb7b2b714dcb587c Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 18 Apr 2024 08:43:42 +0200 Subject: [PATCH] [LLAMA_CPP] Enable batch size > 1 (#905) * Add tests * Add implementation * Apply comments from previous PR --- .../llama_cpp_plugin/src/infer_request.cpp | 39 ++++---- modules/llama_cpp_plugin/src/plugin.cpp | 8 +- .../tests/common/CMakeLists.txt | 3 + .../tests/common/include/llm_inference.hpp | 9 +- .../tests/common/src/llm_inference.cpp | 29 +++--- .../tests/functional/src/batching.cpp | 88 +++++++++++++++++++ .../tests/functional/src/reset_state.cpp | 21 ++--- .../tests/functional/src/threading.cpp | 5 +- 8 files changed, 152 insertions(+), 50 deletions(-) create mode 100644 modules/llama_cpp_plugin/tests/functional/src/batching.cpp diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index 5efd868d8..76fba58cd 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -72,25 +72,29 @@ void LlamaCppSyncInferRequest::infer() { // all inputs without hardcode OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); + size_t batch_size = input_ids_tensor_ptr->get_shape()[0]; size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; - // llama_batch actually contains one sequence - llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1); + llama_batch batch = llama_batch_init(sequence_length * batch_size, /* embd = */ 0, /* n_seq_max = */ batch_size); const int64_t* data_ptr = input_ids_tensor_ptr->data(); const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */; const int64_t* position_idx_ptr = position_ids_tensor_ptr->data(); - for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { - const int64_t token_id = sequence_start_ptr[tok_idx]; - const int64_t position_id = position_idx_ptr[tok_idx]; - llama_batch_add_reimpl(batch, - token_id, - position_id, - {0}, - true); // the last `true` here is a marker that the logits for this - // token should be computed and returned + int num_sequences = batch_size; + + for (int seq_idx = 0; seq_idx < num_sequences; seq_idx++) { + for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { + const int64_t token_id = sequence_start_ptr[seq_idx * sequence_length + tok_idx]; + const int64_t position_id = position_idx_ptr[seq_idx * sequence_length + tok_idx]; + llama_batch_add_reimpl(batch, + token_id, + position_id, + {seq_idx}, + true); // the last `true` here is a marker that the logits for this + // token should be computed and returned + } } llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; @@ -102,12 +106,15 @@ void LlamaCppSyncInferRequest::infer() { size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr); - ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}}; + ov::Tensor output_tensor{ov::element::Type_t::f32, {batch_size, sequence_length, n_vocab}}; float* output_tensor_data_ptr = output_tensor.data(); - for (size_t pos = 0; pos < sequence_length; pos++) { - float* logits_from_llama = llama_get_logits_ith(ctx, pos); - std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab); + for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) { + for (size_t seq_idx = 0; seq_idx < sequence_length; seq_idx++) { + size_t pos = batch_idx * sequence_length + seq_idx; + float* logits_from_llama = llama_get_logits_ith(ctx, pos); + std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab); + } } auto& logit_output = get_outputs()[0]; @@ -115,6 +122,8 @@ void LlamaCppSyncInferRequest::infer() { allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape()); output_tensor.copy_to(ov::make_tensor(tensor)); }); + + llama_batch_free(batch); }; std::vector LlamaCppSyncInferRequest::get_profiling_info() const { OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n"; diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp index c96bf22c2..737c326ac 100644 --- a/modules/llama_cpp_plugin/src/plugin.cpp +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -39,9 +39,7 @@ std::shared_ptr LlamaCppPlugin::compile_model(const std::str auto it = properties.find(ov::inference_num_threads.name()); if (it != properties.end()) { num_threads = it->second.as(); - if (num_threads < 0) { - OPENVINO_THROW("INFERENCE_NUM_THREADS cannot be negative"); - } + OPENVINO_ASSERT(num_threads >= 0, "INFERENCE_NUM_THREADS cannot be negative"); } else { num_threads = m_num_threads; } @@ -52,9 +50,7 @@ void LlamaCppPlugin::set_property(const ov::AnyMap& properties) { for (const auto& map_entry : properties) { if (ov::inference_num_threads == map_entry.first) { int num_threads = map_entry.second.as(); - if (num_threads < 0) { - OPENVINO_THROW("INFERENCE_NUM_THREADS cannot be negative"); - } + OPENVINO_ASSERT(num_threads >= 0, "INFERENCE_NUM_THREADS cannot be negative"); m_num_threads = num_threads; } OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented"); diff --git a/modules/llama_cpp_plugin/tests/common/CMakeLists.txt b/modules/llama_cpp_plugin/tests/common/CMakeLists.txt index 3d175ddc3..71e9b1e0f 100644 --- a/modules/llama_cpp_plugin/tests/common/CMakeLists.txt +++ b/modules/llama_cpp_plugin/tests/common/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + project(llama_cpp_test_common) add_library(llama_cpp_test_common STATIC diff --git a/modules/llama_cpp_plugin/tests/common/include/llm_inference.hpp b/modules/llama_cpp_plugin/tests/common/include/llm_inference.hpp index 445222de0..eb0e7c1e4 100644 --- a/modules/llama_cpp_plugin/tests/common/include/llm_inference.hpp +++ b/modules/llama_cpp_plugin/tests/common/include/llm_inference.hpp @@ -7,9 +7,12 @@ #include "model_fixture.hpp" #include "openvino/openvino.hpp" -std::vector infer_logits_for_tokens_with_positions(ov::InferRequest& lm, - const std::vector& tokens, - int64_t position_ids_start_value); +ov::InferRequest& infer_logits_for_tokens_with_positions(ov::InferRequest& infer_request, + const std::vector& tokens, + int64_t position_ids_start_value); +std::vector infer_and_get_last_logits(ov::InferRequest& lm, + const std::vector& tokens, + int64_t position_ids_start_value); std::vector generate_n_tokens_with_positions(ov::InferRequest& lm, int64_t last_token, diff --git a/modules/llama_cpp_plugin/tests/common/src/llm_inference.cpp b/modules/llama_cpp_plugin/tests/common/src/llm_inference.cpp index 803d9e999..3bf532249 100644 --- a/modules/llama_cpp_plugin/tests/common/src/llm_inference.cpp +++ b/modules/llama_cpp_plugin/tests/common/src/llm_inference.cpp @@ -1,29 +1,36 @@ #include "llm_inference.hpp" -std::vector infer_logits_for_tokens_with_positions(ov::InferRequest& lm, - const std::vector& tokens, - int64_t position_ids_start_value) { +ov::InferRequest& infer_logits_for_tokens_with_positions(ov::InferRequest& infer_request, + const std::vector& tokens, + int64_t position_ids_start_value) { auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, tokens.size()}); std::copy(tokens.begin(), tokens.end(), input_ids_tensor.data()); - lm.set_tensor("input_ids", input_ids_tensor); + infer_request.set_tensor("input_ids", input_ids_tensor); - ov::Tensor position_ids = lm.get_tensor("position_ids"); + ov::Tensor position_ids = infer_request.get_tensor("position_ids"); position_ids.set_shape(input_ids_tensor.get_shape()); std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), position_ids_start_value); - CompiledModelTest::fill_unused_inputs(lm, input_ids_tensor.get_shape()); - lm.infer(); + CompiledModelTest::fill_unused_inputs(infer_request, input_ids_tensor.get_shape()); + infer_request.infer(); + return infer_request; +} - size_t vocab_size = lm.get_tensor("logits").get_shape().back(); - float* logits = lm.get_tensor("logits").data() + (input_ids_tensor.get_size() - 1) * vocab_size; +// Infers all tokens, but returns only the logits for the last token in `tokens`. +std::vector infer_and_get_last_logits(ov::InferRequest& infer_request, + const std::vector& tokens, + int64_t position_ids_start_value) { + infer_request = infer_logits_for_tokens_with_positions(infer_request, tokens, position_ids_start_value); + size_t vocab_size = infer_request.get_tensor("logits").get_shape().back(); + float* logits = infer_request.get_tensor("logits").data() + (tokens.size() - 1) * vocab_size; std::vector logits_vector(vocab_size); std::copy(logits, logits + vocab_size, logits_vector.begin()); return logits_vector; } -std::vector generate_n_tokens_with_positions(ov::InferRequest& lm, +std::vector generate_n_tokens_with_positions(ov::InferRequest& infer_request, int64_t last_token, size_t n_tokens, int64_t position_ids_start_value) { @@ -33,7 +40,7 @@ std::vector generate_n_tokens_with_positions(ov::InferRequest& lm, while (cnt < n_tokens) { std::vector logits_curr = - infer_logits_for_tokens_with_positions(lm, {out_token_ids.back()}, cnt + position_ids_start_value); + infer_and_get_last_logits(infer_request, {out_token_ids.back()}, cnt + position_ids_start_value); int64_t out_token = std::max_element(logits_curr.begin(), logits_curr.end()) - logits_curr.begin(); out_token_ids.push_back(out_token); cnt++; diff --git a/modules/llama_cpp_plugin/tests/functional/src/batching.cpp b/modules/llama_cpp_plugin/tests/functional/src/batching.cpp new file mode 100644 index 000000000..12efcb27b --- /dev/null +++ b/modules/llama_cpp_plugin/tests/functional/src/batching.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "llm_inference.hpp" + +const std::string MODEL_FILE = ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + "gpt2.gguf"; + +class LlamaCppBatchingDimensionTest : public testing::TestWithParam {}; + +TEST_P(LlamaCppBatchingDimensionTest, BatchedOutputDimensionIsAlignedWithInputDimenstion) { + ov::Core core; + auto model = core.compile_model(MODEL_FILE, "LLAMA_CPP"); + auto infer_request = model.create_infer_request(); + + auto batched_shape = GetParam(); + + auto input_tensor = ov::Tensor(ov::element::Type_t::i64, batched_shape); + std::fill(input_tensor.data(), input_tensor.data() + input_tensor.get_size(), 0); + infer_request.set_tensor("input_ids", input_tensor); + infer_request.set_tensor("position_ids", input_tensor); + infer_request.infer(); + auto output_shape = infer_request.get_tensor("logits").get_shape(); + ASSERT_EQ(output_shape.size(), 3); // (batch, input token, output logit distribution) + auto output_shape_without_logit_dimension = ov::Shape{output_shape[0], output_shape[1]}; + ASSERT_EQ(batched_shape, output_shape_without_logit_dimension); +} + +INSTANTIATE_TEST_SUITE_P(VariousBatchAndInputShapes, + LlamaCppBatchingDimensionTest, + ::testing::Values(ov::Shape{2, 1}, ov::Shape{3, 12}, ov::Shape{13, 7})); + +TEST(LlamaCppBatchingTest, BatchedResultIsIdenticalToSingleBatchResults) { + ov::Core core; + auto model = core.compile_model(MODEL_FILE, "LLAMA_CPP"); + auto infer_request = model.create_infer_request(); + + std::vector mock_input_1{4, 8, 15, 16, 23, 42}; + std::vector mock_input_2{1, 1, 2, 3, 5, 8}; + + ASSERT_EQ(mock_input_1.size(), mock_input_2.size()); + + infer_request = infer_logits_for_tokens_with_positions(infer_request, mock_input_1, 0); + auto unbatched_output_1_tensor = infer_request.get_tensor("logits"); + size_t vocab_size = unbatched_output_1_tensor.get_shape().back(); + + auto unbatched_output_1 = + std::vector(unbatched_output_1_tensor.data(), + unbatched_output_1_tensor.data() + mock_input_1.size() * vocab_size); + + infer_request.reset_state(); + + infer_request = infer_logits_for_tokens_with_positions(infer_request, mock_input_2, 0); + auto unbatched_output_2_tensor = infer_request.get_tensor("logits"); + auto unbatched_output_2 = + std::vector(unbatched_output_2_tensor.data(), + unbatched_output_2_tensor.data() + mock_input_2.size() * vocab_size); + infer_request.reset_state(); + + auto batched_input_ids = ov::Tensor(ov::element::Type_t::i64, ov::Shape{2, mock_input_1.size()}); + size_t midpoint_offset = mock_input_1.size(); + auto end_offset = midpoint_offset * 2; + + std::copy(mock_input_1.begin(), mock_input_1.end(), batched_input_ids.data()); + std::copy(mock_input_2.begin(), mock_input_2.end(), batched_input_ids.data() + midpoint_offset); + infer_request.set_tensor("input_ids", batched_input_ids); + + auto batched_position_ids = ov::Tensor(ov::element::Type_t::i64, ov::Shape{2, mock_input_1.size()}); + std::iota(batched_position_ids.data(), batched_position_ids.data() + midpoint_offset, 0); + std::iota(batched_position_ids.data() + midpoint_offset, + batched_position_ids.data() + end_offset, + 0); + infer_request.set_tensor("position_ids", batched_position_ids); + infer_request.infer(); + + auto batched_output = infer_request.get_tensor("logits"); + auto batched_output_1 = + std::vector(batched_output.data(), batched_output.data() + midpoint_offset * vocab_size); + auto batched_output_2 = std::vector(batched_output.data() + midpoint_offset * vocab_size, + batched_output.data() + end_offset * vocab_size); + + EXPECT_EQ(unbatched_output_1.size(), batched_output_1.size()); + EXPECT_EQ(unbatched_output_2.size(), batched_output_2.size()); + + EXPECT_EQ(unbatched_output_1, batched_output_1); + EXPECT_EQ(unbatched_output_2, batched_output_2); +} diff --git a/modules/llama_cpp_plugin/tests/functional/src/reset_state.cpp b/modules/llama_cpp_plugin/tests/functional/src/reset_state.cpp index be5bf7986..020867000 100644 --- a/modules/llama_cpp_plugin/tests/functional/src/reset_state.cpp +++ b/modules/llama_cpp_plugin/tests/functional/src/reset_state.cpp @@ -13,7 +13,7 @@ constexpr size_t NUM_TOKENS_TO_GENERATE = 64; TEST_F(CompiledModelTest, ResetStateGPT2) { // collect reference response tokens ov::InferRequest lm = model.create_infer_request(); - std::vector logits_sun_ref = infer_logits_for_tokens_with_positions(lm, GPT2_SUN_PROMPT_TOKEN_IDS, 0); + std::vector logits_sun_ref = infer_and_get_last_logits(lm, GPT2_SUN_PROMPT_TOKEN_IDS, 0); std::vector out_token_ids_ref = generate_n_tokens_with_positions(lm, get_token_from_logits(logits_sun_ref), NUM_TOKENS_TO_GENERATE, @@ -24,15 +24,13 @@ TEST_F(CompiledModelTest, ResetStateGPT2) { SetUp(); ov::InferRequest lm_reset = model.create_infer_request(); - std::vector logits_lennon_reset = - infer_logits_for_tokens_with_positions(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0); + std::vector logits_lennon_reset = infer_and_get_last_logits(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0); lm_reset.reset_state(); - std::vector logits_sun_reset = - infer_logits_for_tokens_with_positions(lm_reset, - GPT2_SUN_PROMPT_TOKEN_IDS, - 0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size()); + std::vector logits_sun_reset = infer_and_get_last_logits(lm_reset, + GPT2_SUN_PROMPT_TOKEN_IDS, + 0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size()); std::vector out_token_ids_reset = generate_n_tokens_with_positions(lm_reset, get_token_from_logits(logits_sun_reset), @@ -44,14 +42,13 @@ TEST_F(CompiledModelTest, ResetStateGPT2) { SetUp(); ov::InferRequest lm_bad = model.create_infer_request(); - std::vector logits_lennon_bad = infer_logits_for_tokens_with_positions(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0); + std::vector logits_lennon_bad = infer_and_get_last_logits(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0); // no reset_state on purpose - std::vector logits_sun_bad = - infer_logits_for_tokens_with_positions(lm_reset, - GPT2_SUN_PROMPT_TOKEN_IDS, - 0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size()); + std::vector logits_sun_bad = infer_and_get_last_logits(lm_reset, + GPT2_SUN_PROMPT_TOKEN_IDS, + 0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size()); std::vector out_token_ids_bad = generate_n_tokens_with_positions(lm_reset, get_token_from_logits(logits_sun_reset), diff --git a/modules/llama_cpp_plugin/tests/functional/src/threading.cpp b/modules/llama_cpp_plugin/tests/functional/src/threading.cpp index c777e6931..09ab3cd5a 100644 --- a/modules/llama_cpp_plugin/tests/functional/src/threading.cpp +++ b/modules/llama_cpp_plugin/tests/functional/src/threading.cpp @@ -1,6 +1,5 @@ // Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -// #include @@ -33,7 +32,7 @@ ov::CompiledModel get_model_with_thread_setting(int num_threads, ThreadSettingTy } void infer_one_token_fn(ov::InferRequest& infer_request) { - infer_logits_for_tokens_with_positions(infer_request, {1337}, 0); + infer_and_get_last_logits(infer_request, {1337}, 0); } double measure_inference_speed_for_thread_count(int num_threads, ThreadSettingType thread_setting_type) { @@ -55,7 +54,7 @@ TEST_P(LlamaCppThreadSettingTypeTest, NumThreadSettingDoesntFail) { auto infer_request = model.create_infer_request(); std::vector mock_input_ids{1337, NUM_THREADS_TO_SET * 10}; - infer_logits_for_tokens_with_positions(infer_request, mock_input_ids, 0); + infer_and_get_last_logits(infer_request, mock_input_ids, 0); } TEST_P(LlamaCppThreadSettingTypeTest, ThreadedExecutionIsFaster) {