Skip to content

Commit

Permalink
[LLAMA_CPP] Enable batch size > 1 (#905)
Browse files Browse the repository at this point in the history
* Add tests

* Add implementation

* Apply comments from previous PR
  • Loading branch information
vshampor authored Apr 18, 2024
1 parent 5243f86 commit c4b3ef9
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 50 deletions.
39 changes: 24 additions & 15 deletions modules/llama_cpp_plugin/src/infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,29 @@ void LlamaCppSyncInferRequest::infer() {
// all inputs without hardcode
OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
size_t batch_size = input_ids_tensor_ptr->get_shape()[0];
size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];

// llama_batch actually contains one sequence
llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1);
llama_batch batch = llama_batch_init(sequence_length * batch_size, /* embd = */ 0, /* n_seq_max = */ batch_size);
const int64_t* data_ptr = input_ids_tensor_ptr->data<int64_t>();

const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */;

const int64_t* position_idx_ptr = position_ids_tensor_ptr->data<int64_t>();

for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
const int64_t token_id = sequence_start_ptr[tok_idx];
const int64_t position_id = position_idx_ptr[tok_idx];
llama_batch_add_reimpl(batch,
token_id,
position_id,
{0},
true); // the last `true` here is a marker that the logits for this
// token should be computed and returned
int num_sequences = batch_size;

for (int seq_idx = 0; seq_idx < num_sequences; seq_idx++) {
for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
const int64_t token_id = sequence_start_ptr[seq_idx * sequence_length + tok_idx];
const int64_t position_id = position_idx_ptr[seq_idx * sequence_length + tok_idx];
llama_batch_add_reimpl(batch,
token_id,
position_id,
{seq_idx},
true); // the last `true` here is a marker that the logits for this
// token should be computed and returned
}
}

llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
Expand All @@ -102,19 +106,24 @@ void LlamaCppSyncInferRequest::infer() {

size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr);

ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}};
ov::Tensor output_tensor{ov::element::Type_t::f32, {batch_size, sequence_length, n_vocab}};
float* output_tensor_data_ptr = output_tensor.data<float>();

for (size_t pos = 0; pos < sequence_length; pos++) {
float* logits_from_llama = llama_get_logits_ith(ctx, pos);
std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab);
for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) {
for (size_t seq_idx = 0; seq_idx < sequence_length; seq_idx++) {
size_t pos = batch_idx * sequence_length + seq_idx;
float* logits_from_llama = llama_get_logits_ith(ctx, pos);
std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab);
}
}

auto& logit_output = get_outputs()[0];
allocate_tensor(logit_output, [&output_tensor](ov::SoPtr<ov::ITensor>& tensor) {
allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape());
output_tensor.copy_to(ov::make_tensor(tensor));
});

llama_batch_free(batch);
};
std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const {
OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n";
Expand Down
8 changes: 2 additions & 6 deletions modules/llama_cpp_plugin/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::str
auto it = properties.find(ov::inference_num_threads.name());
if (it != properties.end()) {
num_threads = it->second.as<int>();
if (num_threads < 0) {
OPENVINO_THROW("INFERENCE_NUM_THREADS cannot be negative");
}
OPENVINO_ASSERT(num_threads >= 0, "INFERENCE_NUM_THREADS cannot be negative");
} else {
num_threads = m_num_threads;
}
Expand All @@ -52,9 +50,7 @@ void LlamaCppPlugin::set_property(const ov::AnyMap& properties) {
for (const auto& map_entry : properties) {
if (ov::inference_num_threads == map_entry.first) {
int num_threads = map_entry.second.as<int>();
if (num_threads < 0) {
OPENVINO_THROW("INFERENCE_NUM_THREADS cannot be negative");
}
OPENVINO_ASSERT(num_threads >= 0, "INFERENCE_NUM_THREADS cannot be negative");
m_num_threads = num_threads;
}
OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented");
Expand Down
3 changes: 3 additions & 0 deletions modules/llama_cpp_plugin/tests/common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

project(llama_cpp_test_common)

add_library(llama_cpp_test_common STATIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
#include "model_fixture.hpp"
#include "openvino/openvino.hpp"

std::vector<float> infer_logits_for_tokens_with_positions(ov::InferRequest& lm,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value);
ov::InferRequest& infer_logits_for_tokens_with_positions(ov::InferRequest& infer_request,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value);
std::vector<float> infer_and_get_last_logits(ov::InferRequest& lm,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value);

std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& lm,
int64_t last_token,
Expand Down
29 changes: 18 additions & 11 deletions modules/llama_cpp_plugin/tests/common/src/llm_inference.cpp
Original file line number Diff line number Diff line change
@@ -1,29 +1,36 @@
#include "llm_inference.hpp"

std::vector<float> infer_logits_for_tokens_with_positions(ov::InferRequest& lm,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value) {
ov::InferRequest& infer_logits_for_tokens_with_positions(ov::InferRequest& infer_request,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value) {
auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, tokens.size()});
std::copy(tokens.begin(), tokens.end(), input_ids_tensor.data<int64_t>());
lm.set_tensor("input_ids", input_ids_tensor);
infer_request.set_tensor("input_ids", input_ids_tensor);

ov::Tensor position_ids = lm.get_tensor("position_ids");
ov::Tensor position_ids = infer_request.get_tensor("position_ids");
position_ids.set_shape(input_ids_tensor.get_shape());
std::iota(position_ids.data<int64_t>(),
position_ids.data<int64_t>() + position_ids.get_size(),
position_ids_start_value);

CompiledModelTest::fill_unused_inputs(lm, input_ids_tensor.get_shape());
lm.infer();
CompiledModelTest::fill_unused_inputs(infer_request, input_ids_tensor.get_shape());
infer_request.infer();
return infer_request;
}

size_t vocab_size = lm.get_tensor("logits").get_shape().back();
float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_size() - 1) * vocab_size;
// Infers all tokens, but returns only the logits for the last token in `tokens`.
std::vector<float> infer_and_get_last_logits(ov::InferRequest& infer_request,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value) {
infer_request = infer_logits_for_tokens_with_positions(infer_request, tokens, position_ids_start_value);
size_t vocab_size = infer_request.get_tensor("logits").get_shape().back();
float* logits = infer_request.get_tensor("logits").data<float>() + (tokens.size() - 1) * vocab_size;
std::vector<float> logits_vector(vocab_size);
std::copy(logits, logits + vocab_size, logits_vector.begin());
return logits_vector;
}

std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& lm,
std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& infer_request,
int64_t last_token,
size_t n_tokens,
int64_t position_ids_start_value) {
Expand All @@ -33,7 +40,7 @@ std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& lm,

while (cnt < n_tokens) {
std::vector<float> logits_curr =
infer_logits_for_tokens_with_positions(lm, {out_token_ids.back()}, cnt + position_ids_start_value);
infer_and_get_last_logits(infer_request, {out_token_ids.back()}, cnt + position_ids_start_value);
int64_t out_token = std::max_element(logits_curr.begin(), logits_curr.end()) - logits_curr.begin();
out_token_ids.push_back(out_token);
cnt++;
Expand Down
88 changes: 88 additions & 0 deletions modules/llama_cpp_plugin/tests/functional/src/batching.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include <gtest/gtest.h>

#include "llm_inference.hpp"

const std::string MODEL_FILE = ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + "gpt2.gguf";

class LlamaCppBatchingDimensionTest : public testing::TestWithParam<ov::Shape> {};

TEST_P(LlamaCppBatchingDimensionTest, BatchedOutputDimensionIsAlignedWithInputDimenstion) {
ov::Core core;
auto model = core.compile_model(MODEL_FILE, "LLAMA_CPP");
auto infer_request = model.create_infer_request();

auto batched_shape = GetParam();

auto input_tensor = ov::Tensor(ov::element::Type_t::i64, batched_shape);
std::fill(input_tensor.data<int64_t>(), input_tensor.data<int64_t>() + input_tensor.get_size(), 0);
infer_request.set_tensor("input_ids", input_tensor);
infer_request.set_tensor("position_ids", input_tensor);
infer_request.infer();
auto output_shape = infer_request.get_tensor("logits").get_shape();
ASSERT_EQ(output_shape.size(), 3); // (batch, input token, output logit distribution)
auto output_shape_without_logit_dimension = ov::Shape{output_shape[0], output_shape[1]};
ASSERT_EQ(batched_shape, output_shape_without_logit_dimension);
}

INSTANTIATE_TEST_SUITE_P(VariousBatchAndInputShapes,
LlamaCppBatchingDimensionTest,
::testing::Values(ov::Shape{2, 1}, ov::Shape{3, 12}, ov::Shape{13, 7}));

TEST(LlamaCppBatchingTest, BatchedResultIsIdenticalToSingleBatchResults) {
ov::Core core;
auto model = core.compile_model(MODEL_FILE, "LLAMA_CPP");
auto infer_request = model.create_infer_request();

std::vector<int64_t> mock_input_1{4, 8, 15, 16, 23, 42};
std::vector<int64_t> mock_input_2{1, 1, 2, 3, 5, 8};

ASSERT_EQ(mock_input_1.size(), mock_input_2.size());

infer_request = infer_logits_for_tokens_with_positions(infer_request, mock_input_1, 0);
auto unbatched_output_1_tensor = infer_request.get_tensor("logits");
size_t vocab_size = unbatched_output_1_tensor.get_shape().back();

auto unbatched_output_1 =
std::vector<float>(unbatched_output_1_tensor.data<float>(),
unbatched_output_1_tensor.data<float>() + mock_input_1.size() * vocab_size);

infer_request.reset_state();

infer_request = infer_logits_for_tokens_with_positions(infer_request, mock_input_2, 0);
auto unbatched_output_2_tensor = infer_request.get_tensor("logits");
auto unbatched_output_2 =
std::vector<float>(unbatched_output_2_tensor.data<float>(),
unbatched_output_2_tensor.data<float>() + mock_input_2.size() * vocab_size);
infer_request.reset_state();

auto batched_input_ids = ov::Tensor(ov::element::Type_t::i64, ov::Shape{2, mock_input_1.size()});
size_t midpoint_offset = mock_input_1.size();
auto end_offset = midpoint_offset * 2;

std::copy(mock_input_1.begin(), mock_input_1.end(), batched_input_ids.data<int64_t>());
std::copy(mock_input_2.begin(), mock_input_2.end(), batched_input_ids.data<int64_t>() + midpoint_offset);
infer_request.set_tensor("input_ids", batched_input_ids);

auto batched_position_ids = ov::Tensor(ov::element::Type_t::i64, ov::Shape{2, mock_input_1.size()});
std::iota(batched_position_ids.data<int64_t>(), batched_position_ids.data<int64_t>() + midpoint_offset, 0);
std::iota(batched_position_ids.data<int64_t>() + midpoint_offset,
batched_position_ids.data<int64_t>() + end_offset,
0);
infer_request.set_tensor("position_ids", batched_position_ids);
infer_request.infer();

auto batched_output = infer_request.get_tensor("logits");
auto batched_output_1 =
std::vector<float>(batched_output.data<float>(), batched_output.data<float>() + midpoint_offset * vocab_size);
auto batched_output_2 = std::vector<float>(batched_output.data<float>() + midpoint_offset * vocab_size,
batched_output.data<float>() + end_offset * vocab_size);

EXPECT_EQ(unbatched_output_1.size(), batched_output_1.size());
EXPECT_EQ(unbatched_output_2.size(), batched_output_2.size());

EXPECT_EQ(unbatched_output_1, batched_output_1);
EXPECT_EQ(unbatched_output_2, batched_output_2);
}
21 changes: 9 additions & 12 deletions modules/llama_cpp_plugin/tests/functional/src/reset_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ constexpr size_t NUM_TOKENS_TO_GENERATE = 64;
TEST_F(CompiledModelTest, ResetStateGPT2) {
// collect reference response tokens
ov::InferRequest lm = model.create_infer_request();
std::vector<float> logits_sun_ref = infer_logits_for_tokens_with_positions(lm, GPT2_SUN_PROMPT_TOKEN_IDS, 0);
std::vector<float> logits_sun_ref = infer_and_get_last_logits(lm, GPT2_SUN_PROMPT_TOKEN_IDS, 0);
std::vector<int64_t> out_token_ids_ref = generate_n_tokens_with_positions(lm,
get_token_from_logits(logits_sun_ref),
NUM_TOKENS_TO_GENERATE,
Expand All @@ -24,15 +24,13 @@ TEST_F(CompiledModelTest, ResetStateGPT2) {
SetUp();

ov::InferRequest lm_reset = model.create_infer_request();
std::vector<float> logits_lennon_reset =
infer_logits_for_tokens_with_positions(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);
std::vector<float> logits_lennon_reset = infer_and_get_last_logits(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);

lm_reset.reset_state();

std::vector<float> logits_sun_reset =
infer_logits_for_tokens_with_positions(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());
std::vector<float> logits_sun_reset = infer_and_get_last_logits(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());

std::vector<int64_t> out_token_ids_reset = generate_n_tokens_with_positions(lm_reset,
get_token_from_logits(logits_sun_reset),
Expand All @@ -44,14 +42,13 @@ TEST_F(CompiledModelTest, ResetStateGPT2) {
SetUp();

ov::InferRequest lm_bad = model.create_infer_request();
std::vector<float> logits_lennon_bad = infer_logits_for_tokens_with_positions(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);
std::vector<float> logits_lennon_bad = infer_and_get_last_logits(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);

// no reset_state on purpose

std::vector<float> logits_sun_bad =
infer_logits_for_tokens_with_positions(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());
std::vector<float> logits_sun_bad = infer_and_get_last_logits(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());

std::vector<int64_t> out_token_ids_bad = generate_n_tokens_with_positions(lm_reset,
get_token_from_logits(logits_sun_reset),
Expand Down
5 changes: 2 additions & 3 deletions modules/llama_cpp_plugin/tests/functional/src/threading.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>

Expand Down Expand Up @@ -33,7 +32,7 @@ ov::CompiledModel get_model_with_thread_setting(int num_threads, ThreadSettingTy
}

void infer_one_token_fn(ov::InferRequest& infer_request) {
infer_logits_for_tokens_with_positions(infer_request, {1337}, 0);
infer_and_get_last_logits(infer_request, {1337}, 0);
}

double measure_inference_speed_for_thread_count(int num_threads, ThreadSettingType thread_setting_type) {
Expand All @@ -55,7 +54,7 @@ TEST_P(LlamaCppThreadSettingTypeTest, NumThreadSettingDoesntFail) {

auto infer_request = model.create_infer_request();
std::vector<int64_t> mock_input_ids{1337, NUM_THREADS_TO_SET * 10};
infer_logits_for_tokens_with_positions(infer_request, mock_input_ids, 0);
infer_and_get_last_logits(infer_request, mock_input_ids, 0);
}

TEST_P(LlamaCppThreadSettingTypeTest, ThreadedExecutionIsFaster) {
Expand Down

0 comments on commit c4b3ef9

Please sign in to comment.