Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLAMA_CPP] Enable batch size > 1 #905

Merged
merged 3 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 24 additions & 15 deletions modules/llama_cpp_plugin/src/infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,29 @@ void LlamaCppSyncInferRequest::infer() {
// all inputs without hardcode
OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
size_t batch_size = input_ids_tensor_ptr->get_shape()[0];
size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];

// llama_batch actually contains one sequence
llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1);
llama_batch batch = llama_batch_init(sequence_length * batch_size, /* embd = */ 0, /* n_seq_max = */ batch_size);
const int64_t* data_ptr = input_ids_tensor_ptr->data<int64_t>();

const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */;

const int64_t* position_idx_ptr = position_ids_tensor_ptr->data<int64_t>();

for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
const int64_t token_id = sequence_start_ptr[tok_idx];
const int64_t position_id = position_idx_ptr[tok_idx];
llama_batch_add_reimpl(batch,
token_id,
position_id,
{0},
true); // the last `true` here is a marker that the logits for this
// token should be computed and returned
int num_sequences = batch_size;

for (int seq_idx = 0; seq_idx < num_sequences; seq_idx++) {
for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
const int64_t token_id = sequence_start_ptr[seq_idx * sequence_length + tok_idx];
const int64_t position_id = position_idx_ptr[seq_idx * sequence_length + tok_idx];
llama_batch_add_reimpl(batch,
token_id,
position_id,
{seq_idx},
true); // the last `true` here is a marker that the logits for this
// token should be computed and returned
}
}

llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
Expand All @@ -102,19 +106,24 @@ void LlamaCppSyncInferRequest::infer() {

size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr);

ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}};
ov::Tensor output_tensor{ov::element::Type_t::f32, {batch_size, sequence_length, n_vocab}};
float* output_tensor_data_ptr = output_tensor.data<float>();

for (size_t pos = 0; pos < sequence_length; pos++) {
float* logits_from_llama = llama_get_logits_ith(ctx, pos);
std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab);
for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) {
for (size_t seq_idx = 0; seq_idx < sequence_length; seq_idx++) {
size_t pos = batch_idx * sequence_length + seq_idx;
float* logits_from_llama = llama_get_logits_ith(ctx, pos);
std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab);
}
}

auto& logit_output = get_outputs()[0];
allocate_tensor(logit_output, [&output_tensor](ov::SoPtr<ov::ITensor>& tensor) {
allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape());
output_tensor.copy_to(ov::make_tensor(tensor));
});

llama_batch_free(batch);
};
std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const {
OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n";
Expand Down
8 changes: 2 additions & 6 deletions modules/llama_cpp_plugin/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::str
auto it = properties.find(ov::inference_num_threads.name());
if (it != properties.end()) {
num_threads = it->second.as<int>();
if (num_threads < 0) {
OPENVINO_THROW("INFERENCE_NUM_THREADS cannot be negative");
}
OPENVINO_ASSERT(num_threads >= 0, "INFERENCE_NUM_THREADS cannot be negative");
} else {
num_threads = m_num_threads;
}
Expand All @@ -52,9 +50,7 @@ void LlamaCppPlugin::set_property(const ov::AnyMap& properties) {
for (const auto& map_entry : properties) {
if (ov::inference_num_threads == map_entry.first) {
int num_threads = map_entry.second.as<int>();
if (num_threads < 0) {
OPENVINO_THROW("INFERENCE_NUM_THREADS cannot be negative");
}
OPENVINO_ASSERT(num_threads >= 0, "INFERENCE_NUM_THREADS cannot be negative");
m_num_threads = num_threads;
}
OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented");
Expand Down
3 changes: 3 additions & 0 deletions modules/llama_cpp_plugin/tests/common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

project(llama_cpp_test_common)

add_library(llama_cpp_test_common STATIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
#include "model_fixture.hpp"
#include "openvino/openvino.hpp"

std::vector<float> infer_logits_for_tokens_with_positions(ov::InferRequest& lm,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value);
ov::InferRequest& infer_logits_for_tokens_with_positions(ov::InferRequest& infer_request,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value);
std::vector<float> infer_and_get_last_logits(ov::InferRequest& lm,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value);

std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& lm,
int64_t last_token,
Expand Down
29 changes: 18 additions & 11 deletions modules/llama_cpp_plugin/tests/common/src/llm_inference.cpp
Original file line number Diff line number Diff line change
@@ -1,29 +1,36 @@
#include "llm_inference.hpp"

std::vector<float> infer_logits_for_tokens_with_positions(ov::InferRequest& lm,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value) {
ov::InferRequest& infer_logits_for_tokens_with_positions(ov::InferRequest& infer_request,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value) {
auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, tokens.size()});
std::copy(tokens.begin(), tokens.end(), input_ids_tensor.data<int64_t>());
lm.set_tensor("input_ids", input_ids_tensor);
infer_request.set_tensor("input_ids", input_ids_tensor);

ov::Tensor position_ids = lm.get_tensor("position_ids");
ov::Tensor position_ids = infer_request.get_tensor("position_ids");
position_ids.set_shape(input_ids_tensor.get_shape());
std::iota(position_ids.data<int64_t>(),
position_ids.data<int64_t>() + position_ids.get_size(),
position_ids_start_value);

CompiledModelTest::fill_unused_inputs(lm, input_ids_tensor.get_shape());
lm.infer();
CompiledModelTest::fill_unused_inputs(infer_request, input_ids_tensor.get_shape());
infer_request.infer();
return infer_request;
}

size_t vocab_size = lm.get_tensor("logits").get_shape().back();
float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_size() - 1) * vocab_size;
// Infers all tokens, but returns only the logits for the last token in `tokens`.
std::vector<float> infer_and_get_last_logits(ov::InferRequest& infer_request,
const std::vector<int64_t>& tokens,
int64_t position_ids_start_value) {
infer_request = infer_logits_for_tokens_with_positions(infer_request, tokens, position_ids_start_value);
size_t vocab_size = infer_request.get_tensor("logits").get_shape().back();
float* logits = infer_request.get_tensor("logits").data<float>() + (tokens.size() - 1) * vocab_size;
std::vector<float> logits_vector(vocab_size);
std::copy(logits, logits + vocab_size, logits_vector.begin());
return logits_vector;
}

std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& lm,
std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& infer_request,
int64_t last_token,
size_t n_tokens,
int64_t position_ids_start_value) {
Expand All @@ -33,7 +40,7 @@ std::vector<int64_t> generate_n_tokens_with_positions(ov::InferRequest& lm,

while (cnt < n_tokens) {
std::vector<float> logits_curr =
infer_logits_for_tokens_with_positions(lm, {out_token_ids.back()}, cnt + position_ids_start_value);
infer_and_get_last_logits(infer_request, {out_token_ids.back()}, cnt + position_ids_start_value);
int64_t out_token = std::max_element(logits_curr.begin(), logits_curr.end()) - logits_curr.begin();
out_token_ids.push_back(out_token);
cnt++;
Expand Down
88 changes: 88 additions & 0 deletions modules/llama_cpp_plugin/tests/functional/src/batching.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include <gtest/gtest.h>

#include "llm_inference.hpp"

const std::string MODEL_FILE = ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + "gpt2.gguf";

class LlamaCppBatchingDimensionTest : public testing::TestWithParam<ov::Shape> {};

TEST_P(LlamaCppBatchingDimensionTest, BatchedOutputDimensionIsAlignedWithInputDimenstion) {
ov::Core core;
auto model = core.compile_model(MODEL_FILE, "LLAMA_CPP");
auto infer_request = model.create_infer_request();

auto batched_shape = GetParam();

auto input_tensor = ov::Tensor(ov::element::Type_t::i64, batched_shape);
std::fill(input_tensor.data<int64_t>(), input_tensor.data<int64_t>() + input_tensor.get_size(), 0);
infer_request.set_tensor("input_ids", input_tensor);
infer_request.set_tensor("position_ids", input_tensor);
infer_request.infer();
auto output_shape = infer_request.get_tensor("logits").get_shape();
ASSERT_EQ(output_shape.size(), 3); // (batch, input token, output logit distribution)
auto output_shape_without_logit_dimension = ov::Shape{output_shape[0], output_shape[1]};
ASSERT_EQ(batched_shape, output_shape_without_logit_dimension);
}

INSTANTIATE_TEST_SUITE_P(VariousBatchAndInputShapes,
LlamaCppBatchingDimensionTest,
::testing::Values(ov::Shape{2, 1}, ov::Shape{3, 12}, ov::Shape{13, 7}));

TEST(LlamaCppBatchingTest, BatchedResultIsIdenticalToSingleBatchResults) {
ov::Core core;
auto model = core.compile_model(MODEL_FILE, "LLAMA_CPP");
auto infer_request = model.create_infer_request();

std::vector<int64_t> mock_input_1{4, 8, 15, 16, 23, 42};
std::vector<int64_t> mock_input_2{1, 1, 2, 3, 5, 8};

ASSERT_EQ(mock_input_1.size(), mock_input_2.size());

infer_request = infer_logits_for_tokens_with_positions(infer_request, mock_input_1, 0);
auto unbatched_output_1_tensor = infer_request.get_tensor("logits");
size_t vocab_size = unbatched_output_1_tensor.get_shape().back();

auto unbatched_output_1 =
std::vector<float>(unbatched_output_1_tensor.data<float>(),
unbatched_output_1_tensor.data<float>() + mock_input_1.size() * vocab_size);

infer_request.reset_state();

infer_request = infer_logits_for_tokens_with_positions(infer_request, mock_input_2, 0);
auto unbatched_output_2_tensor = infer_request.get_tensor("logits");
auto unbatched_output_2 =
std::vector<float>(unbatched_output_2_tensor.data<float>(),
unbatched_output_2_tensor.data<float>() + mock_input_2.size() * vocab_size);
infer_request.reset_state();

auto batched_input_ids = ov::Tensor(ov::element::Type_t::i64, ov::Shape{2, mock_input_1.size()});
size_t midpoint_offset = mock_input_1.size();
auto end_offset = midpoint_offset * 2;

std::copy(mock_input_1.begin(), mock_input_1.end(), batched_input_ids.data<int64_t>());
std::copy(mock_input_2.begin(), mock_input_2.end(), batched_input_ids.data<int64_t>() + midpoint_offset);
infer_request.set_tensor("input_ids", batched_input_ids);

auto batched_position_ids = ov::Tensor(ov::element::Type_t::i64, ov::Shape{2, mock_input_1.size()});
std::iota(batched_position_ids.data<int64_t>(), batched_position_ids.data<int64_t>() + midpoint_offset, 0);
std::iota(batched_position_ids.data<int64_t>() + midpoint_offset,
batched_position_ids.data<int64_t>() + end_offset,
0);
infer_request.set_tensor("position_ids", batched_position_ids);
infer_request.infer();

auto batched_output = infer_request.get_tensor("logits");
auto batched_output_1 =
std::vector<float>(batched_output.data<float>(), batched_output.data<float>() + midpoint_offset * vocab_size);
auto batched_output_2 = std::vector<float>(batched_output.data<float>() + midpoint_offset * vocab_size,
batched_output.data<float>() + end_offset * vocab_size);

EXPECT_EQ(unbatched_output_1.size(), batched_output_1.size());
EXPECT_EQ(unbatched_output_2.size(), batched_output_2.size());

EXPECT_EQ(unbatched_output_1, batched_output_1);
EXPECT_EQ(unbatched_output_2, batched_output_2);
}
21 changes: 9 additions & 12 deletions modules/llama_cpp_plugin/tests/functional/src/reset_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ constexpr size_t NUM_TOKENS_TO_GENERATE = 64;
TEST_F(CompiledModelTest, ResetStateGPT2) {
// collect reference response tokens
ov::InferRequest lm = model.create_infer_request();
std::vector<float> logits_sun_ref = infer_logits_for_tokens_with_positions(lm, GPT2_SUN_PROMPT_TOKEN_IDS, 0);
std::vector<float> logits_sun_ref = infer_and_get_last_logits(lm, GPT2_SUN_PROMPT_TOKEN_IDS, 0);
std::vector<int64_t> out_token_ids_ref = generate_n_tokens_with_positions(lm,
get_token_from_logits(logits_sun_ref),
NUM_TOKENS_TO_GENERATE,
Expand All @@ -24,15 +24,13 @@ TEST_F(CompiledModelTest, ResetStateGPT2) {
SetUp();

ov::InferRequest lm_reset = model.create_infer_request();
std::vector<float> logits_lennon_reset =
infer_logits_for_tokens_with_positions(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);
std::vector<float> logits_lennon_reset = infer_and_get_last_logits(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);

lm_reset.reset_state();

std::vector<float> logits_sun_reset =
infer_logits_for_tokens_with_positions(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());
std::vector<float> logits_sun_reset = infer_and_get_last_logits(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());

std::vector<int64_t> out_token_ids_reset = generate_n_tokens_with_positions(lm_reset,
get_token_from_logits(logits_sun_reset),
Expand All @@ -44,14 +42,13 @@ TEST_F(CompiledModelTest, ResetStateGPT2) {
SetUp();

ov::InferRequest lm_bad = model.create_infer_request();
std::vector<float> logits_lennon_bad = infer_logits_for_tokens_with_positions(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);
std::vector<float> logits_lennon_bad = infer_and_get_last_logits(lm, GPT2_LENNON_PROMPT_TOKEN_IDS, 0);

// no reset_state on purpose

std::vector<float> logits_sun_bad =
infer_logits_for_tokens_with_positions(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());
std::vector<float> logits_sun_bad = infer_and_get_last_logits(lm_reset,
GPT2_SUN_PROMPT_TOKEN_IDS,
0); // GPT2_LENNON_PROMPT_TOKEN_IDS.size());

std::vector<int64_t> out_token_ids_bad = generate_n_tokens_with_positions(lm_reset,
get_token_from_logits(logits_sun_reset),
Expand Down
5 changes: 2 additions & 3 deletions modules/llama_cpp_plugin/tests/functional/src/threading.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <gtest/gtest.h>

Expand Down Expand Up @@ -33,7 +32,7 @@ ov::CompiledModel get_model_with_thread_setting(int num_threads, ThreadSettingTy
}

void infer_one_token_fn(ov::InferRequest& infer_request) {
infer_logits_for_tokens_with_positions(infer_request, {1337}, 0);
infer_and_get_last_logits(infer_request, {1337}, 0);
}

double measure_inference_speed_for_thread_count(int num_threads, ThreadSettingType thread_setting_type) {
Expand All @@ -55,7 +54,7 @@ TEST_P(LlamaCppThreadSettingTypeTest, NumThreadSettingDoesntFail) {

auto infer_request = model.create_infer_request();
std::vector<int64_t> mock_input_ids{1337, NUM_THREADS_TO_SET * 10};
infer_logits_for_tokens_with_positions(infer_request, mock_input_ids, 0);
infer_and_get_last_logits(infer_request, mock_input_ids, 0);
}

TEST_P(LlamaCppThreadSettingTypeTest, ThreadedExecutionIsFaster) {
Expand Down
Loading