Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update vision_encoder.cpp #7

Merged
merged 1 commit into from
Dec 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 51 additions & 10 deletions src/cpp/src/visual_language/vision_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,35 +309,76 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
std::vector<std::vector<clip_image_u8>> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split);
std::vector<std::vector<ov::Tensor>> results;
std::vector<std::vector<ImageSize>> sizes;
const size_t channels = 3;

std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size()};
size_t max_h = 0, max_w = 0, n_images = 0;
std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const std::vector<clip_image_u8>& row) {
size_t max_h = 0, max_w = 0, n_images = 0, max_size = 0;
std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &max_size, &n_images](const std::vector<clip_image_u8>& row) {
std::vector<clip_image_f32> processed_row{row.size()};
std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const clip_image_u8& raw) {
std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &max_size, &n_images](const clip_image_u8& raw) {
clip_image_f32 im = clip_image_preprocess(ctx_clip, raw);
max_h = std::max(size_t(im.ny), max_h);
max_w = std::max(size_t(im.nx), max_w);
if (size_t(im.ny) * size_t(im.nx) > max_size) {
max_size = size_t(im.ny) * size_t(im.nx);
max_h = size_t(im.ny);
max_w = size_t(im.nx);
}
++n_images;
return im;
});
return processed_row;
});

ov::Tensor batched_images{ov::element::f32, {n_images, 3, max_h, max_w}};
float* batched_data = batched_images.data<float>();
ov::Tensor pixel_values{ov::element::f32, {n_images, channels, patch_size, max_size / patch_size}};
size_t d3_all_pixel = pixel_values.get_shape().at(3);
float* pixel_value_data = pixel_values.data<float>();

//image chw to 1*c*kernel*hw/kernel*kernel and padding zero
const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0);
std::copy(resized_preprocessed.buf.begin(), resized_preprocessed.buf.end(), batched_data);
size_t img_h = resized_preprocessed.ny;
size_t img_w = resized_preprocessed.nx;
ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}};
float* clip_data = clip_img.data<float>();
std::copy(resized_preprocessed.buf.begin(), resized_preprocessed.buf.end(), clip_data);
ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size);

float* clip_value_data = clip_pixel_values.data<float>();
size_t batch_pixel = 1;
size_t d3_clip_pixel = clip_pixel_values.get_shape().at(3);
for (size_t c_idx = 0; c_idx < channels; ++c_idx) {
for (size_t k_idx = 0; k_idx < patch_size; k_idx++) {
std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data);
clip_value_data += d3_clip_pixel;
pixel_value_data += d3_all_pixel;
}
}

if (1 < preprocessed.size()) {
for (size_t row = 1; row < preprocessed.size(); ++row) {
size_t n_slices = preprocessed.at(row).size();
for (size_t col = 0; col < n_slices; ++col) {
const clip_image_f32& elem = preprocessed.at(row).at(col);
std::copy(elem.buf.begin(), elem.buf.end(), batched_data + ((row - 1) * n_slices + col + 1) * 3 * max_h * max_w);
img_h = elem.ny;
img_w = elem.nx;
ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}};
clip_data = clip_img.data<float>();
std::copy(elem.buf.begin(), elem.buf.end(), clip_data);

ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size);

d3_clip_pixel = clip_pixel_values.get_shape().at(3);
clip_value_data = clip_pixel_values.data<float>();
pixel_value_data = pixel_values.data<float>() + batch_pixel * channels * patch_size * d3_all_pixel;
for (size_t c_idx = 0; c_idx < channels; ++c_idx) {
for (size_t k_idx = 0; k_idx < patch_size; k_idx++) {
std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data);
clip_value_data += d3_clip_pixel;
pixel_value_data += d3_all_pixel;
}
}
batch_pixel++;
}
}
}
ov::Tensor pixel_values = preprocess_for_encoder(batched_images, patch_size);
encoder.set_tensor("pixel_values", pixel_values);

ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, max_h / patch_size * max_w / patch_size}};
Expand Down
Loading