Skip to content

Commit

Permalink
Update vision_encoder.cpp
Browse files Browse the repository at this point in the history
Modified the visual image functions that convert NCHW to NCkernel (HW/kernel*kernel) to match the Python implementation
  • Loading branch information
wenyi5608 authored Dec 6, 2024
1 parent c2a8f97 commit 866985b
Showing 1 changed file with 51 additions and 10 deletions.
61 changes: 51 additions & 10 deletions src/cpp/src/visual_language/vision_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,35 +309,76 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
std::vector<std::vector<clip_image_u8>> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split);
std::vector<std::vector<ov::Tensor>> results;
std::vector<std::vector<ImageSize>> sizes;
const size_t channels = 3;

std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size()};
size_t max_h = 0, max_w = 0, n_images = 0;
std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const std::vector<clip_image_u8>& row) {
size_t max_h = 0, max_w = 0, n_images = 0, max_size = 0;
std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip, &max_h, &max_w, &max_size, &n_images](const std::vector<clip_image_u8>& row) {
std::vector<clip_image_f32> processed_row{row.size()};
std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &n_images](const clip_image_u8& raw) {
std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip, &max_h, &max_w, &max_size, &n_images](const clip_image_u8& raw) {
clip_image_f32 im = clip_image_preprocess(ctx_clip, raw);
max_h = std::max(size_t(im.ny), max_h);
max_w = std::max(size_t(im.nx), max_w);
if (size_t(im.ny) * size_t(im.nx) > max_size) {
max_size = size_t(im.ny) * size_t(im.nx);
max_h = size_t(im.ny);
max_w = size_t(im.nx);
}
++n_images;
return im;
});
return processed_row;
});

ov::Tensor batched_images{ov::element::f32, {n_images, 3, max_h, max_w}};
float* batched_data = batched_images.data<float>();
ov::Tensor pixel_values{ov::element::f32, {n_images, channels, patch_size, max_size / patch_size}};
size_t d3_all_pixel = pixel_values.get_shape().at(3);
float* pixel_value_data = pixel_values.data<float>();

//image chw to 1*c*kernel*hw/kernel*kernel and padding zero
const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0);
std::copy(resized_preprocessed.buf.begin(), resized_preprocessed.buf.end(), batched_data);
size_t img_h = resized_preprocessed.ny;
size_t img_w = resized_preprocessed.nx;
ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}};
float* clip_data = clip_img.data<float>();
std::copy(resized_preprocessed.buf.begin(), resized_preprocessed.buf.end(), clip_data);
ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size);

float* clip_value_data = clip_pixel_values.data<float>();
size_t batch_pixel = 1;
size_t d3_clip_pixel = clip_pixel_values.get_shape().at(3);
for (size_t c_idx = 0; c_idx < channels; ++c_idx) {
for (size_t k_idx = 0; k_idx < patch_size; k_idx++) {
std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data);
clip_value_data += d3_clip_pixel;
pixel_value_data += d3_all_pixel;
}
}

if (1 < preprocessed.size()) {
for (size_t row = 1; row < preprocessed.size(); ++row) {
size_t n_slices = preprocessed.at(row).size();
for (size_t col = 0; col < n_slices; ++col) {
const clip_image_f32& elem = preprocessed.at(row).at(col);
std::copy(elem.buf.begin(), elem.buf.end(), batched_data + ((row - 1) * n_slices + col + 1) * 3 * max_h * max_w);
img_h = elem.ny;
img_w = elem.nx;
ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}};
clip_data = clip_img.data<float>();
std::copy(elem.buf.begin(), elem.buf.end(), clip_data);

ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size);

d3_clip_pixel = clip_pixel_values.get_shape().at(3);
clip_value_data = clip_pixel_values.data<float>();
pixel_value_data = pixel_values.data<float>() + batch_pixel * channels * patch_size * d3_all_pixel;
for (size_t c_idx = 0; c_idx < channels; ++c_idx) {
for (size_t k_idx = 0; k_idx < patch_size; k_idx++) {
std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data);
clip_value_data += d3_clip_pixel;
pixel_value_data += d3_all_pixel;
}
}
batch_pixel++;
}
}
}
ov::Tensor pixel_values = preprocess_for_encoder(batched_images, patch_size);
encoder.set_tensor("pixel_values", pixel_values);

ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, max_h / patch_size * max_w / patch_size}};
Expand Down

0 comments on commit 866985b

Please sign in to comment.