Skip to content

Commit

Permalink
Merge branch 'master' into tokenizer_pack_fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mryzhov authored Dec 8, 2023
2 parents 9a69f33 + 11b8b72 commit 04e123f
Show file tree
Hide file tree
Showing 11 changed files with 461 additions and 70 deletions.
3 changes: 2 additions & 1 deletion modules/custom_operations/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ dependencies = [
dev = [
"ruff",
"pytest",
"pytest_harvest"
]
transformers = [
"transformers[sentencepiece,tiktoken]"
"transformers[sentencepiece]"
]
tiktoken = [
"tiktoken"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ namespace openvino_extensions {
// Tensor destination will be reshaped according the input data
template <typename BatchOfStrings>
void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) {
auto batch_size = strings.size();
size_t batch_size = strings.size();

// First run over all elements: calculate total memory required to hold all strings
size_t symbols_size = std::accumulate(
Expand All @@ -25,7 +25,7 @@ void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) {
destination.set_shape({total_size});

int32_t* pindices = reinterpret_cast<int32_t*>(destination.data<uint8_t>());
pindices[0] = batch_size;
pindices[0] = int32_t(batch_size);
pindices[1] = 0;
pindices += 2;
char* psymbols = reinterpret_cast<char*>(pindices + batch_size);
Expand All @@ -34,25 +34,25 @@ void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) {
for (const auto& str: strings) {
psymbols = std::copy(str.begin(), str.end(), psymbols);
current_symbols_pos += str.size();
*pindices = current_symbols_pos;
++pindices;
*pindices = int32_t(current_symbols_pos);
++pindices;
}
}

std::vector<std::string> unpack_strings(const ov::Tensor& source) {
int32_t length = source.get_byte_size();
size_t length = source.get_byte_size();
// check the format of the input bitstream representing the string tensor
OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor");
const int32_t* pindices = reinterpret_cast<const int32_t*>(source.data<const uint8_t>());
int32_t batch_size = pindices[0];
OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size,
OPENVINO_ASSERT(int32_t(length) >= 4 + 4 + 4 * batch_size,
"Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices");
const int32_t* begin_ids = pindices + 1;
const int32_t* end_ids = pindices + 2;
const char* symbols = reinterpret_cast<const char*>(pindices + 2 + batch_size);

std::vector<std::string> result;
result.reserve(batch_size);
result.reserve(size_t(batch_size));
for (int32_t idx = 0; idx < batch_size; ++idx) {
result.emplace_back(symbols + begin_ids[idx], symbols + end_ids[idx]);
}
Expand Down
Loading

0 comments on commit 04e123f

Please sign in to comment.