From fba5836ba7a8f2a8d0ba31654fa993bd369811a5 Mon Sep 17 00:00:00 2001 From: Tal Davidi Date: Thu, 23 Feb 2023 10:18:06 -0800 Subject: [PATCH] Add interface to define a Row based serializer (#492) Summary: Pull Request resolved: https://github.com/facebookresearch/fbpcf/pull/492 # Background: Currently in order to successfully use UDP, you must write some carefully crafted code that will take all the rows of metadata for one side and package it into a collection of bytes. Afterwards the caller will get a `SecString` object back which is a bit representation of all the bytes they passed in, minus the filtered out rows. The user must then extract the corresponding bits for each column into separate MPC Types. This is a cumbersome process which is error prone, as you must make sure to carefully match up the two steps and any changes can cause a bug. # This Diff This diff defines the interface that the caller will use to pass in all their data for serialization / deserialization after UDP. Step 1. Put all the data into an unordered map of column name to type. Note that the variant type of the data must match the expected type based on the column (i.e. a uint32 column expects `std::vector`, a int64 vec column expects `std::vector>`. Call `serializeDataAsBytesForUDP` to get a vector of vector bytes ready for UDP consumption. Step 2. Pass this data into the UDP protocol data processor portion. Get back a `SecString` with all the filtered out rows and same structure Step 3. Call `deserializeUDPOutputIntoMPCTypes`. This will return the same unordered map of column names to the private MPC values that were deserialized from the SecString. The caller is in charge of unboxing the variants to the expected types. Reviewed By: haochenuw Differential Revision: D43366172 fbshipit-source-id: 93ac9751c77883e6ddddbecf950b36f7bf60c97d --- .../serialization/IRowStructureDefinition.h | 53 +++++++++++++++++++ ...onTest.cpp => ColumnSerializationTest.cpp} | 7 +-- 2 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 fbpcf/mpc_std_lib/unified_data_process/serialization/IRowStructureDefinition.h rename fbpcf/mpc_std_lib/unified_data_process/serialization/test/{SerializationTest.cpp => ColumnSerializationTest.cpp} (98%) diff --git a/fbpcf/mpc_std_lib/unified_data_process/serialization/IRowStructureDefinition.h b/fbpcf/mpc_std_lib/unified_data_process/serialization/IRowStructureDefinition.h new file mode 100644 index 00000000..4f7460f9 --- /dev/null +++ b/fbpcf/mpc_std_lib/unified_data_process/serialization/IRowStructureDefinition.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include "IColumnDefinition.h" +#include "fbpcf/frontend/BitString.h" + +namespace fbpcf::mpc_std_lib::unified_data_process::serialization { + +template +class IRowStructureDefinition { + public: + using SecString = frontend::BitString; + + using InputColumnDataType = std::variant< + std::vector, + std::vector, + std::vector, + std::vector, + std::vector>, + std::vector>, + std::vector>, + std::vector>>; + + virtual ~IRowStructureDefinition() = default; + + /* Returns the number of bytes to serialize a single row */ + virtual size_t getRowSizeBytes() const = 0; + + // Serialize each column's worth of data according to the structure + // definition. Each key must match the name of a column in the definition and + // the value contains the data for that column + virtual std::vector> serializeDataAsBytesForUDP( + const std::unordered_map& data, + int numRows) const = 0; + + // Following a run of the UDP protocol, deserialize the batched BitString + // containing encrypted columns into private MPC types. + virtual std::unordered_map< + std::string, + typename IColumnDefinition::DeserializeType> + deserializeUDPOutputIntoMPCTypes(const SecString& secretSharedData) const = 0; +}; + +} // namespace fbpcf::mpc_std_lib::unified_data_process::serialization diff --git a/fbpcf/mpc_std_lib/unified_data_process/serialization/test/SerializationTest.cpp b/fbpcf/mpc_std_lib/unified_data_process/serialization/test/ColumnSerializationTest.cpp similarity index 98% rename from fbpcf/mpc_std_lib/unified_data_process/serialization/test/SerializationTest.cpp rename to fbpcf/mpc_std_lib/unified_data_process/serialization/test/ColumnSerializationTest.cpp index 701bb3fc..f3866f29 100644 --- a/fbpcf/mpc_std_lib/unified_data_process/serialization/test/SerializationTest.cpp +++ b/fbpcf/mpc_std_lib/unified_data_process/serialization/test/ColumnSerializationTest.cpp @@ -103,7 +103,7 @@ static std::vector> deserializeAndRevealPackedBits( return rst; } -TEST(SerializationTest, IntegerColumnTest) { +TEST(ColumnSerializationTest, IntegerColumnTest) { auto factories = fbpcf::engine::communication::getInMemoryAgentFactory(2); auto schedulerFactory0 = @@ -160,7 +160,7 @@ TEST(SerializationTest, IntegerColumnTest) { testVectorEq(vals, rst); } -TEST(SerializationTest, ArrayColumnTest) { +TEST(ColumnSerializationTest, ArrayColumnTest) { auto factories = fbpcf::engine::communication::getInMemoryAgentFactory(2); auto schedulerFactory0 = @@ -235,7 +235,7 @@ TEST(SerializationTest, ArrayColumnTest) { } } -TEST(SerializationTest, PackedBitFieldColumnTest) { +TEST(ColumnSerializationTest, PackedBitFieldColumnTest) { auto factories = fbpcf::engine::communication::getInMemoryAgentFactory(2); auto schedulerFactory0 = @@ -334,4 +334,5 @@ TEST(erializationTest, ColumnTypeTest) { "col4", std::make_unique>("test"), 4); EXPECT_EQ(col6->getColumnType(), ColType::UInt32Vec); } + } // namespace fbpcf::mpc_std_lib::unified_data_process::serialization