Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Format] Add Opaque canonical extension type #41823

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,7 @@ endif()
if(ARROW_JSON)
arrow_add_object_library(ARROW_JSON
extension/fixed_shape_tensor.cc
extension/opaque.cc
json/options.cc
json/chunked_builder.cc
json/chunker.cc
Expand Down
24 changes: 24 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/compute/kernels/util_internal.h"
#include "arrow/scalar.h"
#include "arrow/type_fwd.h"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why type_fwd is included?

#include "arrow/util/bit_block_counter.h"
#include "arrow/util/float16.h"
#include "arrow/util/int_util.h"
Expand Down Expand Up @@ -865,6 +866,25 @@ std::shared_ptr<CastFunction> GetCastToHalfFloat() {
return func;
}

struct NullExtensionTypeMatcher : public TypeMatcher {
~NullExtensionTypeMatcher() override = default;

bool Matches(const DataType& type) const override {
return type.id() == Type::EXTENSION &&
static_cast<const ExtensionType&>(type).storage_id() == Type::NA;
}

std::string ToString() const override { return "extension<storage_type: null>"; }

bool Equals(const TypeMatcher& other) const override {
if (this == &other) {
return true;
}
auto casted = dynamic_cast<const NullExtensionTypeMatcher*>(&other);
return casted != nullptr;
}
};

} // namespace

std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
Expand All @@ -875,6 +895,10 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
OutputAllNull));
// Explicitly allow casting extension type with null backing array to null
paleolimbot marked this conversation as resolved.
Show resolved Hide resolved
DCHECK_OK(cast_null->AddKernel(
Type::EXTENSION, {InputType(std::make_shared<NullExtensionTypeMatcher>())}, null(),
OutputAllNull));
functions.push_back(cast_null);

functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,10 @@ add_arrow_test(test
PREFIX
"arrow-fixed-shape-tensor")

add_arrow_test(test
SOURCES
opaque_test.cc
PREFIX
"arrow-extension-opaque")

arrow_install_all_headers("arrow/extension")
109 changes: 109 additions & 0 deletions cpp/src/arrow/extension/opaque.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/extension/opaque.h"

#include <sstream>

#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
#include "arrow/util/logging.h"

#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <rapidjson/writer.h>

namespace arrow::extension {

std::string OpaqueType::ToString(bool show_metadata) const {
std::stringstream ss;
ss << "extension<" << this->extension_name()
<< "[storage_type=" << storage_type_->ToString() << ", type_name=" << type_name_
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

show_metadata passing into storage_type_->ToString()?

<< ", vendor_name=" << vendor_name_ << "]>";
return ss.str();
}

bool OpaqueType::ExtensionEquals(const ExtensionType& other) const {
if (extension_name() != other.extension_name()) {
return false;
}
const auto& opaque = internal::checked_cast<const OpaqueType&>(other);
return storage_type()->Equals(*opaque.storage_type()) &&
type_name() == opaque.type_name() && vendor_name() == opaque.vendor_name();
}

std::string OpaqueType::Serialize() const {
rapidjson::Document document;
document.SetObject();
rapidjson::Document::AllocatorType& allocator = document.GetAllocator();

rapidjson::Value type_name(rapidjson::StringRef(type_name_));
document.AddMember(rapidjson::Value("type_name", allocator), type_name, allocator);
rapidjson::Value vendor_name(rapidjson::StringRef(vendor_name_));
document.AddMember(rapidjson::Value("vendor_name", allocator), vendor_name, allocator);

rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
document.Accept(writer);
return buffer.GetString();
}

Result<std::shared_ptr<DataType>> OpaqueType::Deserialize(
std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
rapidjson::Document document;
const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length());
if (parsed.HasParseError()) {
return Status::Invalid("Invalid serialized JSON data for OpaqueType: ",
rapidjson::GetParseError_En(parsed.GetParseError()), ": ",
serialized_data);
} else if (!document.IsObject()) {
return Status::Invalid("Invalid serialized JSON data for OpaqueType: not an object");
}
if (!document.HasMember("type_name")) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: missing type_name");
} else if (!document.HasMember("vendor_name")) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: missing vendor_name");
}

const auto& type_name = document["type_name"];
const auto& vendor_name = document["vendor_name"];
if (!type_name.IsString()) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: type_name is not a string");
} else if (!vendor_name.IsString()) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: vendor_name is not a string");
}

return opaque(std::move(storage_type), type_name.GetString(), vendor_name.GetString());
}

std::shared_ptr<Array> OpaqueType::MakeArray(std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("arrow.opaque",
internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
return std::make_shared<OpaqueArray>(data);
}

std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
std::string type_name, std::string vendor_name) {
return std::make_shared<OpaqueType>(std::move(storage_type), std::move(type_name),
std::move(vendor_name));
}

} // namespace arrow::extension
69 changes: 69 additions & 0 deletions cpp/src/arrow/extension/opaque.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/extension_type.h"
#include "arrow/type.h"

namespace arrow::extension {

/// \brief Opaque is a placeholder for a type from an external (usually
/// non-Arrow) system that could not be interpreted.
class ARROW_EXPORT OpaqueType : public ExtensionType {
public:
/// \brief Construct an OpaqueType.
///
/// \param[in] storage_type The underlying storage type. Should be
/// arrow::null if there is no data.
/// \param[in] type_name The name of the type in the external system.
/// \param[in] vendor_name The name of the external system.
explicit OpaqueType(std::shared_ptr<DataType> storage_type, std::string type_name,
std::string vendor_name)
: ExtensionType(std::move(storage_type)),
type_name_(std::move(type_name)),
vendor_name_(std::move(vendor_name)) {}

std::string extension_name() const override { return "arrow.opaque"; }
std::string ToString(bool show_metadata) const override;
bool ExtensionEquals(const ExtensionType& other) const override;
std::string Serialize() const override;
Result<std::shared_ptr<DataType>> Deserialize(
std::shared_ptr<DataType> storage_type,
const std::string& serialized_data) const override;
/// Create an OpaqueArray from ArrayData
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

std::string_view type_name() const { return type_name_; }
std::string_view vendor_name() const { return vendor_name_; }

private:
std::string type_name_;
std::string vendor_name_;
};

/// \brief Opaque is a wrapper for (usually binary) data from an external
/// (often non-Arrow) system that could not be interpreted.
class ARROW_EXPORT OpaqueArray : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
};

/// \brief Return an OpaqueType instance.
ARROW_EXPORT std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
std::string type_name,
std::string vendor_name);

} // namespace arrow::extension
Loading
Loading