Skip to content

Commit

Permalink
Add invalid utf8 test
Browse files Browse the repository at this point in the history
  • Loading branch information
rok committed Aug 30, 2024
1 parent 680197e commit 5bbbd5c
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 29 deletions.
6 changes: 3 additions & 3 deletions cpp/src/arrow/extension/fixed_shape_tensor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ TEST_F(TestExtensionType, RoundtripBatch) {
std::shared_ptr<RecordBatch> read_batch;
auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_);
auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
RoundtripBatch(batch, &read_batch);
ASSERT_OK(RoundtripBatch(batch, &read_batch));
CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);

// Pass extension metadata and storage array, expect getting back extension array
Expand All @@ -216,7 +216,7 @@ TEST_F(TestExtensionType, RoundtripBatch) {
ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true,
/*metadata=*/ext_metadata);
auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr});
RoundtripBatch(batch2, &read_batch2);
ASSERT_OK(RoundtripBatch(batch2, &read_batch2));
CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
}

Expand Down Expand Up @@ -469,7 +469,7 @@ TEST_F(TestExtensionType, RoundtripBatchFromTensor) {
auto ext_field = field("f0", ext_type_, true, ext_metadata);
auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
std::shared_ptr<RecordBatch> read_batch;
RoundtripBatch(batch, &read_batch);
ASSERT_OK(RoundtripBatch(batch, &read_batch));
CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
}

Expand Down
35 changes: 32 additions & 3 deletions cpp/src/arrow/extension/json_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "arrow/ipc/test_common.h"
#include "arrow/record_batch.h"
#include "arrow/testing/gtest_util.h"
#include "parquet/exception.h"

namespace arrow {

Expand All @@ -43,14 +44,28 @@ std::shared_ptr<Array> ExampleJson(const std::shared_ptr<DataType>& storage_type
return ExtensionType::WrapArray(arrow::extension::json(storage_type), arr);
}

static std::shared_ptr<Array> ExampleJsonInvalidUTF8(
const std::shared_ptr<DataType>& storage_type) {
return ArrayFromJSON(storage_type,
"["
R"(
"Hi",
"olá mundo",
"你好世界",
"",
)"
"\"\xa0\xa1\""
"]");
}

TEST_F(TestJsonExtensionType, JsonRoundtrip) {
for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) {
auto ext_arr = ExampleJson(storage_type);

std::shared_ptr<Array> ext_arr = ExampleJson(storage_type);
auto batch =
RecordBatch::Make(schema({field("f0", json(storage_type))}), 8, {ext_arr});

std::shared_ptr<RecordBatch> read_batch;
RoundtripBatch(batch, &read_batch);
ASSERT_OK(RoundtripBatch(batch, &read_batch));
ASSERT_OK(read_batch->ValidateFull());
CompareBatch(*batch, *read_batch, /*compare_metadata*/ true);

Expand All @@ -60,4 +75,18 @@ TEST_F(TestJsonExtensionType, JsonRoundtrip) {
}
}

TEST_F(TestJsonExtensionType, InvalidUTF8) {
for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) {
std::shared_ptr<Array> ext_arr = ExampleJsonInvalidUTF8(storage_type);
auto batch =
RecordBatch::Make(schema({field("f0", json(storage_type))}), 8, {ext_arr});
;

std::shared_ptr<RecordBatch> read_batch;
ASSERT_RAISES_WITH_MESSAGE(IOError,
"IOError: Array length did not match record batch length",
RoundtripBatch(batch, &read_batch));
}
}

} // namespace arrow
4 changes: 2 additions & 2 deletions cpp/src/arrow/extension/uuid_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ TEST(TestUuuidExtensionType, RoundtripBatch) {
std::shared_ptr<RecordBatch> read_batch;
auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
RoundtripBatch(batch, &read_batch);
ASSERT_OK(RoundtripBatch(batch, &read_batch));
CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);

// Pass extension metadata and storage array, expect getting back extension array
Expand All @@ -65,7 +65,7 @@ TEST(TestUuuidExtensionType, RoundtripBatch) {
ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
/*nullable=*/true, /*metadata=*/ext_metadata);
auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
RoundtripBatch(batch2, &read_batch2);
ASSERT_OK(RoundtripBatch(batch2, &read_batch2));
CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
}

Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/extension_type_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,14 @@ TEST_F(TestExtensionType, IpcRoundtrip) {
auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});

std::shared_ptr<RecordBatch> read_batch;
RoundtripBatch(batch, &read_batch);
ASSERT_OK(RoundtripBatch(batch, &read_batch));
CompareBatch(*batch, *read_batch, false /* compare_metadata */);

// Wrap type in a ListArray and ensure it also makes it
auto offsets_arr = ArrayFromJSON(int32(), "[0, 0, 2, 4]");
ASSERT_OK_AND_ASSIGN(auto list_arr, ListArray::FromArrays(*offsets_arr, *ext_arr));
batch = RecordBatch::Make(schema({field("f0", list(uuid()))}), 3, {list_arr});
RoundtripBatch(batch, &read_batch);
ASSERT_OK(RoundtripBatch(batch, &read_batch));
CompareBatch(*batch, *read_batch, false /* compare_metadata */);
}

Expand Down Expand Up @@ -289,7 +289,7 @@ TEST_F(TestExtensionType, ParametricTypes) {
4, {p1, p2, p3, p4});

std::shared_ptr<RecordBatch> read_batch;
RoundtripBatch(batch, &read_batch);
ASSERT_OK(RoundtripBatch(batch, &read_batch));
CompareBatch(*batch, *read_batch, false /* compare_metadata */);
}

Expand Down
17 changes: 9 additions & 8 deletions cpp/src/arrow/ipc/test_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1236,18 +1236,19 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
return Tensor::Make(type, buf, shape, strides).Value(out);
}

void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out) {
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
out_stream.get()));
Status RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out) {
ARROW_ASSIGN_OR_RAISE(auto out_stream, io::BufferOutputStream::Create());
RETURN_NOT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
out_stream.get()));

ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
ARROW_ASSIGN_OR_RAISE(auto complete_ipc_stream, out_stream->Finish());

io::BufferReader reader(complete_ipc_stream);
std::shared_ptr<RecordBatchReader> batch_reader;
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
ASSERT_OK(batch_reader->ReadNext(out));
ARROW_ASSIGN_OR_RAISE(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
RETURN_NOT_OK(batch_reader->ReadNext(out));
return Status::OK();
}

} // namespace test
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/ipc/test_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& shape, bool row_major_p,
std::shared_ptr<Tensor>* out, uint32_t seed = 0);

ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out);
ARROW_TESTING_EXPORT Status RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
std::shared_ptr<RecordBatch>* out);

} // namespace test
} // namespace ipc
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1463,8 +1463,8 @@ TEST_F(TestJsonParquetIO, JsonExtension) {
::parquet::ArrowReaderProperties reader_properties;
reader_properties.set_arrow_extensions_enabled(true);
// TODO: should be json_array, json_array
this->RoundTripSingleColumn(json_array, json_string_array,
default_arrow_writer_properties(), reader_properties);
this->RoundTripSingleColumn(json_array, json_array, default_arrow_writer_properties(),
reader_properties);

// When the original Arrow schema is stored, the stored Arrow type is always respected.
const auto writer_properties =
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {

std::shared_ptr<KeyValueMetadata> metadata;
ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
ASSERT_OK(ConvertSchema(parquet_fields, metadata));
ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
CheckFlatSchema(arrow_schema, true /* check_metadata */);
}

Expand All @@ -803,8 +803,7 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
std::shared_ptr<KeyValueMetadata> metadata;
ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
// TODO
// CheckFlatSchema(arrow_schema, true /* check_metadata */);
CheckFlatSchema(arrow_schema, true /* check_metadata */);
}
}

Expand Down
7 changes: 4 additions & 3 deletions cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,10 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
// Built-in JSON extension is handled differently.
if (ext_type->extension_name() == std::string("arrow.json")) {
// Set physical and logical types and instantiate primitive node.
type = ParquetType::BYTE_ARRAY;
logical_type = LogicalType::JSON();
break;
}
std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
name, ext_type->storage_type(), field->nullable(), field->metadata());
Expand Down Expand Up @@ -994,7 +996,7 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer
if (origin_type->id() == ::arrow::Type::EXTENSION) {
const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type);
if (inferred_type->id() != ::arrow::Type::EXTENSION &&
ex_type.extension_name() == ::arrow::extension::JsonExtensionType::type_name() &&
ex_type.extension_name() == std::string("arrow.json") &&
(inferred_type->id() == ::arrow::Type::STRING ||
inferred_type->id() == ::arrow::Type::LARGE_STRING ||
inferred_type->id() == ::arrow::Type::STRING_VIEW)) {
Expand Down Expand Up @@ -1023,8 +1025,7 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer
} else {
if (inferred_type->id() == ::arrow::Type::EXTENSION) {
const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*inferred_type);
if (ex_type.extension_name() ==
::arrow::extension::JsonExtensionType::type_name()) {
if (ex_type.extension_name() == std::string("arrow.json")) {
// Schema mismatch.
//
// Arrow extensions are ENABLED in Parquet.
Expand Down

0 comments on commit 5bbbd5c

Please sign in to comment.