Skip to content

Commit

Permalink
Add support for de/serializing list-encoded JSON structs [#6558]
Browse files Browse the repository at this point in the history
Currently, a StructArray can only be deserialized from or serialized to
a JSON object (e.g. `{a: 1, b: "c"}`), but some services (e.g. Presto
and Trino) encode ROW types as JSON lists (e.g. `[1, "c"]`) because this
is more compact, and the schema is known.

This PR adds the ability to encode and decode JSON lists from and to
StructArrays, if StructMode is set to ListOnly.  In ListOnly mode,
object-encoded structs raise an error.  Setting to ObjectOnly (the
default) has the original parsing behavior.

Some notes/questions/points for discussion:
1. I've made a JsonParseMode struct instead of a bool flag for two
   reasons.  One is that it's self-descriptive (what would `true` be?),
   and the other is that it allows a future Mixed mode that could
   deserialize either.  The latter isn't currently requested by anyone.
2. I kept the error messages as similar to the old messages as possible.
   I considered having more specific error messages (like "Encountered a
   '[' when parsing a Struct, but the StructParseMode is ObjectOnly" or
   similar), but wanted to hear opinions before I went that route.
3. I'm not attached to any name/code-style/etc, so happy to modify to
   fit local conventions.

Fixes #6558
  • Loading branch information
jagill committed Jan 11, 2025
1 parent 88fb923 commit e6e4031
Show file tree
Hide file tree
Showing 7 changed files with 557 additions and 46 deletions.
96 changes: 96 additions & 0 deletions arrow-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,34 @@ pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer, WriterBuilder};
use half::f16;
use serde_json::{Number, Value};

/// Specifies what is considered valid JSON when reading or writing
/// RecordBatches or StructArrays.
///
/// This enum controls which form(s) the Reader will accept and which form the
/// Writer will produce. For example, if the RecordBatch Schema is
/// `[("a", Int32), ("r", Struct([("b", Boolean), ("c", Utf8)]))]`
/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
/// rows formatted similarly.
///
/// The list encoding is more compact if the schema is known, and is used by
/// tools such as
/// [Presto](https://prestodb.io/docs/current/develop/client-protocol.html#important-queryresults-attributes)
/// and [Trino](https://trino.io/docs/current/develop/client-protocol.html#important-queryresults-attributes).
///
/// When reading objects, the order of the key does not matter. When reading
/// lists, the entries must be the same number and in the same order as the
/// struct fields. Map columns are not affected by this option.
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum StructMode {
#[default]
/// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
ObjectOnly,
/// Encode/decode structs as lists (e.g., [1, "c"])
ListOnly,
}

/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
pub trait JsonSerializable: 'static {
/// Converts self into json value if its possible
Expand Down Expand Up @@ -156,4 +184,72 @@ mod tests {
);
assert_eq!(None, f32::NAN.into_json_value());
}

#[test]
fn test_json_roundtrip_structs() {
use crate::writer::LineDelimited;
use arrow_schema::DataType;
use arrow_schema::Field;
use arrow_schema::Fields;
use arrow_schema::Schema;
use std::sync::Arc;

let schema = Arc::new(Schema::new(vec![
Field::new(
"c1",
DataType::Struct(Fields::from(vec![
Field::new("c11", DataType::Int32, true),
Field::new(
"c12",
DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
false,
),
])),
false,
),
Field::new("c2", DataType::Utf8, false),
]));

{
let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
{"c1":{"c12":{"c121":"f"}},"c2":"b"}
{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
"#
.as_bytes();
let object_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ObjectOnly)
.build(object_input)
.unwrap();

let mut object_output: Vec<u8> = Vec::new();
let mut object_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ObjectOnly)
.build::<_, LineDelimited>(&mut object_output);
for batch_res in object_reader {
object_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(object_input, &object_output);
}

{
let list_input = r#"[[1,["e"]],"a"]
[[null,["f"]],"b"]
[[5,["g"]],"c"]
"#
.as_bytes();
let list_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ListOnly)
.build(list_input)
.unwrap();

let mut list_output: Vec<u8> = Vec::new();
let mut list_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ListOnly)
.build::<_, LineDelimited>(&mut list_output);
for batch_res in list_reader {
list_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(list_input, &list_output);
}
}
}
3 changes: 3 additions & 0 deletions arrow-json/src/reader/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_array::OffsetSizeTrait;
use arrow_buffer::buffer::NullBuffer;
Expand All @@ -37,6 +38,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let field = match &data_type {
DataType::List(f) if !O::IS_LARGE => f,
Expand All @@ -48,6 +50,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive,
strict_mode,
field.is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
4 changes: 4 additions & 0 deletions arrow-json/src/reader/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
Expand All @@ -36,6 +37,7 @@ impl MapArrayDecoder {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let fields = match &data_type {
DataType::Map(_, true) => {
Expand All @@ -59,12 +61,14 @@ impl MapArrayDecoder {
coerce_primitive,
strict_mode,
fields[0].is_nullable(),
struct_mode,
)?;
let values = make_decoder(
fields[1].data_type().clone(),
coerce_primitive,
strict_mode,
fields[1].is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
Loading

0 comments on commit e6e4031

Please sign in to comment.