From e5f8dd33d78a2c964f8d6bac895deb73a9be7aa6 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 12 Aug 2024 16:52:52 -0500 Subject: [PATCH] Update the java code to properly deal with lists being returned as strings (#16536) Recently some JSON parsing was updated so lists could be returned as strings. This updates the java code so that when cleaning up the results to match the desired schema that it can handle corner cases associated with lists and structs properly. Tests are covered in the Spark plugin, but I am happy to add some here if we really want to validate that part of this. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16536 --- java/src/main/java/ai/rapids/cudf/Table.java | 29 +++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 4e737451ed6..36e342cae13 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1084,7 +1084,12 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest // The types don't match so just return the input unchanged... return DidViewChange.no(); } else { - String[] foundNames = children.getNames(); + String[] foundNames; + if (children == null) { + foundNames = new String[0]; + } else { + foundNames = children.getNames(); + } HashMap indices = new HashMap<>(); for (int i = 0; i < foundNames.length; i++) { indices.put(foundNames[i], i); @@ -1101,8 +1106,9 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest for (int i = 0; i < columns.length; i++) { String neededColumnName = neededNames[i]; Integer index = indices.get(neededColumnName); + Schema childSchema = schema.getChild(i); if (index != null) { - if (schema.getChild(i).isStructOrHasStructDescendant()) { + if (childSchema.isStructOrHasStructDescendant()) { ColumnView child = cv.getChildColumnView(index); boolean shouldCloseChild = true; try { @@ -1131,8 +1137,23 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest } } else { somethingChanged = true; - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); + if (types[i] == DType.LIST) { + try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) { + columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); + } + } else if (types[i] == DType.STRUCT) { + int numStructChildren = childSchema.getNumChildren(); + HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren]; + for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) { + structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType(); + } + try (Scalar s = Scalar.structFromNull(structChildren)) { + columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); + } + } else { + try (Scalar s = Scalar.fromNull(types[i])) { + columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); + } } } }