From dfb7c113b9a0e931d40a65b8f3e51ab0295c2284 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 18 Dec 2024 22:35:43 -0500 Subject: [PATCH] Detect mismatches in begin and end tokens returned by JSON tokenizer FST (#17471) Addresses #15820 Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17471 --- cpp/src/io/fst/logical_stack.cuh | 6 ++++++ cpp/src/io/json/nested_json_gpu.cu | 10 ++++------ cpp/tests/io/json/json_test.cpp | 11 +++++++++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 0f1fc7d572b..98641f2c893 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -513,6 +513,12 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, stream)); } + // Check if the last element of d_kv_operations is 0. If not, then we have a problem. + if (num_symbols_in && !supports_reset_op) { + StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream); + CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!"); + } + // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = cub::DoubleBuffer{ reinterpret_cast(d_kv_operations.Current()), diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index f1c2826c62a..30a28a1cf98 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1473,10 +1473,11 @@ void get_stack_context(device_span json_in, to_stack_op::start_state, stream); - auto stack_ops_bufsize = d_num_stack_ops.value(stream); + // Copy back to actual number of stack operations + auto num_stack_ops = d_num_stack_ops.value(stream); // Sequence of stack symbols and their position in the original input (sparse representation) - rmm::device_uvector stack_ops{stack_ops_bufsize, stream}; - rmm::device_uvector stack_op_indices{stack_ops_bufsize, stream}; + rmm::device_uvector stack_ops{num_stack_ops, stream}; + rmm::device_uvector stack_op_indices{num_stack_ops, stream}; // Run bracket-brace FST to retrieve starting positions of structs and lists json_to_stack_ops_fst.Transduce(json_in.begin(), @@ -1487,9 +1488,6 @@ void get_stack_context(device_span json_in, to_stack_op::start_state, stream); - // Copy back to actual number of stack operations - auto const num_stack_ops = d_num_stack_ops.value(stream); - // Stack operations with indices are converted to top of the stack for each character in the input if (stack_behavior == stack_behavior_t::ResetOnDelimiter) { fst::sparse_stack_op_to_top_of_stack( diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 37a750330fa..23ca5734ded 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -3450,4 +3450,15 @@ TEST_P(JsonCompressedIOTest, BasicJsonLines) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}}); } +TEST_F(JsonReaderTest, MismatchedBeginEndTokens) +{ + std::string data = R"({"not_valid": "json)"; + auto opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL) + .build(); + EXPECT_THROW(cudf::io::read_json(opts), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN()